From: Andre Vieira Date: Tue, 18 Jan 2022 15:57:39 +0000 (+0000) Subject: [vect] Add main vectorized loop unrolling X-Git-Tag: upstream/12.2.0~2018 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=7ca1582ca60dc84cc3fc46b9cda620e2a0bed1bb;p=platform%2Fupstream%2Fgcc.git [vect] Add main vectorized loop unrolling gcc/ChangeLog: * tree-vect-loop.cc (vect_estimate_min_profitable_iters): Pass new argument suggested_unroll_factor. (vect_analyze_loop_costing): Likewise. (_loop_vec_info::_loop_vec_info): Initialize new member suggested_unroll_factor. (vect_determine_partial_vectors_and_peeling): Make epilogue of unrolled main loop use partial vectors. (vect_analyze_loop_2): Pass and use new argument suggested_unroll_factor. (vect_analyze_loop_1): Change to intialize local suggested_unroll_factor and use it. (vectorizable_reduction): Don't use single_defuse_cycle when unrolling. * tree-vectorizer.h (_loop_vec_info::_loop_vec_info): Add new member suggested_unroll_factor. (vector_costs::vector_costs): Add new member m_suggested_unroll_factor. (vector_costs::suggested_unroll_factor): New getter function. (finish_cost): Set return argument suggested_unroll_factor. --- diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index f1410b0..0fe3529 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -154,7 +154,8 @@ along with GCC; see the file COPYING3. If not see http://gcc.gnu.org/projects/tree-ssa/vectorization.html */ -static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *); +static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *, + unsigned *); static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info, bool *, bool *); @@ -831,6 +832,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared) skip_main_loop_edge (nullptr), skip_this_loop_edge (nullptr), reusable_accumulators (), + suggested_unroll_factor (1), max_vectorization_factor (0), mask_skip_niters (NULL_TREE), rgroup_compare_type (NULL_TREE), @@ -1834,7 +1836,8 @@ vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo) definitely no, or -1 if it's worth retrying. */ static int -vect_analyze_loop_costing (loop_vec_info loop_vinfo) +vect_analyze_loop_costing (loop_vec_info loop_vinfo, + unsigned *suggested_unroll_factor) { class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo); @@ -1868,7 +1871,8 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo) int min_profitable_iters, min_profitable_estimate; vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters, - &min_profitable_estimate); + &min_profitable_estimate, + suggested_unroll_factor); if (min_profitable_iters < 0) { @@ -2152,10 +2156,16 @@ vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo, vectors to the epilogue, with the main loop continuing to operate on full vectors. + If we are unrolling we also do not want to use partial vectors. This + is to avoid the overhead of generating multiple masks and also to + avoid having to execute entire iterations of FALSE masked instructions + when dealing with one or less full iterations. + ??? We could then end up failing to use partial vectors if we decide to peel iterations into a prologue, and if the main loop then ends up processing fewer than VF iterations. */ - if (param_vect_partial_vector_usage == 1 + if ((param_vect_partial_vector_usage == 1 + || loop_vinfo->suggested_unroll_factor > 1) && !LOOP_VINFO_EPILOGUE_P (loop_vinfo) && !vect_known_niters_smaller_than_vf (loop_vinfo)) LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true; @@ -2222,7 +2232,8 @@ vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo, for it. The different analyses will record information in the loop_vec_info struct. */ static opt_result -vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal) +vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, + unsigned *suggested_unroll_factor) { opt_result ok = opt_result::success (); int res; @@ -2382,6 +2393,12 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal) set of rgroups. */ gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ()); + /* Apply the suggested unrolling factor, this was determined by the backend + during finish_cost the first time we ran the analyzis for this + vector mode. */ + if (loop_vinfo->suggested_unroll_factor > 1) + LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor; + /* This is the point where we can re-start analysis with SLP forced off. */ start_over: @@ -2573,7 +2590,7 @@ start_over: return ok; /* Check the costings of the loop make vectorizing worthwhile. */ - res = vect_analyze_loop_costing (loop_vinfo); + res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor); if (res < 0) { ok = opt_result::failure_at (vect_location, @@ -2851,15 +2868,38 @@ vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared, machine_mode vector_mode = vector_modes[mode_i]; loop_vinfo->vector_mode = vector_mode; + unsigned int suggested_unroll_factor = 1; /* Run the main analysis. */ - opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal); + opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, + &suggested_unroll_factor); if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, "***** Analysis %s with vector mode %s\n", res ? "succeeded" : " failed", GET_MODE_NAME (loop_vinfo->vector_mode)); + if (!main_loop_vinfo && suggested_unroll_factor > 1) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "***** Re-trying analysis for unrolling" + " with unroll factor %d.\n", + suggested_unroll_factor); + loop_vec_info unroll_vinfo + = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo); + unroll_vinfo->vector_mode = vector_mode; + unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor; + opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL); + if (new_res) + { + delete loop_vinfo; + loop_vinfo = unroll_vinfo; + } + else + delete unroll_vinfo; + } + /* Remember the autodetected vector mode. */ if (vector_mode == VOIDmode) autodetected_vector_mode = loop_vinfo->vector_mode; @@ -3860,7 +3900,8 @@ vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue, static void vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, int *ret_min_profitable_niters, - int *ret_min_profitable_estimate) + int *ret_min_profitable_estimate, + unsigned *suggested_unroll_factor) { int min_profitable_iters; int min_profitable_estimate; @@ -4227,7 +4268,22 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, /* Complete the target-specific cost calculations. */ finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs, - &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost); + &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost, + suggested_unroll_factor); + + if (suggested_unroll_factor && *suggested_unroll_factor > 1 + && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR + && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) * + *suggested_unroll_factor, + LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo))) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "can't unroll as unrolled vectorization factor larger" + " than maximum vectorization factor: %d\n", + LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)); + *suggested_unroll_factor = 1; + } vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost); @@ -7194,10 +7250,13 @@ vectorizable_reduction (loop_vec_info loop_vinfo, This only works when we see both the reduction PHI and its only consumer in vectorizable_reduction and there are no intermediate stmts - participating. */ + participating. When unrolling we want each unrolled iteration to have its + own reduction accumulator since one of the main goals of unrolling a + reduction is to reduce the aggregate loop-carried latency. */ if (ncopies > 1 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live) - && reduc_chain_length == 1) + && reduc_chain_length == 1 + && loop_vinfo->suggested_unroll_factor == 1) single_defuse_cycle = true; if (single_defuse_cycle || lane_reduc_code_p) diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 86f90ae..524c86c 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -642,6 +642,13 @@ public: about the reductions that generated them. */ hash_map reusable_accumulators; + /* The number of times that the target suggested we unroll the vector loop + in order to promote more ILP. This value will be used to re-analyze the + loop for vectorization and if successful the value will be folded into + vectorization_factor (and therefore exactly divides + vectorization_factor). */ + unsigned int suggested_unroll_factor; + /* Maximum runtime vectorization factor, or MAX_VECTORIZATION_FACTOR if there is no particular limit. */ unsigned HOST_WIDE_INT max_vectorization_factor; @@ -1465,6 +1472,7 @@ public: unsigned int epilogue_cost () const; unsigned int outside_cost () const; unsigned int total_cost () const; + unsigned int suggested_unroll_factor () const; protected: unsigned int record_stmt_cost (stmt_vec_info, vect_cost_model_location, @@ -1484,6 +1492,9 @@ protected: /* The costs of the three regions, indexed by vect_cost_model_location. */ unsigned int m_costs[3]; + /* The suggested unrolling factor determined at finish_cost. */ + unsigned int m_suggested_unroll_factor; + /* True if finish_cost has been called. */ bool m_finished; }; @@ -1496,6 +1507,7 @@ vector_costs::vector_costs (vec_info *vinfo, bool costing_for_scalar) : m_vinfo (vinfo), m_costing_for_scalar (costing_for_scalar), m_costs (), + m_suggested_unroll_factor(1), m_finished (false) { } @@ -1544,6 +1556,15 @@ vector_costs::total_cost () const return body_cost () + outside_cost (); } +/* Return the suggested unroll factor. */ + +inline unsigned int +vector_costs::suggested_unroll_factor () const +{ + gcc_checking_assert (m_finished); + return m_suggested_unroll_factor; +} + #define VECT_MAX_COST 1000 /* The maximum number of intermediate steps required in multi-step type @@ -1720,12 +1741,14 @@ add_stmt_cost (vector_costs *costs, stmt_info_for_cost *i) static inline void finish_cost (vector_costs *costs, const vector_costs *scalar_costs, unsigned *prologue_cost, unsigned *body_cost, - unsigned *epilogue_cost) + unsigned *epilogue_cost, unsigned *suggested_unroll_factor = NULL) { costs->finish_cost (scalar_costs); *prologue_cost = costs->prologue_cost (); *body_cost = costs->body_cost (); *epilogue_cost = costs->epilogue_cost (); + if (suggested_unroll_factor) + *suggested_unroll_factor = costs->suggested_unroll_factor (); } inline void