From 34fbe3f0946f88828765184ed6581bda62cdf49f Mon Sep 17 00:00:00 2001 From: Jan Hubicka Date: Thu, 5 Dec 2019 19:12:51 +0100 Subject: [PATCH] cgraphclones.c (localize_profile): New function. * cgraphclones.c (localize_profile): New function. (cgraph_node::create_clone): Use it for partial profiles. * common.opt (fprofile-partial-training): New flag. * doc/invoke.texi (-fprofile-partial-training): Document. * ipa-cp.c (update_profiling_info): For partial profiles do not set function profile to zero. * profile.c (compute_branch_probabilities): With partial profile watch if edge count is zero and turn all probabilities to guessed. (compute_branch_probabilities): For partial profiles do not apply profile when entry count is zero. * tree-profile.c (tree_profiling): Only do value_profile_transformations when profile is read. From-SVN: r279013 --- gcc/ChangeLog | 15 +++++++++++++++ gcc/cgraphclones.c | 26 ++++++++++++++++++++++++++ gcc/common.opt | 4 ++++ gcc/doc/invoke.texi | 17 +++++++++++++++-- gcc/ipa-cp.c | 9 +++++++++ gcc/profile.c | 29 +++++++++++++++++++++++------ gcc/tree-profile.c | 3 ++- 7 files changed, 94 insertions(+), 9 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 08aee89..3cd0538 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,18 @@ +2019-12-05 Jan Hubicka + + * cgraphclones.c (localize_profile): New function. + (cgraph_node::create_clone): Use it for partial profiles. + * common.opt (fprofile-partial-training): New flag. + * doc/invoke.texi (-fprofile-partial-training): Document. + * ipa-cp.c (update_profiling_info): For partial profiles do not + set function profile to zero. + * profile.c (compute_branch_probabilities): With partial profile + watch if edge count is zero and turn all probabilities to guessed. + (compute_branch_probabilities): For partial profiles do not apply + profile when entry count is zero. + * tree-profile.c (tree_profiling): Only do value_profile_transformations + when profile is read. + 2019-12-05 Sudakshina Das * tree-vect-loop.c (vect_model_reduction_cost): Remove reduction_type diff --git a/gcc/cgraphclones.c b/gcc/cgraphclones.c index 81c5dfd..f2dfb4e 100644 --- a/gcc/cgraphclones.c +++ b/gcc/cgraphclones.c @@ -307,6 +307,22 @@ dump_callgraph_transformation (const cgraph_node *original, } } +/* Turn profile of N to local profile. */ + +static void +localize_profile (cgraph_node *n) +{ + n->count = n->count.guessed_local (); + for (cgraph_edge *e = n->callees; e; e=e->next_callee) + { + e->count = e->count.guessed_local (); + if (!e->inline_failed) + localize_profile (e->callee); + } + for (cgraph_edge *e = n->indirect_calls; e; e=e->next_callee) + e->count = e->count.guessed_local (); +} + /* Create node representing clone of N executed COUNT times. Decrease the execution counts from original node too. The new clone will have decl set to DECL that may or may not be the same @@ -340,6 +356,7 @@ cgraph_node::create_clone (tree new_decl, profile_count prof_count, cgraph_edge *e; unsigned i; profile_count old_count = count; + bool nonzero = count.ipa ().nonzero_p (); if (new_inlined_to) dump_callgraph_transformation (this, new_inlined_to, "inlining to"); @@ -426,6 +443,15 @@ cgraph_node::create_clone (tree new_decl, profile_count prof_count, if (call_duplication_hook) symtab->call_cgraph_duplication_hooks (this, new_node); + /* With partial train run we do not want to assume that original's + count is zero whenever we redurect all executed edges to clone. + Simply drop profile to local one in this case. */ + if (update_original + && opt_for_fn (decl, flag_profile_partial_training) + && nonzero + && count.ipa_p () + && !count.ipa ().nonzero_p ()) + localize_profile (this); if (!new_inlined_to) dump_callgraph_transformation (this, new_node, suffix); diff --git a/gcc/common.opt b/gcc/common.opt index 404b6aa..7e47953 100644 --- a/gcc/common.opt +++ b/gcc/common.opt @@ -2160,6 +2160,10 @@ fprofile-generate= Common Joined RejectNegative Enable common options for generating profile info for profile feedback directed optimizations, and set -fprofile-dir=. +fprofile-partial-training +Common Report Var(flag_profile_partial_training) Optimization +Do not assume that functions never executed during the train run are cold + fprofile-use Common Var(flag_profile_use) Enable common options for performing profile feedback directed optimizations. diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index d165f31..af3c7f2 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -453,8 +453,8 @@ Objective-C and Objective-C++ Dialects}. -fpartial-inlining -fpeel-loops -fpredictive-commoning @gol -fprefetch-loop-arrays @gol -fprofile-correction @gol --fprofile-use -fprofile-use=@var{path} -fprofile-values @gol --fprofile-reorder-functions @gol +-fprofile-use -fprofile-use=@var{path} -fprofile-partial-training @gol +-fprofile-values -fprofile-reorder-functions @gol -freciprocal-math -free -frename-registers -freorder-blocks @gol -freorder-blocks-algorithm=@var{algorithm} @gol -freorder-blocks-and-partition -freorder-functions @gol @@ -10634,6 +10634,19 @@ default, GCC emits an error message when an inconsistent profile is detected. This option is enabled by @option{-fauto-profile}. +@item -fprofile-partial-training +@opindex fprofile-use +With @code{-fprofile-use} all portions of programs not executed during train +run are optimized agressively for size rather than speed. In some cases it is +not practical to train all possible hot paths in the program. (For +example, program may contain functions specific for a given hardware and +trianing may not cover all hardware configurations program is run on.) With +@code{-fprofile-partial-training} profile feedback will be ignored for all +functions not executed during the train run leading them to be optimized as if +they were compiled without profile feedback. This leads to better performance +when train run is not representative but also leads to significantly bigger +code. + @item -fprofile-use @itemx -fprofile-use=@var{path} @opindex fprofile-use diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c index 693c7a2..14064ae 100644 --- a/gcc/ipa-cp.c +++ b/gcc/ipa-cp.c @@ -4295,6 +4295,15 @@ update_profiling_info (struct cgraph_node *orig_node, remainder = orig_node_count.combine_with_ipa_count (orig_node_count.ipa () - new_sum.ipa ()); + + /* With partial train run we do not want to assume that original's + count is zero whenever we redurect all executed edges to clone. + Simply drop profile to local one in this case. */ + if (remainder.ipa_p () && !remainder.ipa ().nonzero_p () + && orig_node->count.ipa_p () && orig_node->count.ipa ().nonzero_p () + && flag_profile_partial_training) + remainder = remainder.guessed_local (); + new_sum = orig_node_count.combine_with_ipa_count (new_sum); new_node->count = new_sum; orig_node->count = remainder; diff --git a/gcc/profile.c b/gcc/profile.c index 8d39a7d..7e2d7d3 100644 --- a/gcc/profile.c +++ b/gcc/profile.c @@ -635,9 +635,20 @@ compute_branch_probabilities (unsigned cfg_checksum, unsigned lineno_checksum) } if (bb_gcov_count (bb)) { + bool set_to_guessed = false; FOR_EACH_EDGE (e, ei, bb->succs) - e->probability = profile_probability::probability_in_gcov_type - (edge_gcov_count (e), bb_gcov_count (bb)); + { + bool prev_never = e->probability == profile_probability::never (); + e->probability = profile_probability::probability_in_gcov_type + (edge_gcov_count (e), bb_gcov_count (bb)); + if (e->probability == profile_probability::never () + && !prev_never + && flag_profile_partial_training) + set_to_guessed = true; + } + if (set_to_guessed) + FOR_EACH_EDGE (e, ei, bb->succs) + e->probability = e->probability.guessed (); if (bb->index >= NUM_FIXED_BLOCKS && block_ends_with_condjump_p (bb) && EDGE_COUNT (bb->succs) >= 2) @@ -697,17 +708,23 @@ compute_branch_probabilities (unsigned cfg_checksum, unsigned lineno_checksum) } } - if (exec_counts) + if (exec_counts + && (bb_gcov_count (ENTRY_BLOCK_PTR_FOR_FN (cfun)) + || !flag_profile_partial_training)) profile_status_for_fn (cfun) = PROFILE_READ; /* If we have real data, use them! */ if (bb_gcov_count (ENTRY_BLOCK_PTR_FOR_FN (cfun)) || !flag_guess_branch_prob) FOR_ALL_BB_FN (bb, cfun) - bb->count = profile_count::from_gcov_type (bb_gcov_count (bb)); + if (bb_gcov_count (bb) || !flag_profile_partial_training) + bb->count = profile_count::from_gcov_type (bb_gcov_count (bb)); + else + bb->count = profile_count::guessed_zero (); /* If function was not trained, preserve local estimates including statically determined zero counts. */ - else if (profile_status_for_fn (cfun) == PROFILE_READ) + else if (profile_status_for_fn (cfun) == PROFILE_READ + && !flag_profile_partial_training) FOR_ALL_BB_FN (bb, cfun) if (!(bb->count == profile_count::zero ())) bb->count = bb->count.global0 (); @@ -1417,7 +1434,7 @@ branch_prob (bool thunk) /* At this moment we have precise loop iteration count estimates. Record them to loop structure before the profile gets out of date. */ FOR_EACH_LOOP (loop, 0) - if (loop->header->count > 0) + if (loop->header->count > 0 && loop->header->count.reliable_p ()) { gcov_type nit = expected_loop_iterations_unbounded (loop); widest_int bound = gcov_type_to_wide_int (nit); diff --git a/gcc/tree-profile.c b/gcc/tree-profile.c index b4435b9..df60eda 100644 --- a/gcc/tree-profile.c +++ b/gcc/tree-profile.c @@ -785,7 +785,8 @@ tree_profiling (void) if (flag_branch_probabilities && !thunk && flag_profile_values - && flag_value_profile_transformations) + && flag_value_profile_transformations + && profile_status_for_fn (cfun) == PROFILE_READ) gimple_value_profile_transformations (); /* The above could hose dominator info. Currently there is -- 2.7.4