cgraphclones.c (localize_profile): New function.
authorJan Hubicka <hubicka@ucw.cz>
Thu, 5 Dec 2019 18:12:51 +0000 (19:12 +0100)
committerJan Hubicka <hubicka@gcc.gnu.org>
Thu, 5 Dec 2019 18:12:51 +0000 (18:12 +0000)
* cgraphclones.c (localize_profile): New function.
(cgraph_node::create_clone): Use it for partial profiles.
* common.opt (fprofile-partial-training): New flag.
* doc/invoke.texi (-fprofile-partial-training): Document.
* ipa-cp.c (update_profiling_info): For partial profiles do not
set function profile to zero.
* profile.c (compute_branch_probabilities): With partial profile
watch if edge count is zero and turn all probabilities to guessed.
(compute_branch_probabilities): For partial profiles do not apply
profile when entry count is zero.
* tree-profile.c (tree_profiling): Only do value_profile_transformations
when profile is read.

From-SVN: r279013

gcc/ChangeLog
gcc/cgraphclones.c
gcc/common.opt
gcc/doc/invoke.texi
gcc/ipa-cp.c
gcc/profile.c
gcc/tree-profile.c

index 08aee89..3cd0538 100644 (file)
@@ -1,3 +1,18 @@
+2019-12-05  Jan Hubicka  <hubicka@ucw.cz>
+
+       * cgraphclones.c (localize_profile): New function.
+       (cgraph_node::create_clone): Use it for partial profiles.
+       * common.opt (fprofile-partial-training): New flag.
+       * doc/invoke.texi (-fprofile-partial-training): Document.
+       * ipa-cp.c (update_profiling_info): For partial profiles do not
+       set function profile to zero.
+       * profile.c (compute_branch_probabilities): With partial profile
+       watch if edge count is zero and turn all probabilities to guessed.
+       (compute_branch_probabilities): For partial profiles do not apply
+       profile when entry count is zero.
+       * tree-profile.c (tree_profiling): Only do value_profile_transformations
+       when profile is read.
+
 2019-12-05  Sudakshina Das  <sudi.das@arm.com>
 
        * tree-vect-loop.c (vect_model_reduction_cost): Remove reduction_type
index 81c5dfd..f2dfb4e 100644 (file)
@@ -307,6 +307,22 @@ dump_callgraph_transformation (const cgraph_node *original,
     }
 }
 
+/* Turn profile of N to local profile.   */
+
+static void
+localize_profile (cgraph_node *n)
+{
+  n->count = n->count.guessed_local ();
+  for (cgraph_edge *e = n->callees; e; e=e->next_callee)
+    {
+      e->count = e->count.guessed_local ();
+      if (!e->inline_failed)
+       localize_profile (e->callee);
+    }
+  for (cgraph_edge *e = n->indirect_calls; e; e=e->next_callee)
+    e->count = e->count.guessed_local ();
+}
+
 /* Create node representing clone of N executed COUNT times.  Decrease
    the execution counts from original node too.
    The new clone will have decl set to DECL that may or may not be the same
@@ -340,6 +356,7 @@ cgraph_node::create_clone (tree new_decl, profile_count prof_count,
   cgraph_edge *e;
   unsigned i;
   profile_count old_count = count;
+  bool nonzero = count.ipa ().nonzero_p ();
 
   if (new_inlined_to)
     dump_callgraph_transformation (this, new_inlined_to, "inlining to");
@@ -426,6 +443,15 @@ cgraph_node::create_clone (tree new_decl, profile_count prof_count,
 
   if (call_duplication_hook)
     symtab->call_cgraph_duplication_hooks (this, new_node);
+  /* With partial train run we do not want to assume that original's
+     count is zero whenever we redurect all executed edges to clone.
+     Simply drop profile to local one in this case.  */
+  if (update_original
+      && opt_for_fn (decl, flag_profile_partial_training)
+      && nonzero
+      && count.ipa_p ()
+      && !count.ipa ().nonzero_p ())
+    localize_profile (this);
 
   if (!new_inlined_to)
     dump_callgraph_transformation (this, new_node, suffix);
index 404b6aa..7e47953 100644 (file)
@@ -2160,6 +2160,10 @@ fprofile-generate=
 Common Joined RejectNegative
 Enable common options for generating profile info for profile feedback directed optimizations, and set -fprofile-dir=.
 
+fprofile-partial-training
+Common Report Var(flag_profile_partial_training) Optimization
+Do not assume that functions never executed during the train run are cold
+
 fprofile-use
 Common Var(flag_profile_use)
 Enable common options for performing profile feedback directed optimizations.
index d165f31..af3c7f2 100644 (file)
@@ -453,8 +453,8 @@ Objective-C and Objective-C++ Dialects}.
 -fpartial-inlining  -fpeel-loops  -fpredictive-commoning @gol
 -fprefetch-loop-arrays @gol
 -fprofile-correction @gol
--fprofile-use  -fprofile-use=@var{path}  -fprofile-values @gol
--fprofile-reorder-functions @gol
+-fprofile-use  -fprofile-use=@var{path} -fprofile-partial-training @gol
+-fprofile-values -fprofile-reorder-functions @gol
 -freciprocal-math  -free  -frename-registers  -freorder-blocks @gol
 -freorder-blocks-algorithm=@var{algorithm} @gol
 -freorder-blocks-and-partition  -freorder-functions @gol
@@ -10634,6 +10634,19 @@ default, GCC emits an error message when an inconsistent profile is detected.
 
 This option is enabled by @option{-fauto-profile}.
 
+@item -fprofile-partial-training
+@opindex fprofile-use
+With @code{-fprofile-use} all portions of programs not executed during train
+run are optimized agressively for size rather than speed.  In some cases it is
+not practical to train all possible hot paths in the program. (For
+example, program may contain functions specific for a given hardware and
+trianing may not cover all hardware configurations program is run on.)  With
+@code{-fprofile-partial-training} profile feedback will be ignored for all
+functions not executed during the train run leading them to be optimized as if
+they were compiled without profile feedback. This leads to better performance
+when train run is not representative but also leads to significantly bigger
+code.
+
 @item -fprofile-use
 @itemx -fprofile-use=@var{path}
 @opindex fprofile-use
index 693c7a2..14064ae 100644 (file)
@@ -4295,6 +4295,15 @@ update_profiling_info (struct cgraph_node *orig_node,
 
   remainder = orig_node_count.combine_with_ipa_count (orig_node_count.ipa ()
                                                      - new_sum.ipa ());
+
+  /* With partial train run we do not want to assume that original's
+     count is zero whenever we redurect all executed edges to clone.
+     Simply drop profile to local one in this case.  */
+  if (remainder.ipa_p () && !remainder.ipa ().nonzero_p ()
+      && orig_node->count.ipa_p () && orig_node->count.ipa ().nonzero_p ()
+      && flag_profile_partial_training)
+    remainder = remainder.guessed_local ();
+
   new_sum = orig_node_count.combine_with_ipa_count (new_sum);
   new_node->count = new_sum;
   orig_node->count = remainder;
index 8d39a7d..7e2d7d3 100644 (file)
@@ -635,9 +635,20 @@ compute_branch_probabilities (unsigned cfg_checksum, unsigned lineno_checksum)
        }
       if (bb_gcov_count (bb))
        {
+         bool set_to_guessed = false;
          FOR_EACH_EDGE (e, ei, bb->succs)
-           e->probability = profile_probability::probability_in_gcov_type
-               (edge_gcov_count (e), bb_gcov_count (bb));
+           {
+             bool prev_never = e->probability == profile_probability::never ();
+             e->probability = profile_probability::probability_in_gcov_type
+                 (edge_gcov_count (e), bb_gcov_count (bb));
+             if (e->probability == profile_probability::never ()
+                 && !prev_never
+                 && flag_profile_partial_training)
+               set_to_guessed = true;
+           }
+         if (set_to_guessed)
+           FOR_EACH_EDGE (e, ei, bb->succs)
+             e->probability = e->probability.guessed ();
          if (bb->index >= NUM_FIXED_BLOCKS
              && block_ends_with_condjump_p (bb)
              && EDGE_COUNT (bb->succs) >= 2)
@@ -697,17 +708,23 @@ compute_branch_probabilities (unsigned cfg_checksum, unsigned lineno_checksum)
        }
     }
 
-  if (exec_counts)
+  if (exec_counts
+      && (bb_gcov_count (ENTRY_BLOCK_PTR_FOR_FN (cfun))
+         || !flag_profile_partial_training))
     profile_status_for_fn (cfun) = PROFILE_READ;
 
   /* If we have real data, use them!  */
   if (bb_gcov_count (ENTRY_BLOCK_PTR_FOR_FN (cfun))
       || !flag_guess_branch_prob)
     FOR_ALL_BB_FN (bb, cfun)
-      bb->count = profile_count::from_gcov_type (bb_gcov_count (bb));
+      if (bb_gcov_count (bb) || !flag_profile_partial_training)
+        bb->count = profile_count::from_gcov_type (bb_gcov_count (bb));
+      else
+       bb->count = profile_count::guessed_zero ();
   /* If function was not trained, preserve local estimates including statically
      determined zero counts.  */
-  else if (profile_status_for_fn (cfun) == PROFILE_READ)
+  else if (profile_status_for_fn (cfun) == PROFILE_READ
+          && !flag_profile_partial_training)
     FOR_ALL_BB_FN (bb, cfun)
       if (!(bb->count == profile_count::zero ()))
         bb->count = bb->count.global0 ();
@@ -1417,7 +1434,7 @@ branch_prob (bool thunk)
       /* At this moment we have precise loop iteration count estimates.
         Record them to loop structure before the profile gets out of date. */
       FOR_EACH_LOOP (loop, 0)
-       if (loop->header->count > 0)
+       if (loop->header->count > 0 && loop->header->count.reliable_p ())
          {
            gcov_type nit = expected_loop_iterations_unbounded (loop);
            widest_int bound = gcov_type_to_wide_int (nit);
index b4435b9..df60eda 100644 (file)
@@ -785,7 +785,8 @@ tree_profiling (void)
       if (flag_branch_probabilities
          && !thunk
          && flag_profile_values
-         && flag_value_profile_transformations)
+         && flag_value_profile_transformations
+         && profile_status_for_fn (cfun) == PROFILE_READ)
        gimple_value_profile_transformations ();
 
       /* The above could hose dominator info.  Currently there is