loop: Fix profile updates after unrolling [PR102385]
authorRichard Sandiford <richard.sandiford@arm.com>
Mon, 4 Oct 2021 22:55:43 +0000 (23:55 +0100)
committerRichard Sandiford <richard.sandiford@arm.com>
Fri, 8 Oct 2021 12:17:47 +0000 (13:17 +0100)
In g:62acc72a957b5614 I'd stopped the unroller from using
an epilogue loop in cases where the iteration count was
known to be a multiple of the unroll factor.  The epilogue
and non-epilogue cases still shared this (preexisting) code
to update the edge frequencies:

  basic_block exit_bb = single_pred (loop->latch);
  new_exit = find_edge (exit_bb, rest);
  new_exit->probability = profile_probability::always ()
                               .apply_scale (1, new_est_niter + 1);
  [etc]

But of course (in hindsight) that only makes sense for the
epilogue case, where we've already moved the main loop's exit edge
to be a sibling of the latch edge.  For the non-epilogue case,
the exit edge stays (and needs to stay) in its original position.

I don't really understand what the code is trying to do for
the epilogue case.  It has:

      /* Ensure that the frequencies in the loop match the new estimated
 number of iterations, and change the probability of the new
 exit edge.  */

      profile_count freq_h = loop->header->count;
      profile_count freq_e = (loop_preheader_edge (loop))->count ();
      if (freq_h.nonzero_p ())
{
          ...
  scale_loop_frequencies (loop, freq_e.probability_in (freq_h));
}

Here, freq_e.probability_in (freq_h) is freq_e / freq_h, so for the
header block, this has the effect of:

  new header count = freq_h * (freq_e / freq_h)

i.e. we say that the header executes exactly as often as the
preheader edge, which would only make sense if the loop never
iterates.  Also, after setting the probability of the nonexit edge
(correctly) to new_est_niter / (new_est_niter + 1), the code does:

    scale_bbs_frequencies (&loop->latch, 1, prob);

for this new probability.  I think that only makes sense if the
nonexit edge was previously unconditional (100%).  But the code
carefully preserved the probability of the original exit edge
when creating the new one.

All I'm trying to do here though is fix the mess I created
and get the probabilities right for the non-epilogue case.
Things are simpler there since we don't have to worry about
loop versioning.  Hopefully the comments explain the approach.

The function's current interface implies that it can cope with
multiple exit edges and that the function only needs the iteration
count relative to one of those edges in order to work correctly.
In practice that's not the case: it assumes there is exactly one
exit edge and all current callers also ensure that the exit test
dominates the latch.  I think the function is easier to follow
if we remove the implied generality.

gcc/
PR tree-optimization/102385
* predict.h (change_edge_frequency): Declare.
* predict.c (change_edge_frequency): New function.
* tree-ssa-loop-manip.h (tree_transform_and_unroll_loop): Remove
edge argument.
(tree_unroll_loop): Likewise.
* gimple-loop-jam.c (tree_loop_unroll_and_jam): Update accordingly.
* tree-predcom.c (pcom_worker::tree_predictive_commoning_loop):
Likewise.
* tree-ssa-loop-prefetch.c (loop_prefetch_arrays): Likewise.
* tree-ssa-loop-manip.c (tree_unroll_loop): Likewise.
(tree_transform_and_unroll_loop): Likewise.  Use single_dom_exit
to retrieve the exit edges.  Make all the old profile update code
conditional on !single_loop_p -- the case it was written for --
and use a different approach for the single-loop case.

gcc/testsuite/
* gcc.dg/pr102385.c: New test.

gcc/gimple-loop-jam.c
gcc/predict.c
gcc/predict.h
gcc/testsuite/gcc.dg/pr102385.c [new file with mode: 0644]
gcc/tree-predcom.c
gcc/tree-ssa-loop-manip.c
gcc/tree-ssa-loop-manip.h
gcc/tree-ssa-loop-prefetch.c

index d212e39..611d380 100644 (file)
@@ -587,8 +587,7 @@ tree_loop_unroll_and_jam (void)
                             "applying unroll and jam with factor %d\n",
                             unroll_factor);
          initialize_original_copy_tables ();
-         tree_unroll_loop (outer, unroll_factor, single_dom_exit (outer),
-                           &desc);
+         tree_unroll_loop (outer, unroll_factor, &desc);
          free_original_copy_tables ();
          fuse_loops (outer->inner);
          todo |= TODO_cleanup_cfg;
index d9c7249..68b1113 100644 (file)
@@ -4481,6 +4481,43 @@ force_edge_cold (edge e, bool impossible)
     }
 }
 
+/* Change E's probability to NEW_E_PROB, redistributing the probabilities
+   of other outgoing edges proportionally.
+
+   Note that this function does not change the profile counts of any
+   basic blocks.  The caller must do that instead, using whatever
+   information it has about the region that needs updating.  */
+
+void
+change_edge_frequency (edge e, profile_probability new_e_prob)
+{
+  profile_probability old_e_prob = e->probability;
+  profile_probability old_other_prob = old_e_prob.invert ();
+  profile_probability new_other_prob = new_e_prob.invert ();
+
+  e->probability = new_e_prob;
+  profile_probability cumulative_prob = new_e_prob;
+
+  unsigned int num_other = EDGE_COUNT (e->src->succs) - 1;
+  edge other_e;
+  edge_iterator ei;
+  FOR_EACH_EDGE (other_e, ei, e->src->succs)
+    if (other_e != e)
+      {
+       num_other -= 1;
+       if (num_other == 0)
+         /* Ensure that the probabilities add up to 1 without
+            rounding error.  */
+         other_e->probability = cumulative_prob.invert ();
+       else
+         {
+           other_e->probability /= old_other_prob;
+           other_e->probability *= new_other_prob;
+           cumulative_prob += other_e->probability;
+         }
+      }
+}
+
 #if CHECKING_P
 
 namespace selftest {
index 8860caf..4df51bd 100644 (file)
@@ -100,6 +100,7 @@ extern void rebuild_frequencies (void);
 extern void report_predictor_hitrates (void);
 extern void force_edge_cold (edge, bool);
 extern void propagate_unlikely_bbs_forward (void);
+extern void change_edge_frequency (edge, profile_probability);
 
 extern void add_reg_br_prob_note (rtx_insn *, profile_probability);
 
diff --git a/gcc/testsuite/gcc.dg/pr102385.c b/gcc/testsuite/gcc.dg/pr102385.c
new file mode 100644 (file)
index 0000000..1339540
--- /dev/null
@@ -0,0 +1,14 @@
+/* { dg-options "-Wall -Wextra -O2 -fno-toplevel-reorder -fno-tree-ch -fno-tree-dce -fno-tree-dominator-opts -fno-tree-dse -fno-tree-loop-ivcanon -fpredictive-commoning" } */
+
+short a, b;
+int c[9];
+void(d)() {}
+void e() {
+  a = 0;
+  for (; a <= 4; a++) {
+    short *f = &b;
+    c[a] || (*f = 0);
+    d(c[a + 2]);
+  }
+}
+int main() {return 0;}
index ce1f08f..208e755 100644 (file)
@@ -3398,8 +3398,7 @@ pcom_worker::tree_predictive_commoning_loop (bool allow_unroll_p)
         the phi nodes in execute_pred_commoning_cbck.  A bit hacky.  */
       replace_phis_by_defined_names (m_chains);
 
-      edge exit = single_dom_exit (m_loop);
-      tree_transform_and_unroll_loop (m_loop, unroll_factor, exit, &desc,
+      tree_transform_and_unroll_loop (m_loop, unroll_factor, &desc,
                                      execute_pred_commoning_cbck, &dta);
       eliminate_temp_copies (m_loop, tmp_vars);
     }
index c7a2f67..2d576bf 100644 (file)
@@ -1182,8 +1182,9 @@ niter_for_unrolled_loop (class loop *loop, unsigned factor)
   return new_est_niter;
 }
 
-/* Unroll LOOP FACTOR times.  DESC describes number of iterations of LOOP.
-   EXIT is the exit of the loop to that DESC corresponds.
+/* Unroll LOOP FACTOR times.  LOOP is known to have a single exit edge
+   whose source block dominates the latch.  DESC describes the number of
+   iterations of LOOP.
 
    If N is number of iterations of the loop and MAY_BE_ZERO is the condition
    under that loop exits in the first iteration even if N != 0,
@@ -1241,7 +1242,7 @@ niter_for_unrolled_loop (class loop *loop, unsigned factor)
 
 void
 tree_transform_and_unroll_loop (class loop *loop, unsigned factor,
-                               edge exit, class tree_niter_desc *desc,
+                               class tree_niter_desc *desc,
                                transform_callback transform,
                                void *data)
 {
@@ -1265,10 +1266,11 @@ tree_transform_and_unroll_loop (class loop *loop, unsigned factor,
 
   gcond *exit_if = nullptr;
   class loop *new_loop = nullptr;
-  basic_block rest;
   edge new_exit;
   if (!single_loop_p)
     {
+      edge exit = single_dom_exit (loop);
+
       /* The values for scales should keep profile consistent, and somewhat
         close to correct.
 
@@ -1291,7 +1293,7 @@ tree_transform_and_unroll_loop (class loop *loop, unsigned factor,
 
       /* Prepare the cfg and update the phi nodes.  Move the loop exit to the
         loop latch (and make its condition dummy, for the moment).  */
-      rest = loop_preheader_edge (new_loop)->src;
+      basic_block rest = loop_preheader_edge (new_loop)->src;
       edge precond_edge = single_pred_edge (rest);
       split_edge (loop_latch_edge (loop));
       basic_block exit_bb = single_pred (loop->latch);
@@ -1373,10 +1375,7 @@ tree_transform_and_unroll_loop (class loop *loop, unsigned factor,
       remove_path (exit);
     }
   else
-    {
-      new_exit = exit;
-      rest = exit->dest;
-    }
+    new_exit = single_dom_exit (loop);
 
   /* Transform the loop.  */
   if (transform)
@@ -1401,6 +1400,7 @@ tree_transform_and_unroll_loop (class loop *loop, unsigned factor,
     }
   update_ssa (TODO_update_ssa);
 
+  new_exit = single_dom_exit (loop);
   if (!single_loop_p)
     {
       /* Ensure that the frequencies in the loop match the new estimated
@@ -1417,29 +1417,24 @@ tree_transform_and_unroll_loop (class loop *loop, unsigned factor,
            freq_e = freq_e.force_nonzero ();
          scale_loop_frequencies (loop, freq_e.probability_in (freq_h));
        }
-    }
 
-  basic_block exit_bb = single_pred (loop->latch);
-  new_exit = find_edge (exit_bb, rest);
-  new_exit->probability = profile_probability::always ()
-                               .apply_scale (1, new_est_niter + 1);
+      basic_block rest = new_exit->dest;
+      new_exit->probability = profile_probability::always ()
+       .apply_scale (1, new_est_niter + 1);
 
-  if (!single_loop_p)
-    rest->count += new_exit->count ();
+      rest->count += new_exit->count ();
 
-  edge new_nonexit = single_pred_edge (loop->latch);
-  profile_probability prob = new_nonexit->probability;
-  new_nonexit->probability = new_exit->probability.invert ();
-  prob = new_nonexit->probability / prob;
-  if (prob.initialized_p ())
-    scale_bbs_frequencies (&loop->latch, 1, prob);
+      edge new_nonexit = single_pred_edge (loop->latch);
+      profile_probability prob = new_nonexit->probability;
+      new_nonexit->probability = new_exit->probability.invert ();
+      prob = new_nonexit->probability / prob;
+      if (prob.initialized_p ())
+       scale_bbs_frequencies (&loop->latch, 1, prob);
 
-  if (!single_loop_p)
-    {
       /* Finally create the new counter for number of iterations and add
         the new exit instruction.  */
       tree ctr_before, ctr_after;
-      gimple_stmt_iterator bsi = gsi_last_nondebug_bb (exit_bb);
+      gimple_stmt_iterator bsi = gsi_last_nondebug_bb (new_exit->src);
       exit_if = as_a <gcond *> (gsi_stmt (bsi));
       create_iv (exit_base, exit_step, NULL_TREE, loop,
                 &bsi, false, &ctr_before, &ctr_after);
@@ -1448,6 +1443,67 @@ tree_transform_and_unroll_loop (class loop *loop, unsigned factor,
       gimple_cond_set_rhs (exit_if, exit_bound);
       update_stmt (exit_if);
     }
+  else
+    {
+      /* gimple_duplicate_loop_to_header_edge has adjusted the loop body's
+        original profile counts in line with the unroll factor.  However,
+        the old counts might not have been consistent with the old
+        iteration count.
+
+        Therefore, if the iteration count is known exactly, make sure that the
+        profile counts of the loop header (and any other blocks that might be
+        executed in the final iteration) are consistent with the combination
+        of (a) the incoming profile count and (b) the new iteration count.  */
+      profile_count in_count = loop_preheader_edge (loop)->count ();
+      profile_count old_header_count = loop->header->count;
+      if (in_count.nonzero_p ()
+         && old_header_count.nonzero_p ()
+         && TREE_CODE (desc->niter) == INTEGER_CST)
+       {
+         /* The + 1 converts latch counts to iteration counts.  */
+         profile_count new_header_count
+           = (in_count.apply_scale (new_est_niter + 1, 1));
+         basic_block *body = get_loop_body (loop);
+         scale_bbs_frequencies_profile_count (body, loop->num_nodes,
+                                              new_header_count,
+                                              old_header_count);
+         free (body);
+       }
+
+      /* gimple_duplicate_loop_to_header_edge discarded FACTOR - 1
+        exit edges and adjusted the loop body's profile counts for the
+        new probabilities of the remaining non-exit edges.  However,
+        the remaining exit edge still has the same probability as it
+        did before, even though it is now more likely.
+
+        Therefore, all blocks executed after a failed exit test now have
+        a profile count that is too high, and the sum of the profile counts
+        for the header's incoming edges is greater than the profile count
+        of the header itself.
+
+        Adjust the profile counts of all code in the loop body after
+        the exit test so that the sum of the counts on entry to the
+        header agree.  */
+      profile_count old_latch_count = loop_latch_edge (loop)->count ();
+      profile_count new_latch_count = loop->header->count - in_count;
+      if (old_latch_count.nonzero_p () && new_latch_count.nonzero_p ())
+       scale_dominated_blocks_in_loop (loop, new_exit->src, new_latch_count,
+                                       old_latch_count);
+
+      /* Set the probability of the exit edge based on NEW_EST_NITER
+        (which estimates latch counts rather than iteration counts).
+        Update the probabilities of other edges to match.
+
+        If the profile counts are large enough to give the required
+        precision, the updates above will have made
+
+           e->dest->count / e->src->count ~= new e->probability
+
+        for every outgoing edge e of NEW_EXIT->src.  */
+      profile_probability new_exit_prob = profile_probability::always ()
+       .apply_scale (1, new_est_niter + 1);
+      change_edge_frequency (new_exit, new_exit_prob);
+    }
 
   checking_verify_flow_info ();
   checking_verify_loop_structure ();
@@ -1461,10 +1517,9 @@ tree_transform_and_unroll_loop (class loop *loop, unsigned factor,
 
 void
 tree_unroll_loop (class loop *loop, unsigned factor,
-                 edge exit, class tree_niter_desc *desc)
+                 class tree_niter_desc *desc)
 {
-  tree_transform_and_unroll_loop (loop, factor, exit, desc,
-                                 NULL, NULL);
+  tree_transform_and_unroll_loop (loop, factor, desc, NULL, NULL);
 }
 
 /* Rewrite the phi node at position PSI in function of the main
index 86fc118..5e2d7fa 100644 (file)
@@ -50,10 +50,9 @@ extern bool can_unroll_loop_p (class loop *loop, unsigned factor,
                               class tree_niter_desc *niter);
 extern gcov_type niter_for_unrolled_loop (class loop *, unsigned);
 extern void tree_transform_and_unroll_loop (class loop *, unsigned,
-                                           edge, class tree_niter_desc *,
+                                           tree_niter_desc *,
                                            transform_callback, void *);
-extern void tree_unroll_loop (class loop *, unsigned,
-                             edge, class tree_niter_desc *);
+extern void tree_unroll_loop (class loop *, unsigned, tree_niter_desc *);
 extern tree canonicalize_loop_ivs (class loop *, tree *, bool);
 extern unsigned int loop_invariant_motion_in_fun (function *, bool);
 
index 85977e2..04b2b33 100644 (file)
@@ -1962,8 +1962,7 @@ loop_prefetch_arrays (class loop *loop)
      iterations so that we do not issue superfluous prefetches.  */
   if (unroll_factor != 1)
     {
-      tree_unroll_loop (loop, unroll_factor,
-                       single_dom_exit (loop), &desc);
+      tree_unroll_loop (loop, unroll_factor, &desc);
       unrolled = true;
     }