From 6e27bc2b885207d51500b2c42f949ca5073dbe72 Mon Sep 17 00:00:00 2001
From: Richard Biener <rguenther@suse.de>
Date: Thu, 9 Sep 2021 10:52:12 +0200
Subject: [PATCH] Avoid full DOM walk in LIM fill_always_executed_in

This avoids a full DOM walk via get_loop_body_in_dom_order in the
loop body walk of fill_always_executed_in which is often terminating
the walk of a loop body early by integrating the DOM walk of
get_loop_body_in_dom_order with the actual processing done by
fill_always_executed_in.  This trades the fully populated loop
body array with a worklist allocation of the same size and thus
should be a strict improvement over the recursive approach of
get_loop_body_in_dom_order.

2021-09-09  Richard Biener  <rguenther@suse.de>

	* tree-ssa-loop-im.c (fill_always_executed_in_1): Integrate
	DOM walk from get_loop_body_in_dom_order using a worklist
	approach.
---
 gcc/tree-ssa-loop-im.c | 39 +++++++++++++++++++++++++++++++--------
 1 file changed, 31 insertions(+), 8 deletions(-)

diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c
index 01f3fc2..5d68454 100644
--- a/gcc/tree-ssa-loop-im.c
+++ b/gcc/tree-ssa-loop-im.c
@@ -3030,19 +3030,19 @@ do_store_motion (void)
 static void
 fill_always_executed_in_1 (class loop *loop, sbitmap contains_call)
 {
-  basic_block bb = NULL, *bbs, last = NULL;
-  unsigned i;
+  basic_block bb = NULL, last = NULL;
   edge e;
   class loop *inn_loop = loop;
 
   if (ALWAYS_EXECUTED_IN (loop->header) == NULL)
     {
-      bbs = get_loop_body_in_dom_order (loop);
-
-      for (i = 0; i < loop->num_nodes; i++)
+      auto_vec<basic_block, 64> worklist;
+      worklist.reserve_exact (loop->num_nodes);
+      worklist.quick_push (loop->header);
+      do
 	{
 	  edge_iterator ei;
-	  bb = bbs[i];
+	  bb = worklist.pop ();
 
 	  if (!flow_bb_inside_loop_p (inn_loop, bb))
 	    {
@@ -3083,7 +3083,32 @@ fill_always_executed_in_1 (class loop *loop, sbitmap contains_call)
 		 since it might not be finite.  */
 	      inn_loop = bb->loop_father;
 	    }
+
+	  /* Walk the body of LOOP sorted by dominance relation.  Additionally,
+	     if a basic block S dominates the latch, then only blocks dominated
+	     by S are after it.
+	     This is get_loop_body_in_dom_order using a worklist algorithm and
+	     stopping once we are no longer interested in visiting further
+	     blocks.  */
+	  unsigned old_len = worklist.length ();
+	  unsigned postpone = 0;
+	  for (basic_block son = first_dom_son (CDI_DOMINATORS, bb);
+	       son;
+	       son = next_dom_son (CDI_DOMINATORS, son))
+	    {
+	      if (!flow_bb_inside_loop_p (loop, son))
+		continue;
+	      if (dominated_by_p (CDI_DOMINATORS, loop->latch, son))
+		postpone = worklist.length ();
+	      worklist.quick_push (son);
+	    }
+	  if (postpone)
+	    /* Postponing the block that dominates the latch means
+	       processing it last and thus putting it earliest in the
+	       worklist.  */
+	    std::swap (worklist[old_len], worklist[postpone]);
 	}
+      while (!worklist.is_empty ());
 
       while (1)
 	{
@@ -3095,8 +3120,6 @@ fill_always_executed_in_1 (class loop *loop, sbitmap contains_call)
 	    break;
 	  last = get_immediate_dominator (CDI_DOMINATORS, last);
 	}
-
-      free (bbs);
     }
 
   for (loop = loop->inner; loop; loop = loop->next)
-- 
2.7.4