From 5955438a2ea81d9fb97d87a913315062ac47672b Mon Sep 17 00:00:00 2001
From: Bin Cheng <bin.cheng@arm.com>
Date: Tue, 23 Jan 2018 16:47:03 +0000
Subject: [PATCH] re PR tree-optimization/82604 (SPEC CPU2006 410.bwaves ~50%
 performance regression with trunk@253679 when ftree-parallelize-loops is
 used)

	PR tree-optimization/82604
	* tree-loop-distribution.c (enum partition_kind): New enum item
	PKIND_PARTIAL_MEMSET.
	(partition_builtin_p): Support above new enum item.
	(generate_code_for_partition): Ditto.
	(compute_access_range): Differentiate cases that equality can be
	proven at all loops, the innermost loops or no loops.
	(classify_builtin_st, classify_builtin_ldst): Adjust call to above
	function.  Set PKIND_PARTIAL_MEMSET for partition appropriately.
	(finalize_partitions, distribute_loop): Don't fuse partition of
	PKIND_PARTIAL_MEMSET kind when distributing 3-level loop nest.
	(prepare_perfect_loop_nest): Distribute 3-level loop nest only if
	parloop is enabled.

From-SVN: r256990
---
 gcc/ChangeLog                |  16 +++++++
 gcc/tree-loop-distribution.c | 100 ++++++++++++++++++++++++++++++-------------
 2 files changed, 87 insertions(+), 29 deletions(-)

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 3238e70..576d5a2 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,19 @@
+2018-01-23  Bin Cheng  <bin.cheng@arm.com>
+
+	PR tree-optimization/82604
+	* tree-loop-distribution.c (enum partition_kind): New enum item
+	PKIND_PARTIAL_MEMSET.
+	(partition_builtin_p): Support above new enum item.
+	(generate_code_for_partition): Ditto.
+	(compute_access_range): Differentiate cases that equality can be
+	proven at all loops, the innermost loops or no loops.
+	(classify_builtin_st, classify_builtin_ldst): Adjust call to above
+	function.  Set PKIND_PARTIAL_MEMSET for partition appropriately.
+	(finalize_partitions, distribute_loop): Don't fuse partition of
+	PKIND_PARTIAL_MEMSET kind when distributing 3-level loop nest.
+	(prepare_perfect_loop_nest): Distribute 3-level loop nest only if
+	parloop is enabled.
+
 2018-01-23  Martin Liska  <mliska@suse.cz>
 
 	* predict.def (PRED_INDIR_CALL): Set probability to PROB_EVEN in
diff --git a/gcc/tree-loop-distribution.c b/gcc/tree-loop-distribution.c
index a3d76e4..67f27ba 100644
--- a/gcc/tree-loop-distribution.c
+++ b/gcc/tree-loop-distribution.c
@@ -584,7 +584,19 @@ build_rdg (struct loop *loop, control_dependences *cd)
 
 /* Kind of distributed loop.  */
 enum partition_kind {
-    PKIND_NORMAL, PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE
+    PKIND_NORMAL,
+    /* Partial memset stands for a paritition can be distributed into a loop
+       of memset calls, rather than a single memset call.  It's handled just
+       like a normal parition, i.e, distributed as separate loop, no memset
+       call is generated.
+
+       Note: This is a hacking fix trying to distribute ZERO-ing stmt in a
+       loop nest as deep as possible.  As a result, parloop achieves better
+       parallelization by parallelizing deeper loop nest.  This hack should
+       be unnecessary and removed once distributed memset can be understood
+       and analyzed in data reference analysis.  See PR82604 for more.  */
+    PKIND_PARTIAL_MEMSET,
+    PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE
 };
 
 /* Type of distributed loop.  */
@@ -659,7 +671,7 @@ partition_free (partition *partition)
 static bool
 partition_builtin_p (partition *partition)
 {
-  return partition->kind != PKIND_NORMAL;
+  return partition->kind > PKIND_PARTIAL_MEMSET;
 }
 
 /* Returns true if the partition contains a reduction.  */
@@ -1127,6 +1139,7 @@ generate_code_for_partition (struct loop *loop,
   switch (partition->kind)
     {
     case PKIND_NORMAL:
+    case PKIND_PARTIAL_MEMSET:
       /* Reductions all have to be in the last partition.  */
       gcc_assert (!partition_reduction_p (partition)
 		  || !copy_p);
@@ -1399,17 +1412,22 @@ find_single_drs (struct loop *loop, struct graph *rdg, partition *partition,
 
 /* Given data reference DR in LOOP_NEST, this function checks the enclosing
    loops from inner to outer to see if loop's step equals to access size at
-   each level of loop.  Return true if yes; record access base and size in
-   BASE and SIZE; save loop's step at each level of loop in STEPS if it is
-   not null.  For example:
+   each level of loop.  Return 2 if we can prove this at all level loops;
+   record access base and size in BASE and SIZE; save loop's step at each
+   level of loop in STEPS if it is not null.  For example:
 
      int arr[100][100][100];
      for (i = 0; i < 100; i++)       ;steps[2] = 40000
        for (j = 100; j > 0; j--)     ;steps[1] = -400
 	 for (k = 0; k < 100; k++)   ;steps[0] = 4
-	   arr[i][j - 1][k] = 0;     ;base = &arr, size = 4000000.  */
+	   arr[i][j - 1][k] = 0;     ;base = &arr, size = 4000000
 
-static bool
+   Return 1 if we can prove the equality at the innermost loop, but not all
+   level loops.  In this case, no information is recorded.
+
+   Return 0 if no equality can be proven at any level loops.  */
+
+static int
 compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base,
 		      tree *size, vec<tree> *steps = NULL)
 {
@@ -1419,24 +1437,25 @@ compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base,
   tree ref = DR_REF (dr);
   tree access_base = build_fold_addr_expr (ref);
   tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (ref));
+  int res = 0;
 
   do {
       tree scev_fn = analyze_scalar_evolution (loop, access_base);
       if (TREE_CODE (scev_fn) != POLYNOMIAL_CHREC)
-	return false;
+	return res;
 
       access_base = CHREC_LEFT (scev_fn);
       if (tree_contains_chrecs (access_base, NULL))
-	return false;
+	return res;
 
       tree scev_step = CHREC_RIGHT (scev_fn);
       /* Only support constant steps.  */
       if (TREE_CODE (scev_step) != INTEGER_CST)
-	return false;
+	return res;
 
       enum ev_direction access_dir = scev_direction (scev_fn);
       if (access_dir == EV_DIR_UNKNOWN)
-	return false;
+	return res;
 
       if (steps != NULL)
 	steps->safe_push (scev_step);
@@ -1449,7 +1468,11 @@ compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base,
       /* At each level of loop, scev step must equal to access size.  In other
 	 words, DR must access consecutive memory between loop iterations.  */
       if (!operand_equal_p (scev_step, access_size, 0))
-	return false;
+	return res;
+
+      /* Access stride can be computed for data reference at least for the
+	 innermost loop.  */
+      res = 1;
 
       /* Compute DR's execution times in loop.  */
       tree niters = number_of_latch_executions (loop);
@@ -1471,7 +1494,8 @@ compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base,
 
   *base = access_base;
   *size = access_size;
-  return true;
+  /* Access stride can be computed for data reference at each level loop.  */
+  return 2;
 }
 
 /* Allocate and return builtin struct.  Record information like DST_DR,
@@ -1510,8 +1534,14 @@ classify_builtin_st (loop_p loop, partition *partition, data_reference_p dr)
       && flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (rhs))))
     return;
 
-  if (!compute_access_range (loop, dr, &base, &size))
+  int res = compute_access_range (loop, dr, &base, &size);
+  if (res == 0)
     return;
+  if (res == 1)
+    {
+      partition->kind = PKIND_PARTIAL_MEMSET;
+      return;
+    }
 
   poly_uint64 base_offset;
   unsigned HOST_WIDE_INT const_base_offset;
@@ -1537,11 +1567,16 @@ classify_builtin_ldst (loop_p loop, struct graph *rdg, partition *partition,
   tree base, size, src_base, src_size;
   auto_vec<tree> dst_steps, src_steps;
 
-  /* Compute access range of both load and store.  They much have the same
-     access size.  */
-  if (!compute_access_range (loop, dst_dr, &base, &size, &dst_steps)
-      || !compute_access_range (loop, src_dr, &src_base, &src_size, &src_steps)
-      || !operand_equal_p (size, src_size, 0))
+  /* Compute access range of both load and store.  */
+  int res = compute_access_range (loop, dst_dr, &base, &size, &dst_steps);
+  if (res != 2)
+    return;
+  res = compute_access_range (loop, src_dr, &src_base, &src_size, &src_steps);
+  if (res != 2)
+    return;
+
+  /* They much have the same access size.  */
+  if (!operand_equal_p (size, src_size, 0))
     return;
 
   /* Load and store in loop nest must access memory in the same way, i.e,
@@ -2623,22 +2658,27 @@ finalize_partitions (struct loop *loop, vec<struct partition *> *partitions,
       || alias_ddrs->length () > 0)
     return;
 
-  unsigned num_builtin = 0, num_normal = 0;
+  unsigned num_builtin = 0, num_normal = 0, num_partial_memset = 0;
   bool same_type_p = true;
   enum partition_type type = ((*partitions)[0])->type;
   for (i = 0; partitions->iterate (i, &partition); ++i)
     {
       same_type_p &= (type == partition->type);
-      if (partition->kind != PKIND_NORMAL)
-	num_builtin++;
-      else
-	num_normal++;
+      if (partition_builtin_p (partition))
+	{
+	  num_builtin++;
+	  continue;
+	}
+      num_normal++;
+      if (partition->kind == PKIND_PARTIAL_MEMSET)
+	num_partial_memset++;
     }
 
   /* Don't distribute current loop into too many loops given we don't have
      memory stream cost model.  Be even more conservative in case of loop
      nest distribution.  */
-  if ((same_type_p && num_builtin == 0)
+  if ((same_type_p && num_builtin == 0
+       && (loop->inner == NULL || num_normal != 2 || num_partial_memset != 1))
       || (loop->inner != NULL
 	  && i >= NUM_PARTITION_THRESHOLD && num_normal > 1)
       || (loop->inner == NULL
@@ -2786,7 +2826,7 @@ distribute_loop (struct loop *loop, vec<gimple *> stmts,
   for (i = 0; partitions.iterate (i, &into); ++i)
     {
       bool changed = false;
-      if (partition_builtin_p (into))
+      if (partition_builtin_p (into) || into->kind == PKIND_PARTIAL_MEMSET)
 	continue;
       for (int j = i + 1;
 	   partitions.iterate (j, &partition); ++j)
@@ -2966,10 +3006,12 @@ prepare_perfect_loop_nest (struct loop *loop)
   struct loop *outer = loop_outer (loop);
   tree niters = number_of_latch_executions (loop);
 
-  /* TODO: We only support the innermost 2-level loop nest distribution
+  /* TODO: We only support the innermost 3-level loop nest distribution
      because of compilation time issue for now.  This should be relaxed
-     in the future.  */
-  while (loop->inner == NULL
+     in the future.  Note we only allow 3-level loop nest distribution
+     when parallelizing loops.  */
+  while ((loop->inner == NULL
+	  || (loop->inner->inner == NULL && flag_tree_parallelize_loops > 1))
 	 && loop_outer (outer)
 	 && outer->inner == loop && loop->next == NULL
 	 && single_exit (outer)
-- 
2.7.4