* tree-ssa-loop-prefetch.c (schedule_prefetches): Cleanup and improve
authorrakdver <rakdver@138bc75d-0d04-0410-961f-82ee72b054a4>
Sun, 12 Nov 2006 20:05:49 +0000 (20:05 +0000)
committerrakdver <rakdver@138bc75d-0d04-0410-961f-82ee72b054a4>
Sun, 12 Nov 2006 20:05:49 +0000 (20:05 +0000)
comments.
(issue_prefetch_ref): Move assignment to write_p out of loop.
(determine_unroll_factor): Do not take PARAM_MAX_UNROLL_TIMES and
SIMULTANEOUS_PREFETCHES into account.
(loop_prefetch_arrays): Do not pass ahead to determine_unroll_factor.
* lambda-code.c (lcm): Renamed to ...
(least_common_multiple): ... and exported.
* tree-flow.h (least_common_multiple): Declare.

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@118730 138bc75d-0d04-0410-961f-82ee72b054a4

gcc/ChangeLog
gcc/lambda-code.c
gcc/testsuite/ChangeLog
gcc/testsuite/gcc.dg/tree-ssa/prefetch-3.c [new file with mode: 0644]
gcc/tree-flow.h
gcc/tree-ssa-loop-prefetch.c

index 845e235..e29ed31 100644 (file)
@@ -1,5 +1,17 @@
 2006-11-12  Zdenek Dvorak <dvorakz@suse.cz>
 
+       * tree-ssa-loop-prefetch.c (schedule_prefetches): Cleanup and improve
+       comments.
+       (issue_prefetch_ref): Move assignment to write_p out of loop.
+       (determine_unroll_factor): Do not take PARAM_MAX_UNROLL_TIMES and
+       SIMULTANEOUS_PREFETCHES into account.
+       (loop_prefetch_arrays): Do not pass ahead to determine_unroll_factor.
+       * lambda-code.c (lcm): Renamed to ...
+       (least_common_multiple): ... and exported.
+       * tree-flow.h (least_common_multiple): Declare.
+
+2006-11-12  Zdenek Dvorak <dvorakz@suse.cz>
+
        * Makefile.in (tree-data-ref.o): Add langhooks.h dependency.
        * tree-ssa-loop-niter.c (derive_constant_upper_bound):  Follow
        ud-chains.  Handle AND_EXPR.
index 2a03fd3..3dfad91 100644 (file)
@@ -442,8 +442,8 @@ lambda_lattice_compute_base (lambda_loopnest nest)
 
 /* Compute the least common multiple of two numbers A and B .  */
 
-static int
-lcm (int a, int b)
+int
+least_common_multiple (int a, int b)
 {
   return (abs (a) * abs (b) / gcd (a, b));
 }
@@ -577,7 +577,7 @@ compute_nest_using_fourier_motzkin (int size,
                {
                  if (A[k][i] < 0)
                    {
-                     multiple = lcm (A[j][i], A[k][i]);
+                     multiple = least_common_multiple (A[j][i], A[k][i]);
                      f1 = multiple / A[j][i];
                      f2 = -1 * multiple / A[k][i];
 
index 5a2b502..71597e2 100644 (file)
@@ -1,3 +1,7 @@
+2006-11-12  Zdenek Dvorak <dvorakz@suse.cz>
+
+       * gcc.dg/tree-ssa/prefetch-3.c: New test.
+
 2006-11-12  Roger Sayle  <roger@eyesopen.com>
 
        PR tree-optimization/13827
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/prefetch-3.c b/gcc/testsuite/gcc.dg/tree-ssa/prefetch-3.c
new file mode 100644 (file)
index 0000000..c5767a7
--- /dev/null
@@ -0,0 +1,22 @@
+/* Prefetching used to prefer nonsensical unroll factor of 5 in this testcase.  */
+
+/* { dg-do compile { target i?86-*-* } } */
+/* { dg-options "-O2 -fprefetch-loop-arrays -march=athlon -msse2 -mfpmath=sse -fdump-tree-aprefetch-details" } */
+
+#define N 1000000
+
+double a[N];
+
+double test(void)
+{
+  unsigned i;
+  double sum = 0;
+
+  for (i = 0; i < N; i += 2)
+    sum += (a[i] * a[i+1]);
+
+  return sum;
+}
+
+/* { dg-final { scan-tree-dump-times "unroll factor 4" 1 "aprefetch" } } */
+/* { dg-final { cleanup-tree-dump "aprefetch" } } */
index daa19d4..a880c32 100644 (file)
@@ -1036,4 +1036,7 @@ void swap_tree_operands (tree, tree *, tree *);
 
 extern void recalculate_used_alone (void);
 extern bool updating_used_alone;
+
+int least_common_multiple (int, int);
+
 #endif /* _TREE_FLOW_H  */
index 41ada26..b40fc0f 100644 (file)
@@ -744,19 +744,21 @@ static bool
 schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
                     unsigned ahead)
 {
-  unsigned max_prefetches, n_prefetches;
+  unsigned remaining_prefetch_slots, n_prefetches, prefetch_slots;
+  unsigned slots_per_prefetch;
   struct mem_ref *ref;
   bool any = false;
 
-  max_prefetches = (SIMULTANEOUS_PREFETCHES * unroll_factor) / ahead;
-  if (max_prefetches > (unsigned) SIMULTANEOUS_PREFETCHES)
-    max_prefetches = SIMULTANEOUS_PREFETCHES;
+  /* At most SIMULTANEOUS_PREFETCHES should be running at the same time.  */
+  remaining_prefetch_slots = SIMULTANEOUS_PREFETCHES;
 
+  /* The prefetch will run for AHEAD iterations of the original loop, i.e.,
+     AHEAD / UNROLL_FACTOR iterations of the unrolled loop.  In each iteration,
+     it will need a prefetch slot.  */
+  slots_per_prefetch = (ahead + unroll_factor / 2) / unroll_factor;
   if (dump_file && (dump_flags & TDF_DETAILS))
-    fprintf (dump_file, "Max prefetches to issue: %d.\n", max_prefetches);
-
-  if (!max_prefetches)
-    return false;
+    fprintf (dump_file, "Each prefetch instruction takes %u prefetch slots.\n",
+            slots_per_prefetch);
 
   /* For now we just take memory references one by one and issue
      prefetches for as many as possible.  The groups are sorted
@@ -769,16 +771,24 @@ schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
        if (!should_issue_prefetch_p (ref))
          continue;
 
-       ref->issue_prefetch_p = true;
-
-       /* If prefetch_mod is less then unroll_factor, we need to insert
-          several prefetches for the reference.  */
+       /* If we need to prefetch the reference each PREFETCH_MOD iterations,
+          and we unroll the loop UNROLL_FACTOR times, we need to insert
+          ceil (UNROLL_FACTOR / PREFETCH_MOD) instructions in each
+          iteration.  */
        n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
                        / ref->prefetch_mod);
-       if (max_prefetches <= n_prefetches)
-         return true;
+       prefetch_slots = n_prefetches * slots_per_prefetch;
+
+       /* If more than half of the prefetches would be lost anyway, do not
+          issue the prefetch.  */
+       if (2 * remaining_prefetch_slots < prefetch_slots)
+         continue;
+
+       ref->issue_prefetch_p = true;
 
-       max_prefetches -= n_prefetches;
+       if (remaining_prefetch_slots <= prefetch_slots)
+         return true;
+       remaining_prefetch_slots -= prefetch_slots;
        any = true;
       }
 
@@ -822,6 +832,7 @@ issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
                  / ref->prefetch_mod);
   addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node);
   addr_base = force_gimple_operand_bsi (&bsi, unshare_expr (addr_base), true, NULL);
+  write_p = ref->write_p ? integer_one_node : integer_zero_node;
 
   for (ap = 0; ap < n_prefetches; ap++)
     {
@@ -832,10 +843,9 @@ issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
       addr = force_gimple_operand_bsi (&bsi, unshare_expr (addr), true, NULL);
 
       /* Create the prefetch instruction.  */
-      write_p = ref->write_p ? integer_one_node : integer_zero_node;
       params = tree_cons (NULL_TREE, addr,
                          tree_cons (NULL_TREE, write_p, NULL_TREE));
-                                
+
       prefetch = build_function_call_expr (built_in_decls[BUILT_IN_PREFETCH],
                                           params);
       bsi_insert_before (&bsi, prefetch, BSI_SAME_STMT);
@@ -888,48 +898,36 @@ should_unroll_loop_p (struct loop *loop, struct tree_niter_desc *desc,
 
 static unsigned
 determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs,
-                        unsigned ahead, unsigned ninsns,
-                        struct tree_niter_desc *desc)
+                        unsigned ninsns, struct tree_niter_desc *desc)
 {
-  unsigned upper_bound, size_factor, constraint_factor;
-  unsigned factor, max_mod_constraint, ahead_factor;
+  unsigned upper_bound;
+  unsigned nfactor, factor, mod_constraint;
   struct mem_ref_group *agp;
   struct mem_ref *ref;
 
-  upper_bound = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
-
-  /* First check whether the loop is not too large to unroll.  */
-  size_factor = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns;
-  if (size_factor <= 1)
+  /* First check whether the loop is not too large to unroll.  We ignore
+     PARAM_MAX_UNROLL_TIMES, because for small loops, it prevented us
+     from unrolling them enough to make exactly one cache line covered by each
+     iteration.  Also, the goal of PARAM_MAX_UNROLL_TIMES is to prevent
+     us from unrolling the loops too many times in cases where we only expect
+     gains from better scheduling and decreasing loop overhead, which is not
+     the case here.  */
+  upper_bound = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns;
+  if (upper_bound <= 1)
     return 1;
 
-  if (size_factor < upper_bound)
-    upper_bound = size_factor;
-
-  max_mod_constraint = 1;
+  /* Choose the factor so that we may prefetch each cache just once,
+     but bound the unrolling by UPPER_BOUND.  */
+  factor = 1;
   for (agp = refs; agp; agp = agp->next)
     for (ref = agp->refs; ref; ref = ref->next)
-      if (should_issue_prefetch_p (ref)
-         && ref->prefetch_mod > max_mod_constraint)
-       max_mod_constraint = ref->prefetch_mod;
-
-  /* Set constraint_factor as large as needed to be able to satisfy the
-     largest modulo constraint.  */
-  constraint_factor = max_mod_constraint;
-
-  /* If ahead is too large in comparison with the number of available
-     prefetches, unroll the loop as much as needed to be able to prefetch
-     at least partially some of the references in the loop.  */
-  ahead_factor = ((ahead + SIMULTANEOUS_PREFETCHES - 1)
-                 / SIMULTANEOUS_PREFETCHES);
-
-  /* Unroll as much as useful, but bound the code size growth.  */
-  if (constraint_factor < ahead_factor)
-    factor = ahead_factor;
-  else
-    factor = constraint_factor;
-  if (factor > upper_bound)
-    factor = upper_bound;
+      if (should_issue_prefetch_p (ref))
+       {
+         mod_constraint = ref->prefetch_mod;
+         nfactor = least_common_multiple (mod_constraint, factor);
+         if (nfactor <= upper_bound)
+           factor = nfactor;
+       }
 
   if (!should_unroll_loop_p (loop, desc, factor))
     return 1;
@@ -964,8 +962,7 @@ loop_prefetch_arrays (struct loops *loops, struct loop *loop)
      instructions executed per iteration of the loop.  */
   ninsns = tree_num_loop_insns (loop);
   ahead = (PREFETCH_LATENCY + ninsns - 1) / ninsns;
-  unroll_factor = determine_unroll_factor (loop, refs, ahead, ninsns,
-                                          &desc);
+  unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc);
   if (dump_file && (dump_flags & TDF_DETAILS))
     fprintf (dump_file, "Ahead %d, unroll factor %d\n", ahead, unroll_factor);