[nvptx] Fix calls to vector and worker routines

author Nathan Sidwell <nathan@codesourcery.com>

Fri, 20 Apr 2018 13:46:07 +0000 (13:46 +0000)

committer Tom de Vries <vries@gcc.gnu.org>

Fri, 20 Apr 2018 13:46:07 +0000 (13:46 +0000)
author Nathan Sidwell <nathan@codesourcery.com>
Fri, 20 Apr 2018 13:46:07 +0000 (13:46 +0000)
committer Tom de Vries <vries@gcc.gnu.org>
Fri, 20 Apr 2018 13:46:07 +0000 (13:46 +0000)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index 4461982..7152a19 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,15 @@
+2018-04-20  Nathan Sidwell  <nathan@codesourcery.com>
+           Tom de Vries  <tom@codesourcery.com>
+
+       PR target/85445
+       * config/nvptx/nvptx.c (nvptx_emit_forking, nvptx_emit_joining):
+       Emit insns for calls too.
+       (nvptx_find_par): Always look for worker-level predecessor insn.
+       (nvptx_propagate): Add is_call parm, return bool.  Copy frame for
+       calls.
+       (nvptx_vpropagate, nvptx_wpropagate): Adjust.
+       (nvptx_process_pars): Propagate frames for calls.
+
  2018-04-20  H.J. Lu  <hongjiu.lu@intel.com>
  
         PR target/85469
diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c

index 131b495..ca3fea3 100644 (file)
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -399,8 +399,7 @@ nvptx_emit_forking (unsigned mask, bool is_call)
          it creates a block with a single successor before entering a
          partitooned region.  That is a good candidate for the end of
          an SESE region.  */
-      if (!is_call)
-       emit_insn (gen_nvptx_fork (op));
+      emit_insn (gen_nvptx_fork (op));
        emit_insn (gen_nvptx_forked (op));
      }
  }
@@ -419,8 +418,7 @@ nvptx_emit_joining (unsigned mask, bool is_call)
        /* Emit joining for all non-call pars to ensure there's a single
          predecessor for the block the join insn ends up in.  This is
          needed for skipping entire loops.  */
-      if (!is_call)
-       emit_insn (gen_nvptx_joining (op));
+      emit_insn (gen_nvptx_joining (op));
        emit_insn (gen_nvptx_join (op));
      }
  }
@@ -3086,8 +3084,7 @@ nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
             par = new parallel (par, mask);
             par->forked_block = block;
             par->forked_insn = end;
-           if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
-               && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
+           if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
               par->fork_insn
                 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
           }
@@ -3102,8 +3099,7 @@ nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
             gcc_assert (par->mask == mask);
             par->join_block = block;
             par->join_insn = end;
-           if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
-               && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
+           if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
               par->joining_insn
                 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
             par = par->parent;
@@ -3782,29 +3778,34 @@ nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
  #undef BB_SET_SESE
  #undef BB_GET_SESE
  
-/* Propagate live state at the start of a partitioned region.  BLOCK
-   provides the live register information, and might not contain
-   INSN. Propagation is inserted just after INSN. RW indicates whether
-   we are reading and/or writing state.  This
+/* Propagate live state at the start of a partitioned region.  IS_CALL
+   indicates whether the propagation is for a (partitioned) call
+   instruction.  BLOCK provides the live register information, and
+   might not contain INSN. Propagation is inserted just after INSN. RW
+   indicates whether we are reading and/or writing state.  This
     separation is needed for worker-level proppagation where we
     essentially do a spill & fill.  FN is the underlying worker
     function to generate the propagation instructions for single
     register.  DATA is user data.
  
-   We propagate the live register set and the entire frame.  We could
-   do better by (a) propagating just the live set that is used within
-   the partitioned regions and (b) only propagating stack entries that
-   are used.  The latter might be quite hard to determine.  */
+   Returns true if we didn't emit any instructions.
+
+   We propagate the live register set for non-calls and the entire
+   frame for calls and non-calls.  We could do better by (a)
+   propagating just the live set that is used within the partitioned
+   regions and (b) only propagating stack entries that are used.  The
+   latter might be quite hard to determine.  */
  
  typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *);
  
-static void
-nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw,
-                propagator_fn fn, void *data)
+static bool
+nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
+                propagate_mask rw, propagator_fn fn, void *data)
  {
    bitmap live = DF_LIVE_IN (block);
    bitmap_iterator iterator;
    unsigned ix;
+  bool empty = true;
  
    /* Copy the frame array.  */
    HOST_WIDE_INT fs = get_frame_size ();
@@ -3816,6 +3817,7 @@ nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw,
        rtx pred = NULL_RTX;
        rtx_code_label *label = NULL;
  
+      empty = false;
        /* The frame size might not be DImode compatible, but the frame
          array's declaration will be.  So it's ok to round up here.  */
        fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
@@ -3862,18 +3864,21 @@ nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw,
        insn = emit_insn_after (cpy, insn);
      }
  
-  /* Copy live registers.  */
-  EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
-    {
-      rtx reg = regno_reg_rtx[ix];
+  if (!is_call)
+    /* Copy live registers.  */
+    EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
+      {
+       rtx reg = regno_reg_rtx[ix];
  
-      if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
-       {
-         rtx bcast = fn (reg, rw, 0, data);
+       if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
+         {
+           rtx bcast = fn (reg, rw, 0, data);
  
-         insn = emit_insn_after (bcast, insn);
-       }
-    }
+           insn = emit_insn_after (bcast, insn);
+           empty = false;
+         }
+      }
+  return empty;
  }
  
  /* Worker for nvptx_vpropagate.  */
@@ -3889,12 +3894,13 @@ vprop_gen (rtx reg, propagate_mask pm,
  }
  
  /* Propagate state that is live at start of BLOCK across the vectors
-   of a single warp.  Propagation is inserted just after INSN.   */
+   of a single warp.  Propagation is inserted just after INSN.
+   IS_CALL and return as for nvptx_propagate.  */
  
-static void
-nvptx_vpropagate (basic_block block, rtx_insn *insn)
+static bool
+nvptx_vpropagate (bool is_call, basic_block block, rtx_insn *insn)
  {
-  nvptx_propagate (block, insn, PM_read_write, vprop_gen, 0);
+  return nvptx_propagate (is_call, block, insn, PM_read_write, vprop_gen, 0);
  }
  
  /* Worker for nvptx_wpropagate.  */
@@ -3930,10 +3936,10 @@ wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
  /* Spill or fill live state that is live at start of BLOCK.  PRE_P
     indicates if this is just before partitioned mode (do spill), or
     just after it starts (do fill). Sequence is inserted just after
-   INSN.  */
+   INSN.  IS_CALL and return as for nvptx_propagate.  */
  
-static void
-nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn)
+static bool
+nvptx_wpropagate (bool pre_p, bool is_call, basic_block block, rtx_insn *insn)
  {
    wcast_data_t data;
  
@@ -3941,7 +3947,9 @@ nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn)
    data.offset = 0;
    data.ptr = NULL_RTX;
  
-  nvptx_propagate (block, insn, pre_p ? PM_read : PM_write, wprop_gen, &data);
+  bool empty = nvptx_propagate (is_call, block, insn,
+                               pre_p ? PM_read : PM_write, wprop_gen, &data);
+  gcc_assert (empty == !data.offset);
    if (data.offset)
      {
        /* Stuff was emitted, initialize the base pointer now.  */
@@ -3951,6 +3959,7 @@ nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn)
        if (worker_bcast_size < data.offset)
         worker_bcast_size = data.offset;
      }
+  return empty;
  }
  
  /* Emit a worker-level synchronization barrier.  We use different
@@ -4311,18 +4320,23 @@ nvptx_process_pars (parallel *par)
        inner_mask |= par->inner_mask;
      }
  
-  if (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
-    /* No propagation needed for a call.  */;
-  else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+  bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0;
+
+  if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
      {
-      nvptx_wpropagate (false, par->forked_block, par->forked_insn);
-      nvptx_wpropagate (true, par->forked_block, par->fork_insn);
-      /* Insert begin and end synchronizations.  */
-      emit_insn_before (nvptx_wsync (false), par->forked_insn);
-      emit_insn_before (nvptx_wsync (true), par->join_insn);
+      nvptx_wpropagate (false, is_call, par->forked_block, par->forked_insn);
+      bool empty = nvptx_wpropagate (true, is_call,
+                                    par->forked_block, par->fork_insn);
+
+      if (!empty || !is_call)
+       {
+         /* Insert begin and end synchronizations.  */
+         emit_insn_before (nvptx_wsync (false), par->forked_insn);
+         emit_insn_before (nvptx_wsync (true), par->join_insn);
+       }
      }
    else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
-    nvptx_vpropagate (par->forked_block, par->forked_insn);
+    nvptx_vpropagate (is_call, par->forked_block, par->forked_insn);
  
    /* Now do siblings.  */
    if (par->next)
diff --git a/libgomp/ChangeLog b/libgomp/ChangeLog

index f68a4a8..1c9fa44 100644 (file)
--- a/libgomp/ChangeLog
+++ b/libgomp/ChangeLog
@@ -1,3 +1,9 @@
+2018-04-20  Nathan Sidwell  <nathan@codesourcery.com>
+           Tom de Vries  <tom@codesourcery.com>
+
+       PR target/85445
+       * testsuite/libgomp.oacc-c++/ref-1.C: New.
+
  2018-04-19  Thomas Schwinge  <thomas@codesourcery.com>
  
         PR libgomp/85463
diff --git a/libgomp/testsuite/libgomp.oacc-c++/ref-1.C b/libgomp/testsuite/libgomp.oacc-c++/ref-1.C

new file mode 100644 (file)

index 0000000..b3aaf0f
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c++/ref-1.C
@@ -0,0 +1,78 @@
+/* { dg-do run } */
+
+#include <stdio.h>
+
+#pragma acc routine vector
+void __attribute__((noinline, noclone))
+Vector (int *ptr, int n, const int &inc)
+{
+#pragma acc loop vector
+  for (unsigned ix = 0; ix < n; ix++)
+    ptr[ix] += inc;
+}
+
+#pragma acc routine worker
+void __attribute__((noinline, noclone))
+Worker (int *ptr, int m, int n, const int &inc)
+{
+#pragma acc loop worker
+  for (unsigned ix = 0; ix < m; ix++)
+    Vector(ptr + ix * n, n, inc);
+}
+
+int
+main (void)
+{
+  const int n = 32, m = 32;
+
+  int ary[m][n];
+  unsigned ix,  iy;
+
+  for (ix = m; ix--;)
+    for (iy = n; iy--;)
+      ary[ix][iy] = (ix << 8) + iy;
+
+#pragma acc parallel copy(ary)
+  {
+    Worker (&ary[0][0], m, n, 1 << 16);
+  }
+
+  int err = 0;
+
+  for (ix = m; ix--;)
+    for (iy = n; iy--;)
+      if (ary[ix][iy] != ((1 << 16) + (ix << 8) + iy))
+       {
+         printf ("ary[%u][%u] = %x expected %x\n",
+                 ix, iy, ary[ix][iy], ((1 << 16) + (ix << 8) + iy));
+         err++;
+       }
+
+  if (err)
+    {
+      printf ("%d failed\n", err);
+      return 1;
+    }
+
+#pragma acc parallel copy(ary)
+  {
+    Vector (&ary[0][0], m * n, (1 << 24) - (1 << 16));
+  }
+
+  for (ix = m; ix--;)
+    for (iy = n; iy--;)
+      if (ary[ix][iy] != ((1 << 24) + (ix << 8) + iy))
+       {
+         printf ("ary[%u][%u] = %x expected %x\n",
+                 ix, iy, ary[ix][iy], ((1 << 24) + (ix << 8) + iy));
+         err++;
+       }
+
+  if (err)
+    {
+      printf ("%d failed\n", err);
+      return 1;
+    }
+
+  return 0;
+}
author	Nathan Sidwell <nathan@codesourcery.com>
	Fri, 20 Apr 2018 13:46:07 +0000 (13:46 +0000)
committer	Tom de Vries <vries@gcc.gnu.org>
	Fri, 20 Apr 2018 13:46:07 +0000 (13:46 +0000)
gcc/ChangeLog		patch \| blob \| history
gcc/config/nvptx/nvptx.c		patch \| blob \| history
libgomp/ChangeLog		patch \| blob \| history
libgomp/testsuite/libgomp.oacc-c++/ref-1.C	[new file with mode: 0644]	patch \| blob