[amdgcn] Scale number of threads/workers with VGPR usage

author Kwok Cheung Yeung <kcy@codesourcery.com>

Fri, 31 Jan 2020 14:53:30 +0000 (06:53 -0800)

committer Kwok Cheung Yeung <kcy@codesourcery.com>

Fri, 31 Jan 2020 15:13:05 +0000 (07:13 -0800)
author Kwok Cheung Yeung <kcy@codesourcery.com>
Fri, 31 Jan 2020 14:53:30 +0000 (06:53 -0800)
committer Kwok Cheung Yeung <kcy@codesourcery.com>
Fri, 31 Jan 2020 15:13:05 +0000 (07:13 -0800)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index f44a09d..78a8310 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,9 @@
+2020-01-31  Kwok Cheung Yeung  <kcy@codesourcery.com>
+
+       * config/gcn/mkoffload.c (process_asm): Add sgpr_count and vgpr_count
+       to definition of hsa_kernel_description.  Parse assembly to find SGPR
+       and VGPR count of kernel and store in hsa_kernel_description.
+
  2020-01-31  Tamar Christina  <tamar.christina@arm.com>
  
         PR rtl-optimization/91838
diff --git a/gcc/config/gcn/mkoffload.c b/gcc/config/gcn/mkoffload.c

index 0062f15..723da10 100644 (file)
--- a/gcc/config/gcn/mkoffload.c
+++ b/gcc/config/gcn/mkoffload.c
@@ -211,12 +211,13 @@ access_check (const char *name, int mode)
  static void
  process_asm (FILE *in, FILE *out, FILE *cfile)
  {
-  int fn_count = 0, var_count = 0, dims_count = 0;
-  struct obstack fns_os, vars_os, varsizes_os, dims_os;
+  int fn_count = 0, var_count = 0, dims_count = 0, regcount_count = 0;
+  struct obstack fns_os, vars_os, varsizes_os, dims_os, regcounts_os;
    obstack_init (&fns_os);
    obstack_init (&vars_os);
    obstack_init (&varsizes_os);
    obstack_init (&dims_os);
+  obstack_init (&regcounts_os);
  
    struct oaccdims
    {
@@ -224,13 +225,20 @@ process_asm (FILE *in, FILE *out, FILE *cfile)
      char *name;
    } dim;
  
+  struct regcount
+  {
+    int sgpr_count;
+    int vgpr_count;
+    char *kernel_name;
+  } regcount;
+
    /* Always add _init_array and _fini_array as kernels.  */
    obstack_ptr_grow (&fns_os, xstrdup ("_init_array"));
    obstack_ptr_grow (&fns_os, xstrdup ("_fini_array"));
    fn_count += 2;
  
    char buf[1000];
-  enum { IN_CODE, IN_VARS, IN_FUNCS } state = IN_CODE;
+  enum { IN_CODE, IN_AMD_KERNEL_CODE_T, IN_VARS, IN_FUNCS } state = IN_CODE;
    while (fgets (buf, sizeof (buf), in))
      {
        switch (state)
@@ -243,6 +251,22 @@ process_asm (FILE *in, FILE *out, FILE *cfile)
                 obstack_grow (&dims_os, &dim, sizeof (dim));
                 dims_count++;
               }
+           else if (sscanf (buf, " .amdgpu_hsa_kernel %ms\n",
+                            &regcount.kernel_name) == 1)
+             break;
+
+           break;
+         }
+       case IN_AMD_KERNEL_CODE_T:
+         {
+           gcc_assert (regcount.kernel_name);
+           if (sscanf (buf, " wavefront_sgpr_count = %d\n",
+                       &regcount.sgpr_count) == 1)
+             break;
+           else if (sscanf (buf, " workitem_vgpr_count = %d\n",
+                            &regcount.vgpr_count) == 1)
+             break;
+
             break;
           }
         case IN_VARS:
@@ -282,19 +306,36 @@ process_asm (FILE *in, FILE *out, FILE *cfile)
         state = IN_VARS;
        else if (sscanf (buf, " .section .gnu.offload_funcs%c", &dummy) > 0)
         state = IN_FUNCS;
+      else if (sscanf (buf, " .amd_kernel_code_%c", &dummy) > 0)
+       {
+         state = IN_AMD_KERNEL_CODE_T;
+         regcount.sgpr_count = regcount.vgpr_count = -1;
+       }
        else if (sscanf (buf, " .section %c", &dummy) > 0
                || sscanf (buf, " .text%c", &dummy) > 0
                || sscanf (buf, " .bss%c", &dummy) > 0
                || sscanf (buf, " .data%c", &dummy) > 0
                || sscanf (buf, " .ident %c", &dummy) > 0)
         state = IN_CODE;
+      else if (sscanf (buf, " .end_amd_kernel_code_%c", &dummy) > 0)
+       {
+         state = IN_CODE;
+         gcc_assert (regcount.kernel_name != NULL
+                     && regcount.sgpr_count >= 0
+                     && regcount.vgpr_count >= 0);
+         obstack_grow (&regcounts_os, &regcount, sizeof (regcount));
+         regcount_count++;
+         regcount.kernel_name = NULL;
+         regcount.sgpr_count = regcount.vgpr_count = -1;
+       }
  
-      if (state == IN_CODE)
+      if (state == IN_CODE || state == IN_AMD_KERNEL_CODE_T)
         fputs (buf, out);
      }
  
    char **fns = XOBFINISH (&fns_os, char **);
    struct oaccdims *dims = XOBFINISH (&dims_os, struct oaccdims *);
+  struct regcount *regcounts = XOBFINISH (&regcounts_os, struct regcount *);
  
    fprintf (cfile, "#include <stdlib.h>\n");
    fprintf (cfile, "#include <stdbool.h>\n\n");
@@ -322,6 +363,8 @@ process_asm (FILE *in, FILE *out, FILE *cfile)
    fprintf (cfile, "static const struct hsa_kernel_description {\n"
            "  const char *name;\n"
            "  int oacc_dims[3];\n"
+          "  int sgpr_count;\n"
+          "  int vgpr_count;\n"
            "} gcn_kernels[] = {\n  ");
    dim.d[0] = dim.d[1] = dim.d[2] = 0;
    const char *comma;
@@ -329,15 +372,24 @@ process_asm (FILE *in, FILE *out, FILE *cfile)
      {
        /* Find if we recorded dimensions for this function.  */
        int *d = dim.d;          /* Previously zeroed.  */
+      int sgpr_count = 0;
+      int vgpr_count = 0;
        for (int j = 0; j < dims_count; j++)
         if (strcmp (fns[i], dims[j].name) == 0)
           {
             d = dims[j].d;
             break;
           }
+      for (int j = 0; j < regcount_count; j++)
+       if (strcmp (fns[i], regcounts[j].kernel_name) == 0)
+         {
+           sgpr_count = regcounts[j].sgpr_count;
+           vgpr_count = regcounts[j].vgpr_count;
+           break;
+         }
  
-      fprintf (cfile, "%s{\"%s\", {%d, %d, %d}}", comma,
-              fns[i], d[0], d[1], d[2]);
+      fprintf (cfile, "%s{\"%s\", {%d, %d, %d}, %d, %d}", comma,
+              fns[i], d[0], d[1], d[2], sgpr_count, vgpr_count);
  
        free (fns[i]);
      }
@@ -346,7 +398,10 @@ process_asm (FILE *in, FILE *out, FILE *cfile)
    obstack_free (&fns_os, NULL);
    for (i = 0; i < dims_count; i++)
      free (dims[i].name);
+  for (i = 0; i < regcount_count; i++)
+    free (regcounts[i].kernel_name);
    obstack_free (&dims_os, NULL);
+  obstack_free (&regcounts_os, NULL);
  }
  
  /* Embed an object file into a C source file.  */
diff --git a/libgomp/ChangeLog b/libgomp/ChangeLog

index 0970724..557bec3 100644 (file)
--- a/libgomp/ChangeLog
+++ b/libgomp/ChangeLog
@@ -1,3 +1,13 @@
+2020-01-31  Kwok Cheung Yeung  <kcy@codesourcery.com>
+
+       * plugin/plugin-gcn.c (struct hsa_kernel_description): Add sgpr_count
+       and vgpr_count fields.
+       (struct kernel_info): Add a field for a hsa_kernel_description.
+       (run_kernel): Reduce the number of threads/workers if the requested
+       number would require too many VGPRs.
+       (init_basic_kernel_info): Initialize description field with
+       the hsa_kernel_description entry for the kernel.
+
  2020-01-29  Tobias Burnus  <tobias@codesourcery.com>
  
         PR bootstrap/93409
diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c

index 22676b4..25547ef 100644 (file)
--- a/libgomp/plugin/plugin-gcn.c
+++ b/libgomp/plugin/plugin-gcn.c
@@ -371,6 +371,8 @@ struct hsa_kernel_description
  {
    const char *name;
    int oacc_dims[3];  /* Only present for GCN kernels.  */
+  int sgpr_count;
+  int vpgr_count;
  };
  
  /* Mkoffload uses this structure to describe an offload variable.  */
@@ -478,6 +480,8 @@ struct kernel_info
    struct agent_info *agent;
    /* The specific module where the kernel takes place.  */
    struct module_info *module;
+  /* Information provided by mkoffload associated with the kernel.  */
+  struct hsa_kernel_description *description;
    /* Mutex enforcing that at most once thread ever initializes a kernel for
       use.  A thread should have locked agent->module_rwlock for reading before
       acquiring it.  */
@@ -2102,6 +2106,24 @@ run_kernel (struct kernel_info *kernel, void *vars,
             struct GOMP_kernel_launch_attributes *kla,
             struct goacc_asyncqueue *aq, bool module_locked)
  {
+  GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel->description->sgpr_count,
+            kernel->description->vpgr_count);
+
+  /* Reduce the number of threads/workers if there are insufficient
+     VGPRs available to run the kernels together.  */
+  if (kla->ndim == 3 && kernel->description->vpgr_count > 0)
+    {
+      int granulated_vgprs = (kernel->description->vpgr_count + 3) & ~3;
+      int max_threads = (256 / granulated_vgprs) * 4;
+      if (kla->gdims[2] > max_threads)
+       {
+         GCN_WARNING ("Too many VGPRs required to support %d threads/workers"
+                      " per team/gang - reducing to %d threads/workers.\n",
+                      kla->gdims[2], max_threads);
+         kla->gdims[2] = max_threads;
+       }
+    }
+
    GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
              (aq ? aq->id : 0));
    GCN_DEBUG ("GCN launch attribs: gdims:[");
@@ -2303,6 +2325,7 @@ init_basic_kernel_info (struct kernel_info *kernel,
    kernel->agent = agent;
    kernel->module = module;
    kernel->name = d->name;
+  kernel->description = d;
    if (pthread_mutex_init (&kernel->init_mutex, NULL))
      {
        GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
author	Kwok Cheung Yeung <kcy@codesourcery.com>
	Fri, 31 Jan 2020 14:53:30 +0000 (06:53 -0800)
committer	Kwok Cheung Yeung <kcy@codesourcery.com>
	Fri, 31 Jan 2020 15:13:05 +0000 (07:13 -0800)
gcc/ChangeLog		patch \| blob \| history
gcc/config/gcn/mkoffload.c		patch \| blob \| history
libgomp/ChangeLog		patch \| blob \| history
libgomp/plugin/plugin-gcn.c		patch \| blob \| history