From 5a28e2727f718f14a2afccf16c4dda814af5b6f8 Mon Sep 17 00:00:00 2001 From: Kwok Cheung Yeung Date: Fri, 31 Jan 2020 06:53:30 -0800 Subject: [PATCH] [amdgcn] Scale number of threads/workers with VGPR usage 2020-01-31 Kwok Cheung Yeung gcc/ * config/gcn/mkoffload.c (process_asm): Add sgpr_count and vgpr_count to definition of hsa_kernel_description. Parse assembly to find SGPR and VGPR count of kernel and store in hsa_kernel_description. libgomp/ * plugin/plugin-gcn.c (struct hsa_kernel_description): Add sgpr_count and vgpr_count fields. (struct kernel_info): Add a field for a hsa_kernel_description. (run_kernel): Reduce the number of threads/workers if the requested number would require too many VGPRs. (init_basic_kernel_info): Initialize description field with the hsa_kernel_description entry for the kernel. --- gcc/ChangeLog | 6 ++++ gcc/config/gcn/mkoffload.c | 67 +++++++++++++++++++++++++++++++++++++++++---- libgomp/ChangeLog | 10 +++++++ libgomp/plugin/plugin-gcn.c | 23 ++++++++++++++++ 4 files changed, 100 insertions(+), 6 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index f44a09d..78a8310 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,9 @@ +2020-01-31 Kwok Cheung Yeung + + * config/gcn/mkoffload.c (process_asm): Add sgpr_count and vgpr_count + to definition of hsa_kernel_description. Parse assembly to find SGPR + and VGPR count of kernel and store in hsa_kernel_description. + 2020-01-31 Tamar Christina PR rtl-optimization/91838 diff --git a/gcc/config/gcn/mkoffload.c b/gcc/config/gcn/mkoffload.c index 0062f15..723da10 100644 --- a/gcc/config/gcn/mkoffload.c +++ b/gcc/config/gcn/mkoffload.c @@ -211,12 +211,13 @@ access_check (const char *name, int mode) static void process_asm (FILE *in, FILE *out, FILE *cfile) { - int fn_count = 0, var_count = 0, dims_count = 0; - struct obstack fns_os, vars_os, varsizes_os, dims_os; + int fn_count = 0, var_count = 0, dims_count = 0, regcount_count = 0; + struct obstack fns_os, vars_os, varsizes_os, dims_os, regcounts_os; obstack_init (&fns_os); obstack_init (&vars_os); obstack_init (&varsizes_os); obstack_init (&dims_os); + obstack_init (®counts_os); struct oaccdims { @@ -224,13 +225,20 @@ process_asm (FILE *in, FILE *out, FILE *cfile) char *name; } dim; + struct regcount + { + int sgpr_count; + int vgpr_count; + char *kernel_name; + } regcount; + /* Always add _init_array and _fini_array as kernels. */ obstack_ptr_grow (&fns_os, xstrdup ("_init_array")); obstack_ptr_grow (&fns_os, xstrdup ("_fini_array")); fn_count += 2; char buf[1000]; - enum { IN_CODE, IN_VARS, IN_FUNCS } state = IN_CODE; + enum { IN_CODE, IN_AMD_KERNEL_CODE_T, IN_VARS, IN_FUNCS } state = IN_CODE; while (fgets (buf, sizeof (buf), in)) { switch (state) @@ -243,6 +251,22 @@ process_asm (FILE *in, FILE *out, FILE *cfile) obstack_grow (&dims_os, &dim, sizeof (dim)); dims_count++; } + else if (sscanf (buf, " .amdgpu_hsa_kernel %ms\n", + ®count.kernel_name) == 1) + break; + + break; + } + case IN_AMD_KERNEL_CODE_T: + { + gcc_assert (regcount.kernel_name); + if (sscanf (buf, " wavefront_sgpr_count = %d\n", + ®count.sgpr_count) == 1) + break; + else if (sscanf (buf, " workitem_vgpr_count = %d\n", + ®count.vgpr_count) == 1) + break; + break; } case IN_VARS: @@ -282,19 +306,36 @@ process_asm (FILE *in, FILE *out, FILE *cfile) state = IN_VARS; else if (sscanf (buf, " .section .gnu.offload_funcs%c", &dummy) > 0) state = IN_FUNCS; + else if (sscanf (buf, " .amd_kernel_code_%c", &dummy) > 0) + { + state = IN_AMD_KERNEL_CODE_T; + regcount.sgpr_count = regcount.vgpr_count = -1; + } else if (sscanf (buf, " .section %c", &dummy) > 0 || sscanf (buf, " .text%c", &dummy) > 0 || sscanf (buf, " .bss%c", &dummy) > 0 || sscanf (buf, " .data%c", &dummy) > 0 || sscanf (buf, " .ident %c", &dummy) > 0) state = IN_CODE; + else if (sscanf (buf, " .end_amd_kernel_code_%c", &dummy) > 0) + { + state = IN_CODE; + gcc_assert (regcount.kernel_name != NULL + && regcount.sgpr_count >= 0 + && regcount.vgpr_count >= 0); + obstack_grow (®counts_os, ®count, sizeof (regcount)); + regcount_count++; + regcount.kernel_name = NULL; + regcount.sgpr_count = regcount.vgpr_count = -1; + } - if (state == IN_CODE) + if (state == IN_CODE || state == IN_AMD_KERNEL_CODE_T) fputs (buf, out); } char **fns = XOBFINISH (&fns_os, char **); struct oaccdims *dims = XOBFINISH (&dims_os, struct oaccdims *); + struct regcount *regcounts = XOBFINISH (®counts_os, struct regcount *); fprintf (cfile, "#include \n"); fprintf (cfile, "#include \n\n"); @@ -322,6 +363,8 @@ process_asm (FILE *in, FILE *out, FILE *cfile) fprintf (cfile, "static const struct hsa_kernel_description {\n" " const char *name;\n" " int oacc_dims[3];\n" + " int sgpr_count;\n" + " int vgpr_count;\n" "} gcn_kernels[] = {\n "); dim.d[0] = dim.d[1] = dim.d[2] = 0; const char *comma; @@ -329,15 +372,24 @@ process_asm (FILE *in, FILE *out, FILE *cfile) { /* Find if we recorded dimensions for this function. */ int *d = dim.d; /* Previously zeroed. */ + int sgpr_count = 0; + int vgpr_count = 0; for (int j = 0; j < dims_count; j++) if (strcmp (fns[i], dims[j].name) == 0) { d = dims[j].d; break; } + for (int j = 0; j < regcount_count; j++) + if (strcmp (fns[i], regcounts[j].kernel_name) == 0) + { + sgpr_count = regcounts[j].sgpr_count; + vgpr_count = regcounts[j].vgpr_count; + break; + } - fprintf (cfile, "%s{\"%s\", {%d, %d, %d}}", comma, - fns[i], d[0], d[1], d[2]); + fprintf (cfile, "%s{\"%s\", {%d, %d, %d}, %d, %d}", comma, + fns[i], d[0], d[1], d[2], sgpr_count, vgpr_count); free (fns[i]); } @@ -346,7 +398,10 @@ process_asm (FILE *in, FILE *out, FILE *cfile) obstack_free (&fns_os, NULL); for (i = 0; i < dims_count; i++) free (dims[i].name); + for (i = 0; i < regcount_count; i++) + free (regcounts[i].kernel_name); obstack_free (&dims_os, NULL); + obstack_free (®counts_os, NULL); } /* Embed an object file into a C source file. */ diff --git a/libgomp/ChangeLog b/libgomp/ChangeLog index 0970724..557bec3 100644 --- a/libgomp/ChangeLog +++ b/libgomp/ChangeLog @@ -1,3 +1,13 @@ +2020-01-31 Kwok Cheung Yeung + + * plugin/plugin-gcn.c (struct hsa_kernel_description): Add sgpr_count + and vgpr_count fields. + (struct kernel_info): Add a field for a hsa_kernel_description. + (run_kernel): Reduce the number of threads/workers if the requested + number would require too many VGPRs. + (init_basic_kernel_info): Initialize description field with + the hsa_kernel_description entry for the kernel. + 2020-01-29 Tobias Burnus PR bootstrap/93409 diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c index 22676b4..25547ef 100644 --- a/libgomp/plugin/plugin-gcn.c +++ b/libgomp/plugin/plugin-gcn.c @@ -371,6 +371,8 @@ struct hsa_kernel_description { const char *name; int oacc_dims[3]; /* Only present for GCN kernels. */ + int sgpr_count; + int vpgr_count; }; /* Mkoffload uses this structure to describe an offload variable. */ @@ -478,6 +480,8 @@ struct kernel_info struct agent_info *agent; /* The specific module where the kernel takes place. */ struct module_info *module; + /* Information provided by mkoffload associated with the kernel. */ + struct hsa_kernel_description *description; /* Mutex enforcing that at most once thread ever initializes a kernel for use. A thread should have locked agent->module_rwlock for reading before acquiring it. */ @@ -2102,6 +2106,24 @@ run_kernel (struct kernel_info *kernel, void *vars, struct GOMP_kernel_launch_attributes *kla, struct goacc_asyncqueue *aq, bool module_locked) { + GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel->description->sgpr_count, + kernel->description->vpgr_count); + + /* Reduce the number of threads/workers if there are insufficient + VGPRs available to run the kernels together. */ + if (kla->ndim == 3 && kernel->description->vpgr_count > 0) + { + int granulated_vgprs = (kernel->description->vpgr_count + 3) & ~3; + int max_threads = (256 / granulated_vgprs) * 4; + if (kla->gdims[2] > max_threads) + { + GCN_WARNING ("Too many VGPRs required to support %d threads/workers" + " per team/gang - reducing to %d threads/workers.\n", + kla->gdims[2], max_threads); + kla->gdims[2] = max_threads; + } + } + GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id, (aq ? aq->id : 0)); GCN_DEBUG ("GCN launch attribs: gdims:["); @@ -2303,6 +2325,7 @@ init_basic_kernel_info (struct kernel_info *kernel, kernel->agent = agent; kernel->module = module; kernel->name = d->name; + kernel->description = d; if (pthread_mutex_init (&kernel->init_mutex, NULL)) { GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex"); -- 2.7.4