libhsail-rt/rt/workitems.c

   1 /* workitems.c -- The main runtime entry that performs work-item execution in
   2    various ways and the builtin functions closely related to the
   3    implementation.
   4
   5    Copyright (C) 2015-2021 Free Software Foundation, Inc.
   6    Contributed by Pekka Jaaskelainen <pekka.jaaskelainen@parmance.com>
   7    for General Processor Tech.
   8
   9    Permission is hereby granted, free of charge, to any person obtaining a
  10    copy of this software and associated documentation files
  11    (the "Software"), to deal in the Software without restriction, including
  12    without limitation the rights to use, copy, modify, merge, publish,
  13    distribute, sublicense, and/or sell copies of the Software, and to
  14    permit persons to whom the Software is furnished to do so, subject to
  15    the following conditions:
  16
  17    The above copyright notice and this permission notice shall be included
  18    in all copies or substantial portions of the Software.
  19
  20    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  21    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  22    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  23    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
  24    DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  25    OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  26    USE OR OTHER DEALINGS IN THE SOFTWARE.
  27 */
  28
  29 /* The fiber based multiple work-item work-group execution uses ucontext
  30    based user mode threading.  However, if gccbrig is able to optimize the
  31    kernel to a much faster work-group function that implements the multiple
  32    WI execution using loops instead of fibers requiring slow context switches,
  33    the fiber-based implementation won't be called.
  34  */
  35
  36 #include <stdlib.h>
  37 #include <signal.h>
  38 #include <string.h>
  39
  40 #include "workitems.h"
  41 #include "phsa-rt.h"
  42
  43 #ifdef HAVE_FIBERS
  44 #include "fibers.h"
  45 #endif
  46
  47 #ifdef BENCHMARK_PHSA_RT
  48 #include <stdio.h>
  49 #include <time.h>
  50
  51 static uint64_t wi_count = 0;
  52 static uint64_t wis_skipped = 0;
  53 static uint64_t wi_total = 0;
  54 static clock_t start_time;
  55
  56 #endif
  57
  58 #ifdef DEBUG_PHSA_RT
  59 #include <stdio.h>
  60 #endif
  61
  62 #define PRIVATE_SEGMENT_ALIGN 256
  63 #define FIBER_STACK_SIZE (64*1024)
  64 #define GROUP_SEGMENT_ALIGN 256
  65
  66 /* Preserve this amount of additional space in the alloca stack as we need to
  67    store the alloca frame pointer to the alloca frame, thus must preserve
  68    space for it.  This thus supports at most 1024 functions with allocas in
  69    a call chain.  */
  70 #define ALLOCA_OVERHEAD 1024*4
  71
  72 uint32_t __hsail_workitemabsid (uint32_t dim, PHSAWorkItem *context);
  73
  74 uint32_t __hsail_workitemid (uint32_t dim, PHSAWorkItem *context);
  75
  76 uint32_t __hsail_gridgroups (uint32_t dim, PHSAWorkItem *context);
  77
  78 uint32_t __hsail_currentworkgroupsize (uint32_t dim, PHSAWorkItem *wi);
  79
  80 uint32_t __hsail_workgroupsize (uint32_t dim, PHSAWorkItem *wi);
  81
  82 void
  83 phsa_fatal_error (int code)
  84 {
  85   exit (code);
  86 }
  87
  88 #ifdef HAVE_FIBERS
  89 /* ucontext-based work-item thread implementation.  Runs all work-items in
  90    separate fibers.  */
  91
  92 static void
  93 phsa_work_item_thread (int arg0, int arg1)
  94 {
  95   void *arg = fiber_int_args_to_ptr (arg0, arg1);
  96
  97   PHSAWorkItem *wi = (PHSAWorkItem *) arg;
  98   volatile PHSAWorkGroup *wg = wi->wg;
  99   PHSAKernelLaunchData *l_data = wi->launch_data;
 100
 101   do
 102     {
 103       int retcode
 104         = fiber_barrier_reach ((fiber_barrier_t *) l_data->wg_start_barrier);
 105
 106       /* At this point the threads can assume that either more_wgs is 0 or
 107          the current_work_group_* is set to point to the WG executed next.  */
 108       if (!wi->wg->more_wgs)
 109         break;
 110
 111       wi->group_x = wg->x;
 112       wi->group_y = wg->y;
 113       wi->group_z = wg->z;
 114
 115       wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi);
 116       wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi);
 117       wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi);
 118
 119 #ifdef DEBUG_PHSA_RT
 120       printf (
 121         "Running work-item %lu/%lu/%lu for wg %lu/%lu/%lu / %lu/%lu/%lu...\n",
 122         wi->x, wi->y, wi->z, wi->group_x, wi->group_y, wi->group_z,
 123         l_data->wg_max_x, l_data->wg_max_y, l_data->wg_max_z);
 124 #endif
 125
 126       if (wi->x < __hsail_currentworkgroupsize (0, wi)
 127           && wi->y < __hsail_currentworkgroupsize (1, wi)
 128           && wi->z < __hsail_currentworkgroupsize (2, wi))
 129         {
 130           l_data->kernel (l_data->kernarg_addr, wi, wg->group_base_ptr,
 131                           wg->initial_group_offset, wg->private_base_ptr);
 132 #ifdef DEBUG_PHSA_RT
 133           printf ("done.\n");
 134 #endif
 135 #ifdef BENCHMARK_PHSA_RT
 136           wi_count++;
 137 #endif
 138         }
 139       else
 140         {
 141 #ifdef DEBUG_PHSA_RT
 142           printf ("skipped (partial WG).\n");
 143 #endif
 144 #ifdef BENCHMARK_PHSA_RT
 145           wis_skipped++;
 146 #endif
 147         }
 148
 149       retcode
 150         = fiber_barrier_reach ((fiber_barrier_t *)
 151                                l_data->wg_completion_barrier);
 152
 153       /* The first thread updates the WG to execute next etc.  */
 154
 155       if (retcode == 0)
 156         {
 157 #ifdef EXECUTE_WGS_BACKWARDS
 158           if (wg->x == l_data->wg_min_x)
 159             {
 160               wg->x = l_data->wg_max_x - 1;
 161               if (wg->y == l_data->wg_min_y)
 162                 {
 163                   wg->y = l_data->wg_max_y - 1;
 164                   if (wg->z == l_data->wg_min_z)
 165                     wg->more_wgs = 0;
 166                   else
 167                     wg->z--;
 168                 }
 169               else
 170                 wg->y--;
 171             }
 172           else
 173             wg->x--;
 174 #else
 175           if (wg->x + 1 >= l_data->wg_max_x)
 176             {
 177               wg->x = l_data->wg_min_x;
 178               if (wg->y + 1 >= l_data->wg_max_y)
 179                 {
 180                   wg->y = l_data->wg_min_y;
 181                   if (wg->z + 1 >= l_data->wg_max_z)
 182                     wg->more_wgs = 0;
 183                   else
 184                     wg->z++;
 185                 }
 186               else
 187                 wg->y++;
 188             }
 189           else
 190             wg->x++;
 191 #endif
 192           wi->group_x = wg->x;
 193           wi->group_y = wg->y;
 194           wi->group_z = wg->z;
 195
 196           wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi);
 197           wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi);
 198           wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi);
 199
 200           /* Reinitialize the work-group barrier according to the new WG's
 201              size, which might not be the same as the previous ones, due
 202              to "partial WGs".  */
 203           size_t wg_size = __hsail_currentworkgroupsize (0, wi)
 204                            * __hsail_currentworkgroupsize (1, wi)
 205                            * __hsail_currentworkgroupsize (2, wi);
 206
 207 #ifdef DEBUG_PHSA_RT
 208           printf ("Reinitializing the WG barrier to %lu.\n", wg_size);
 209 #endif
 210           fiber_barrier_init ((fiber_barrier_t *)
 211                               wi->launch_data->wg_sync_barrier,
 212                               wg_size);
 213
 214 #ifdef BENCHMARK_PHSA_RT
 215           if (wi_count % 1000 == 0)
 216             {
 217               clock_t spent_time = clock () - start_time;
 218               double spent_time_sec = (double) spent_time / CLOCKS_PER_SEC;
 219               double wis_per_sec = wi_count / spent_time_sec;
 220               uint64_t eta_sec
 221                 = (wi_total - wi_count - wis_skipped) / wis_per_sec;
 222
 223               printf ("%lu WIs executed %lu skipped in %lus (%lu WIs/s, ETA in "
 224                       "%lu s)\n",
 225                       wi_count, wis_skipped, (uint64_t) spent_time_sec,
 226                       (uint64_t) wis_per_sec, (uint64_t) eta_sec);
 227             }
 228 #endif
 229         }
 230     }
 231   while (1);
 232
 233   fiber_exit ();
 234 }
 235 #endif
 236
 237 #define MIN(a, b) ((a < b) ? a : b)
 238 #define MAX(a, b) ((a > b) ? a : b)
 239
 240 #ifdef HAVE_FIBERS
 241 /* Spawns a given number of work-items to execute a set of work-groups,
 242    blocks until their completion.  */
 243
 244 static void
 245 phsa_execute_wi_gang (PHSAKernelLaunchData *context, void *group_base_ptr,
 246                       uint32_t group_local_offset, size_t wg_size_x,
 247                       size_t wg_size_y, size_t wg_size_z)
 248 {
 249   PHSAWorkItem *wi_threads = NULL;
 250   PHSAWorkGroup wg;
 251   size_t flat_wi_id = 0, x, y, z, max_x, max_y, max_z;
 252   uint32_t group_x, group_y, group_z;
 253   fiber_barrier_t wg_start_barrier;
 254   fiber_barrier_t wg_completion_barrier;
 255   fiber_barrier_t wg_sync_barrier;
 256
 257   max_x = wg_size_x == 0 ? 1 : wg_size_x;
 258   max_y = wg_size_y == 0 ? 1 : wg_size_y;
 259   max_z = wg_size_z == 0 ? 1 : wg_size_z;
 260
 261   size_t wg_size = max_x * max_y * max_z;
 262   if (wg_size > PHSA_MAX_WG_SIZE)
 263     phsa_fatal_error (2);
 264
 265   wg.private_segment_total_size = context->dp->private_segment_size * wg_size;
 266   if (wg.private_segment_total_size > 0
 267       && posix_memalign (&wg.private_base_ptr, PRIVATE_SEGMENT_ALIGN,
 268                          wg.private_segment_total_size)
 269            != 0)
 270     phsa_fatal_error (3);
 271
 272   wg.alloca_stack_p = wg.private_segment_total_size + ALLOCA_OVERHEAD;
 273   wg.alloca_frame_p = wg.alloca_stack_p;
 274   wg.initial_group_offset = group_local_offset;
 275
 276 #ifdef EXECUTE_WGS_BACKWARDS
 277   group_x = context->wg_max_x - 1;
 278   group_y = context->wg_max_y - 1;
 279   group_z = context->wg_max_z - 1;
 280 #else
 281   group_x = context->wg_min_x;
 282   group_y = context->wg_min_y;
 283   group_z = context->wg_min_z;
 284 #endif
 285
 286   fiber_barrier_init (&wg_sync_barrier, wg_size);
 287   fiber_barrier_init (&wg_start_barrier, wg_size);
 288   fiber_barrier_init (&wg_completion_barrier, wg_size);
 289
 290   context->wg_start_barrier = &wg_start_barrier;
 291   context->wg_sync_barrier = &wg_sync_barrier;
 292   context->wg_completion_barrier = &wg_completion_barrier;
 293
 294   wg.more_wgs = 1;
 295   wg.group_base_ptr = group_base_ptr;
 296
 297 #ifdef BENCHMARK_PHSA_RT
 298   wi_count = 0;
 299   wis_skipped = 0;
 300   start_time = clock ();
 301 #endif
 302   wi_threads = malloc (sizeof (PHSAWorkItem) * max_x * max_y * max_z);
 303   for (x = 0; x < max_x; ++x)
 304     for (y = 0; y < max_y; ++y)
 305       for (z = 0; z < max_z; ++z)
 306         {
 307           PHSAWorkItem *wi = &wi_threads[flat_wi_id];
 308           wi->launch_data = context;
 309           wi->wg = &wg;
 310
 311           wg.x = wi->group_x = group_x;
 312           wg.y = wi->group_y = group_y;
 313           wg.z = wi->group_z = group_z;
 314
 315           wi->wg_size_x = context->dp->workgroup_size_x;
 316           wi->wg_size_y = context->dp->workgroup_size_y;
 317           wi->wg_size_z = context->dp->workgroup_size_z;
 318
 319           wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi);
 320           wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi);
 321           wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi);
 322
 323           wi->x = x;
 324           wi->y = y;
 325           wi->z = z;
 326
 327           /* TODO: set the stack size according to the private
 328                    segment size.  Too big stack consumes huge amount of
 329                    memory in case of huge number of WIs and a too small stack
 330                    will fail in mysterious and potentially dangerous ways.  */
 331
 332           fiber_init (&wi->fiber, phsa_work_item_thread, wi,
 333                       FIBER_STACK_SIZE, PRIVATE_SEGMENT_ALIGN);
 334           ++flat_wi_id;
 335         }
 336
 337   do
 338     {
 339       --flat_wi_id;
 340       fiber_join (&wi_threads[flat_wi_id].fiber);
 341     }
 342   while (flat_wi_id > 0);
 343
 344   if (wg.private_segment_total_size > 0)
 345     free (wg.private_base_ptr);
 346
 347   free (wi_threads);
 348 }
 349
 350 /* Spawn the work-item threads to execute work-groups and let
 351    them execute all the WGs, including a potential partial WG.  */
 352
 353 static void
 354 phsa_spawn_work_items (PHSAKernelLaunchData *context, void *group_base_ptr,
 355                        uint32_t group_local_offset)
 356 {
 357   hsa_kernel_dispatch_packet_t *dp = context->dp;
 358   size_t x, y, z;
 359
 360   context->group_segment_start_addr = (size_t) group_base_ptr;
 361
 362   /* HSA seems to allow the WG size to be larger than the grid size.  We need to
 363      saturate the effective WG size to the grid size to prevent the extra WIs
 364      from executing.  */
 365   size_t sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, sat_wg_size;
 366   sat_wg_size_x = MIN (dp->workgroup_size_x, dp->grid_size_x);
 367   sat_wg_size_y = MIN (dp->workgroup_size_y, dp->grid_size_y);
 368   sat_wg_size_z = MIN (dp->workgroup_size_z, dp->grid_size_z);
 369   sat_wg_size = sat_wg_size_x * sat_wg_size_y * sat_wg_size_z;
 370
 371 #ifdef BENCHMARK_PHSA_RT
 372   wi_total = (uint64_t) dp->grid_size_x
 373              * (dp->grid_size_y > 0 ? dp->grid_size_y : 1)
 374              * (dp->grid_size_z > 0 ? dp->grid_size_z : 1);
 375 #endif
 376
 377   /* For now execute all work groups in a single coarse thread (does not utilize
 378      multicore/multithread).  */
 379   context->wg_min_x = context->wg_min_y = context->wg_min_z = 0;
 380
 381   int dims = dp->setup & 0x3;
 382
 383   context->wg_max_x = ((uint64_t) dp->grid_size_x + dp->workgroup_size_x - 1)
 384                       / dp->workgroup_size_x;
 385
 386   context->wg_max_y
 387     = dims < 2 ? 1 : ((uint64_t) dp->grid_size_y + dp->workgroup_size_y - 1)
 388                        / dp->workgroup_size_y;
 389
 390   context->wg_max_z
 391     = dims < 3 ? 1 : ((uint64_t) dp->grid_size_z + dp->workgroup_size_z - 1)
 392                        / dp->workgroup_size_z;
 393
 394 #ifdef DEBUG_PHSA_RT
 395   printf ("### launching work-groups %lu/%lu/%lu to %lu/%lu/%lu with "
 396           "wg size %lu/%lu/%lu grid size %u/%u/%u\n",
 397           context->wg_min_x, context->wg_min_y, context->wg_min_z,
 398           context->wg_max_x, context->wg_max_y, context->wg_max_z,
 399           sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, dp->grid_size_x,
 400           dp->grid_size_y, dp->grid_size_z);
 401 #endif
 402
 403   phsa_execute_wi_gang (context, group_base_ptr, group_local_offset,
 404                         sat_wg_size_x, sat_wg_size_y, sat_wg_size_z);
 405 }
 406 #endif
 407
 408 /* Executes the given work-group function for all work groups in the grid.
 409
 410    A work-group function is a version of the original kernel which executes
 411    the kernel for all work-items in a work-group.  It is produced by gccbrig
 412    if it can handle the kernel's barrier usage and is much faster way to
 413    execute massive numbers of work-items in a non-SPMD machine than fibers
 414    (easily 100x faster).  */
 415 static void
 416 phsa_execute_work_groups (PHSAKernelLaunchData *context, void *group_base_ptr,
 417                           uint32_t group_local_offset)
 418 {
 419   hsa_kernel_dispatch_packet_t *dp = context->dp;
 420   size_t x, y, z, wg_x, wg_y, wg_z;
 421
 422   context->group_segment_start_addr = (size_t) group_base_ptr;
 423
 424   /* HSA seems to allow the WG size to be larger than the grid size.  We need
 425      to saturate the effective WG size to the grid size to prevent the extra WIs
 426      from executing.  */
 427   size_t sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, sat_wg_size;
 428   sat_wg_size_x = MIN (dp->workgroup_size_x, dp->grid_size_x);
 429   sat_wg_size_y = MIN (dp->workgroup_size_y, dp->grid_size_y);
 430   sat_wg_size_z = MIN (dp->workgroup_size_z, dp->grid_size_z);
 431   sat_wg_size = sat_wg_size_x * sat_wg_size_y * sat_wg_size_z;
 432
 433 #ifdef BENCHMARK_PHSA_RT
 434   wi_total = (uint64_t) dp->grid_size_x
 435              * (dp->grid_size_y > 0 ? dp->grid_size_y : 1)
 436              * (dp->grid_size_z > 0 ? dp->grid_size_z : 1);
 437 #endif
 438
 439   context->wg_min_x = context->wg_min_y = context->wg_min_z = 0;
 440
 441   int dims = dp->setup & 0x3;
 442
 443   context->wg_max_x = ((uint64_t) dp->grid_size_x + dp->workgroup_size_x - 1)
 444                       / dp->workgroup_size_x;
 445
 446   context->wg_max_y
 447     = dims < 2 ? 1 : ((uint64_t) dp->grid_size_y + dp->workgroup_size_y - 1)
 448                        / dp->workgroup_size_y;
 449
 450   context->wg_max_z
 451     = dims < 3 ? 1 : ((uint64_t) dp->grid_size_z + dp->workgroup_size_z - 1)
 452                        / dp->workgroup_size_z;
 453
 454 #ifdef DEBUG_PHSA_RT
 455   printf ("### launching work-groups %lu/%lu/%lu to %lu/%lu/%lu with "
 456           "wg size %lu/%lu/%lu grid size %u/%u/%u\n",
 457           context->wg_min_x, context->wg_min_y, context->wg_min_z,
 458           context->wg_max_x, context->wg_max_y, context->wg_max_z,
 459           sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, dp->grid_size_x,
 460           dp->grid_size_y, dp->grid_size_z);
 461 #endif
 462
 463   PHSAWorkItem wi;
 464   PHSAWorkGroup wg;
 465   wi.wg = &wg;
 466   wi.x = wi.y = wi.z = 0;
 467   wi.launch_data = context;
 468
 469 #ifdef BENCHMARK_PHSA_RT
 470   start_time = clock ();
 471   uint64_t wg_count = 0;
 472 #endif
 473
 474   size_t wg_size = __hsail_workgroupsize (0, &wi)
 475                    * __hsail_workgroupsize (1, &wi)
 476                    * __hsail_workgroupsize (2, &wi);
 477
 478   void *private_base_ptr = NULL;
 479   if (dp->private_segment_size > 0
 480       && posix_memalign (&private_base_ptr, PRIVATE_SEGMENT_ALIGN,
 481                          dp->private_segment_size * wg_size)
 482            != 0)
 483     phsa_fatal_error (3);
 484
 485   wg.alloca_stack_p = dp->private_segment_size * wg_size + ALLOCA_OVERHEAD;
 486   wg.alloca_frame_p = wg.alloca_stack_p;
 487
 488   wg.private_base_ptr = private_base_ptr;
 489   wg.group_base_ptr = group_base_ptr;
 490
 491 #ifdef DEBUG_PHSA_RT
 492   printf ("priv seg size %u wg_size %lu @ %p\n", dp->private_segment_size,
 493           wg_size, private_base_ptr);
 494 #endif
 495
 496   for (wg_z = context->wg_min_z; wg_z < context->wg_max_z; ++wg_z)
 497     for (wg_y = context->wg_min_y; wg_y < context->wg_max_y; ++wg_y)
 498       for (wg_x = context->wg_min_x; wg_x < context->wg_max_x; ++wg_x)
 499         {
 500           wi.group_x = wg_x;
 501           wi.group_y = wg_y;
 502           wi.group_z = wg_z;
 503
 504           wi.wg_size_x = context->dp->workgroup_size_x;
 505           wi.wg_size_y = context->dp->workgroup_size_y;
 506           wi.wg_size_z = context->dp->workgroup_size_z;
 507
 508           wi.cur_wg_size_x = __hsail_currentworkgroupsize (0, &wi);
 509           wi.cur_wg_size_y = __hsail_currentworkgroupsize (1, &wi);
 510           wi.cur_wg_size_z = __hsail_currentworkgroupsize (2, &wi);
 511
 512           context->kernel (context->kernarg_addr, &wi, group_base_ptr,
 513                            group_local_offset, private_base_ptr);
 514
 515 #if defined (BENCHMARK_PHSA_RT)
 516           wg_count++;
 517           if (wg_count % 1000000 == 0)
 518             {
 519               clock_t spent_time = clock () - start_time;
 520               uint64_t wi_count = wg_x * sat_wg_size_x + wg_y * sat_wg_size_y
 521                                   + wg_z * sat_wg_size_z;
 522               double spent_time_sec = (double) spent_time / CLOCKS_PER_SEC;
 523               double wis_per_sec = wi_count / spent_time_sec;
 524               uint64_t eta_sec = (wi_total - wi_count) / wis_per_sec;
 525
 526               printf ("%lu WIs executed in %lus (%lu WIs/s, ETA in %lu s)\n",
 527                       wi_count, (uint64_t) spent_time_sec,
 528                       (uint64_t) wis_per_sec, (uint64_t) eta_sec);
 529             }
 530 #endif
 531         }
 532
 533 #ifdef BENCHMARK_PHSA_RT
 534   clock_t spent_time = clock () - start_time;
 535   double spent_time_sec = (double) spent_time / CLOCKS_PER_SEC;
 536   double wis_per_sec = wi_total / spent_time_sec;
 537
 538   printf ("### %lu WIs executed in %lu s (%lu WIs / s)\n", wi_total,
 539           (uint64_t) spent_time_sec, (uint64_t) wis_per_sec);
 540 #endif
 541   free (private_base_ptr);
 542   private_base_ptr = NULL;
 543 }
 544
 545 /* gccbrig generates the following from each HSAIL kernel:
 546
 547    1) The actual kernel function (a single work-item kernel or a work-group
 548       function) generated from HSAIL (BRIG).
 549
 550          static void _Kernel (void* args, void* context, void* group_base_ptr)
 551          {
 552            ...
 553          }
 554
 555   2) A public facing kernel function that is called from the PHSA runtime:
 556
 557    a) A single work-item function (that requires fibers for multi-WI):
 558
 559       void Kernel (void* context)
 560       {
 561          __launch_launch_kernel (_Kernel, context);
 562       }
 563
 564       or
 565
 566     b) a when gccbrig could generate a work-group function:
 567
 568       void Kernel (void* context)
 569       {
 570                 __hsail_launch_wg_function (_Kernel, context);
 571       }
 572 */
 573
 574 #ifdef HAVE_FIBERS
 575
 576 void
 577 __hsail_launch_kernel (gccbrigKernelFunc kernel, PHSAKernelLaunchData *context,
 578                        void *group_base_ptr, uint32_t group_local_offset)
 579 {
 580   context->kernel = kernel;
 581   phsa_spawn_work_items (context, group_base_ptr, group_local_offset);
 582 }
 583 #endif
 584
 585 void
 586 __hsail_launch_wg_function (gccbrigKernelFunc kernel,
 587                             PHSAKernelLaunchData *context, void *group_base_ptr,
 588                             uint32_t group_local_offset)
 589 {
 590   context->kernel = kernel;
 591   phsa_execute_work_groups (context, group_base_ptr, group_local_offset);
 592 }
 593
 594 uint32_t
 595 __hsail_workitemabsid (uint32_t dim, PHSAWorkItem *context)
 596 {
 597   hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
 598
 599   uint32_t id;
 600   switch (dim)
 601     {
 602     default:
 603     case 0:
 604       /* Overflow semantics in the case of WG dim > grid dim.  */
 605       id = ((uint64_t) context->group_x * dp->workgroup_size_x + context->x)
 606            % dp->grid_size_x;
 607       break;
 608     case 1:
 609       id = ((uint64_t) context->group_y * dp->workgroup_size_y + context->y)
 610            % dp->grid_size_y;
 611       break;
 612     case 2:
 613       id = ((uint64_t) context->group_z * dp->workgroup_size_z + context->z)
 614            % dp->grid_size_z;
 615       break;
 616     }
 617   return id;
 618 }
 619
 620 uint64_t
 621 __hsail_workitemabsid_u64 (uint32_t dim, PHSAWorkItem *context)
 622 {
 623   hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
 624
 625   uint64_t id;
 626   switch (dim)
 627     {
 628     default:
 629     case 0:
 630       /* Overflow semantics in the case of WG dim > grid dim.  */
 631       id = ((uint64_t) context->group_x * dp->workgroup_size_x + context->x)
 632            % dp->grid_size_x;
 633       break;
 634     case 1:
 635       id = ((uint64_t) context->group_y * dp->workgroup_size_y + context->y)
 636            % dp->grid_size_y;
 637       break;
 638     case 2:
 639       id = ((uint64_t) context->group_z * dp->workgroup_size_z + context->z)
 640            % dp->grid_size_z;
 641       break;
 642     }
 643   return id;
 644 }
 645
 646
 647 uint32_t
 648 __hsail_workitemid (uint32_t dim, PHSAWorkItem *context)
 649 {
 650   PHSAWorkItem *c = (PHSAWorkItem *) context;
 651   hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
 652
 653   /* The number of dimensions is in the two least significant bits.  */
 654   int dims = dp->setup & 0x3;
 655
 656   uint32_t id;
 657   switch (dim)
 658     {
 659     default:
 660     case 0:
 661       id = c->x;
 662       break;
 663     case 1:
 664       id = dims < 2 ? 0 : c->y;
 665       break;
 666     case 2:
 667       id = dims < 3 ? 0 : c->z;
 668       break;
 669     }
 670   return id;
 671 }
 672
 673 uint32_t
 674 __hsail_gridgroups (uint32_t dim, PHSAWorkItem *context)
 675 {
 676   hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
 677   int dims = dp->setup & 0x3;
 678
 679   uint32_t id;
 680   switch (dim)
 681     {
 682     default:
 683     case 0:
 684       id = (dp->grid_size_x + dp->workgroup_size_x - 1) / dp->workgroup_size_x;
 685       break;
 686     case 1:
 687       id = dims < 2 ? 1 : (dp->grid_size_y + dp->workgroup_size_y - 1)
 688                             / dp->workgroup_size_y;
 689       break;
 690     case 2:
 691       id = dims < 3 ? 1 : (dp->grid_size_z + dp->workgroup_size_z - 1)
 692                             / dp->workgroup_size_z;
 693       break;
 694     }
 695   return id;
 696 }
 697
 698 uint32_t
 699 __hsail_workitemflatid (PHSAWorkItem *c)
 700 {
 701   hsa_kernel_dispatch_packet_t *dp = c->launch_data->dp;
 702
 703   return c->x + c->y * dp->workgroup_size_x
 704          + c->z * dp->workgroup_size_x * dp->workgroup_size_y;
 705 }
 706
 707 uint32_t
 708 __hsail_currentworkitemflatid (PHSAWorkItem *c)
 709 {
 710   hsa_kernel_dispatch_packet_t *dp = c->launch_data->dp;
 711
 712   return c->x + c->y * __hsail_currentworkgroupsize (0, c)
 713          + c->z * __hsail_currentworkgroupsize (0, c)
 714              * __hsail_currentworkgroupsize (1, c);
 715 }
 716
 717 void
 718 __hsail_setworkitemid (uint32_t dim, uint32_t id, PHSAWorkItem *context)
 719 {
 720   switch (dim)
 721     {
 722     default:
 723     case 0:
 724       context->x = id;
 725       break;
 726     case 1:
 727       context->y = id;
 728       break;
 729     case 2:
 730       context->z = id;
 731       break;
 732     }
 733 }
 734
 735 uint64_t
 736 __hsail_workitemflatabsid_u64 (PHSAWorkItem *context)
 737 {
 738   PHSAWorkItem *c = (PHSAWorkItem *) context;
 739   hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
 740
 741   /* Work-item flattened absolute ID = ID0 + ID1 * max0 + ID2 * max0 * max1.  */
 742   uint64_t id0 = __hsail_workitemabsid (0, context);
 743   uint64_t id1 = __hsail_workitemabsid (1, context);
 744   uint64_t id2 = __hsail_workitemabsid (2, context);
 745
 746   uint64_t max0 = dp->grid_size_x;
 747   uint64_t max1 = dp->grid_size_y;
 748   uint64_t id = id0 + id1 * max0 + id2 * max0 * max1;
 749
 750   return id;
 751 }
 752
 753 uint32_t
 754 __hsail_workitemflatabsid_u32 (PHSAWorkItem *context)
 755 {
 756   PHSAWorkItem *c = (PHSAWorkItem *) context;
 757   hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
 758
 759   /* work-item flattened absolute ID = ID0 + ID1 * max0 + ID2 * max0 * max1.  */
 760   uint64_t id0 = __hsail_workitemabsid (0, context);
 761   uint64_t id1 = __hsail_workitemabsid (1, context);
 762   uint64_t id2 = __hsail_workitemabsid (2, context);
 763
 764   uint64_t max0 = dp->grid_size_x;
 765   uint64_t max1 = dp->grid_size_y;
 766   uint64_t id = id0 + id1 * max0 + id2 * max0 * max1;
 767   return (uint32_t) id;
 768 }
 769
 770 uint32_t
 771 __hsail_currentworkgroupsize (uint32_t dim, PHSAWorkItem *wi)
 772 {
 773   hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp;
 774   uint32_t wg_size = 0;
 775   switch (dim)
 776     {
 777     default:
 778     case 0:
 779       if ((uint64_t) wi->group_x < dp->grid_size_x / dp->workgroup_size_x)
 780         wg_size = dp->workgroup_size_x; /* Full WG.  */
 781       else
 782         wg_size = dp->grid_size_x % dp->workgroup_size_x; /* Partial WG.  */
 783       break;
 784     case 1:
 785       if ((uint64_t) wi->group_y < dp->grid_size_y / dp->workgroup_size_y)
 786         wg_size = dp->workgroup_size_y; /* Full WG.  */
 787       else
 788         wg_size = dp->grid_size_y % dp->workgroup_size_y; /* Partial WG.  */
 789       break;
 790     case 2:
 791       if ((uint64_t) wi->group_z < dp->grid_size_z / dp->workgroup_size_z)
 792         wg_size = dp->workgroup_size_z; /* Full WG.  */
 793       else
 794         wg_size = dp->grid_size_z % dp->workgroup_size_z; /* Partial WG.  */
 795       break;
 796     }
 797   return wg_size;
 798 }
 799
 800 uint32_t
 801 __hsail_workgroupsize (uint32_t dim, PHSAWorkItem *wi)
 802 {
 803   hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp;
 804   switch (dim)
 805     {
 806     default:
 807     case 0:
 808       return dp->workgroup_size_x;
 809     case 1:
 810       return dp->workgroup_size_y;
 811     case 2:
 812       return dp->workgroup_size_z;
 813     }
 814 }
 815
 816 uint32_t
 817 __hsail_gridsize (uint32_t dim, PHSAWorkItem *wi)
 818 {
 819   hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp;
 820   switch (dim)
 821     {
 822     default:
 823     case 0:
 824       return dp->grid_size_x;
 825     case 1:
 826       return dp->grid_size_y;
 827     case 2:
 828       return dp->grid_size_z;
 829     }
 830 }
 831
 832 uint32_t
 833 __hsail_workgroupid (uint32_t dim, PHSAWorkItem *wi)
 834 {
 835   switch (dim)
 836     {
 837     default:
 838     case 0:
 839       return wi->group_x;
 840     case 1:
 841       return wi->group_y;
 842     case 2:
 843       return wi->group_z;
 844     }
 845 }
 846
 847 uint32_t
 848 __hsail_dim (PHSAWorkItem *wi)
 849 {
 850   hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp;
 851   return dp->setup & 0x3;
 852 }
 853
 854 uint64_t
 855 __hsail_packetid (PHSAWorkItem *wi)
 856 {
 857   return wi->launch_data->packet_id;
 858 }
 859
 860 uint32_t
 861 __hsail_packetcompletionsig_sig32 (PHSAWorkItem *wi)
 862 {
 863   return (uint32_t) wi->launch_data->dp->completion_signal.handle;
 864 }
 865
 866 uint64_t
 867 __hsail_packetcompletionsig_sig64 (PHSAWorkItem *wi)
 868 {
 869   return (uint64_t) (wi->launch_data->dp->completion_signal.handle);
 870 }
 871
 872 #ifdef HAVE_FIBERS
 873 void
 874 __hsail_barrier (PHSAWorkItem *wi)
 875 {
 876   fiber_barrier_reach ((fiber_barrier_t *) wi->launch_data->wg_sync_barrier);
 877 }
 878 #endif
 879
 880 /* Return a 32b private segment address that points to a dynamically
 881    allocated chunk of 'size' with 'align'.
 882
 883    Allocates the space from the end of the private segment allocated
 884    for the whole work group.  In implementations with separate private
 885    memories per WI, we will need to have a stack pointer per WI.  But in
 886    the current implementation, the segment is shared, so we possibly
 887    save some space in case all WIs do not call the alloca.
 888
 889    The "alloca frames" are organized as follows:
 890
 891    wg->alloca_stack_p points to the last allocated data (initially
 892    outside the private segment)
 893    wg->alloca_frame_p points to the first address _outside_ the current
 894    function's allocations (initially to the same as alloca_stack_p)
 895
 896    The data is allocated downwards from the end of the private segment.
 897
 898    In the beginning of a new function which has allocas, a new alloca
 899    frame is pushed which adds the current alloca_frame_p (the current
 900    function's frame starting point) to the top of the alloca stack and
 901    alloca_frame_p is set to the current stack position.
 902
 903    At the exit points of a function with allocas, the alloca frame
 904    is popped before returning.  This involves popping the alloca_frame_p
 905    to the one of the previous function in the call stack, and alloca_stack_p
 906    similarly, to the position of the last word alloca'd by the previous
 907    function.
 908  */
 909
 910 uint32_t
 911 __hsail_alloca (uint32_t size, uint32_t align, PHSAWorkItem *wi)
 912 {
 913   volatile PHSAWorkGroup *wg = wi->wg;
 914   int64_t new_pos = wg->alloca_stack_p - size;
 915   while (new_pos % align != 0)
 916     new_pos--;
 917   if (new_pos < 0)
 918     phsa_fatal_error (2);
 919
 920   wg->alloca_stack_p = new_pos;
 921
 922 #ifdef DEBUG_ALLOCA
 923   printf ("--- alloca (%u, %u) sp @%u fp @%u\n", size, align,
 924           wg->alloca_stack_p, wg->alloca_frame_p);
 925 #endif
 926   return new_pos;
 927 }
 928
 929 /* Initializes a new "alloca frame" in the private segment.
 930    This should be called at all the function entry points in case
 931    the function contains at least one call to alloca.  */
 932
 933 void
 934 __hsail_alloca_push_frame (PHSAWorkItem *wi)
 935 {
 936   volatile PHSAWorkGroup *wg = wi->wg;
 937
 938   /* Store the alloca_frame_p without any alignment padding so
 939      we know exactly where the previous frame ended after popping
 940      it.  */
 941 #ifdef DEBUG_ALLOCA
 942   printf ("--- push frame ");
 943 #endif
 944   uint32_t last_word_offs = __hsail_alloca (4, 1, wi);
 945   memcpy (wg->private_base_ptr + last_word_offs,
 946           (const void *) &wg->alloca_frame_p, 4);
 947   wg->alloca_frame_p = last_word_offs;
 948
 949 #ifdef DEBUG_ALLOCA
 950   printf ("--- sp @%u fp @%u\n", wg->alloca_stack_p, wg->alloca_frame_p);
 951 #endif
 952 }
 953
 954 /* Frees the current "alloca frame" and restores the frame
 955    pointer.
 956    This should be called at all the function return points in case
 957    the function contains at least one call to alloca.  Restores the
 958    alloca stack to the condition it was before pushing the frame
 959    the last time.  */
 960 void
 961 __hsail_alloca_pop_frame (PHSAWorkItem *wi)
 962 {
 963   volatile PHSAWorkGroup *wg = wi->wg;
 964
 965   wg->alloca_stack_p = wg->alloca_frame_p;
 966   memcpy ((void *) &wg->alloca_frame_p,
 967           (const void *) (wg->private_base_ptr + wg->alloca_frame_p), 4);
 968   /* Now frame_p points to the beginning of the previous function's
 969      frame and stack_p to its end.  */
 970
 971   wg->alloca_stack_p += 4;
 972
 973 #ifdef DEBUG_ALLOCA
 974   printf ("--- pop frame sp @%u fp @%u\n", wg->alloca_stack_p,
 975           wg->alloca_frame_p);
 976 #endif
 977 }