#include <signal.h>
#include "hsa.h"
+#include "../../../libgomp/config/gcn/libgomp-gcn.h"
#ifndef HSA_RUNTIME_LIB
#define HSA_RUNTIME_LIB "libhsa-runtime64.so.1"
automatically assign the exit value to *return_value. */
struct kernargs
{
- /* Kernargs. */
- int32_t argc;
- int64_t argv;
- int64_t out_ptr;
- int64_t heap_ptr;
-
- /* Output data. */
- struct output
- {
- int return_value;
- unsigned int next_output;
- struct printf_data
- {
- int written;
- char msg[128];
- int type;
- union
- {
- int64_t ivalue;
- double dvalue;
- char text[128];
- };
- } queue[1024];
- unsigned int consumed;
- } output_data;
+ union {
+ struct {
+ int32_t argc;
+ int64_t argv;
+ } args;
+ struct kernargs_abi abi;
+ };
+ struct output output_data;
};
-struct heap
-{
- int64_t size;
- char data[0];
-} heap;
-
/* Print any console output from the kernel.
We print all entries from "consumed" to the next entry without a "written"
flag, or "next_output" is reached. The buffer is circular, but the
for (int i = 0; i < kernel_argc; i++)
args_size += strlen (kernel_argv[i]) + 1;
+ /* The device stack can be adjusted via an environment variable. */
+ char *envvar = getenv ("GCN_STACK_SIZE");
+ int stack_size = 1 * 1024 * 1024; /* 1MB default. */
+ if (envvar)
+ {
+ int val = atoi (envvar);
+ if (val)
+ stack_size = val;
+ }
+
/* Allocate device memory for both function parameters and the argv
data. */
struct kernargs *kernargs = device_malloc (sizeof (*kernargs),
XHSA (hsa_fns.hsa_memory_assign_agent_fn (heap, device,
HSA_ACCESS_PERMISSION_RW),
"Assign heap to device agent");
+ void *stack = device_malloc (stack_size, heap_region);
/* Write the data to the target. */
- kernargs->argc = kernel_argc;
- kernargs->argv = (int64_t) args->argv_data;
- kernargs->out_ptr = (int64_t) &kernargs->output_data;
+ kernargs->args.argc = kernel_argc;
+ kernargs->args.argv = (int64_t) args->argv_data;
+ kernargs->abi.out_ptr = (int64_t) &kernargs->output_data;
kernargs->output_data.return_value = 0xcafe0000; /* Default return value. */
kernargs->output_data.next_output = 0;
for (unsigned i = 0; i < (sizeof (kernargs->output_data.queue)
memcpy (&args->strings[offset], kernel_argv[i], arg_len + 1);
offset += arg_len;
}
- kernargs->heap_ptr = (int64_t) heap;
+ kernargs->abi.heap_ptr = (int64_t) heap;
hsa_fns.hsa_memory_copy_fn (&heap->size, &heap_size, sizeof (heap_size));
+ kernargs->abi.arena_ptr = 0;
+ kernargs->abi.stack_ptr = (int64_t) stack;
+ kernargs->abi.stack_size_per_thread = stack_size;
/* Run constructors on the GPU. */
run (init_array_kernel, kernargs);
: ISA_UNKNOWN);
gcc_assert (gcn_isa != ISA_UNKNOWN);
- /* The default stack size needs to be small for offload kernels because
- there may be many, many threads. Also, a smaller stack gives a
- measureable performance boost. But, a small stack is insufficient
- for running the testsuite, so we use a larger default for the stand
- alone case. */
- if (stack_size_opt == -1)
- {
- if (flag_openacc || flag_openmp)
- /* 512 bytes per work item = 32kB total. */
- stack_size_opt = 512 * 64;
- else
- /* 1MB total. */
- stack_size_opt = 1048576;
- }
-
/* Reserve 1Kb (somewhat arbitrarily) of LDS space for reduction results and
worker broadcasts. */
if (gang_private_size_opt == -1)
};
static const long default_requested_args
- = (1 << PRIVATE_SEGMENT_BUFFER_ARG)
- | (1 << DISPATCH_PTR_ARG)
+ = (1 << DISPATCH_PTR_ARG)
| (1 << QUEUE_PTR_ARG)
| (1 << KERNARG_SEGMENT_PTR_ARG)
- | (1 << PRIVATE_SEGMENT_WAVE_OFFSET_ARG)
| (1 << WORKGROUP_ID_X_ARG)
| (1 << WORK_ITEM_ID_X_ARG)
| (1 << WORK_ITEM_ID_Y_ARG)
if (AS_LDS_P (as_from) && AS_FLAT_P (as_to))
{
- rtx queue = gen_rtx_REG (DImode,
- cfun->machine->args.reg[QUEUE_PTR_ARG]);
+ /* The high bits of the QUEUE_PTR_ARG register are used by
+ GCN_BUILTIN_FIRST_CALL_THIS_THREAD_P, so mask them out. */
+ rtx queue_reg = gen_rtx_REG (DImode,
+ cfun->machine->args.reg[QUEUE_PTR_ARG]);
+ rtx queue_ptr = gen_reg_rtx (DImode);
+ emit_insn (gen_anddi3 (queue_ptr, queue_reg, GEN_INT (0xffffffffffff)));
rtx group_seg_aperture_hi = gen_rtx_MEM (SImode,
- gen_rtx_PLUS (DImode, queue,
+ gen_rtx_PLUS (DImode, queue_ptr,
gen_int_mode (64, SImode)));
rtx tmp = gen_reg_rtx (DImode);
fixed_regs[cfun->machine->args.reg[DISPATCH_PTR_ARG]] = 1;
fixed_regs[cfun->machine->args.reg[DISPATCH_PTR_ARG] + 1] = 1;
}
+ if (cfun->machine->args.reg[QUEUE_PTR_ARG] >= 0)
+ {
+ fixed_regs[cfun->machine->args.reg[QUEUE_PTR_ARG]] = 1;
+ fixed_regs[cfun->machine->args.reg[QUEUE_PTR_ARG] + 1] = 1;
+ }
if (cfun->machine->args.reg[WORKGROUP_ID_X_ARG] >= 0)
fixed_regs[cfun->machine->args.reg[WORKGROUP_ID_X_ARG]] = 1;
if (cfun->machine->args.reg[WORK_ITEM_ID_X_ARG] >= 0)
}
else
{
- rtx wave_offset = gen_rtx_REG (SImode,
- cfun->machine->args.
- reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]);
+ if (TARGET_PACKED_WORK_ITEMS)
+ {
+ /* v0 conatins the X, Y and Z dimensions all in one.
+ Expand them out for ABI compatibility. */
+ /* TODO: implement and use zero_extract. */
+ rtx v1 = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
+ emit_insn (gen_andv64si3 (v1, gen_rtx_REG (V64SImode, VGPR_REGNO (0)),
+ gen_rtx_CONST_INT (VOIDmode, 0x3FF << 10)));
+ emit_insn (gen_lshrv64si3 (v1, v1, gen_rtx_CONST_INT (VOIDmode, 10)));
+ emit_insn (gen_prologue_use (v1));
+
+ rtx v2 = gen_rtx_REG (V64SImode, VGPR_REGNO (2));
+ emit_insn (gen_andv64si3 (v2, gen_rtx_REG (V64SImode, VGPR_REGNO (0)),
+ gen_rtx_CONST_INT (VOIDmode, 0x3FF << 20)));
+ emit_insn (gen_lshrv64si3 (v2, v2, gen_rtx_CONST_INT (VOIDmode, 20)));
+ emit_insn (gen_prologue_use (v2));
+ }
+
+ /* We no longer use the private segment for the stack (it's not
+ accessible to reverse offload), so we must calculate a wave offset
+ from the grid dimensions and stack size, which is calculated on the
+ host, and passed in the kernargs region.
+ See libgomp-gcn.h for details. */
+ rtx wave_offset = gen_rtx_REG (SImode, FIRST_PARM_REG);
+
+ rtx num_waves_mem = gcn_oacc_dim_size (1);
+ rtx num_waves = gen_rtx_REG (SImode, FIRST_PARM_REG+1);
+ set_mem_addr_space (num_waves_mem, ADDR_SPACE_SCALAR_FLAT);
+ emit_move_insn (num_waves, num_waves_mem);
+
+ rtx workgroup_num = gcn_oacc_dim_pos (0);
+ rtx wave_num = gen_rtx_REG (SImode, FIRST_PARM_REG+2);
+ emit_move_insn(wave_num, gcn_oacc_dim_pos (1));
+ rtx thread_id = gen_rtx_REG (SImode, FIRST_PARM_REG+3);
+ emit_insn (gen_mulsi3 (thread_id, num_waves, workgroup_num));
+ emit_insn (gen_addsi3_scc (thread_id, thread_id, wave_num));
+
+ rtx kernarg_reg = gen_rtx_REG (DImode, cfun->machine->args.reg
+ [KERNARG_SEGMENT_PTR_ARG]);
+ rtx stack_size_mem = gen_rtx_MEM (SImode,
+ gen_rtx_PLUS (DImode, kernarg_reg,
+ GEN_INT (52)));
+ set_mem_addr_space (stack_size_mem, ADDR_SPACE_SCALAR_FLAT);
+ emit_move_insn (wave_offset, stack_size_mem);
+
+ emit_insn (gen_mulsi3 (wave_offset, wave_offset, thread_id));
+
+ /* The FLAT_SCRATCH_INIT is not usually needed, but can be enabled
+ via the function attributes. */
if (cfun->machine->args.requested & (1 << FLAT_SCRATCH_INIT_ARG))
{
rtx fs_init_lo =
HOST_WIDE_INT sp_adjust = (offsets->local_vars
+ offsets->outgoing_args_size);
- /* Initialise FP and SP from the buffer descriptor in s[0:3]. */
- emit_move_insn (fp_lo, gen_rtx_REG (SImode, 0));
- emit_insn (gen_andsi3_scc (fp_hi, gen_rtx_REG (SImode, 1),
- gen_int_mode (0xffff, SImode)));
+ /* Initialize FP and SP from space allocated on the host. */
+ rtx stack_addr_mem = gen_rtx_MEM (DImode,
+ gen_rtx_PLUS (DImode, kernarg_reg,
+ GEN_INT (40)));
+ set_mem_addr_space (stack_addr_mem, ADDR_SPACE_SCALAR_FLAT);
+ emit_move_insn (fp, stack_addr_mem);
rtx scc = gen_rtx_REG (BImode, SCC_REG);
emit_insn (gen_addsi3_scalar_carry (fp_lo, fp_lo, wave_offset, scc));
emit_insn (gen_addcsi3_scalar_zero (fp_hi, fp_hi, scc));
emit_insn (gen_prologue_use (gen_rtx_REG (SImode, M0_REG)));
}
- if (TARGET_PACKED_WORK_ITEMS
- && cfun && cfun->machine && !cfun->machine->normal_function)
- {
- /* v0 conatins the X, Y and Z dimensions all in one.
- Expand them out for ABI compatibility. */
- /* TODO: implement and use zero_extract. */
- rtx v1 = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
- emit_insn (gen_andv64si3 (v1, gen_rtx_REG (V64SImode, VGPR_REGNO (0)),
- gen_rtx_CONST_INT (VOIDmode, 0x3FF << 10)));
- emit_insn (gen_lshrv64si3 (v1, v1, gen_rtx_CONST_INT (VOIDmode, 10)));
- emit_insn (gen_prologue_use (v1));
-
- rtx v2 = gen_rtx_REG (V64SImode, VGPR_REGNO (2));
- emit_insn (gen_andv64si3 (v2, gen_rtx_REG (V64SImode, VGPR_REGNO (0)),
- gen_rtx_CONST_INT (VOIDmode, 0x3FF << 20)));
- emit_insn (gen_lshrv64si3 (v2, v2, gen_rtx_CONST_INT (VOIDmode, 20)));
- emit_insn (gen_prologue_use (v2));
- }
-
if (cfun && cfun->machine && !cfun->machine->normal_function && flag_openmp)
{
/* OpenMP kernels have an implicit call to gomp_gcn_enter_kernel. */
cf. struct hsa_kernel_dispatch_packet_s in the HSA doc. */
rtx ptr;
if (cfun->machine->args.reg[DISPATCH_PTR_ARG] >= 0
- && cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] >= 0)
+ && cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG] >= 0)
{
- rtx size_rtx = gen_rtx_REG (DImode,
- cfun->machine->args.reg[DISPATCH_PTR_ARG]);
- size_rtx = gen_rtx_MEM (SImode,
- gen_rtx_PLUS (DImode, size_rtx,
- GEN_INT (6*2 + 3*4)));
- size_rtx = gen_rtx_MULT (SImode, size_rtx, GEN_INT (64));
-
- ptr = gen_rtx_REG (DImode,
- cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG]);
- ptr = gen_rtx_AND (DImode, ptr, GEN_INT (0x0000ffffffffffff));
- ptr = gen_rtx_PLUS (DImode, ptr, size_rtx);
- if (cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG] >= 0)
- {
- rtx off;
- off = gen_rtx_REG (SImode,
- cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]);
- ptr = gen_rtx_PLUS (DImode, ptr, off);
- }
+ rtx num_waves_mem = gcn_oacc_dim_size (1);
+ rtx num_waves = gen_reg_rtx (SImode);
+ set_mem_addr_space (num_waves_mem, ADDR_SPACE_SCALAR_FLAT);
+ emit_move_insn (num_waves, num_waves_mem);
+
+ rtx workgroup_num = gcn_oacc_dim_pos (0);
+ rtx wave_num = gen_reg_rtx (SImode);
+ emit_move_insn(wave_num, gcn_oacc_dim_pos (1));
+
+ rtx thread_id = gen_reg_rtx (SImode);
+ emit_insn (gen_mulsi3 (thread_id, num_waves, workgroup_num));
+ emit_insn (gen_addsi3_scc (thread_id, thread_id, wave_num));
+
+ rtx kernarg_reg = gen_rtx_REG (DImode, cfun->machine->args.reg
+ [KERNARG_SEGMENT_PTR_ARG]);
+ rtx stack_size_mem = gen_rtx_MEM (SImode,
+ gen_rtx_PLUS (DImode,
+ kernarg_reg,
+ GEN_INT (52)));
+ set_mem_addr_space (stack_size_mem, ADDR_SPACE_SCALAR_FLAT);
+ rtx stack_size = gen_reg_rtx (SImode);
+ emit_move_insn (stack_size, stack_size_mem);
+
+ rtx wave_offset = gen_reg_rtx (SImode);
+ emit_insn (gen_mulsi3 (wave_offset, stack_size, thread_id));
+
+ rtx stack_limit_offset = gen_reg_rtx (SImode);
+ emit_insn (gen_addsi3 (stack_limit_offset, wave_offset,
+ stack_size));
+
+ rtx stack_limit_offset_di = gen_reg_rtx (DImode);
+ emit_move_insn (gen_rtx_SUBREG (SImode, stack_limit_offset_di, 4),
+ const0_rtx);
+ emit_move_insn (gen_rtx_SUBREG (SImode, stack_limit_offset_di, 0),
+ stack_limit_offset);
+
+ rtx stack_addr_mem = gen_rtx_MEM (DImode,
+ gen_rtx_PLUS (DImode,
+ kernarg_reg,
+ GEN_INT (40)));
+ set_mem_addr_space (stack_addr_mem, ADDR_SPACE_SCALAR_FLAT);
+ rtx stack_addr = gen_reg_rtx (DImode);
+ emit_move_insn (stack_addr, stack_addr_mem);
+
+ ptr = gen_rtx_PLUS (DImode, stack_addr, stack_limit_offset_di);
}
else
{
whether it was the first call. */
rtx result = gen_reg_rtx (BImode);
emit_move_insn (result, const0_rtx);
- if (cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] >= 0)
+ if (cfun->machine->args.reg[QUEUE_PTR_ARG] >= 0)
{
rtx not_first = gen_label_rtx ();
rtx reg = gen_rtx_REG (DImode,
- cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG]);
+ cfun->machine->args.reg[QUEUE_PTR_ARG]);
reg = gcn_operand_part (DImode, reg, 1);
rtx cmp = force_reg (SImode,
gen_rtx_LSHIFTRT (SImode, reg, GEN_INT (16)));
"\t .amdhsa_reserve_vcc\t1\n"
"\t .amdhsa_reserve_flat_scratch\t0\n"
"\t .amdhsa_reserve_xnack_mask\t%i\n"
- "\t .amdhsa_private_segment_fixed_size\t%i\n"
+ "\t .amdhsa_private_segment_fixed_size\t0\n"
"\t .amdhsa_group_segment_fixed_size\t%u\n"
"\t .amdhsa_float_denorm_mode_32\t3\n"
"\t .amdhsa_float_denorm_mode_16_64\t3\n",
vgpr,
sgpr,
xnack_enabled,
- /* workitem_private_segment_bytes_size needs to be
- one 64th the wave-front stack size. */
- stack_size_opt / 64,
LDS_SIZE);
if (gcn_arch == PROCESSOR_GFX90a)
fprintf (file,
" .kernarg_segment_size: %i\n"
" .kernarg_segment_align: %i\n"
" .group_segment_fixed_size: %u\n"
- " .private_segment_fixed_size: %i\n"
+ " .private_segment_fixed_size: 0\n"
" .wavefront_size: 64\n"
" .sgpr_count: %i\n"
" .vgpr_count: %i\n"
cfun->machine->kernarg_segment_byte_size,
cfun->machine->kernarg_segment_alignment,
LDS_SIZE,
- stack_size_opt / 64,
sgpr, vgpr);
if (gcn_arch == PROCESSOR_GFX90a)
fprintf (file, " .agpr_count: 0\n"); // AGPRs are not used, yet
\f
#define FIXED_REGISTERS { \
/* Scalars. */ \
- 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, \
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \
/* fp sp lr. */ \
1, 1, 0, 0, 0, 0, 1, 1, 0, 0, \
/* exec_save, cc_save */ \
mstack-size=
Target RejectNegative Joined UInteger Var(stack_size_opt) Init(-1)
--mstack-size=<number> Set the private segment size per wave-front, in bytes.
+Obsolete; use GCN_STACK_SIZE at runtime.
int gang_private_size_opt = -1
-/* { dg-xfail-if "can cause stack underflow" { nios2-*-* } } */
+/* { dg-xfail-run-if "can cause stack underflow" { nios2-*-* amdgcn-*-* } } */
/* { dg-require-effective-target untyped_assembly } */
#define INTEGER_ARG 5
/* { dg-do run } */
/* { dg-require-effective-target untyped_assembly } */
+/* { dg-xfail-run-if "can cause stack underflow" { amdgcn-*-* } } */
#define INTEGER_ARG 5
/* { dg-additional-options "-mno-mmx" { target { { i?86-*-* x86_64-*-* } && ia32 } } } */
/* { dg-do run } */
/* { dg-require-effective-target untyped_assembly } */
+/* { dg-xfail-run-if "can cause stack underflow" { amdgcn-*-* } } */
extern void abort (void);
/* { dg-do run } */
/* { dg-require-effective-target untyped_assembly } */
+/* { dg-xfail-run-if "can cause stack underflow" { amdgcn-*-* } } */
#define INTEGER_ARG 5
/* { dg-do run } */
/* { dg-additional-options "-fgnu89-inline" } */
/* { dg-require-effective-target untyped_assembly } */
+/* { dg-xfail-run-if "can cause stack underflow" { amdgcn-*-* } } */
extern void abort (void);
to the plugin interface defined in libgomp/libgomp.h. */
#define GOMP_VERSION 2
#define GOMP_VERSION_NVIDIA_PTX 1
-#define GOMP_VERSION_GCN 2
+#define GOMP_VERSION_GCN 3
#define GOMP_VERSION_PACK(LIB, DEV) (((LIB) << 16) | (DEV))
#define GOMP_VERSION_LIB(PACK) (((PACK) >> 16) & 0xffff)
#ifndef LIBGOMP_GCN_H
#define LIBGOMP_GCN_H 1
+#define DEFAULT_GCN_STACK_SIZE (32*1024)
+#define DEFAULT_TEAM_ARENA_SIZE (64*1024)
+
+struct heap
+{
+ int64_t size;
+ char data[0];
+};
+
+/* This struct defines the (unofficial) ABI-defined values the compiler
+ expects to find in first bytes of the kernargs space.
+ The plugin may choose to place additional data later in the kernargs
+ memory allocation, but those are not in any fixed location. */
+struct kernargs_abi {
+ /* Leave space for the real kernel arguments.
+ OpenACC and OpenMP only use one pointer. */
+ int64_t dummy1;
+ int64_t dummy2;
+
+ /* A pointer to struct output, below, for console output data. */
+ int64_t out_ptr; /* Offset 16. */
+
+ /* A pointer to struct heap. */
+ int64_t heap_ptr; /* Offset 24. */
+
+ /* A pointer to the ephemeral memory areas.
+ The team arena is only needed for OpenMP.
+ Each should have enough space for all the teams and threads. */
+ int64_t arena_ptr; /* Offset 32. */
+ int64_t stack_ptr; /* Offset 40. */
+ int arena_size_per_team; /* Offset 48. */
+ int stack_size_per_thread; /* Offset 52. */
+};
+
/* This struct is also used in Newlib's libc/sys/amdgcn/write.c. */
struct output
{
/* Initialize the team arena for optimized memory allocation.
The arena has been allocated on the host side, and the address
passed in via the kernargs. Each team takes a small slice of it. */
- void **kernargs = (void**) __builtin_gcn_kernarg_ptr ();
- void *team_arena = (kernargs[4] + TEAM_ARENA_SIZE*teamid);
+ struct kernargs_abi *kernargs =
+ (struct kernargs_abi*) __builtin_gcn_kernarg_ptr ();
+ void *team_arena = ((void*)kernargs->arena_ptr
+ + kernargs->arena_size_per_team * teamid);
void * __lds *arena_start = (void * __lds *)TEAM_ARENA_START;
void * __lds *arena_free = (void * __lds *)TEAM_ARENA_FREE;
void * __lds *arena_end = (void * __lds *)TEAM_ARENA_END;
*arena_start = team_arena;
*arena_free = team_arena;
- *arena_end = team_arena + TEAM_ARENA_SIZE;
+ *arena_end = team_arena + kernargs->arena_size_per_team;
/* Allocate and initialize the team-local-storage data. */
struct gomp_thread *thrs = team_malloc_cleared (sizeof (*thrs)
/* Optimized allocators for team-specific data that will die with the team. */
#ifdef __AMDGCN__
+#include "libgomp-gcn.h"
/* The arena is initialized in config/gcn/team.c. */
-#define TEAM_ARENA_SIZE 64*1024 /* Must match the value in plugin-gcn.c. */
#define TEAM_ARENA_START 16 /* LDS offset of free pointer. */
#define TEAM_ARENA_FREE 24 /* LDS offset of free pointer. */
#define TEAM_ARENA_END 32 /* LDS offset of end pointer. */
{
/* While this is experimental, let's make sure we know when OOM
happens. */
- const char msg[] = "GCN team arena exhausted\n";
+ const char msg[] = "GCN team arena exhausted;"
+ " configure with GCN_TEAM_ARENA_SIZE=bytes\n";
write (2, msg, sizeof(msg)-1);
/* Fall back to using the heap (slowly). */
in libgomp target code. */
struct kernargs {
- /* Leave space for the real kernel arguments.
- OpenACC and OpenMP only use one pointer. */
- int64_t dummy1;
- int64_t dummy2;
-
- /* A pointer to struct output, below, for console output data. */
- int64_t out_ptr;
-
- /* A pointer to struct heap, below. */
- int64_t heap_ptr;
-
- /* A pointer to an ephemeral memory arena.
- Only needed for OpenMP. */
- int64_t arena_ptr;
+ struct kernargs_abi abi;
/* Output data. */
struct output output_data;
/* The HSA memory region from which to allocate device data. */
hsa_region_t data_region;
- /* Allocated team arenas. */
- struct team_arena_list *team_arena_list;
- pthread_mutex_t team_arena_write_lock;
+ /* Allocated ephemeral memories (team arena and stack space). */
+ struct ephemeral_memories_list *ephemeral_memories_list;
+ pthread_mutex_t ephemeral_memories_write_lock;
/* Read-write lock that protects kernels which are running or about to be run
from interference with loading and unloading of images. Needs to be
};
/* A linked list of memory arenas allocated on the device.
- These are only used by OpenMP, as a means to optimize per-team malloc. */
+ These are used by OpenMP, as a means to optimize per-team malloc,
+ and for host-accessible stack space. */
-struct team_arena_list
+struct ephemeral_memories_list
{
- struct team_arena_list *next;
+ struct ephemeral_memories_list *next;
- /* The number of teams determines the size of the allocation. */
- int num_teams;
- /* The device address of the arena itself. */
- void *arena;
- /* A flag to prevent two asynchronous kernels trying to use the same arena.
+ /* The size is determined by the number of teams and threads. */
+ size_t size;
+ /* The device address allocated memory. */
+ void *address;
+ /* A flag to prevent two asynchronous kernels trying to use the same memory.
The mutex is locked until the kernel exits. */
pthread_mutex_t in_use;
};
char driver_version_s[30];
};
-/* Format of the on-device heap.
-
- This must match the definition in Newlib and gcn-run. */
-
-struct heap {
- int64_t size;
- char data[0];
-};
-
/* }}} */
/* {{{ Global variables */
static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE;
+/* Ephemeral memory sizes for each kernel launch. */
+
+static int team_arena_size = DEFAULT_TEAM_ARENA_SIZE;
+static int stack_size = DEFAULT_GCN_STACK_SIZE;
+
/* Flag to decide whether print to stderr information about what is going on.
Set in init_debug depending on environment variables. */
fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue);
fprintf (stderr, "%*skernarg_address: %p\n", indent, "", kernargs);
fprintf (stderr, "%*sheap address: %p\n", indent, "",
- (void*)kernargs->heap_ptr);
- fprintf (stderr, "%*sarena address: %p\n", indent, "",
- (void*)kernargs->arena_ptr);
+ (void*)kernargs->abi.heap_ptr);
+ fprintf (stderr, "%*sarena address: %p (%d bytes per workgroup)\n", indent,
+ "", (void*)kernargs->abi.arena_ptr,
+ kernargs->abi.arena_size_per_team);
+ fprintf (stderr, "%*sstack address: %p (%d bytes per wavefront)\n", indent,
+ "", (void*)kernargs->abi.stack_ptr,
+ kernargs->abi.stack_size_per_thread);
fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object);
fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "",
dispatch->private_segment_size);
if (tmp)
gcn_kernel_heap_size = tmp;
}
+
+ const char *arena = secure_getenv ("GCN_TEAM_ARENA_SIZE");
+ if (arena)
+ {
+ int tmp = atoi (arena);
+ if (tmp)
+ team_arena_size = tmp;;
+ }
+
+ const char *stack = secure_getenv ("GCN_STACK_SIZE");
+ if (stack)
+ {
+ int tmp = atoi (stack);
+ if (tmp)
+ stack_size = tmp;;
+ }
}
/* Return malloc'd string with name of SYMBOL. */
/* }}} */
/* {{{ Run */
-/* Create or reuse a team arena.
+/* Create or reuse a team arena and stack space.
Team arenas are used by OpenMP to avoid calling malloc multiple times
while setting up each team. This is purely a performance optimization.
- Allocating an arena also costs performance, albeit on the host side, so
- this function will reuse an existing arena if a large enough one is idle.
- The arena is released, but not deallocated, when the kernel exits. */
+ The stack space is used by all kernels. We must allocate it in such a
+ way that the reverse offload implmentation can access the data.
-static void *
-get_team_arena (struct agent_info *agent, int num_teams)
+ Allocating this memory costs performance, so this function will reuse an
+ existing allocation if a large enough one is idle.
+ The memory lock is released, but not deallocated, when the kernel exits. */
+
+static void
+configure_ephemeral_memories (struct kernel_info *kernel,
+ struct kernargs_abi *kernargs, int num_teams,
+ int num_threads)
{
- struct team_arena_list **next_ptr = &agent->team_arena_list;
- struct team_arena_list *item;
+ struct agent_info *agent = kernel->agent;
+ struct ephemeral_memories_list **next_ptr = &agent->ephemeral_memories_list;
+ struct ephemeral_memories_list *item;
+
+ int actual_arena_size = (kernel->kind == KIND_OPENMP
+ ? team_arena_size : 0);
+ int actual_arena_total_size = actual_arena_size * num_teams;
+ size_t size = (actual_arena_total_size
+ + num_teams * num_threads * stack_size);
for (item = *next_ptr; item; next_ptr = &item->next, item = item->next)
{
- if (item->num_teams < num_teams)
+ if (item->size < size)
continue;
- if (pthread_mutex_trylock (&item->in_use))
- continue;
-
- return item->arena;
+ if (pthread_mutex_trylock (&item->in_use) == 0)
+ break;
}
- GCN_DEBUG ("Creating a new arena for %d teams\n", num_teams);
-
- if (pthread_mutex_lock (&agent->team_arena_write_lock))
+ if (!item)
{
- GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
- return false;
- }
- item = malloc (sizeof (*item));
- item->num_teams = num_teams;
- item->next = NULL;
- *next_ptr = item;
+ GCN_DEBUG ("Creating a new %sstack for %d teams with %d threads"
+ " (%zd bytes)\n", (actual_arena_size ? "arena and " : ""),
+ num_teams, num_threads, size);
- if (pthread_mutex_init (&item->in_use, NULL))
- {
- GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
- return false;
- }
- if (pthread_mutex_lock (&item->in_use))
- {
- GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
- return false;
- }
- if (pthread_mutex_unlock (&agent->team_arena_write_lock))
- {
- GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
- return false;
- }
+ if (pthread_mutex_lock (&agent->ephemeral_memories_write_lock))
+ {
+ GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
+ return;
+ }
+ item = malloc (sizeof (*item));
+ item->size = size;
+ item->next = NULL;
+ *next_ptr = item;
- const int TEAM_ARENA_SIZE = 64*1024; /* Must match libgomp.h. */
- hsa_status_t status;
- status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
- TEAM_ARENA_SIZE*num_teams,
- &item->arena);
- if (status != HSA_STATUS_SUCCESS)
- hsa_fatal ("Could not allocate memory for GCN kernel arena", status);
- status = hsa_fns.hsa_memory_assign_agent_fn (item->arena, agent->id,
- HSA_ACCESS_PERMISSION_RW);
- if (status != HSA_STATUS_SUCCESS)
- hsa_fatal ("Could not assign arena memory to device", status);
+ if (pthread_mutex_init (&item->in_use, NULL))
+ {
+ GOMP_PLUGIN_error ("Failed to initialize a GCN memory write mutex");
+ return;
+ }
+ if (pthread_mutex_lock (&item->in_use))
+ {
+ GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
+ return;
+ }
+ if (pthread_mutex_unlock (&agent->ephemeral_memories_write_lock))
+ {
+ GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
+ return;
+ }
+
+ hsa_status_t status;
+ status = hsa_fns.hsa_memory_allocate_fn (agent->data_region, size,
+ &item->address);
+ if (status != HSA_STATUS_SUCCESS)
+ hsa_fatal ("Could not allocate memory for GCN kernel arena", status);
+ status = hsa_fns.hsa_memory_assign_agent_fn (item->address, agent->id,
+ HSA_ACCESS_PERMISSION_RW);
+ if (status != HSA_STATUS_SUCCESS)
+ hsa_fatal ("Could not assign arena & stack memory to device", status);
+ }
- return item->arena;
+ kernargs->arena_ptr = (actual_arena_total_size
+ ? (uint64_t)item->address
+ : 0);
+ kernargs->stack_ptr = (uint64_t)item->address + actual_arena_total_size;
+ kernargs->arena_size_per_team = actual_arena_size;
+ kernargs->stack_size_per_thread = stack_size;
}
-/* Mark a team arena available for reuse. */
+/* Mark an ephemeral memory space available for reuse. */
static void
-release_team_arena (struct agent_info* agent, void *arena)
+release_ephemeral_memories (struct agent_info* agent, void *address)
{
- struct team_arena_list *item;
+ struct ephemeral_memories_list *item;
- for (item = agent->team_arena_list; item; item = item->next)
+ for (item = agent->ephemeral_memories_list; item; item = item->next)
{
- if (item->arena == arena)
+ if (item->address == address)
{
if (pthread_mutex_unlock (&item->in_use))
GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
/* Clean up all the allocated team arenas. */
static bool
-destroy_team_arenas (struct agent_info *agent)
+destroy_ephemeral_memories (struct agent_info *agent)
{
- struct team_arena_list *item, *next;
+ struct ephemeral_memories_list *item, *next;
- for (item = agent->team_arena_list; item; item = next)
+ for (item = agent->ephemeral_memories_list; item; item = next)
{
next = item->next;
- hsa_fns.hsa_memory_free_fn (item->arena);
+ hsa_fns.hsa_memory_free_fn (item->address);
if (pthread_mutex_destroy (&item->in_use))
{
- GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
+ GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
return false;
}
free (item);
}
- agent->team_arena_list = NULL;
+ agent->ephemeral_memories_list = NULL;
return true;
}
the necessary device signals and memory allocations. */
static struct kernel_dispatch *
-create_kernel_dispatch (struct kernel_info *kernel, int num_teams)
+create_kernel_dispatch (struct kernel_info *kernel, int num_teams,
+ int num_threads)
{
struct agent_info *agent = kernel->agent;
struct kernel_dispatch *shadow
struct kernargs *kernargs = shadow->kernarg_address;
/* Zero-initialize the output_data (minimum needed). */
- kernargs->out_ptr = (int64_t)&kernargs->output_data;
+ kernargs->abi.out_ptr = (int64_t)&kernargs->output_data;
kernargs->output_data.next_output = 0;
for (unsigned i = 0;
i < (sizeof (kernargs->output_data.queue)
kernargs->output_data.consumed = 0;
/* Pass in the heap location. */
- kernargs->heap_ptr = (int64_t)kernel->module->heap;
+ kernargs->abi.heap_ptr = (int64_t)kernel->module->heap;
- /* Create an arena. */
- if (kernel->kind == KIND_OPENMP)
- kernargs->arena_ptr = (int64_t)get_team_arena (agent, num_teams);
- else
- kernargs->arena_ptr = 0;
+ /* Create the ephemeral memory spaces. */
+ configure_ephemeral_memories (kernel, &kernargs->abi, num_teams, num_threads);
/* Ensure we can recognize unset return values. */
kernargs->output_data.return_value = 0xcafe0000;
GCN_DEBUG ("Released kernel dispatch: %p\n", shadow);
struct kernargs *kernargs = shadow->kernarg_address;
- void *arena = (void *)kernargs->arena_ptr;
- if (arena)
- release_team_arena (shadow->agent, arena);
+ void *addr = (void *)kernargs->abi.arena_ptr;
+ if (!addr)
+ addr = (void *)kernargs->abi.stack_ptr;
+ release_ephemeral_memories (shadow->agent, addr);
hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
packet->workgroup_size_z);
struct kernel_dispatch *shadow
- = create_kernel_dispatch (kernel, packet->grid_size_x);
+ = create_kernel_dispatch (kernel, packet->grid_size_x,
+ packet->grid_size_z);
shadow->queue = command_q;
if (debug)
GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
return false;
}
- if (pthread_mutex_init (&agent->team_arena_write_lock, NULL))
+ if (pthread_mutex_init (&agent->ephemeral_memories_write_lock, NULL))
{
GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
return false;
}
agent->async_queues = NULL;
agent->omp_async_queue = NULL;
- agent->team_arena_list = NULL;
+ agent->ephemeral_memories_list = NULL;
uint32_t queue_size;
hsa_status_t status;
agent->module = NULL;
}
- if (!destroy_team_arenas (agent))
+ if (!destroy_ephemeral_memories (agent))
return false;
if (!destroy_hsa_program (agent))
GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
return false;
}
- if (pthread_mutex_destroy (&agent->team_arena_write_lock))
+ if (pthread_mutex_destroy (&agent->ephemeral_memories_write_lock))
{
- GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
+ GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
return false;
}
agent->initialized = false;