ADD_EXECUTABLE(test_private_memory tests/test_private_memory.c)
ADD_EXECUTABLE(test_constant_memory tests/test_constant_memory.c)
ADD_EXECUTABLE(test_memory_leak tests/test_memory_leak.c)
+ADD_EXECUTABLE(test_eot tests/test_eot.c)
ADD_EXECUTABLE(mandelbrot tests/mandelbrot.c)
ADD_EXECUTABLE(mersenneTwister tests/mersenneTwister.c)
ADD_EXECUTABLE(blackscholes tests/blackscholes.c)
TARGET_LINK_LIBRARIES(test_constant_memory cl_test m)
TARGET_LINK_LIBRARIES(test_memory_leak cl_test m)
TARGET_LINK_LIBRARIES(test_write_only cl_test m)
+TARGET_LINK_LIBRARIES(test_eot cl_test m)
TARGET_LINK_LIBRARIES(mandelbrot cl_test m)
TARGET_LINK_LIBRARIES(mersenneTwister cl_test m)
TARGET_LINK_LIBRARIES(blackscholes cl_test m)
#include <string.h>
#include <assert.h>
-#ifdef _PLASMA
-#define PRINTF(x) printf("CL API => cl%s\n", x)
-#else
-#define PRINTF(x)
-#endif
-
cl_int
clGetPlatformIDs(cl_uint num_entries,
cl_platform_id * platforms,
void * user_data,
cl_int * errcode_ret)
{
- PRINTF("CreateContext");
return cl_create_context(properties,
num_devices,
devices,
cl_int
clReleaseContext(cl_context context)
{
- PRINTF("ReleaseContext");
-
cl_int err = CL_SUCCESS;
CHECK_CONTEXT (context);
cl_context_delete(context);
cl_command_queue_properties properties,
cl_int * errcode_ret)
{
- PRINTF("CreateCommandQueue");
cl_command_queue queue = NULL;
cl_int err = CL_SUCCESS;
CHECK_CONTEXT (context);
cl_int
clReleaseCommandQueue(cl_command_queue command_queue)
{
- PRINTF("ReleaseCommandQueue");
cl_int err = CL_SUCCESS;
CHECK_QUEUE (command_queue);
cl_command_queue_delete(command_queue);
void * host_ptr,
cl_int * errcode_ret)
{
- PRINTF("CreateBuffer");
cl_mem mem = NULL;
cl_int err = CL_SUCCESS;
CHECK_CONTEXT (context);
void * host_ptr,
cl_int * errcode_ret)
{
- PRINTF("CreateImage2D");
cl_mem mem = NULL;
cl_int err = CL_SUCCESS;
CHECK_CONTEXT (context);
cl_int * binary_status,
cl_int * errcode_ret)
{
- PRINTF("CreateProgramWithBinary");
cl_program program = NULL;
cl_int err = CL_SUCCESS;
cl_int
clReleaseProgram(cl_program program)
{
- PRINTF("clReleaseProgram");
cl_int err = CL_SUCCESS;
CHECK_PROGRAM (program);
cl_program_delete(program);
const char * kernel_name,
cl_int * errcode_ret)
{
- PRINTF("CreateKernel");
cl_kernel kernel = NULL;
cl_int err = CL_SUCCESS;
cl_int
clReleaseKernel(cl_kernel kernel)
{
- PRINTF("clReleaseKernel");
cl_int err = CL_SUCCESS;
CHECK_KERNEL(kernel);
cl_kernel_delete(kernel);
const cl_event * event_wait_list,
cl_event * event)
{
- PRINTF("EnqueueNDRangeKernel");
size_t fixed_global_off[] = {0,0,0};
size_t fixed_global_sz[] = {1,1,1};
size_t fixed_local_sz[] = {16,1,1};
atomic_inc(&queue->ref_n);
}
+ LOCAL cl_int
+cl_command_queue_bind_surface(cl_command_queue queue,
+ cl_kernel k,
+ char *curbe,
+ drm_intel_bo **local,
+ drm_intel_bo **priv,
+ drm_intel_bo **scratch,
+ uint32_t local_sz)
+{
+ cl_context ctx = queue->ctx;
+ intel_gpgpu_t *gpgpu = queue->gpgpu;
+ drm_intel_bufmgr *bufmgr = cl_context_get_intel_bufmgr(ctx);
+ drm_intel_bo *sync_bo = NULL;
+ cl_int err = CL_SUCCESS;
+#if 0
+ cl_context ctx = queue->ctx;
+ intel_gpgpu_t *gpgpu = queue->gpgpu;
+ drm_intel_bufmgr *bufmgr = cl_context_get_intel_bufmgr(ctx);
+ cl_mem mem = NULL;
+ drm_intel_bo *bo = NULL, *sync_bo = NULL;
+ const size_t max_thread = ctx->device->max_compute_unit;
+ cl_int err = CL_SUCCESS;
+ uint32_t i, index;
+
+ /* Bind user defined surface */
+ for (i = 0; i < k->arg_info_n; ++i) {
+ assert(k->arg_info[i].offset % SURFACE_SZ == 0);
+ index = k->arg_info[i].offset / SURFACE_SZ;
+ mem = (cl_mem) k->args[k->arg_info[i].arg_index];
+ assert(index != MAX_SURFACES - 1);
+ CHECK_MEM(mem);
+ bo = mem->bo;
+ assert(bo);
+ if (mem->is_image) {
+ const int32_t w = mem->w, h = mem->h, pitch = mem->pitch;
+ const uint32_t fmt = mem->intel_fmt;
+ gpgpu_tiling_t tiling = GPGPU_NO_TILE;
+ if (mem->tiling == CL_TILE_X)
+ tiling = GPGPU_TILE_X;
+ else if (mem->tiling == CL_TILE_Y)
+ tiling = GPGPU_TILE_Y;
+ gpgpu_bind_image2D(gpgpu, index, bo, fmt, w, h, pitch, tiling);
+
+ /* Copy the image parameters (width, height) in the constant buffer if the
+ * user requests them
+ */
+ cl_kernel_copy_image_parameters(k, mem, index, curbe);
+ } else
+ gpgpu_bind_buf(gpgpu, index, bo, cc_llc_l3);
+ }
+
+ /* Allocate the constant surface (if any) */
+ if (k->const_bo) {
+ assert(k->const_bo_index != MAX_SURFACES - 1);
+ gpgpu_bind_buf(gpgpu, k->const_bo_index,
+ k->const_bo,
+ cc_llc_l3);
+ }
+
+ /* Allocate local surface needed for SLM and bind it */
+ if (local && local_sz != 0) {
+ const size_t sz = 16 * local_sz; /* XXX 16 == maximum barrier number */
+ assert(k->patch.local_surf.offset % SURFACE_SZ == 0);
+ index = k->patch.local_surf.offset / SURFACE_SZ;
+ assert(index != MAX_SURFACES - 1);
+ *local = drm_intel_bo_alloc(bufmgr, "CL local surface", sz, 64);
+ gpgpu_bind_buf(gpgpu, index, *local, cc_llc_l3);
+ }
+ else if (local)
+ *local = NULL;
+
+ /* Allocate private surface and bind it */
+ if (priv && k->patch.private_surf.size != 0) {
+ const size_t sz = max_thread *
+ k->patch.private_surf.size *
+ k->patch.exec_env.largest_compiled_simd_sz;
+ // assert(k->patch.exec_env.largest_compiled_simd_sz == 16);
+ assert(k->patch.private_surf.offset % SURFACE_SZ == 0);
+ index = k->patch.private_surf.offset / SURFACE_SZ;
+ assert(index != MAX_SURFACES - 1);
+ *priv = drm_intel_bo_alloc(bufmgr, "CL private surface", sz, 64);
+ gpgpu_bind_buf(gpgpu, index, *priv, cc_llc_l3);
+ }
+ else if(priv)
+ *priv = NULL;
+
+ /* Allocate scratch surface and bind it */
+ if (scratch && k->patch.scratch.size != 0) {
+ const size_t sz = max_thread * /* XXX is it given per lane ??? */
+ k->patch.scratch.size *
+ k->patch.exec_env.largest_compiled_simd_sz;
+ // assert(k->patch.exec_env.largest_compiled_simd_sz == 16);
+ assert(k->patch.scratch.offset % SURFACE_SZ == 0);
+ assert(index != MAX_SURFACES - 1);
+ index = k->patch.scratch.offset / SURFACE_SZ;
+ *scratch = drm_intel_bo_alloc(bufmgr, "CL scratch surface", sz, 64);
+ gpgpu_bind_buf(gpgpu, index, *scratch, cc_llc_l3);
+ }
+ else if (scratch)
+ *scratch = NULL;
+#endif
+ /* Now bind a bo used for synchronization */
+ sync_bo = drm_intel_bo_alloc(bufmgr, "sync surface", 64, 64);
+ gpgpu_bind_buf(gpgpu, MAX_SURFACES-1, sync_bo, cc_llc_l3);
+ if (queue->last_batch != NULL)
+ drm_intel_bo_unreference(queue->last_batch);
+ queue->last_batch = sync_bo;
+
+// error:
+ assert(err == CL_SUCCESS); /* Cannot fail here */
+ return err;
+}
+
#if USE_FULSIM
extern void drm_intel_bufmgr_gem_stop_aubfile(drm_intel_bufmgr*);
extern void drm_intel_bufmgr_gem_set_aubfile(drm_intel_bufmgr*, FILE*);
static INLINE size_t
cl_kernel_compute_batch_sz(cl_kernel k)
{
-#ifdef _PLASMA
- size_t sz = 0x1000; // _PLASMA
-#else
- size_t sz = 256 + 32;
-#endif
+ size_t sz = 256 + 32;
return sz;
}
-#if 0
static cl_int
cl_set_local_ids(char *data,
const size_t *local_wk_sz,
cl_int err = CL_SUCCESS;
for (i = 0; i < 3; ++i)
- TRY_ALLOC(ids[i], (uint16_t*) cl_calloc(sizeof(uint16_t), thread_n*simd_sz));
+ TRY_ALLOC(ids[i], (uint16_t*) alloca(sizeof(uint16_t)*thread_n*simd_sz));
/* Compute the IDs */
for (k = 0; k < local_wk_sz[2]; ++k)
}
error:
- for (i = 0; i < 3; ++i)
- cl_free(ids[i]);
return err;
}
-#endif
LOCAL cl_int
cl_command_queue_ND_range_gen7(cl_command_queue queue,
const size_t *global_wk_sz,
const size_t *local_wk_sz)
{
+ cl_context ctx = queue->ctx;
+ intel_gpgpu_t *gpgpu = queue->gpgpu;
+ char *curbe = NULL; /* Does not include per-thread local IDs */
+ char *final_curbe = NULL; /* Includes them */
+ drm_intel_bo *private_bo = NULL, *scratch_bo = NULL;
+ genx_gpgpu_kernel_t kernel;
+ const uint32_t simd_sz = cl_kernel_get_simd_width(ker);
+ size_t i, batch_sz = 0u, local_sz = 0u, thread_n = 0u, id_offset = 0u, cst_sz = 0u;
+ cl_int err = CL_SUCCESS;
+
+ /* Setup kernel */
+ kernel.name = "OCL kernel";
+ kernel.grf_blocks = 128;
+ kernel.bo = ker->bo;
+ kernel.barrierID = 0;
+ kernel.use_barrier = 0;
+ kernel.slm_sz = 0;
+ kernel.cst_sz = 0;
+
+ /* Compute the number of HW threads we are going to need */
+ TRY (cl_kernel_work_group_sz, ker, local_wk_sz, 3, &local_sz);
+ kernel.thread_n = thread_n = local_sz / simd_sz;
+ id_offset = cst_sz = ALIGN(cst_sz, 32); /* Align the user data on 32 bytes */
+ kernel.cst_sz = cst_sz += 3 * 32; /* Add local IDs (16 words) */
+
+ /* Setup the kernel */
+ gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32);
+ if (queue->last_batch != NULL)
+ drm_intel_bo_unreference(queue->last_batch);
+ queue->last_batch = NULL;
+ cl_command_queue_bind_surface(queue, ker, curbe, NULL, &private_bo, &scratch_bo, 0);
+ gpgpu_states_setup(gpgpu, &kernel, 1);
+
+ /* CURBE step 2. Give the localID and upload it to video memory */
+ TRY_ALLOC (final_curbe, (char*) alloca(thread_n * cst_sz));
+ if (curbe)
+ for (i = 0; i < thread_n; ++i)
+ memcpy(final_curbe + cst_sz * i, curbe, cst_sz - 3*32);
+ TRY (cl_set_local_ids, final_curbe, local_wk_sz, simd_sz, cst_sz, id_offset, thread_n);
+ gpgpu_upload_constants(gpgpu, final_curbe, thread_n*cst_sz);
+
+ /* Start a new batch buffer */
+ batch_sz = cl_kernel_compute_batch_sz(ker);
+ gpgpu_batch_reset(gpgpu, batch_sz);
+ gpgpu_batch_start(gpgpu);
+
+ /* Issue the GPGPU_WALKER command */
+ gpgpu_walker(gpgpu, simd_sz, thread_n, global_wk_off, global_wk_sz, local_wk_sz);
+
+ /* Close the batch buffer and submit it */
+ gpgpu_batch_end(gpgpu, 0);
+ gpgpu_flush(gpgpu);
+
#if 0
cl_context ctx = queue->ctx;
intel_gpgpu_t *gpgpu = queue->gpgpu;
cl_free(curbe);
return err;
#endif
- return CL_SUCCESS;
+
+error:
+ return err;
}
#include "cl_mem.h"
#include "cl_alloc.h"
#include "cl_utils.h"
-
#include "CL/cl.h"
-
-#ifdef _PLASMA
-#include "plasma/plasma_export.h"
-#else
#include "intel_bufmgr.h"
#include "intel/intel_gpgpu.h"
-#endif
+#include "gen/program.h"
#include <stdio.h>
#include <string.h>
}
LOCAL cl_kernel
-cl_kernel_new(void)
+cl_kernel_new(const cl_program p)
{
cl_kernel k = NULL;
TRY_ALLOC_NO_ERR (k, CALLOC(struct _cl_kernel));
k->ref_n = 1;
k->magic = CL_MAGIC_KERNEL_HEADER;
+ k->program = p;
exit:
return k;
goto exit;
}
+LOCAL const char*
+cl_kernel_get_name(const cl_kernel k)
+{
+ if (UNLIKELY(k == NULL)) return NULL;
+ return GenKernelGetName(k->gen_kernel);
+}
+
LOCAL void
cl_kernel_add_ref(cl_kernel k)
{
return err;
}
+LOCAL uint32_t
+cl_kernel_get_simd_width(const cl_kernel k)
+{
+ assert(k != NULL);
+ return GenKernelGetSIMDWidth(k->gen_kernel);
+}
+
+LOCAL void
+cl_kernel_setup(cl_kernel k, const struct GenKernel *gen_kernel)
+{
+ cl_context ctx = k->program->ctx;
+ drm_intel_bufmgr *bufmgr = cl_context_get_intel_bufmgr(ctx);
+
+ /* Allocate the gen code here */
+ const uint32_t code_sz = GenKernelGetCodeSize(gen_kernel);
+ const char *code = GenKernelGetCode(gen_kernel);
+ k->bo = drm_intel_bo_alloc(bufmgr, "CL kernel", code_sz, 64u);
+
+ /* Upload the code */
+ drm_intel_bo_subdata(k->bo, 0, code_sz, code);
+ k->gen_kernel = gen_kernel;
+}
+
+LOCAL cl_kernel
+cl_kernel_dup(const cl_kernel from)
+{
+ cl_kernel to = NULL;
+
+ if (UNLIKELY(from == NULL))
+ return NULL;
+ TRY_ALLOC_NO_ERR (to, CALLOC(struct _cl_kernel));
+ to->bo = from->bo;
+ to->const_bo = from->const_bo;
+ to->gen_kernel = from->gen_kernel;
+ to->ref_n = 1;
+ to->magic = CL_MAGIC_KERNEL_HEADER;
+ to->program = from->program;
+
+ /* Retain the bos */
+ if (from->bo) drm_intel_bo_reference(from->bo);
+ if (from->const_bo) drm_intel_bo_reference(from->const_bo);
+
+ /* We retain the program destruction since this kernel (user allocated)
+ * depends on the program for some of its pointers
+ */
+ assert(from->program);
+ cl_program_add_ref(from->program);
+ to->ref_its_program = CL_TRUE;
+
+exit:
+ return to;
+error:
+ cl_kernel_delete(to);
+ to = NULL;
+ goto exit;
+}
+
+LOCAL cl_int
+cl_kernel_work_group_sz(cl_kernel ker,
+ const size_t *local_wk_sz,
+ uint32_t wk_dim,
+ size_t *wk_grp_sz)
+{
+ cl_int err = CL_SUCCESS;
+ size_t sz = 0;
+ cl_uint i;
+
+ for (i = 0; i < wk_dim; ++i) {
+ const uint32_t required_sz = GenKernelGetRequiredWorkGroupSize(ker->gen_kernel, i);
+ if (required_sz != 0 && required_sz != local_wk_sz[i]) {
+ err = CL_INVALID_WORK_ITEM_SIZE;
+ goto error;
+ }
+ }
+ sz = local_wk_sz[0];
+ for (i = 1; i < wk_dim; ++i)
+ sz *= local_wk_sz[i];
+ FATAL_IF (sz % 16, "Work group size must be a multiple of 16");
+ if (sz > ker->program->ctx->device->max_work_group_size) {
+ err = CL_INVALID_WORK_ITEM_SIZE;
+ goto error;
+ }
+
+error:
+ if (wk_grp_sz)
+ *wk_grp_sz = sz;
+ return err;
+}
+
+
#include "cl_defs.h"
#include "cl_internals.h"
#include "CL/cl.h"
-#include "gen/program.h"
#include <stdint.h>
#include <stdlib.h>
+/* This is the kernel as it is interfaced by the compiler */
+struct GenKernel;
+
+/*! One OCL function */
struct _cl_kernel {
- uint64_t magic; /* To identify it as a kernel */
- volatile int ref_n; /* We reference count this object */
- struct _drm_intel_bo *bo; /* The code itself */
- struct _drm_intel_bo *const_bo;/* Buffer for all __constants values in the OCL program */
- cl_program program; /* Owns this structure (and pointers) */
- uint8_t ref_its_program; /* True only for the user kernel (those created by clCreateKernel) */
+ uint64_t magic; /* To identify it as a kernel */
+ volatile int ref_n; /* We reference count this object */
+ struct _drm_intel_bo *bo; /* The code itself */
+ struct _drm_intel_bo *const_bo; /* Buffer for all __constants values in the OCL program */
+ cl_program program; /* Owns this structure (and pointers) */
+ const struct GenKernel *gen_kernel; /* (Opaque) compiler structure for the OCL kernel */
+ uint8_t ref_its_program; /* True only for the user kernel (those created by clCreateKernel) */
};
/* Allocate an empty kernel */
-extern cl_kernel cl_kernel_new(void);
+extern cl_kernel cl_kernel_new(const cl_program);
/* Destroy and deallocate an empty kernel */
extern void cl_kernel_delete(cl_kernel);
+/* Setup the kernel with the given Gen Kernel */
+extern void cl_kernel_setup(cl_kernel k, const struct GenKernel *gen_kernel);
+
+/* Get the kernel name */
+extern const char *cl_kernel_get_name(const cl_kernel k);
+
+/* Get the simd width as used in the code */
+extern uint32_t cl_kernel_get_simd_width(const cl_kernel k);
+
/* When a kernel is created from outside, we just duplicate the structure we
* have internally and give it back to the user
*/
#include "cl_alloc.h"
#include "cl_utils.h"
#include "CL/cl.h"
+#include "CL/cl_intel.h"
#include "gen/program.h"
#include <stdio.h>
}
LOCAL cl_program
-cl_program_new(cl_context ctx, const char *data, size_t sz)
+cl_program_new(cl_context ctx)
{
cl_program p = NULL;
/* Allocate the structure */
TRY_ALLOC_NO_ERR (p, CALLOC(struct _cl_program));
- TRY_ALLOC_NO_ERR (p->bin, CALLOC_ARRAY(char, sz));
- memcpy(p->bin, data, sz);
- p->bin_sz = sz;
p->ref_n = 1;
p->magic = CL_MAGIC_PROGRAM_HEADER;
p->ctx = ctx;
atomic_inc(&p->ref_n);
}
+static cl_int
+cl_program_load_gen_program(cl_program p)
+{
+ cl_int err = CL_SUCCESS;
+ uint32_t i;
+
+ assert(p->gen_program != NULL);
+ p->ker_n = GenProgramGetKernelNum(p->gen_program);
+
+ /* Allocate the kernel array */
+ TRY_ALLOC (p->ker, CALLOC_ARRAY(cl_kernel, p->ker_n));
+
+ for (i = 0; i < p->ker_n; ++i) {
+ const GenKernel *gen_kernel = GenProgramGetKernel(p->gen_program, i);
+ assert(gen_kernel != NULL);
+ TRY_ALLOC (p->ker[i], cl_kernel_new(p));
+ cl_kernel_setup(p->ker[i], gen_kernel);
+ }
+
+error:
+ return err;
+}
+
LOCAL cl_program
cl_program_create_from_binary(cl_context ctx,
cl_uint num_devices,
INVALID_DEVICE_IF (devices[0] != ctx->device);
INVALID_VALUE_IF (file_name == NULL);
+ program = cl_program_new(ctx);
+
program->gen_program = GenProgramNewFromLLVM(file_name, 0, NULL, NULL);
if (program->gen_program == NULL) {
err = CL_INVALID_PROGRAM;
goto error;
}
+ /* Create all the kernels */
+ TRY (cl_program_load_gen_program, program);
+
exit:
if (errcode_ret)
*errcode_ret = err;
LOCAL cl_kernel
cl_program_create_kernel(cl_program p, const char *name, cl_int *errcode_ret)
{
- return NULL;
+ cl_kernel from = NULL, to = NULL;
+ cl_int err = CL_SUCCESS;
+ uint32_t i = 0;
+
+ if (UNLIKELY(name == NULL)) {
+ err = CL_INVALID_KERNEL_NAME;
+ goto error;
+ }
+
+ /* Find the program first */
+ for (i = 0; i < p->ker_n; ++i) {
+ assert(p->ker[i]);
+ const char *ker_name = cl_kernel_get_name(p->ker[i]);
+ if (strcmp(ker_name, name) == 0) {
+ from = p->ker[i];
+ break;
+ }
+ }
+
+ /* We were not able to find this named kernel */
+ if (UNLIKELY(from == NULL)) {
+ err = CL_INVALID_KERNEL_NAME;
+ goto error;
+ }
+
+ TRY_ALLOC(to, cl_kernel_dup(from));
+
+exit:
+ if (errcode_ret)
+ *errcode_ret = err;
+ return to;
+error:
+ cl_kernel_delete(to);
+ to = NULL;
+ goto exit;
}
uint32_t is_built:1; /* Did we call clBuildProgram on it? */
};
-/* Create a program from */
-extern cl_program cl_program_new(cl_context, const char*, size_t);
+/* Create a empty program */
+extern cl_program cl_program_new(cl_context);
/* Destroy and deallocate an empty kernel */
extern void cl_program_delete(cl_program);
const char *name; /* kernel name and bo name */
uint32_t grf_blocks; /* register blocks kernel wants (in 8 reg blocks) */
uint32_t cst_sz; /* total size of all constants */
- const char *bin; /* binary code of the kernel */
- int32_t size; /* kernel code size */
struct _drm_intel_bo *bo;/* kernel code in the proper addr space */
int32_t barrierID; /* barrierID for _this_ kernel */
uint32_t use_barrier:1; /* For gen7 (automatic barrier management) */