From d6b119cc43c7db6597f45056a835a106de94b9ad Mon Sep 17 00:00:00 2001 From: bsegovia Date: Sat, 30 Jul 2011 01:35:53 +0000 Subject: [PATCH] Cleaned up code to start GPGPU_WALKER integration --- kernels/test_barrier/test_barrier_kernels_0.bin | Bin 1267 -> 1203 bytes kernels/test_barrier/test_barrier_kernels_0.ghal3d | 18 +- src/CMakeLists.txt | 2 + src/cl_api.c | 12 +- src/cl_command_queue.c | 372 ++------------------- src/cl_command_queue.h | 20 +- src/cl_kernel.c | 81 +++++ src/cl_kernel.h | 21 +- src/intel/intel_gpgpu.c | 6 + src/intel/intel_gpgpu.h | 3 + 10 files changed, 165 insertions(+), 370 deletions(-) diff --git a/kernels/test_barrier/test_barrier_kernels_0.bin b/kernels/test_barrier/test_barrier_kernels_0.bin index fbb28b17114b70a59eaa5bb97983c100c8803844..23ab7d78c2d68833575cf042a891cc99ccdc43f6 100644 GIT binary patch delta 97 zcmey&xtUYbImFNNB{Ks92Ll5GBM@KHe)&uQNZSB88<>C$AaDR;kByp385ub@pJ()C nWOSM=%WOK?h*^nIVe(dHV_uK~kO3k<>@fK&vpP^*mgO1%)Yubo delta 109 zcmdnY`I%GGImFNNB{Ks92Ll5GBM`4D@#+x((l$WO1|}c_2poX8W25F$Mn>t)=NWw& w8Ji}{GMi3fmYKxNGC7D@3BqD@m|V(i%-aA|3^Ggvh#MwvWmX4@-(|iA0K6g=+5i9m diff --git a/kernels/test_barrier/test_barrier_kernels_0.ghal3d b/kernels/test_barrier/test_barrier_kernels_0.ghal3d index 3e68040..92489ce 100644 --- a/kernels/test_barrier/test_barrier_kernels_0.ghal3d +++ b/kernels/test_barrier/test_barrier_kernels_0.ghal3d @@ -32,7 +32,7 @@ DCL_INPUT i0.xyz, THREAD_ID; - DCL_TEMPS r0..r23; + DCL_TEMPS r0..r29; DCL_POINTERS ptr0..ptr3; ADDRESS_OF ptr0, u0; MOV r0, c0.x; @@ -54,11 +54,17 @@ IADD r19.x, r18.x, r5.x; MOV r20.x, i0.x; SHL r21.x, r20.x, 2; - PADD ptr2, ptr1, r21.x; - LD_RAW_PTR r22.x, ptr2; - IADD r23.x, r19.x, r22.x; - PADD ptr3, ptr0, r21.x; - STORE_RAW_PTR ptr3.x, r23.x; + SHR r22.x, r20.x, 30; + MOV r23.x, 0; + MOV_SWZ r24, r21.x, r22.x, r23.x, r23.x; + MOV r25.xy, r24; + MOV r26.xy, r25.xyxy; + PADD ptr2, ptr1, r26.x; + LD_RAW_PTR r27.x, ptr2; + IADD r28.x, r19.x, r27.x; + MOV r29.xy, r25.xyxy; + PADD ptr3, ptr0, r29.x; + STORE_RAW_PTR ptr3.x, r28.x; SYNC.t.g; RET; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 71f2ea8..1380ad7 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -14,6 +14,8 @@ SET(OPENCL_SRC cl_device_id.c cl_context.c cl_command_queue.c + cl_command_queue_gen6.c + cl_command_queue_gen7.c intel/intel_gpgpu.c intel/intel_batchbuffer.c intel/intel_driver.c diff --git a/src/cl_api.c b/src/cl_api.c index 3e17af9..d9875cd 100644 --- a/src/cl_api.c +++ b/src/cl_api.c @@ -962,12 +962,12 @@ clEnqueueNDRangeKernel(cl_command_queue command_queue, FATAL_IF(event != NULL, "Events are not supported"); /* Do device specific checks are enqueue the kernel */ - err = cl_command_queue_ND_kernel(command_queue, - kernel, - work_dim, - global_work_offset, - global_work_size, - local_work_size); + err = cl_command_queue_ND_range(command_queue, + kernel, + work_dim, + global_work_offset, + global_work_size, + local_work_size); error: return err; diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c index d1b1439..59dcf77 100644 --- a/src/cl_command_queue.c +++ b/src/cl_command_queue.c @@ -95,37 +95,24 @@ cl_command_queue_add_ref(cl_command_queue queue) atomic_inc(&queue->ref_n); } -/* Header used by kernels */ -typedef struct cl_inline_header { - uint32_t grp_n[3]; - uint32_t local_sz[3]; - uint32_t exec_mask; - uint32_t local_mem_sz; -} cl_inline_header_t; - -/* ID inside the work group */ -typedef struct cl_local_id { - uint16_t data[16]; -} cl_local_id_t; - #define SURFACE_SZ 32 -static cl_int +extern cl_int cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k, drm_intel_bo **local, - uint32_t local_sz, drm_intel_bo **priv, - drm_intel_bo **scratch) + drm_intel_bo **scratch, + uint32_t local_sz) { cl_context ctx = queue->ctx; intel_gpgpu_t *gpgpu = queue->gpgpu; drm_intel_bufmgr *bufmgr = cl_context_get_intel_bufmgr(ctx); cl_mem mem = NULL; drm_intel_bo *bo = NULL, *sync_bo = NULL; + const size_t max_thread = ctx->device->max_compute_unit; cl_int err = CL_SUCCESS; uint32_t i, index; - const size_t max_thread = ctx->device->max_compute_unit; /* Bind user defined surface */ for (i = 0; i < k->arg_info_n; ++i) { @@ -205,7 +192,7 @@ error: return err; } -static INLINE cl_int +LOCAL cl_int cl_kernel_check_args(cl_kernel k) { uint32_t i; @@ -215,129 +202,8 @@ cl_kernel_check_args(cl_kernel k) return CL_SUCCESS; } -static INLINE void -cl_command_queue_enqueue_wrk_grp3(cl_command_queue queue, - cl_local_id_t **ids, - const cl_inline_header_t *header, - uint32_t thread_n, - uint32_t barrierID) -{ - intel_gpgpu_t *gpgpu = queue->gpgpu; - uint32_t i; - for (i = 0; i < thread_n; ++i) { - const size_t sz = sizeof(cl_inline_header_t) + 3*sizeof(cl_local_id_t); - char *data = gpgpu_run_with_inline(gpgpu, barrierID, sz); - size_t offset = 0; - assert(data); - *((cl_inline_header_t *) (data + offset)) = *header; - offset += sizeof(cl_inline_header_t); - *((cl_local_id_t *) (data + offset)) = ids[0][i]; - offset += sizeof(cl_local_id_t); - *((cl_local_id_t *) (data + offset)) = ids[1][i]; - offset += sizeof(cl_local_id_t); - *((cl_local_id_t *) (data + offset)) = ids[2][i]; - } -} - -static INLINE void -cl_command_queue_enqueue_wrk_grp2(cl_command_queue queue, - cl_local_id_t **ids, - const cl_inline_header_t *header, - uint32_t thread_n, - uint32_t barrierID) -{ - intel_gpgpu_t *gpgpu = queue->gpgpu; - uint32_t i; - for (i = 0; i < thread_n; ++i) { - const size_t sz = sizeof(cl_inline_header_t) + 2*sizeof(cl_local_id_t); - char *data = gpgpu_run_with_inline(gpgpu, barrierID, sz); - size_t offset = 0; - assert(data); - *((cl_inline_header_t *) (data + offset)) = *header; - offset += sizeof(cl_inline_header_t); - *((cl_local_id_t *) (data + offset)) = ids[0][i]; - offset += sizeof(cl_local_id_t); - *((cl_local_id_t *) (data + offset)) = ids[1][i]; - } -} - -static INLINE void -cl_command_queue_enqueue_wrk_grp1(cl_command_queue queue, - cl_local_id_t **ids, - const cl_inline_header_t *header, - uint32_t thread_n, - uint32_t barrierID) -{ - intel_gpgpu_t *gpgpu = queue->gpgpu; - uint32_t i; - for (i = 0; i < thread_n; ++i) { - const size_t sz = sizeof(cl_inline_header_t) + sizeof(cl_local_id_t); - char *data = gpgpu_run_with_inline(gpgpu, barrierID, sz); - size_t offset = 0; - assert(data); - *((cl_inline_header_t *) (data + offset)) = *header; - offset += sizeof(cl_inline_header_t); - *((cl_local_id_t *) (data + offset)) = ids[0][i]; - } -} - -static INLINE int32_t -cl_kernel_get_first_local(cl_kernel k) -{ - int32_t i; - for (i = 0; i < (int32_t) k->curbe_info_n; ++i) - if (k->curbe_info[i].type == DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_ARGUMENT_SIZES) - return i; - return k->curbe_info_n; -} - -static void -cl_kernel_handle_local_memory(cl_kernel k, cl_inline_header_t *header) -{ - int32_t i; - - if (k->has_local_buffer) { - header->local_mem_sz = 0; - - /* Look for all local surfaces offset to set */ - i = cl_kernel_get_first_local(k); - - /* Now, set the offsets for all local surfaces */ - for (; i < (int32_t) k->curbe_info_n; ++i) { - cl_curbe_patch_info_t *info = k->curbe_info + i; - const size_t offset = header->local_mem_sz; - if (info->type != DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_ARGUMENT_SIZES) - break; - assert(info->last == 0); - assert(sizeof(int32_t) + info->offsets[0] <= k->patch.curbe.sz); - memcpy(k->cst_buffer + info->offsets[0], &offset, sizeof(int32_t)); - header->local_mem_sz += info->sz; - } - header->local_mem_sz += k->patch.local_surf.sz; - } - else - header->local_mem_sz = 0; -} - -static INLINE size_t -cl_ker_compute_batch_sz(cl_kernel k, - size_t wrk_dim_n, - size_t wrk_grp_n, - size_t thread_n) -{ - size_t sz = 256; /* upper bound of the complete prelude */ - size_t media_obj_sz = 6 * 4; /* size of one MEDIA OBJECT */ - media_obj_sz += sizeof(cl_inline_header_t); /* header for all threads */ - media_obj_sz += wrk_dim_n * sizeof(cl_local_id_t);/* for each dimension */ - if (k->patch.exec_env.has_barriers) - media_obj_sz += 4 * 4; /* one barrier update per object */ - sz += media_obj_sz * wrk_grp_n * thread_n; - return sz; -} - LOCAL cl_int -cl_command_queue_set_report_buffer(cl_command_queue queue, - cl_mem mem) +cl_command_queue_set_report_buffer(cl_command_queue queue, cl_mem mem) { cl_int err = CL_SUCCESS; if (queue->perf != NULL) { @@ -357,50 +223,6 @@ error: return err; } -static char* -cl_kernel_create_cst_buffer(cl_kernel k, - cl_uint work_dim, - const size_t *global_wk_sz, - const size_t *local_wk_sz) -{ - cl_curbe_patch_info_t *info = NULL; - const size_t sz = k->patch.curbe.sz; - uint64_t key = 0; - char *data = NULL; - - TRY_ALLOC_NO_ERR (data, (char *) cl_calloc(sz, 1)); - memcpy(data, k->cst_buffer, sz); - - /* Global work group size */ - key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_SIZE, 0, 0); - if ((info = cl_kernel_get_curbe_info(k, key)) != NULL) - memcpy(data+info->offsets[0], global_wk_sz, sizeof(uint32_t)); - key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_SIZE, 0, 4); - if ((info = cl_kernel_get_curbe_info(k, key)) != NULL) - memcpy(data+info->offsets[0], global_wk_sz+1, sizeof(uint32_t)); - key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_SIZE, 0, 8); - if ((info = cl_kernel_get_curbe_info(k, key)) != NULL) - memcpy(data+info->offsets[0], global_wk_sz+2, sizeof(uint32_t)); - - /* Local work group size */ - key = cl_curbe_key(DATA_PARAMETER_LOCAL_WORK_SIZE, 0, 0); - if ((info = cl_kernel_get_curbe_info(k, key)) != NULL) - memcpy(data+info->offsets[0], local_wk_sz, sizeof(uint32_t)); - key = cl_curbe_key(DATA_PARAMETER_LOCAL_WORK_SIZE, 0, 4); - if ((info = cl_kernel_get_curbe_info(k, key)) != NULL) - memcpy(data+info->offsets[0], local_wk_sz+1, sizeof(uint32_t)); - key = cl_curbe_key(DATA_PARAMETER_LOCAL_WORK_SIZE, 0, 8); - if ((info = cl_kernel_get_curbe_info(k, key)) != NULL) - memcpy(data+info->offsets[0], local_wk_sz+2, sizeof(uint32_t)); - -exit: - return data; -error: - cl_free(data); - data = NULL; - goto exit; -} - #if USE_FULSIM LOCAL void cl_run_fulsim(void) @@ -423,181 +245,33 @@ cl_run_fulsim(void) } #endif /* USE_FULSIM */ +extern cl_int cl_command_queue_ND_range_gen6(cl_command_queue, cl_kernel, cl_uint, const size_t*, const size_t*, const size_t*); +extern cl_int cl_command_queue_ND_range_gen7(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *); + LOCAL cl_int -cl_command_queue_ND_kernel(cl_command_queue queue, - cl_kernel ker, - cl_uint work_dim, - const size_t *global_work_offset, - const size_t *global_wk_sz, - const size_t *local_wk_sz) +cl_command_queue_ND_range(cl_command_queue queue, + cl_kernel ker, + cl_uint wk_dim, + const size_t *global_wk_off, + const size_t *global_wk_sz, + const size_t *local_wk_sz) { - cl_context ctx = queue->ctx; intel_gpgpu_t *gpgpu = queue->gpgpu; - drm_intel_bo *slm_bo = NULL, *private_bo = NULL, *scratch_bo = NULL; - size_t cst_sz = ker->patch.curbe.sz; - size_t wrk_grp_sz, wrk_grp_n, batch_sz; - uint32_t grp_end[3], offset[3], thread_n; /* per work group */ - uint32_t i, j, k, curr; - uint32_t barrierID = 0; - genx_gpgpu_kernel_t *kernels = NULL; - - cl_inline_header_t header; - cl_local_id_t *ids[3] = {NULL,NULL,NULL}; + const int32_t ver = intel_gpgpu_version(gpgpu); cl_int err = CL_SUCCESS; - /* Allocate 16 kernels (one for each barrier) */ - TRY_ALLOC (kernels, CALLOC_ARRAY(genx_gpgpu_kernel_t, 16)); - for (i = 0; i < 16; ++i) { - kernels[i].name = "OCL kernel"; - kernels[i].grf_blocks = 128; - kernels[i].cst_sz = cst_sz; - kernels[i].bin = NULL, - kernels[i].size = 0, - kernels[i].bo = ker->bo; - kernels[i].barrierID = i; - } - - /* All arguments must have been set */ - TRY (cl_kernel_check_args, ker); - - /* Total number of elements in the work group */ - for (i = 0; i < work_dim; ++i) - if ((&ker->patch.exec_env.required_wgr_sz_x)[i] && - (&ker->patch.exec_env.required_wgr_sz_x)[i] != local_wk_sz[i]) { - err = CL_INVALID_WORK_ITEM_SIZE; - goto error; - } - wrk_grp_sz = local_wk_sz[0]; - for (i = 1; i < work_dim; ++i) - wrk_grp_sz *= local_wk_sz[i]; - FATAL_IF (wrk_grp_sz % 16, "Work group size must be a multiple of 16"); - if (wrk_grp_sz > ctx->device->max_work_group_size) { - err = CL_INVALID_WORK_ITEM_SIZE; - goto error; - } - - /* Directly from the user defined values */ - header.local_sz[0] = local_wk_sz[0]; - header.local_sz[1] = local_wk_sz[1]; - header.local_sz[2] = local_wk_sz[2]; - offset[0] = header.grp_n[0] = 0; - offset[1] = header.grp_n[1] = 0; - offset[2] = header.grp_n[2] = 0; - header.exec_mask = ~0; - - /* offsets are evenly divided by the local sizes */ - if (global_work_offset) - for (i = 0; i < work_dim; ++i) - offset[i] = global_work_offset[i]/local_wk_sz[i]; - - /* Compute the local size per wg and the offsets for each local buffer */ - cl_kernel_handle_local_memory(ker, &header); - - if (queue->perf) - gpgpu_set_perf_counters(gpgpu, queue->perf->bo); - - /* Setup the kernel */ - gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, 4, 64, cst_sz / 32, 64); - if (queue->last_batch != NULL) - drm_intel_bo_unreference(queue->last_batch); - queue->last_batch = NULL; - cl_command_queue_bind_surface(queue, - ker, - &slm_bo, - header.local_mem_sz, - &private_bo, - &scratch_bo); - gpgpu_states_setup(gpgpu, kernels, 16); - - /* Fill the constant buffer */ - if (cst_sz > 0) { - char *data = NULL; - assert(ker->cst_buffer); - data = cl_kernel_create_cst_buffer(ker,work_dim,global_wk_sz,local_wk_sz); - gpgpu_upload_constants(gpgpu, data, cst_sz); - cl_free(data); - } - - wrk_grp_n = 1; - for (i = 0; i < work_dim; ++i) { - TRY_ALLOC (ids[i], (cl_local_id_t*) cl_malloc(wrk_grp_sz*sizeof(uint16_t))); - grp_end[i] = offset[i] + global_wk_sz[i] / local_wk_sz[i]; - wrk_grp_n *= grp_end[i]-offset[i]; - } - thread_n = wrk_grp_sz / 16; - batch_sz = cl_ker_compute_batch_sz(ker, work_dim, wrk_grp_n, thread_n); - - /* Start a new batch buffer */ - gpgpu_batch_reset(gpgpu, batch_sz); - gpgpu_batch_start(gpgpu); -#if 1 - /* Push all media objects. We implement three paths to make it (a bit) faster. - * Local IDs are shared from work group to work group. We allocate once the - * buffers and reuse them - */ - if (work_dim == 3) { - curr = 0; - for (i = 0; i < local_wk_sz[0]; ++i) - for (j = 0; j < local_wk_sz[1]; ++j) - for (k = 0; k < local_wk_sz[2]; ++k, ++curr) { - ((uint16_t*) ids[0])[curr] = i; - ((uint16_t*) ids[1])[curr] = j; - ((uint16_t*) ids[2])[curr] = k; - } - for (header.grp_n[0] = offset[0]; header.grp_n[0] < grp_end[0]; ++header.grp_n[0]) - for (header.grp_n[1] = offset[1]; header.grp_n[1] < grp_end[1]; ++header.grp_n[1]) - for (header.grp_n[2] = offset[2]; header.grp_n[2] < grp_end[2]; ++header.grp_n[2]) { - if (ker->patch.exec_env.has_barriers) - gpgpu_update_barrier(gpgpu, barrierID, thread_n); - cl_command_queue_enqueue_wrk_grp3(queue, ids, &header, thread_n, barrierID); - barrierID = (barrierID + 1) % 16; - } - } - else if (work_dim == 2) { - curr = 0; - for (i = 0; i < local_wk_sz[0]; ++i) - for (j = 0; j < local_wk_sz[1]; ++j, ++curr) { - ((uint16_t*) ids[0])[curr] = i; - ((uint16_t*) ids[1])[curr] = j; - } - for (header.grp_n[0] = offset[0]; header.grp_n[0] < grp_end[0]; ++header.grp_n[0]) - for (header.grp_n[1] = offset[1]; header.grp_n[1] < grp_end[1]; ++header.grp_n[1]) { - if (ker->patch.exec_env.has_barriers) - gpgpu_update_barrier(gpgpu, barrierID, thread_n); - cl_command_queue_enqueue_wrk_grp2(queue, ids, &header, thread_n, barrierID); - barrierID = (barrierID + 1) % 16; - } - } - else { - for (i = 0; i < local_wk_sz[0]; ++i) - ((uint16_t*) ids[0])[i] = i; - for (header.grp_n[0] = offset[0]; header.grp_n[0] < grp_end[0]; ++header.grp_n[0]) { - if (ker->patch.exec_env.has_barriers) - gpgpu_update_barrier(gpgpu, barrierID, thread_n); - cl_command_queue_enqueue_wrk_grp1(queue, ids, &header, thread_n, barrierID); - barrierID = (barrierID + 1) % 16; - } - } -#endif - gpgpu_batch_end(gpgpu, 0); - gpgpu_flush(gpgpu); - - if (slm_bo) - drm_intel_bo_unreference(slm_bo); - if (private_bo) - drm_intel_bo_unreference(private_bo); - if (scratch_bo) - drm_intel_bo_unreference(scratch_bo); + if (ver == 6) + TRY (cl_command_queue_ND_range_gen6, queue, ker, wk_dim, global_wk_off, global_wk_sz, local_wk_sz); + else if (ver == 7) + TRY (cl_command_queue_ND_range_gen7, queue, ker, wk_dim, global_wk_off, global_wk_sz, local_wk_sz); + else + FATAL ("Unknown Gen Device"); #if USE_FULSIM cl_run_fulsim(); #endif /* USE_FULSIM */ error: - cl_free(kernels); - cl_free(ids[0]); - cl_free(ids[1]); - cl_free(ids[2]); return err; } @@ -612,6 +286,8 @@ cl_command_queue_finish(cl_command_queue queue) return CL_SUCCESS; } +extern int drm_intel_aub_set_bo_to_dump(drm_intel_bufmgr*, drm_intel_bo*); + LOCAL cl_int cl_command_queue_set_fulsim_buffer(cl_command_queue queue, cl_mem mem) { diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h index 0e0a37f..3e60e69 100644 --- a/src/cl_command_queue.h +++ b/src/cl_command_queue.h @@ -50,12 +50,12 @@ extern void cl_command_queue_delete(cl_command_queue); extern void cl_command_queue_add_ref(cl_command_queue); /* Map ND range kernel from OCL API */ -extern cl_int cl_command_queue_ND_kernel(cl_command_queue queue, - cl_kernel ker, - cl_uint work_dim, - const size_t *global_work_offset, - const size_t *global_work_size, - const size_t *local_work_size); +extern cl_int cl_command_queue_ND_range(cl_command_queue queue, + cl_kernel ker, + cl_uint work_dim, + const size_t *global_work_offset, + const size_t *global_work_size, + const size_t *local_work_size); /* The memory object where to report the performance */ extern cl_int cl_command_queue_set_report_buffer(cl_command_queue, cl_mem); @@ -66,5 +66,13 @@ cl_int cl_command_queue_set_fulsim_buffer(cl_command_queue, cl_mem); /* Wait for the completion of the command queue */ extern cl_int cl_command_queue_finish(cl_command_queue); +/* Bind all the surfaces in the GPGPU state */ +extern cl_int cl_command_queue_bind_surface(cl_command_queue queue, + cl_kernel k, + struct _drm_intel_bo **local, + struct _drm_intel_bo **priv, + struct _drm_intel_bo **scratch, + uint32_t local_sz); + #endif /* __CL_COMMAND_QUEUE_H__ */ diff --git a/src/cl_kernel.c b/src/cl_kernel.c index 20c0f42..0268a51 100644 --- a/src/cl_kernel.c +++ b/src/cl_kernel.c @@ -698,3 +698,84 @@ error: return err; } +static INLINE int32_t +cl_kernel_get_first_local(cl_kernel k) +{ + int32_t i; + for (i = 0; i < (int32_t) k->curbe_info_n; ++i) + if (k->curbe_info[i].type == DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_ARGUMENT_SIZES) + return i; + return k->curbe_info_n; +} + +LOCAL uint32_t +cl_kernel_local_memory_sz(cl_kernel k) +{ + int32_t i; + uint32_t local_mem_sz = 0; + + if (k->has_local_buffer) { + + /* Look for all local surfaces offset to set */ + i = cl_kernel_get_first_local(k); + + /* Now, set the offsets for all local surfaces */ + for (; i < (int32_t) k->curbe_info_n; ++i) { + cl_curbe_patch_info_t *info = k->curbe_info + i; + const size_t offset = local_mem_sz; + if (info->type != DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_ARGUMENT_SIZES) + break; + assert(info->last == 0); + assert(sizeof(int32_t) + info->offsets[0] <= k->patch.curbe.sz); + memcpy(k->cst_buffer + info->offsets[0], &offset, sizeof(int32_t)); + local_mem_sz += info->sz; + } + local_mem_sz += k->patch.local_surf.sz; + } + return local_mem_sz; +} + +LOCAL char* +cl_kernel_create_cst_buffer(cl_kernel k, + cl_uint wk_dim, + const size_t *global_wk_sz, + const size_t *local_wk_sz) +{ + cl_curbe_patch_info_t *info = NULL; + const size_t sz = k->patch.curbe.sz; + uint64_t key = 0; + char *data = NULL; + + TRY_ALLOC_NO_ERR (data, (char *) cl_calloc(sz, 1)); + memcpy(data, k->cst_buffer, sz); + + /* Global work group size */ + key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_SIZE, 0, 0); + if ((info = cl_kernel_get_curbe_info(k, key)) != NULL) + memcpy(data+info->offsets[0], global_wk_sz, sizeof(uint32_t)); + key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_SIZE, 0, 4); + if ((info = cl_kernel_get_curbe_info(k, key)) != NULL) + memcpy(data+info->offsets[0], global_wk_sz+1, sizeof(uint32_t)); + key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_SIZE, 0, 8); + if ((info = cl_kernel_get_curbe_info(k, key)) != NULL) + memcpy(data+info->offsets[0], global_wk_sz+2, sizeof(uint32_t)); + + /* Local work group size */ + key = cl_curbe_key(DATA_PARAMETER_LOCAL_WORK_SIZE, 0, 0); + if ((info = cl_kernel_get_curbe_info(k, key)) != NULL) + memcpy(data+info->offsets[0], local_wk_sz, sizeof(uint32_t)); + key = cl_curbe_key(DATA_PARAMETER_LOCAL_WORK_SIZE, 0, 4); + if ((info = cl_kernel_get_curbe_info(k, key)) != NULL) + memcpy(data+info->offsets[0], local_wk_sz+1, sizeof(uint32_t)); + key = cl_curbe_key(DATA_PARAMETER_LOCAL_WORK_SIZE, 0, 8); + if ((info = cl_kernel_get_curbe_info(k, key)) != NULL) + memcpy(data+info->offsets[0], local_wk_sz+2, sizeof(uint32_t)); + +exit: + return data; +error: + cl_free(data); + data = NULL; + goto exit; +} + diff --git a/src/cl_kernel.h b/src/cl_kernel.h index 28dae57..8500209 100644 --- a/src/cl_kernel.h +++ b/src/cl_kernel.h @@ -295,10 +295,16 @@ extern void cl_kernel_add_ref(cl_kernel); extern int cl_kernel_setup(cl_kernel, const char*); /* Set the argument before kernel execution */ -extern int cl_kernel_set_arg(cl_kernel kernel, - uint32_t arg_index, - size_t arg_size, - const void * arg_value); +extern int cl_kernel_set_arg(cl_kernel, + uint32_t arg_index, + size_t arg_size, + const void *arg_value); + +/* Check that all arguments are set before running the kernel */ +extern cl_int cl_kernel_check_args(cl_kernel); + +/* Get the size of shared local memory bound to the kernel */ +extern uint32_t cl_kernel_local_memory_sz(cl_kernel); /* Return a curbe entry if it exists. NULL otherwise */ extern cl_curbe_patch_info_t *cl_kernel_get_curbe_info(cl_kernel, uint64_t); @@ -312,5 +318,12 @@ cl_curbe_key(uint32_t type, uint32_t index, uint32_t src_offset) (uint64_t) src_offset; } +/* Allocate and fill the CURBE */ +extern char* +cl_kernel_create_cst_buffer(cl_kernel k, + cl_uint wk_dim, + const size_t *global_wk_sz, + const size_t *local_wk_sz); + #endif /* __CL_KERNEL_H__ */ diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c index 81d5ced..83209b5 100644 --- a/src/intel/intel_gpgpu.c +++ b/src/intel/intel_gpgpu.c @@ -1004,3 +1004,9 @@ gpgpu_run_with_inline(intel_gpgpu_t *state, int32_t ki, size_t sz) return (char*) intel_batchbuffer_alloc_space(state->batch,sz); } +LOCAL int32_t +intel_gpgpu_version(intel_gpgpu_t *gpgpu) +{ + return gpgpu->drv->gen_ver; +} + diff --git a/src/intel/intel_gpgpu.h b/src/intel/intel_gpgpu.h index 8f2e867..789203b 100644 --- a/src/intel/intel_gpgpu.h +++ b/src/intel/intel_gpgpu.h @@ -62,6 +62,9 @@ extern intel_gpgpu_t* intel_gpgpu_new(struct intel_driver*); /* Destroy and deallocate a GPGPU state */ extern void intel_gpgpu_delete(intel_gpgpu_t*); +/* Get the device generation */ +extern int32_t intel_gpgpu_version(intel_gpgpu_t*); + /* Set surface descriptor in the current binding table */ extern void gpgpu_bind_surf_2d(intel_gpgpu_t*, int32_t index, -- 2.7.4