From d6b119cc43c7db6597f45056a835a106de94b9ad Mon Sep 17 00:00:00 2001
From: bsegovia <devnull@localhost>
Date: Sat, 30 Jul 2011 01:35:53 +0000
Subject: [PATCH] Cleaned up code to start GPGPU_WALKER integration

---
 kernels/test_barrier/test_barrier_kernels_0.bin    | Bin 1267 -> 1203 bytes
 kernels/test_barrier/test_barrier_kernels_0.ghal3d |  18 +-
 src/CMakeLists.txt                                 |   2 +
 src/cl_api.c                                       |  12 +-
 src/cl_command_queue.c                             | 372 ++-------------------
 src/cl_command_queue.h                             |  20 +-
 src/cl_kernel.c                                    |  81 +++++
 src/cl_kernel.h                                    |  21 +-
 src/intel/intel_gpgpu.c                            |   6 +
 src/intel/intel_gpgpu.h                            |   3 +
 10 files changed, 165 insertions(+), 370 deletions(-)

diff --git a/kernels/test_barrier/test_barrier_kernels_0.bin b/kernels/test_barrier/test_barrier_kernels_0.bin
index fbb28b17114b70a59eaa5bb97983c100c8803844..23ab7d78c2d68833575cf042a891cc99ccdc43f6 100644
GIT binary patch
delta 97
zcmey&xtUYbImFNNB{Ks92Ll5GBM@KHe)&uQNZSB88<>C$AaDR;kByp385ub@pJ()C
nWOSM=%WOK?h*^nIVe(dHV_uK~kO3k<>@fK&vpP^*mgO1%)Yubo

delta 109
zcmdnY`I%GGImFNNB{Ks92Ll5GBM`4D@#+x((l$WO1|}c_2poX8W25F$Mn>t)=NWw&
w8Ji}{GMi3fmYKxNGC7D@3BqD@m|V(i%-aA|3^Ggvh#MwvWmX4@-(|iA0K6g=+5i9m

diff --git a/kernels/test_barrier/test_barrier_kernels_0.ghal3d b/kernels/test_barrier/test_barrier_kernels_0.ghal3d
index 3e68040..92489ce 100644
--- a/kernels/test_barrier/test_barrier_kernels_0.ghal3d
+++ b/kernels/test_barrier/test_barrier_kernels_0.ghal3d
@@ -32,7 +32,7 @@
 
 	DCL_INPUT i0.xyz, THREAD_ID;
 
-	DCL_TEMPS r0..r23;
+	DCL_TEMPS r0..r29;
 	DCL_POINTERS ptr0..ptr3;
 	ADDRESS_OF ptr0, u0;
 	MOV r0, c0.x;
@@ -54,11 +54,17 @@
 	IADD r19.x, r18.x, r5.x;
 	MOV r20.x, i0.x;
 	SHL r21.x, r20.x, 2;
-	PADD ptr2, ptr1, r21.x;
-	LD_RAW_PTR r22.x, ptr2;
-	IADD r23.x, r19.x, r22.x;
-	PADD ptr3, ptr0, r21.x;
-	STORE_RAW_PTR ptr3.x, r23.x;
+	SHR r22.x, r20.x, 30;
+	MOV r23.x, 0;
+	MOV_SWZ r24, r21.x, r22.x, r23.x, r23.x;
+	MOV r25.xy, r24;
+	MOV r26.xy, r25.xyxy;
+	PADD ptr2, ptr1, r26.x;
+	LD_RAW_PTR r27.x, ptr2;
+	IADD r28.x, r19.x, r27.x;
+	MOV r29.xy, r25.xyxy;
+	PADD ptr3, ptr0, r29.x;
+	STORE_RAW_PTR ptr3.x, r28.x;
 	SYNC.t.g;
 	RET;
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 71f2ea8..1380ad7 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -14,6 +14,8 @@ SET(OPENCL_SRC
     cl_device_id.c
     cl_context.c
     cl_command_queue.c
+    cl_command_queue_gen6.c
+    cl_command_queue_gen7.c
     intel/intel_gpgpu.c
     intel/intel_batchbuffer.c
     intel/intel_driver.c
diff --git a/src/cl_api.c b/src/cl_api.c
index 3e17af9..d9875cd 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -962,12 +962,12 @@ clEnqueueNDRangeKernel(cl_command_queue  command_queue,
   FATAL_IF(event != NULL, "Events are not supported");
 
   /* Do device specific checks are enqueue the kernel */
-  err = cl_command_queue_ND_kernel(command_queue,
-                                   kernel,
-                                   work_dim,
-                                   global_work_offset,
-                                   global_work_size,
-                                   local_work_size);
+  err = cl_command_queue_ND_range(command_queue,
+                                  kernel,
+                                  work_dim,
+                                  global_work_offset,
+                                  global_work_size,
+                                  local_work_size);
 
 error:
   return err;
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index d1b1439..59dcf77 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -95,37 +95,24 @@ cl_command_queue_add_ref(cl_command_queue queue)
   atomic_inc(&queue->ref_n);
 }
 
-/* Header used by kernels */
-typedef struct cl_inline_header {
-  uint32_t grp_n[3];
-  uint32_t local_sz[3];
-  uint32_t exec_mask;
-  uint32_t local_mem_sz;
-} cl_inline_header_t;
-
-/* ID inside the work group */
-typedef struct cl_local_id {
-  uint16_t data[16];
-} cl_local_id_t;
-
 #define SURFACE_SZ 32
 
-static cl_int
+extern cl_int
 cl_command_queue_bind_surface(cl_command_queue queue,
                               cl_kernel k,
                               drm_intel_bo **local, 
-                              uint32_t local_sz,
                               drm_intel_bo **priv,
-                              drm_intel_bo **scratch)
+                              drm_intel_bo **scratch,
+                              uint32_t local_sz)
 {
   cl_context ctx = queue->ctx;
   intel_gpgpu_t *gpgpu = queue->gpgpu;
   drm_intel_bufmgr *bufmgr = cl_context_get_intel_bufmgr(ctx);
   cl_mem mem = NULL;
   drm_intel_bo *bo = NULL, *sync_bo = NULL;
+  const size_t max_thread = ctx->device->max_compute_unit;
   cl_int err = CL_SUCCESS;
   uint32_t i, index;
-  const size_t max_thread = ctx->device->max_compute_unit;
 
   /* Bind user defined surface */
   for (i = 0; i < k->arg_info_n; ++i) {
@@ -205,7 +192,7 @@ error:
   return err;
 }
 
-static INLINE cl_int
+LOCAL cl_int
 cl_kernel_check_args(cl_kernel k)
 {
   uint32_t i;
@@ -215,129 +202,8 @@ cl_kernel_check_args(cl_kernel k)
   return CL_SUCCESS;
 }
 
-static INLINE void
-cl_command_queue_enqueue_wrk_grp3(cl_command_queue queue,
-                                  cl_local_id_t **ids,
-                                  const cl_inline_header_t *header,
-                                  uint32_t thread_n,
-                                  uint32_t barrierID)
-{
-  intel_gpgpu_t *gpgpu = queue->gpgpu;
-  uint32_t i;
-  for (i = 0; i < thread_n; ++i) {
-    const size_t sz = sizeof(cl_inline_header_t) + 3*sizeof(cl_local_id_t);
-    char *data = gpgpu_run_with_inline(gpgpu, barrierID, sz);
-    size_t offset = 0;
-    assert(data);
-    *((cl_inline_header_t *) (data + offset)) = *header;
-    offset += sizeof(cl_inline_header_t);
-    *((cl_local_id_t *) (data + offset)) = ids[0][i];
-    offset += sizeof(cl_local_id_t);
-    *((cl_local_id_t *) (data + offset)) = ids[1][i];
-    offset += sizeof(cl_local_id_t);
-    *((cl_local_id_t *) (data + offset)) = ids[2][i];
-  }
-}
-
-static INLINE void
-cl_command_queue_enqueue_wrk_grp2(cl_command_queue queue,
-                                  cl_local_id_t **ids,
-                                  const cl_inline_header_t *header,
-                                  uint32_t thread_n,
-                                  uint32_t barrierID)
-{
-  intel_gpgpu_t *gpgpu = queue->gpgpu;
-  uint32_t i;
-  for (i = 0; i < thread_n; ++i) {
-    const size_t sz = sizeof(cl_inline_header_t) + 2*sizeof(cl_local_id_t);
-    char *data = gpgpu_run_with_inline(gpgpu, barrierID, sz);
-    size_t offset = 0;
-    assert(data);
-    *((cl_inline_header_t *) (data + offset)) = *header;
-    offset += sizeof(cl_inline_header_t);
-    *((cl_local_id_t *) (data + offset)) = ids[0][i];
-    offset += sizeof(cl_local_id_t);
-    *((cl_local_id_t *) (data + offset)) = ids[1][i];
-  }
-}
-
-static INLINE void
-cl_command_queue_enqueue_wrk_grp1(cl_command_queue queue,
-                                  cl_local_id_t **ids,
-                                  const cl_inline_header_t *header,
-                                  uint32_t thread_n,
-                                  uint32_t barrierID)
-{
-  intel_gpgpu_t *gpgpu = queue->gpgpu;
-  uint32_t i;
-  for (i = 0; i < thread_n; ++i) {
-    const size_t sz = sizeof(cl_inline_header_t) + sizeof(cl_local_id_t);
-    char *data = gpgpu_run_with_inline(gpgpu, barrierID, sz);
-    size_t offset = 0;
-    assert(data);
-    *((cl_inline_header_t *) (data + offset)) = *header;
-    offset += sizeof(cl_inline_header_t);
-    *((cl_local_id_t *) (data + offset)) = ids[0][i];
-  }
-}
-
-static INLINE int32_t
-cl_kernel_get_first_local(cl_kernel k)
-{
-  int32_t i;
-  for (i = 0; i < (int32_t) k->curbe_info_n; ++i)
-    if (k->curbe_info[i].type == DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_ARGUMENT_SIZES)
-      return i;
-  return k->curbe_info_n;
-}
-
-static void
-cl_kernel_handle_local_memory(cl_kernel k, cl_inline_header_t *header)
-{
-  int32_t i;
-
-  if (k->has_local_buffer) {
-    header->local_mem_sz = 0;
-
-    /* Look for all local surfaces offset to set */
-    i = cl_kernel_get_first_local(k);
-
-    /* Now, set the offsets for all local surfaces */
-    for (; i < (int32_t) k->curbe_info_n; ++i) {
-      cl_curbe_patch_info_t *info = k->curbe_info + i;
-      const size_t offset = header->local_mem_sz;
-      if (info->type != DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_ARGUMENT_SIZES)
-        break;
-      assert(info->last == 0);
-      assert(sizeof(int32_t) + info->offsets[0] <= k->patch.curbe.sz);
-      memcpy(k->cst_buffer + info->offsets[0], &offset, sizeof(int32_t));
-      header->local_mem_sz += info->sz;
-    }
-    header->local_mem_sz += k->patch.local_surf.sz;
-  }
-  else
-    header->local_mem_sz = 0;
-}
-
-static INLINE size_t
-cl_ker_compute_batch_sz(cl_kernel k,
-                        size_t wrk_dim_n,
-                        size_t wrk_grp_n,
-                        size_t thread_n)
-{
-  size_t sz = 256; /* upper bound of the complete prelude */
-  size_t media_obj_sz = 6 * 4; /* size of one MEDIA OBJECT */
-  media_obj_sz += sizeof(cl_inline_header_t); /* header for all threads */
-  media_obj_sz += wrk_dim_n * sizeof(cl_local_id_t);/* for each dimension */ 
-  if (k->patch.exec_env.has_barriers)
-    media_obj_sz += 4 * 4; /* one barrier update per object */
-  sz += media_obj_sz * wrk_grp_n * thread_n;
-  return sz;
-}
-
 LOCAL cl_int
-cl_command_queue_set_report_buffer(cl_command_queue queue,
-                                   cl_mem mem)
+cl_command_queue_set_report_buffer(cl_command_queue queue, cl_mem mem)
 {
   cl_int err = CL_SUCCESS;
   if (queue->perf != NULL) {
@@ -357,50 +223,6 @@ error:
   return err;
 }
 
-static char*
-cl_kernel_create_cst_buffer(cl_kernel k, 
-                            cl_uint work_dim,
-                            const size_t *global_wk_sz,
-                            const size_t *local_wk_sz)
-{
-  cl_curbe_patch_info_t *info = NULL;
-  const size_t sz = k->patch.curbe.sz;
-  uint64_t key = 0;
-  char *data = NULL;
-
-  TRY_ALLOC_NO_ERR (data, (char *) cl_calloc(sz, 1));
-  memcpy(data, k->cst_buffer, sz);
-
-  /* Global work group size */
-  key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_SIZE, 0, 0);
-  if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
-    memcpy(data+info->offsets[0], global_wk_sz,   sizeof(uint32_t));
-  key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_SIZE, 0, 4);
-  if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
-    memcpy(data+info->offsets[0], global_wk_sz+1, sizeof(uint32_t));
-  key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_SIZE, 0, 8);
-  if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
-    memcpy(data+info->offsets[0], global_wk_sz+2, sizeof(uint32_t));
-
-  /* Local work group size */
-  key = cl_curbe_key(DATA_PARAMETER_LOCAL_WORK_SIZE, 0, 0);
-  if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
-    memcpy(data+info->offsets[0], local_wk_sz,   sizeof(uint32_t));
-  key = cl_curbe_key(DATA_PARAMETER_LOCAL_WORK_SIZE, 0, 4);
-  if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
-    memcpy(data+info->offsets[0], local_wk_sz+1, sizeof(uint32_t));
-  key = cl_curbe_key(DATA_PARAMETER_LOCAL_WORK_SIZE, 0, 8);
-  if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
-    memcpy(data+info->offsets[0], local_wk_sz+2, sizeof(uint32_t));
-
-exit:
-  return data;
-error:
-  cl_free(data);
-  data = NULL;
-  goto exit;
-}
-
 #if USE_FULSIM
 LOCAL void
 cl_run_fulsim(void)
@@ -423,181 +245,33 @@ cl_run_fulsim(void)
 }
 #endif /* USE_FULSIM */
 
+extern cl_int cl_command_queue_ND_range_gen6(cl_command_queue, cl_kernel, cl_uint, const size_t*, const size_t*, const size_t*);
+extern cl_int cl_command_queue_ND_range_gen7(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *);
+
 LOCAL cl_int
-cl_command_queue_ND_kernel(cl_command_queue queue,
-                           cl_kernel ker,
-                           cl_uint work_dim,
-                           const size_t *global_work_offset,
-                           const size_t *global_wk_sz,
-                           const size_t *local_wk_sz)
+cl_command_queue_ND_range(cl_command_queue queue,
+                          cl_kernel ker,
+                          cl_uint wk_dim,
+                          const size_t *global_wk_off,
+                          const size_t *global_wk_sz,
+                          const size_t *local_wk_sz)
 {
-  cl_context ctx = queue->ctx;
   intel_gpgpu_t *gpgpu = queue->gpgpu;
-  drm_intel_bo *slm_bo = NULL, *private_bo = NULL, *scratch_bo = NULL;
-  size_t cst_sz = ker->patch.curbe.sz;
-  size_t wrk_grp_sz, wrk_grp_n, batch_sz;
-  uint32_t grp_end[3], offset[3], thread_n; /* per work group */
-  uint32_t i, j, k, curr;
-  uint32_t barrierID = 0;
-  genx_gpgpu_kernel_t *kernels = NULL;
-
-  cl_inline_header_t header;
-  cl_local_id_t *ids[3] = {NULL,NULL,NULL};
+  const int32_t ver = intel_gpgpu_version(gpgpu);
   cl_int err = CL_SUCCESS;
 
-  /* Allocate 16 kernels (one for each barrier) */
-  TRY_ALLOC (kernels, CALLOC_ARRAY(genx_gpgpu_kernel_t, 16));
-  for (i = 0; i < 16; ++i) {
-    kernels[i].name = "OCL kernel";
-    kernels[i].grf_blocks = 128;
-    kernels[i].cst_sz = cst_sz;
-    kernels[i].bin = NULL,
-    kernels[i].size = 0,
-    kernels[i].bo = ker->bo;
-    kernels[i].barrierID = i;
-  }
-
-  /* All arguments must have been set */
-  TRY (cl_kernel_check_args, ker);
-
-  /* Total number of elements in the work group */
-  for (i = 0; i < work_dim; ++i)
-    if ((&ker->patch.exec_env.required_wgr_sz_x)[i] &&
-        (&ker->patch.exec_env.required_wgr_sz_x)[i] != local_wk_sz[i]) {
-      err = CL_INVALID_WORK_ITEM_SIZE;
-      goto error;
-    }
-  wrk_grp_sz = local_wk_sz[0];
-  for (i = 1; i < work_dim; ++i)
-    wrk_grp_sz *= local_wk_sz[i];
-  FATAL_IF (wrk_grp_sz % 16, "Work group size must be a multiple of 16");
-  if (wrk_grp_sz > ctx->device->max_work_group_size) {
-    err = CL_INVALID_WORK_ITEM_SIZE;
-    goto error;
-  }
-
-  /* Directly from the user defined values */
-  header.local_sz[0] = local_wk_sz[0];
-  header.local_sz[1] = local_wk_sz[1];
-  header.local_sz[2] = local_wk_sz[2];
-  offset[0] = header.grp_n[0] = 0;
-  offset[1] = header.grp_n[1] = 0;
-  offset[2] = header.grp_n[2] = 0;
-  header.exec_mask = ~0;
-
-  /* offsets are evenly divided by the local sizes */
-  if (global_work_offset)
-    for (i = 0; i < work_dim; ++i)
-      offset[i] = global_work_offset[i]/local_wk_sz[i];
-
-  /* Compute the local size per wg and the offsets for each local buffer */
-  cl_kernel_handle_local_memory(ker, &header);
-
-  if (queue->perf)
-    gpgpu_set_perf_counters(gpgpu, queue->perf->bo);
-
-  /* Setup the kernel */
-  gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, 4, 64, cst_sz / 32, 64);
-  if (queue->last_batch != NULL)
-    drm_intel_bo_unreference(queue->last_batch);
-  queue->last_batch = NULL;
-  cl_command_queue_bind_surface(queue,
-                                ker,
-                                &slm_bo,
-                                header.local_mem_sz,
-                                &private_bo,
-                                &scratch_bo);
-  gpgpu_states_setup(gpgpu, kernels, 16);
-
-  /* Fill the constant buffer */
-  if (cst_sz > 0) {
-    char *data = NULL;
-    assert(ker->cst_buffer);
-    data = cl_kernel_create_cst_buffer(ker,work_dim,global_wk_sz,local_wk_sz);
-    gpgpu_upload_constants(gpgpu, data, cst_sz);
-    cl_free(data);
-  }
-
-  wrk_grp_n = 1;
-  for (i = 0; i < work_dim; ++i) {
-    TRY_ALLOC (ids[i], (cl_local_id_t*) cl_malloc(wrk_grp_sz*sizeof(uint16_t)));
-    grp_end[i] = offset[i] + global_wk_sz[i] / local_wk_sz[i];
-    wrk_grp_n *= grp_end[i]-offset[i];
-  }
-  thread_n = wrk_grp_sz / 16;
-  batch_sz = cl_ker_compute_batch_sz(ker, work_dim, wrk_grp_n, thread_n);
-
-  /* Start a new batch buffer */
-  gpgpu_batch_reset(gpgpu, batch_sz);
-  gpgpu_batch_start(gpgpu);
-#if 1
-  /* Push all media objects. We implement three paths to make it (a bit) faster.
-   * Local IDs are shared from work group to work group. We allocate once the
-   * buffers and reuse them
-   */
-  if (work_dim == 3) {
-    curr = 0;
-    for (i = 0; i < local_wk_sz[0]; ++i)
-    for (j = 0; j < local_wk_sz[1]; ++j)
-    for (k = 0; k < local_wk_sz[2]; ++k, ++curr) {
-      ((uint16_t*) ids[0])[curr] = i;
-      ((uint16_t*) ids[1])[curr] = j;
-      ((uint16_t*) ids[2])[curr] = k;
-    }
-    for (header.grp_n[0] = offset[0]; header.grp_n[0] < grp_end[0]; ++header.grp_n[0])
-    for (header.grp_n[1] = offset[1]; header.grp_n[1] < grp_end[1]; ++header.grp_n[1])
-    for (header.grp_n[2] = offset[2]; header.grp_n[2] < grp_end[2]; ++header.grp_n[2]) {
-      if (ker->patch.exec_env.has_barriers)
-        gpgpu_update_barrier(gpgpu, barrierID, thread_n);
-      cl_command_queue_enqueue_wrk_grp3(queue, ids, &header, thread_n, barrierID);
-      barrierID = (barrierID + 1) % 16;
-    }
-  }
-  else if (work_dim == 2) {
-    curr = 0;
-    for (i = 0; i < local_wk_sz[0]; ++i)
-    for (j = 0; j < local_wk_sz[1]; ++j, ++curr) {
-      ((uint16_t*) ids[0])[curr] = i;
-      ((uint16_t*) ids[1])[curr] = j;
-    }
-    for (header.grp_n[0] = offset[0]; header.grp_n[0] < grp_end[0]; ++header.grp_n[0])
-    for (header.grp_n[1] = offset[1]; header.grp_n[1] < grp_end[1]; ++header.grp_n[1]) {
-      if (ker->patch.exec_env.has_barriers)
-        gpgpu_update_barrier(gpgpu, barrierID, thread_n);
-      cl_command_queue_enqueue_wrk_grp2(queue, ids, &header, thread_n, barrierID);
-      barrierID = (barrierID + 1) % 16;
-    }
-  }
-  else {
-    for (i = 0; i < local_wk_sz[0]; ++i)
-      ((uint16_t*) ids[0])[i] = i;
-    for (header.grp_n[0] = offset[0]; header.grp_n[0] < grp_end[0]; ++header.grp_n[0]) {
-      if (ker->patch.exec_env.has_barriers)
-        gpgpu_update_barrier(gpgpu, barrierID, thread_n);
-      cl_command_queue_enqueue_wrk_grp1(queue, ids, &header, thread_n, barrierID);
-      barrierID = (barrierID + 1) % 16;
-    }
-  }
-#endif
-  gpgpu_batch_end(gpgpu, 0);
-  gpgpu_flush(gpgpu);
-
-  if (slm_bo)
-    drm_intel_bo_unreference(slm_bo);
-  if (private_bo)
-    drm_intel_bo_unreference(private_bo);
-  if (scratch_bo)
-    drm_intel_bo_unreference(scratch_bo);
+  if (ver == 6)
+    TRY (cl_command_queue_ND_range_gen6, queue, ker, wk_dim, global_wk_off, global_wk_sz, local_wk_sz);
+  else if (ver == 7)
+    TRY (cl_command_queue_ND_range_gen7, queue, ker, wk_dim, global_wk_off, global_wk_sz, local_wk_sz);
+  else
+    FATAL ("Unknown Gen Device");
 
 #if USE_FULSIM
   cl_run_fulsim();
 #endif /* USE_FULSIM */
 
 error:
-  cl_free(kernels);
-  cl_free(ids[0]);
-  cl_free(ids[1]);
-  cl_free(ids[2]);
   return err;
 }
 
@@ -612,6 +286,8 @@ cl_command_queue_finish(cl_command_queue queue)
   return CL_SUCCESS;
 }
 
+extern int drm_intel_aub_set_bo_to_dump(drm_intel_bufmgr*, drm_intel_bo*);
+
 LOCAL cl_int
 cl_command_queue_set_fulsim_buffer(cl_command_queue queue, cl_mem mem)
 {
diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h
index 0e0a37f..3e60e69 100644
--- a/src/cl_command_queue.h
+++ b/src/cl_command_queue.h
@@ -50,12 +50,12 @@ extern void cl_command_queue_delete(cl_command_queue);
 extern void cl_command_queue_add_ref(cl_command_queue);
 
 /* Map ND range kernel from OCL API */
-extern cl_int cl_command_queue_ND_kernel(cl_command_queue queue,
-                                         cl_kernel ker,
-                                         cl_uint work_dim,
-                                         const size_t *global_work_offset,
-                                         const size_t *global_work_size,
-                                         const size_t *local_work_size);
+extern cl_int cl_command_queue_ND_range(cl_command_queue queue,
+                                        cl_kernel ker,
+                                        cl_uint work_dim,
+                                        const size_t *global_work_offset,
+                                        const size_t *global_work_size,
+                                        const size_t *local_work_size);
 
 /* The memory object where to report the performance */
 extern cl_int cl_command_queue_set_report_buffer(cl_command_queue, cl_mem);
@@ -66,5 +66,13 @@ cl_int cl_command_queue_set_fulsim_buffer(cl_command_queue, cl_mem);
 /* Wait for the completion of the command queue */
 extern cl_int cl_command_queue_finish(cl_command_queue);
 
+/* Bind all the surfaces in the GPGPU state */
+extern cl_int cl_command_queue_bind_surface(cl_command_queue queue,
+                                            cl_kernel k,
+                                            struct _drm_intel_bo **local, 
+                                            struct _drm_intel_bo **priv,
+                                            struct _drm_intel_bo **scratch,
+                                            uint32_t local_sz);
+
 #endif /* __CL_COMMAND_QUEUE_H__ */
 
diff --git a/src/cl_kernel.c b/src/cl_kernel.c
index 20c0f42..0268a51 100644
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -698,3 +698,84 @@ error:
   return err;
 }
 
+static INLINE int32_t
+cl_kernel_get_first_local(cl_kernel k)
+{
+  int32_t i;
+  for (i = 0; i < (int32_t) k->curbe_info_n; ++i)
+    if (k->curbe_info[i].type == DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_ARGUMENT_SIZES)
+      return i;
+  return k->curbe_info_n;
+}
+
+LOCAL uint32_t
+cl_kernel_local_memory_sz(cl_kernel k)
+{
+  int32_t i;
+  uint32_t local_mem_sz = 0;
+
+  if (k->has_local_buffer) {
+
+    /* Look for all local surfaces offset to set */
+    i = cl_kernel_get_first_local(k);
+
+    /* Now, set the offsets for all local surfaces */
+    for (; i < (int32_t) k->curbe_info_n; ++i) {
+      cl_curbe_patch_info_t *info = k->curbe_info + i;
+      const size_t offset = local_mem_sz;
+      if (info->type != DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_ARGUMENT_SIZES)
+        break;
+      assert(info->last == 0);
+      assert(sizeof(int32_t) + info->offsets[0] <= k->patch.curbe.sz);
+      memcpy(k->cst_buffer + info->offsets[0], &offset, sizeof(int32_t));
+      local_mem_sz += info->sz;
+    }
+    local_mem_sz += k->patch.local_surf.sz;
+  }
+  return local_mem_sz;
+}
+
+LOCAL char*
+cl_kernel_create_cst_buffer(cl_kernel k,
+                            cl_uint wk_dim,
+                            const size_t *global_wk_sz,
+                            const size_t *local_wk_sz)
+{
+  cl_curbe_patch_info_t *info = NULL;
+  const size_t sz = k->patch.curbe.sz;
+  uint64_t key = 0;
+  char *data = NULL;
+
+  TRY_ALLOC_NO_ERR (data, (char *) cl_calloc(sz, 1));
+  memcpy(data, k->cst_buffer, sz);
+
+  /* Global work group size */
+  key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_SIZE, 0, 0);
+  if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
+    memcpy(data+info->offsets[0], global_wk_sz,   sizeof(uint32_t));
+  key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_SIZE, 0, 4);
+  if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
+    memcpy(data+info->offsets[0], global_wk_sz+1, sizeof(uint32_t));
+  key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_SIZE, 0, 8);
+  if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
+    memcpy(data+info->offsets[0], global_wk_sz+2, sizeof(uint32_t));
+
+  /* Local work group size */
+  key = cl_curbe_key(DATA_PARAMETER_LOCAL_WORK_SIZE, 0, 0);
+  if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
+    memcpy(data+info->offsets[0], local_wk_sz,   sizeof(uint32_t));
+  key = cl_curbe_key(DATA_PARAMETER_LOCAL_WORK_SIZE, 0, 4);
+  if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
+    memcpy(data+info->offsets[0], local_wk_sz+1, sizeof(uint32_t));
+  key = cl_curbe_key(DATA_PARAMETER_LOCAL_WORK_SIZE, 0, 8);
+  if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
+    memcpy(data+info->offsets[0], local_wk_sz+2, sizeof(uint32_t));
+
+exit:
+  return data;
+error:
+  cl_free(data);
+  data = NULL;
+  goto exit;
+}
+
diff --git a/src/cl_kernel.h b/src/cl_kernel.h
index 28dae57..8500209 100644
--- a/src/cl_kernel.h
+++ b/src/cl_kernel.h
@@ -295,10 +295,16 @@ extern void cl_kernel_add_ref(cl_kernel);
 extern int cl_kernel_setup(cl_kernel, const char*);
 
 /* Set the argument before kernel execution */
-extern int cl_kernel_set_arg(cl_kernel     kernel,
-                                    uint32_t      arg_index,
-                                    size_t        arg_size,
-                                    const void *  arg_value);
+extern int cl_kernel_set_arg(cl_kernel,
+                             uint32_t    arg_index,
+                             size_t      arg_size,
+                             const void *arg_value);
+
+/* Check that all arguments are set before running the kernel */
+extern cl_int cl_kernel_check_args(cl_kernel);
+
+/* Get the size of shared local memory bound to the kernel */
+extern uint32_t cl_kernel_local_memory_sz(cl_kernel);
 
 /* Return a curbe entry if it exists. NULL otherwise */
 extern cl_curbe_patch_info_t *cl_kernel_get_curbe_info(cl_kernel, uint64_t);
@@ -312,5 +318,12 @@ cl_curbe_key(uint32_t type, uint32_t index, uint32_t src_offset)
           (uint64_t) src_offset;
 }
 
+/* Allocate and fill the CURBE */
+extern char*
+cl_kernel_create_cst_buffer(cl_kernel k,
+                            cl_uint wk_dim,
+                            const size_t *global_wk_sz,
+                            const size_t *local_wk_sz);
+
 #endif /* __CL_KERNEL_H__ */
 
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index 81d5ced..83209b5 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -1004,3 +1004,9 @@ gpgpu_run_with_inline(intel_gpgpu_t *state, int32_t ki, size_t sz)
   return (char*) intel_batchbuffer_alloc_space(state->batch,sz);
 }
 
+LOCAL int32_t
+intel_gpgpu_version(intel_gpgpu_t *gpgpu)
+{
+  return gpgpu->drv->gen_ver;
+}
+
diff --git a/src/intel/intel_gpgpu.h b/src/intel/intel_gpgpu.h
index 8f2e867..789203b 100644
--- a/src/intel/intel_gpgpu.h
+++ b/src/intel/intel_gpgpu.h
@@ -62,6 +62,9 @@ extern intel_gpgpu_t* intel_gpgpu_new(struct intel_driver*);
 /* Destroy and deallocate a GPGPU state */
 extern void intel_gpgpu_delete(intel_gpgpu_t*);
 
+/* Get the device generation */
+extern int32_t intel_gpgpu_version(intel_gpgpu_t*);
+
 /* Set surface descriptor in the current binding table */
 extern void gpgpu_bind_surf_2d(intel_gpgpu_t*,
                                int32_t index,
-- 
2.7.4