Cleaned and simplified code for gen6 Code now starts to work for gen7
authorbsegovia <devnull@localhost>
Thu, 4 Aug 2011 01:33:16 +0000 (01:33 +0000)
committerKeith Packard <keithp@keithp.com>
Fri, 10 Aug 2012 23:14:40 +0000 (16:14 -0700)
src/cl_api.c
src/cl_command_queue.c
src/cl_command_queue.h
src/cl_command_queue_gen6.c
src/cl_command_queue_gen7.c
src/cl_kernel.c
src/cl_kernel.h
src/intel/intel_defines.h
src/intel/intel_gpgpu.c
src/intel/intel_gpgpu.h

index d9875cd..3defcae 100644 (file)
@@ -909,6 +909,9 @@ clEnqueueNDRangeKernel(cl_command_queue  command_queue,
                        const cl_event *  event_wait_list,
                        cl_event *        event)
 {
+  size_t fixed_global_off[] = {0,0,0};
+  size_t fixed_global_sz[] = {1,1,1};
+  size_t fixed_local_sz[] = {16,1,1};
   cl_int err = CL_SUCCESS;
   cl_int i;
 
@@ -961,13 +964,22 @@ clEnqueueNDRangeKernel(cl_command_queue  command_queue,
   FATAL_IF(event_wait_list != NULL, "Events are not supported");
   FATAL_IF(event != NULL, "Events are not supported");
 
+  if (local_work_size != NULL)
+    for (i = 0; i < work_dim; ++i)
+      fixed_local_sz[i] = local_work_size[i];
+  if (global_work_size != NULL)
+    for (i = 0; i < work_dim; ++i)
+      fixed_global_sz[i] = global_work_size[i];
+  if (global_work_offset != NULL)
+    for (i = 0; i < work_dim; ++i)
+      fixed_global_off[i] = global_work_offset[i];
+
   /* Do device specific checks are enqueue the kernel */
   err = cl_command_queue_ND_range(command_queue,
                                   kernel,
-                                  work_dim,
-                                  global_work_offset,
-                                  global_work_size,
-                                  local_work_size);
+                                  fixed_global_off,
+                                  fixed_global_sz,
+                                  fixed_local_sz);
 
 error:
   return err;
index 68b44b4..904df7e 100644 (file)
@@ -139,7 +139,7 @@ cl_command_queue_bind_surface(cl_command_queue queue,
   }
 
   /* Allocate local surface needed for SLM and bind it */
-  if (local_sz != 0) {
+  if (local && local_sz != 0) {
     const size_t sz = 16 * local_sz; /* XXX 16 == maximum barrier number */
     assert(k->patch.local_surf.offset % SURFACE_SZ == 0);
     index = k->patch.local_surf.offset / SURFACE_SZ;
@@ -147,11 +147,11 @@ cl_command_queue_bind_surface(cl_command_queue queue,
     *local = drm_intel_bo_alloc(bufmgr, "CL local surface", sz, 64);
     gpgpu_bind_buf(gpgpu, index, *local, 0, sz, cc_llc_mlc);
   }
-  else
+  else if (local)
     *local = NULL;
 
   /* Allocate private surface and bind it */
-  if (k->patch.private_surf.size != 0) {
+  if (priv && k->patch.private_surf.size != 0) {
     const size_t sz = max_thread *
                       k->patch.private_surf.size *
                       k->patch.exec_env.largest_compiled_simd_sz;
@@ -162,11 +162,11 @@ cl_command_queue_bind_surface(cl_command_queue queue,
     *priv = drm_intel_bo_alloc(bufmgr, "CL private surface", sz, 64);
     gpgpu_bind_buf(gpgpu, index, *priv, 0, sz, cc_llc_mlc);
   }
-  else
+  else if(priv)
     *priv = NULL;
 
   /* Allocate scratch surface and bind it */
-  if (k->patch.scratch.size != 0) {
+  if (scratch && k->patch.scratch.size != 0) {
     const size_t sz = max_thread * /* XXX is it given per lane ??? */
                       k->patch.scratch.size *
                       k->patch.exec_env.largest_compiled_simd_sz;
@@ -177,7 +177,7 @@ cl_command_queue_bind_surface(cl_command_queue queue,
     *scratch = drm_intel_bo_alloc(bufmgr, "CL scratch surface", sz, 64);
     gpgpu_bind_buf(gpgpu, index, *scratch, 0, sz, cc_llc_mlc);
   }
-  else
+  else if (scratch)
     *scratch = NULL;
 
   /* Now bind a bo used for synchronization */
@@ -245,13 +245,12 @@ cl_run_fulsim(void)
 }
 #endif /* USE_FULSIM */
 
-extern cl_int cl_command_queue_ND_range_gen6(cl_command_queue, cl_kernel, cl_uint, const size_t*, const size_t*, const size_t*);
-extern cl_int cl_command_queue_ND_range_gen7(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *);
+extern cl_int cl_command_queue_ND_range_gen6(cl_command_queue, cl_kernel, const size_t*, const size_t*, const size_t*);
+extern cl_int cl_command_queue_ND_range_gen7(cl_command_queue, cl_kernel, const size_t *, const size_t *, const size_t *);
 
 LOCAL cl_int
 cl_command_queue_ND_range(cl_command_queue queue,
-                          cl_kernel ker,
-                          cl_uint wk_dim,
+                          cl_kernel k,
                           const size_t *global_wk_off,
                           const size_t *global_wk_sz,
                           const size_t *local_wk_sz)
@@ -261,19 +260,9 @@ cl_command_queue_ND_range(cl_command_queue queue,
   cl_int err = CL_SUCCESS;
 
   if (ver == 6)
-    TRY (cl_command_queue_ND_range_gen6, queue,
-                                         ker,
-                                         wk_dim,
-                                         global_wk_off,
-                                         global_wk_sz,
-                                         local_wk_sz);
+    TRY (cl_command_queue_ND_range_gen6, queue, k, global_wk_off, global_wk_sz, local_wk_sz);
   else if (ver == 7)
-    TRY (cl_command_queue_ND_range_gen7, queue,
-                                         ker,
-                                         wk_dim,
-                                         global_wk_off,
-                                         global_wk_sz,
-                                         local_wk_sz);
+    TRY (cl_command_queue_ND_range_gen7, queue, k, global_wk_off, global_wk_sz, local_wk_sz);
   else
     FATAL ("Unknown Gen Device");
 
index 3e60e69..95a6f45 100644 (file)
@@ -52,7 +52,6 @@ extern void cl_command_queue_add_ref(cl_command_queue);
 /* Map ND range kernel from OCL API */
 extern cl_int cl_command_queue_ND_range(cl_command_queue queue,
                                         cl_kernel ker,
-                                        cl_uint work_dim,
                                         const size_t *global_work_offset,
                                         const size_t *global_work_size,
                                         const size_t *local_work_size);
index a09c553..882d6bc 100644 (file)
@@ -47,15 +47,12 @@ typedef struct cl_local_id {
 } cl_local_id_t;
 
 static INLINE size_t
-cl_ker_compute_batch_sz(cl_kernel k,
-                        size_t wrk_dim_n,
-                        size_t wk_grp_n,
-                        size_t thread_n)
+cl_kernel_compute_batch_sz(cl_kernel k, size_t wk_grp_n, size_t thread_n)
 {
   size_t sz = 256; /* upper bound of the complete prelude */
   size_t media_obj_sz = 6 * 4; /* size of one MEDIA OBJECT */
   media_obj_sz += sizeof(cl_inline_header_t); /* header for all threads */
-  media_obj_sz += wrk_dim_n * sizeof(cl_local_id_t);/* for each dimension */ 
+  media_obj_sz += 3 * sizeof(cl_local_id_t);/* for each dimension */ 
   if (k->patch.exec_env.has_barriers)
     media_obj_sz += 4 * 4; /* one barrier update per object */
   sz += media_obj_sz * wk_grp_n * thread_n;
@@ -63,11 +60,11 @@ cl_ker_compute_batch_sz(cl_kernel k,
 }
 
 static INLINE void
-cl_command_queue_enqueue_wk_grp3(cl_command_queue queue,
-                                  cl_local_id_t **ids,
-                                  const cl_inline_header_t *header,
-                                  uint32_t thread_n,
-                                  uint32_t barrierID)
+cl_command_queue_enqueue_wk_grp(cl_command_queue queue,
+                                 cl_local_id_t **ids,
+                                 const cl_inline_header_t *header,
+                                 uint32_t thread_n,
+                                 uint32_t barrierID)
 {
   intel_gpgpu_t *gpgpu = queue->gpgpu;
   uint32_t i;
@@ -86,53 +83,10 @@ cl_command_queue_enqueue_wk_grp3(cl_command_queue queue,
   }
 }
 
-static INLINE void
-cl_command_queue_enqueue_wk_grp2(cl_command_queue queue,
-                                  cl_local_id_t **ids,
-                                  const cl_inline_header_t *header,
-                                  uint32_t thread_n,
-                                  uint32_t barrierID)
-{
-  intel_gpgpu_t *gpgpu = queue->gpgpu;
-  uint32_t i;
-  for (i = 0; i < thread_n; ++i) {
-    const size_t sz = sizeof(cl_inline_header_t) + 2*sizeof(cl_local_id_t);
-    char *data = gpgpu_run_with_inline(gpgpu, barrierID, sz);
-    size_t offset = 0;
-    assert(data);
-    *((cl_inline_header_t *) (data + offset)) = *header;
-    offset += sizeof(cl_inline_header_t);
-    *((cl_local_id_t *) (data + offset)) = ids[0][i];
-    offset += sizeof(cl_local_id_t);
-    *((cl_local_id_t *) (data + offset)) = ids[1][i];
-  }
-}
-
-static INLINE void
-cl_command_queue_enqueue_wk_grp1(cl_command_queue queue,
-                                  cl_local_id_t **ids,
-                                  const cl_inline_header_t *header,
-                                  uint32_t thread_n,
-                                  uint32_t barrierID)
-{
-  intel_gpgpu_t *gpgpu = queue->gpgpu;
-  uint32_t i;
-  for (i = 0; i < thread_n; ++i) {
-    const size_t sz = sizeof(cl_inline_header_t) + sizeof(cl_local_id_t);
-    char *data = gpgpu_run_with_inline(gpgpu, barrierID, sz);
-    size_t offset = 0;
-    assert(data);
-    *((cl_inline_header_t *) (data + offset)) = *header;
-    offset += sizeof(cl_inline_header_t);
-    *((cl_local_id_t *) (data + offset)) = ids[0][i];
-  }
-}
-
 LOCAL cl_int
 cl_command_queue_ND_range_gen6(cl_command_queue queue,
                                 cl_kernel ker,
-                                cl_uint wk_dim,
-                                const size_t *global_work_offset,
+                                const size_t *global_wk_off,
                                 const size_t *global_wk_sz,
                                 const size_t *local_wk_sz)
 {
@@ -164,7 +118,7 @@ cl_command_queue_ND_range_gen6(cl_command_queue queue,
   TRY (cl_kernel_check_args, ker);
 
   /* Check that the local work sizes are OK */
-  TRY (cl_kernel_work_group_sz, ker, local_wk_sz, wk_dim, &wk_grp_sz);
+  TRY (cl_kernel_work_group_sz, ker, local_wk_sz, 3, &wk_grp_sz);
 
   /* Directly from the user defined values */
   header.local_sz[0] = local_wk_sz[0];
@@ -176,9 +130,9 @@ cl_command_queue_ND_range_gen6(cl_command_queue queue,
   header.exec_mask = ~0;
 
   /* offsets are evenly divided by the local sizes */
-  if (global_work_offset)
-    for (i = 0; i < wk_dim; ++i)
-      offset[i] = global_work_offset[i]/local_wk_sz[i];
+  offset[0] = global_wk_off[0] / local_wk_sz[0];
+  offset[1] = global_wk_off[1] / local_wk_sz[1];
+  offset[2] = global_wk_off[2] / local_wk_sz[2];
 
   /* Compute the local size per wg and the offsets for each local buffer */
   header.local_mem_sz = cl_kernel_local_memory_sz(ker);
@@ -203,19 +157,19 @@ cl_command_queue_ND_range_gen6(cl_command_queue queue,
   if (cst_sz > 0) {
     char *data = NULL;
     assert(ker->cst_buffer);
-    data = cl_kernel_create_cst_buffer(ker,wk_dim,global_wk_sz,local_wk_sz);
+    data = cl_kernel_create_cst_buffer(ker, global_wk_sz, local_wk_sz);
     gpgpu_upload_constants(gpgpu, data, cst_sz);
     cl_free(data);
   }
 
   wk_grp_n = 1;
-  for (i = 0; i < wk_dim; ++i) {
+  for (i = 0; i < 3; ++i) {
     TRY_ALLOC (ids[i], (cl_local_id_t*) cl_malloc(wk_grp_sz*sizeof(uint16_t)));
     grp_end[i] = offset[i] + global_wk_sz[i] / local_wk_sz[i];
     wk_grp_n *= grp_end[i]-offset[i];
   }
   thread_n = wk_grp_sz / 16;
-  batch_sz = cl_ker_compute_batch_sz(ker, wk_dim, wk_grp_n, thread_n);
+  batch_sz = cl_kernel_compute_batch_sz(ker, wk_grp_n, thread_n);
 
   /* Start a new batch buffer */
   gpgpu_batch_reset(gpgpu, batch_sz);
@@ -225,48 +179,21 @@ cl_command_queue_ND_range_gen6(cl_command_queue queue,
    * Local IDs are shared from work group to work group. We allocate once the
    * buffers and reuse them
    */
-  if (wk_dim == 3) {
-    curr = 0;
-    for (i = 0; i < local_wk_sz[0]; ++i)
-    for (j = 0; j < local_wk_sz[1]; ++j)
-    for (k = 0; k < local_wk_sz[2]; ++k, ++curr) {
-      ((uint16_t*) ids[0])[curr] = i;
-      ((uint16_t*) ids[1])[curr] = j;
-      ((uint16_t*) ids[2])[curr] = k;
-    }
-    for (header.grp_n[0] = offset[0]; header.grp_n[0] < grp_end[0]; ++header.grp_n[0])
-    for (header.grp_n[1] = offset[1]; header.grp_n[1] < grp_end[1]; ++header.grp_n[1])
-    for (header.grp_n[2] = offset[2]; header.grp_n[2] < grp_end[2]; ++header.grp_n[2]) {
-      if (ker->patch.exec_env.has_barriers)
-        gpgpu_update_barrier(gpgpu, barrierID, thread_n);
-      cl_command_queue_enqueue_wk_grp3(queue, ids, &header, thread_n, barrierID);
-      barrierID = (barrierID + 1) % 16;
-    }
-  }
-  else if (wk_dim == 2) {
-    curr = 0;
-    for (i = 0; i < local_wk_sz[0]; ++i)
-    for (j = 0; j < local_wk_sz[1]; ++j, ++curr) {
-      ((uint16_t*) ids[0])[curr] = i;
-      ((uint16_t*) ids[1])[curr] = j;
-    }
-    for (header.grp_n[0] = offset[0]; header.grp_n[0] < grp_end[0]; ++header.grp_n[0])
-    for (header.grp_n[1] = offset[1]; header.grp_n[1] < grp_end[1]; ++header.grp_n[1]) {
-      if (ker->patch.exec_env.has_barriers)
-        gpgpu_update_barrier(gpgpu, barrierID, thread_n);
-      cl_command_queue_enqueue_wk_grp2(queue, ids, &header, thread_n, barrierID);
-      barrierID = (barrierID + 1) % 16;
-    }
+  curr = 0;
+  for (i = 0; i < local_wk_sz[0]; ++i)
+  for (j = 0; j < local_wk_sz[1]; ++j)
+  for (k = 0; k < local_wk_sz[2]; ++k, ++curr) {
+    ((uint16_t*) ids[0])[curr] = i;
+    ((uint16_t*) ids[1])[curr] = j;
+    ((uint16_t*) ids[2])[curr] = k;
   }
-  else {
-    for (i = 0; i < local_wk_sz[0]; ++i)
-      ((uint16_t*) ids[0])[i] = i;
-    for (header.grp_n[0] = offset[0]; header.grp_n[0] < grp_end[0]; ++header.grp_n[0]) {
-      if (ker->patch.exec_env.has_barriers)
-        gpgpu_update_barrier(gpgpu, barrierID, thread_n);
-      cl_command_queue_enqueue_wk_grp1(queue, ids, &header, thread_n, barrierID);
-      barrierID = (barrierID + 1) % 16;
-    }
+  for (header.grp_n[0] = offset[0]; header.grp_n[0] < grp_end[0]; ++header.grp_n[0])
+  for (header.grp_n[1] = offset[1]; header.grp_n[1] < grp_end[1]; ++header.grp_n[1])
+  for (header.grp_n[2] = offset[2]; header.grp_n[2] < grp_end[2]; ++header.grp_n[2]) {
+    if (ker->patch.exec_env.has_barriers)
+      gpgpu_update_barrier(gpgpu, barrierID, thread_n);
+    cl_command_queue_enqueue_wk_grp(queue, ids, &header, thread_n, barrierID);
+    barrierID = (barrierID + 1) % 16;
   }
 
   gpgpu_batch_end(gpgpu, 0);
index 28802dc..7fa7e21 100644 (file)
 #include <stdio.h>
 #include <string.h>
 
+static INLINE size_t
+cl_kernel_compute_batch_sz(cl_kernel k)
+{
+  size_t sz = 256 + 16;
+  return sz;
+}
+
+static cl_int
+cl_set_local_ids(char *data,
+                 const size_t *local_wk_sz,
+                 size_t cst_sz,
+                 size_t id_offset,
+                 size_t thread_n)
+{
+  uint16_t *ids[3] = {NULL,NULL,NULL};
+  size_t i, j, k, curr = 0;
+  cl_int err = CL_SUCCESS;
+
+  for (i = 0; i < 3; ++i)
+    TRY_ALLOC(ids[i], (uint16_t*) cl_calloc(sizeof(uint16_t), thread_n*16));
+
+  /* Compute the IDs */
+  for (i = 0; i < local_wk_sz[0]; ++i)
+  for (j = 0; j < local_wk_sz[1]; ++j)
+  for (k = 0; k < local_wk_sz[2]; ++k, ++curr) {
+    ((uint16_t*) ids[0])[curr] = i;
+    ((uint16_t*) ids[1])[curr] = j;
+    ((uint16_t*) ids[2])[curr] = k;
+  }
+
+  /* Copy them to the constant buffer */
+  curr = 0;
+  data += id_offset;
+  for (i = 0; i < thread_n; ++i, data += cst_sz) {
+    uint16_t *ids0 = (uint16_t *) (data +  0);
+    uint16_t *ids1 = (uint16_t *) (data + 32);
+    uint16_t *ids2 = (uint16_t *) (data + 64);
+    for (j = 0; j < 16; ++j, ++curr) {/* SIMD16 */
+      ids0[j] = ids[0][curr];
+      ids1[j] = ids[1][curr];
+      ids2[j] = ids[2][curr];
+    }
+  }
+
+error:
+  for (i = 0; i < 3; ++i)
+    cl_free(ids[i]);
+  return err;
+}
+
 LOCAL cl_int
 cl_command_queue_ND_range_gen7(cl_command_queue queue,
-                                cl_kernel ker,
-                                cl_uint wk_dim,
-                                const size_t *global_work_offset,
-                                const size_t *global_wk_sz,
-                                const size_t *local_wk_sz)
+                               cl_kernel ker,
+                               const size_t *global_wk_off,
+                               const size_t *global_wk_sz,
+                               const size_t *local_wk_sz)
 {
-#if 0
   cl_context ctx = queue->ctx;
   intel_gpgpu_t *gpgpu = queue->gpgpu;
   drm_intel_bo *private_bo = NULL, *scratch_bo = NULL;
+  char *user = NULL;  /* User defined constants first */
+  char *data = NULL;  /* Complete constant buffer to upload */
   genx_gpgpu_kernel_t kernel;
-  const size_t cst_sz = ker->patch.curbe.sz;
-#endif
+  const size_t local_mem_sz = cl_kernel_local_memory_sz(ker);
+  size_t local_sz, batch_sz, cst_sz = ker->patch.curbe.sz;
+  size_t i, thread_n, id_offset;
+  cl_int err = CL_SUCCESS;
+
+  /* Setup kernel */
+  kernel.name = "OCL kernel";
+  kernel.grf_blocks = 128;
+  kernel.bin = NULL,
+  kernel.size = 0,
+  kernel.bo = ker->bo;
+  kernel.barrierID = 0;
+
+  /* All arguments must have been set */
+  TRY (cl_kernel_check_args, ker);
+
+  /* Check that the local work sizes are OK */
+  TRY (cl_kernel_work_group_sz, ker, local_wk_sz, 3, &local_sz);
+  thread_n = local_sz / 16; /* SIMD16 only */
+
+  /* Fill the constant buffer. Basically, we have to build one set of
+   * constants for each thread. The constants also includes the local ids we
+   * append after all the other regular values (function parameters...)
+   */
+  if (cst_sz > 0) {
+    assert(ker->cst_buffer);
+    user = cl_kernel_create_cst_buffer(ker, global_wk_sz, local_wk_sz);
+  }
+  id_offset = cst_sz =  ALIGN(cst_sz, 32); /* Align the user data on 32 bytes */
+  kernel.cst_sz = cst_sz += 3 * 32;        /* Add local IDs (16 words) */
+  TRY_ALLOC (data, (char*) cl_calloc(thread_n, cst_sz));
+  for (i = 0; i < thread_n; ++i)
+    memcpy(data + cst_sz * i, user, cst_sz);
+  TRY (cl_set_local_ids, data, local_wk_sz, cst_sz, id_offset, thread_n);
+
+  /* Setup the kernel */
+  gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, 4, 64, cst_sz / 32, 64);
+  if (queue->last_batch != NULL)
+    drm_intel_bo_unreference(queue->last_batch);
+  queue->last_batch = NULL;
+  cl_command_queue_bind_surface(queue, ker, NULL, &private_bo, &scratch_bo, local_mem_sz);
+  gpgpu_states_setup(gpgpu, &kernel, 1);
+
+  /* We always have constant with Gen7 (local_ids are used) */
+  gpgpu_upload_constants(gpgpu, data, thread_n*cst_sz);
+
+  /* Start a new batch buffer */
+  batch_sz = cl_kernel_compute_batch_sz(ker);
+  gpgpu_batch_reset(gpgpu, batch_sz);
+  gpgpu_batch_start(gpgpu);
+
+  /* Issue the GPGPU_WALKER command */
+  gpgpu_walker(gpgpu, thread_n, global_wk_off, global_wk_sz, local_wk_sz);
+
+  /* Close the batch buffer and submit it */
+  gpgpu_batch_end(gpgpu, 0);
+  gpgpu_flush(gpgpu);
 
-  return CL_SUCCESS;
+error:
+  /* Release all temporary buffers */
+  if (private_bo)
+    drm_intel_bo_unreference(private_bo);
+  if (scratch_bo)
+    drm_intel_bo_unreference(scratch_bo);
+  cl_free(data);
+  cl_free(user);
+  return err;
 }
 
index 98f20a3..aece675 100644 (file)
@@ -737,10 +737,7 @@ cl_kernel_local_memory_sz(cl_kernel k)
 }
 
 LOCAL char*
-cl_kernel_create_cst_buffer(cl_kernel k,
-                            cl_uint wk_dim,
-                            const size_t *global_wk_sz,
-                            const size_t *local_wk_sz)
+cl_kernel_create_cst_buffer(cl_kernel k, const size_t *global_wk_sz, const size_t *local_wk_sz)
 {
   cl_curbe_patch_info_t *info = NULL;
   const size_t sz = k->patch.curbe.sz;
@@ -783,10 +780,11 @@ error:
 LOCAL cl_int
 cl_kernel_work_group_sz(cl_kernel ker,
                         const size_t *local_wk_sz,
-                        cl_uint wk_dim,
+                        uint32_t wk_dim,
                         size_t *wk_grp_sz)
 {
   cl_int err = CL_SUCCESS;
+  size_t sz;
   cl_uint i;
 
   for (i = 0; i < wk_dim; ++i)
@@ -795,16 +793,18 @@ cl_kernel_work_group_sz(cl_kernel ker,
       err = CL_INVALID_WORK_ITEM_SIZE;
       goto error;
     }
-  *wk_grp_sz = local_wk_sz[0];
+  sz = local_wk_sz[0];
   for (i = 1; i < wk_dim; ++i)
-    *wk_grp_sz *= local_wk_sz[i];
-  FATAL_IF (*wk_grp_sz % 16, "Work group size must be a multiple of 16");
-  if (*wk_grp_sz > ker->program->ctx->device->max_work_group_size) {
+    sz *= local_wk_sz[i];
+  FATAL_IF (sz % 16, "Work group size must be a multiple of 16");
+  if (sz > ker->program->ctx->device->max_work_group_size) {
     err = CL_INVALID_WORK_ITEM_SIZE;
     goto error;
   }
 
 error:
+  if (wk_grp_sz)
+    *wk_grp_sz = sz;
   return err;
 }
 
index b16077a..1c9b08e 100644 (file)
@@ -320,10 +320,7 @@ cl_curbe_key(uint32_t type, uint32_t index, uint32_t src_offset)
 
 /* Allocate, fill and return the CURBE */
 extern char*
-cl_kernel_create_cst_buffer(cl_kernel k,
-                            cl_uint wk_dim,
-                            const size_t *global_wk_sz,
-                            const size_t *local_wk_sz);
+cl_kernel_create_cst_buffer(cl_kernel k, const size_t *global_wk_sz, const size_t *local_wk_sz);
 
 /* Compute and check the work group size from the user provided local size */
 extern cl_int
index af0e3db..24aabf5 100644 (file)
@@ -31,6 +31,7 @@
 #define CMD_STATE_PREFETCH                      CMD(0, 0, 3)
 #define CMD_MEDIA_GATEWAY_STATE                 CMD(2, 0, 3)
 #define CMD_MEDIA_STATE_FLUSH                   CMD(2, 0, 4)
+#define CMD_GPGPU_WALKER                        CMD(2, 1, 5)
 
 #define CMD_STATE_BASE_ADDRESS                  CMD(0, 1, 1)
 #define CMD_STATE_SIP                           CMD(0, 1, 2)
@@ -38,8 +39,8 @@
 #define CMD_SAMPLER_PALETTE_LOAD                CMD(3, 1, 2)
 
 #define CMD_MEDIA_STATE_POINTERS                CMD(2, 0, 0)
-#define CMD_MEDIA                        CMD(2, 1, 0)
-#define CMD_MEDIA_EX                     CMD(2, 1, 1)
+#define CMD_MEDIA                               CMD(2, 1, 0)
+#define CMD_MEDIA_EX                            CMD(2, 1, 1)
 
 #define CMD_PIPELINED_POINTERS                  CMD(3, 0, 0)
 #define CMD_BINDING_TABLE_POINTERS              CMD(3, 0, 1)
index 83209b5..2562239 100644 (file)
@@ -427,11 +427,12 @@ gpgpu_load_vfe_state(intel_gpgpu_t *state)
   BEGIN_BATCH(state->batch, 8);
   OUT_BATCH(state->batch, CMD_MEDIA_STATE_POINTERS | (8-2));
 
-  struct gen6_vfe_state_inline* vfe = (struct gen6_vfe_state_inline*)
+  gen6_vfe_state_inline_t* vfe = (gen6_vfe_state_inline_t*)
     intel_batchbuffer_alloc_space(state->batch,0);
 
   memset(vfe, 0, sizeof(struct gen6_vfe_state_inline));
   vfe->vfe1.fast_preempt = 1;
+  vfe->vfe1.gpgpu_mode = state->drv->gen_ver > 6 ? 1 : 0;
   vfe->vfe1.bypass_gateway_ctl = 1;
   vfe->vfe1.reset_gateway_timer = 1;
   vfe->vfe1.urb_entries = state->urb.num_vfe_entries;
@@ -441,7 +442,7 @@ gpgpu_load_vfe_state(intel_gpgpu_t *state)
 /*  vfe->vfe3.curbe_size = 63; */
 /*  vfe->vfe3.urbe_size = 13; */
   vfe->vfe4.scoreboard_enable = 1;
-  intel_batchbuffer_alloc_space(state->batch, sizeof(struct gen6_vfe_state_inline));
+  intel_batchbuffer_alloc_space(state->batch, sizeof(gen6_vfe_state_inline_t));
   ADVANCE_BATCH(state->batch);
 }
 
@@ -1004,6 +1005,28 @@ gpgpu_run_with_inline(intel_gpgpu_t *state, int32_t ki, size_t sz)
   return (char*) intel_batchbuffer_alloc_space(state->batch,sz);
 }
 
+LOCAL void
+gpgpu_walker(intel_gpgpu_t *state,
+             uint32_t thread_n,
+             const size_t global_wk_off[3],
+             const size_t global_wk_sz[3],
+             const size_t local_wk_sz[3])
+{
+  BEGIN_BATCH(state->batch, 11);
+  OUT_BATCH(state->batch, CMD_GPGPU_WALKER | 9);
+  OUT_BATCH(state->batch, 0);                       /* kernel index */
+  OUT_BATCH(state->batch, (1 << 30) | (thread_n-1)); /* SIMD16 | thread max */
+  OUT_BATCH(state->batch, global_wk_off[0]);
+  OUT_BATCH(state->batch, global_wk_sz[0]-1);
+  OUT_BATCH(state->batch, global_wk_off[1]);
+  OUT_BATCH(state->batch, global_wk_sz[1]-1);
+  OUT_BATCH(state->batch, global_wk_off[2]);
+  OUT_BATCH(state->batch, global_wk_sz[2]-1);
+  OUT_BATCH(state->batch, ~0x0);
+  OUT_BATCH(state->batch, ~0x0);
+  ADVANCE_BATCH(state->batch);
+}
+
 LOCAL int32_t
 intel_gpgpu_version(intel_gpgpu_t *gpgpu)
 {
index 789203b..ceb7982 100644 (file)
@@ -145,5 +145,13 @@ extern void gpgpu_run(intel_gpgpu_t*, int32_t ki);
  */
 extern char* gpgpu_run_with_inline(intel_gpgpu_t*, int32_t ki, size_t sz);
 
+/* Will spawn all threads */
+extern void
+gpgpu_walker(intel_gpgpu_t *state,
+             uint32_t thread_n,
+             const size_t global_wk_off[3],
+             const size_t global_wk_sz[3],
+             const size_t local_wk_sz[3]);
+
 #endif /* __GENX_GPGPU_H__ */