From 4dc837fa175a1c1d6e2e9c3aea621a5d17a4ed00 Mon Sep 17 00:00:00 2001
From: Benjamin Segovia <segovia.benjamin@gmail.com>
Date: Wed, 25 Apr 2012 20:31:03 +0000
Subject: [PATCH] Align driver code with new way to handle curbe

---
 src/cl_command_queue_gen7.c | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index ae0bd67..89d65b9 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -40,9 +40,9 @@ cl_kernel_compute_batch_sz(cl_kernel k)
 static cl_int
 cl_set_local_ids(char *data,
                  const size_t *local_wk_sz,
+                 const size_t *id_offset,
                  size_t simd_sz,
                  size_t cst_sz,
-                 size_t id_offset,
                  size_t thread_n)
 {
   uint32_t *ids[3] = {NULL,NULL,NULL};
@@ -63,11 +63,10 @@ cl_set_local_ids(char *data,
 
   /* Copy them to the constant buffer */
   curr = 0;
-  data += id_offset;
   for (i = 0; i < thread_n; ++i, data += cst_sz) {
-    uint32_t *ids0 = (uint32_t *) (data + 0);
-    uint32_t *ids1 = (uint32_t *) (data + 1*simd_sz*sizeof(uint32_t));
-    uint32_t *ids2 = (uint32_t *) (data + 2*simd_sz*sizeof(uint32_t));
+    uint32_t *ids0 = (uint32_t *) (data + id_offset[0]);
+    uint32_t *ids1 = (uint32_t *) (data + id_offset[1]);
+    uint32_t *ids2 = (uint32_t *) (data + id_offset[2]);
     for (j = 0; j < simd_sz; ++j, ++curr) {
       ids0[j] = ids[0][curr];
       ids1[j] = ids[1][curr];
@@ -119,8 +118,8 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   cl_buffer private_bo = NULL, scratch_bo = NULL;
   cl_gpgpu_kernel kernel;
   const uint32_t simd_sz = cl_kernel_get_simd_width(ker);
-  size_t i, batch_sz = 0u, local_sz = 0u, local_id_sz = 0u, cst_sz = ker->curbe_sz;
-  size_t thread_n = 0u, id_offset = 0u;
+  size_t i, batch_sz = 0u, local_sz = 0u, cst_sz = ker->curbe_sz;
+  size_t thread_n = 0u, id_offset[3];
   cl_int err = CL_SUCCESS;
 
   /* Setup kernel */
@@ -135,12 +134,10 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   curbe = alloca(ker->curbe_sz);
   cl_curbe_fill(ker, curbe, global_wk_off, global_wk_sz, local_wk_sz);
 
-  /* Compute the number of HW threads we are going to need */
+  /* Compute the number of HW threads we need */
   TRY (cl_kernel_work_group_sz, ker, local_wk_sz, 3, &local_sz);
   kernel.thread_n = thread_n = local_sz / simd_sz;
-  id_offset = cst_sz = ALIGN(cst_sz, 32); /* Align the user data on 32 bytes */
-  local_id_sz = 3 * simd_sz * sizeof(uint32_t); /* Add local IDs */
-  kernel.cst_sz = cst_sz += local_id_sz;
+  kernel.cst_sz = cst_sz;
 
   /* Setup the kernel */
   cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32);
@@ -154,8 +151,12 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   TRY_ALLOC (final_curbe, (char*) alloca(thread_n * cst_sz));
   if (curbe)
     for (i = 0; i < thread_n; ++i)
-      memcpy(final_curbe + cst_sz * i, curbe, cst_sz - local_id_sz);
-  TRY (cl_set_local_ids, final_curbe, local_wk_sz, simd_sz, cst_sz, id_offset, thread_n);
+      memcpy(final_curbe + cst_sz * i, curbe, cst_sz);
+  id_offset[0] = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_LOCAL_ID_X, 0);
+  id_offset[1] = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_LOCAL_ID_X, 1);
+  id_offset[2] = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_LOCAL_ID_X, 2);
+  assert(id_offset[0] >= 0 && id_offset[1] >= 0 && id_offset[2] >= 0);
+  TRY (cl_set_local_ids, final_curbe, local_wk_sz, id_offset, simd_sz, cst_sz, thread_n);
   cl_gpgpu_upload_constants(gpgpu, final_curbe, thread_n*cst_sz);
 
   /* Start a new batch buffer */
-- 
2.7.4