From 9196b68dd5e7c0b2664d9f8339ee52ed454200c1 Mon Sep 17 00:00:00 2001
From: bsegovia <devnull@localhost>
Date: Thu, 4 Aug 2011 23:26:44 +0000
Subject: [PATCH] All unit tests now pass

---
 setup_fulsim.sh             |   5 +++
 src/cl_command_queue.c      |   2 +-
 src/cl_command_queue_gen6.c |   8 +++-
 src/cl_command_queue_gen7.c |  18 +++++---
 src/cl_kernel.c             |  23 +++++++++-
 src/cl_kernel.h             |   9 +++-
 src/intel/intel_defines.h   |   2 +
 src/intel/intel_gpgpu.c     | 106 +++++++++++++++++++++++++++++++++++++++++---
 src/intel/intel_gpgpu.h     |   8 ++--
 9 files changed, 160 insertions(+), 21 deletions(-)
 create mode 100644 setup_fulsim.sh

diff --git a/setup_fulsim.sh b/setup_fulsim.sh
new file mode 100644
index 0000000..9e355a0
--- /dev/null
+++ b/setup_fulsim.sh
@@ -0,0 +1,5 @@
+export INTEL_DEVID_OVERRIDE=0x0166     # or, 0x0112
+export DEVICE=ivb_m_gt2                #     snb_gt2 for SNB GT2 desktop
+export OCL_FULSIM_RUN=1
+export OCL_FULSIM_DEBUG_MODE=$1
+
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 904df7e..b4e2735 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -236,7 +236,7 @@ cl_run_fulsim(void)
     system("wine AubLoad.exe dump.aub -device sbrB0");
   else
     system("wine AubLoad.exe dump.aub -device sbrB0 -debug");
-#elif EMULATE_GEN == 7
+#elif EMULATE_GEN == 7 /* IVB */
   if (debug_mode == NULL || strcmp(debug_mode, "1"))
     system("wine AubLoad.exe dump.aub -device ivb2");
   else
diff --git a/src/cl_command_queue_gen6.c b/src/cl_command_queue_gen6.c
index 882d6bc..763b757 100644
--- a/src/cl_command_queue_gen6.c
+++ b/src/cl_command_queue_gen6.c
@@ -112,6 +112,8 @@ cl_command_queue_ND_range_gen6(cl_command_queue queue,
     kernels[i].size = 0,
     kernels[i].bo = ker->bo;
     kernels[i].barrierID = i;
+    kernels[i].use_barrier = 0; /* unused in gen6 */
+    kernels[i].thread_n = 0;    /* unused in gen6 */
   }
 
   /* All arguments must have been set */
@@ -157,7 +159,11 @@ cl_command_queue_ND_range_gen6(cl_command_queue queue,
   if (cst_sz > 0) {
     char *data = NULL;
     assert(ker->cst_buffer);
-    data = cl_kernel_create_cst_buffer(ker, global_wk_sz, local_wk_sz);
+    data = cl_kernel_create_cst_buffer(ker,
+                                       global_wk_off,
+                                       global_wk_sz,
+                                       local_wk_sz,
+                                       0, 0); /* unused on Gen6 */
     gpgpu_upload_constants(gpgpu, data, cst_sz);
     cl_free(data);
   }
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 7fa7e21..9a65d98 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -36,7 +36,7 @@
 static INLINE size_t
 cl_kernel_compute_batch_sz(cl_kernel k)
 {
-  size_t sz = 256 + 16;
+  size_t sz = 256 + 32;
   return sz;
 }
 
@@ -96,7 +96,6 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   char *user = NULL;  /* User defined constants first */
   char *data = NULL;  /* Complete constant buffer to upload */
   genx_gpgpu_kernel_t kernel;
-  const size_t local_mem_sz = cl_kernel_local_memory_sz(ker);
   size_t local_sz, batch_sz, cst_sz = ker->patch.curbe.sz;
   size_t i, thread_n, id_offset;
   cl_int err = CL_SUCCESS;
@@ -108,13 +107,15 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   kernel.size = 0,
   kernel.bo = ker->bo;
   kernel.barrierID = 0;
+  kernel.use_barrier = ker->patch.exec_env.has_barriers;
+  kernel.slm_sz = cl_kernel_local_memory_sz(ker);
 
   /* All arguments must have been set */
   TRY (cl_kernel_check_args, ker);
 
   /* Check that the local work sizes are OK */
   TRY (cl_kernel_work_group_sz, ker, local_wk_sz, 3, &local_sz);
-  thread_n = local_sz / 16; /* SIMD16 only */
+  kernel.thread_n = thread_n = local_sz / 16; /* SIMD16 only */
 
   /* Fill the constant buffer. Basically, we have to build one set of
    * constants for each thread. The constants also includes the local ids we
@@ -122,9 +123,14 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
    */
   if (cst_sz > 0) {
     assert(ker->cst_buffer);
-    user = cl_kernel_create_cst_buffer(ker, global_wk_sz, local_wk_sz);
+    user = cl_kernel_create_cst_buffer(ker,
+                                       global_wk_off,
+                                       global_wk_sz,
+                                       local_wk_sz,
+                                       3,
+                                       thread_n);
   }
-  id_offset = cst_sz =  ALIGN(cst_sz, 32); /* Align the user data on 32 bytes */
+  id_offset = cst_sz = ALIGN(cst_sz, 32); /* Align the user data on 32 bytes */
   kernel.cst_sz = cst_sz += 3 * 32;        /* Add local IDs (16 words) */
   TRY_ALLOC (data, (char*) cl_calloc(thread_n, cst_sz));
   for (i = 0; i < thread_n; ++i)
@@ -136,7 +142,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   if (queue->last_batch != NULL)
     drm_intel_bo_unreference(queue->last_batch);
   queue->last_batch = NULL;
-  cl_command_queue_bind_surface(queue, ker, NULL, &private_bo, &scratch_bo, local_mem_sz);
+  cl_command_queue_bind_surface(queue, ker, NULL, &private_bo, &scratch_bo, 0);
   gpgpu_states_setup(gpgpu, &kernel, 1);
 
   /* We always have constant with Gen7 (local_ids are used) */
diff --git a/src/cl_kernel.c b/src/cl_kernel.c
index 1efb159..ac12d80 100644
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -737,7 +737,12 @@ cl_kernel_local_memory_sz(cl_kernel k)
 }
 
 LOCAL char*
-cl_kernel_create_cst_buffer(cl_kernel k, const size_t *global_wk_sz, const size_t *local_wk_sz)
+cl_kernel_create_cst_buffer(cl_kernel k,
+                            const size_t *global_wk_off,
+                            const size_t *global_wk_sz,
+                            const size_t *local_wk_sz,
+                            cl_uint wk_dim,
+                            cl_uint thread_n)
 {
   cl_curbe_patch_info_t *info = NULL;
   const size_t sz = k->patch.curbe.sz;
@@ -747,6 +752,17 @@ cl_kernel_create_cst_buffer(cl_kernel k, const size_t *global_wk_sz, const size_
   TRY_ALLOC_NO_ERR (data, (char *) cl_calloc(sz, 1));
   memcpy(data, k->cst_buffer, sz);
 
+  /* Global work group offset */
+  key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_OFFSET, 0, 0);
+  if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
+    memcpy(data+info->offsets[0], global_wk_off,   sizeof(uint32_t));
+  key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_OFFSET, 0, 4);
+  if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
+    memcpy(data+info->offsets[0], global_wk_off+1, sizeof(uint32_t));
+  key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_OFFSET, 0, 8);
+  if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
+    memcpy(data+info->offsets[0], global_wk_off+2, sizeof(uint32_t));
+
   /* Global work group size */
   key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_SIZE, 0, 0);
   if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
@@ -769,6 +785,11 @@ cl_kernel_create_cst_buffer(cl_kernel k, const size_t *global_wk_sz, const size_
   if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
     memcpy(data+info->offsets[0], local_wk_sz+2, sizeof(uint32_t));
 
+  /* HW thread number (Gen7+) */
+  key = cl_curbe_key(DATA_PARAMETER_NUM_HARDWARE_THREADS, 0, 0);
+  if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
+    memcpy(data+info->offsets[0], &thread_n, sizeof(uint32_t));
+
 exit:
   return data;
 error:
diff --git a/src/cl_kernel.h b/src/cl_kernel.h
index 1c9b08e..9c1cd5f 100644
--- a/src/cl_kernel.h
+++ b/src/cl_kernel.h
@@ -251,7 +251,7 @@ struct _cl_kernel {
   volatile int ref_n;            /* We reference count this object */
   struct _drm_intel_bo *bo;      /* The code itself */
   struct _drm_intel_bo *const_bo;/* Buffer for all __constants values in the OCL program */
-  cl_program program;  /* Owns this structure (and pointers) */
+  cl_program program;            /* Owns this structure (and pointers) */
   cl_arg_info_t *arg_info;       /* List of arguments */
   cl_curbe_patch_info_t *curbe_info; /* List of patch locations for the curbe */
   char *name;                   /* User defined name */
@@ -320,7 +320,12 @@ cl_curbe_key(uint32_t type, uint32_t index, uint32_t src_offset)
 
 /* Allocate, fill and return the CURBE */
 extern char*
-cl_kernel_create_cst_buffer(cl_kernel k, const size_t *global_wk_sz, const size_t *local_wk_sz);
+cl_kernel_create_cst_buffer(cl_kernel k,
+                            const size_t *global_wk_off,
+                            const size_t *global_wk_sz,
+                            const size_t *local_wk_sz,
+                            cl_uint wk_dim,
+                            cl_uint thread_n);
 
 /* Compute and check the work group size from the user provided local size */
 extern cl_int
diff --git a/src/intel/intel_defines.h b/src/intel/intel_defines.h
index 24aabf5..753bfc1 100644
--- a/src/intel/intel_defines.h
+++ b/src/intel/intel_defines.h
@@ -33,6 +33,8 @@
 #define CMD_MEDIA_STATE_FLUSH                   CMD(2, 0, 4)
 #define CMD_GPGPU_WALKER                        CMD(2, 1, 5)
 
+#define CMD_LOAD_REGISTER_IMM                   (0x22 << 23)
+
 #define CMD_STATE_BASE_ADDRESS                  CMD(0, 1, 1)
 #define CMD_STATE_SIP                           CMD(0, 1, 2)
 #define CMD_PIPELINE_SELECT                     CMD(1, 1, 4)
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index 2562239..002992e 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -280,7 +280,7 @@ typedef struct gen6_interface_descriptor
   struct {
     uint32_t group_threads_num:8;        /* 0..64, 0 - no barrier use */
     uint32_t barrier_return_byte:8;
-    uint32_t shared_local_mem_size:5;    /* 0..16 - 0K..64K */
+    uint32_t slm_sz:5;                   /* 0..16 - 0K..64K */
     uint32_t barrier_enable:1;
     uint32_t rounding_mode:2;
     uint32_t barrier_return_grf_offset:8;
@@ -310,6 +310,7 @@ struct intel_gpgpu
 {
   intel_driver_t *drv;
   intel_batchbuffer_t *batch;
+  genx_gpgpu_kernel_t *ker;
 
   struct {
     dri_bo *bo;
@@ -431,8 +432,8 @@ gpgpu_load_vfe_state(intel_gpgpu_t *state)
     intel_batchbuffer_alloc_space(state->batch,0);
 
   memset(vfe, 0, sizeof(struct gen6_vfe_state_inline));
-  vfe->vfe1.fast_preempt = 1;
-  vfe->vfe1.gpgpu_mode = state->drv->gen_ver > 6 ? 1 : 0;
+  vfe->vfe1.fast_preempt = 0;
+  vfe->vfe1.gpgpu_mode = state->drv->gen_ver >= 7 ? 1 : 0;
   vfe->vfe1.bypass_gateway_ctl = 1;
   vfe->vfe1.reset_gateway_timer = 1;
   vfe->vfe1.urb_entries = state->urb.num_vfe_entries;
@@ -441,7 +442,7 @@ gpgpu_load_vfe_state(intel_gpgpu_t *state)
   vfe->vfe1.max_threads = state->max_threads - 1;
 /*  vfe->vfe3.curbe_size = 63; */
 /*  vfe->vfe3.urbe_size = 13; */
-  vfe->vfe4.scoreboard_enable = 1;
+  vfe->vfe4.scoreboard_enable = 0;
   intel_batchbuffer_alloc_space(state->batch, sizeof(gen6_vfe_state_inline_t));
   ADVANCE_BATCH(state->batch);
 }
@@ -470,11 +471,73 @@ gpgpu_load_idrt(intel_gpgpu_t *state)
   ADVANCE_BATCH(state->batch);
 }
 
+static const uint32_t Gen7L3CacheConfigReg2DataTable[] =
+{
+                // SLM    URB     DC      RO      I/S     C       T
+    0x00080040, //{ 0,    256,    0,      256,    0,      0,      0,      }
+    0x02040040, //{ 0,    256,    128,    128,    0,      0,      0,      }
+    0x00800040, //{ 0,    256,    32,     0,      64,     32,     128,    }
+    0x01000038, //{ 0,    224,    64,     0,      64,     32,     128,    }
+    0x02000030, //{ 0,    224,    128,    0,      64,     32,     64,     }
+    0x01000038, //{ 0,    224,    64,     0,      128,    32,     64,     }
+    0x00000038, //{ 0,    224,    0,      0,      128,    32,     128,    }
+    0x00000040, //{ 0,    256,    0,      0,      128,    0,      128,    }
+    0x0A140091, //{ 128,  128,    128,    128,    0,      0,      0,      }
+    0x09100091, //{ 128,  128,    64,     0,      64,     64,     64,     }
+    0x08900091, //{ 128,  128,    32,     0,      64,     32,     128,    }
+    0x08900091  //{ 128,  128,    32,     0,      128,    32,     64,     }
+};
+
+static const uint32_t Gen7L3CacheConfigReg3DataTable[] =
+{
+                // SLM    URB     DC      RO      I/S     C       T
+    0x00000000, //{ 0,    256,    0,      256,    0,      0,      0,      }
+    0x00000000, //{ 0,    256,    128,    128,    0,      0,      0,      }
+    0x00080410, //{ 0,    256,    32,     0,      64,     32,     128,    }
+    0x00080410, //{ 0,    224,    64,     0,      64,     32,     128,    }
+    0x00040410, //{ 0,    224,    128,    0,      64,     32,     64,     }
+    0x00040420, //{ 0,    224,    64,     0,      128,    32,     64,     }
+    0x00080420, //{ 0,    224,    0,      0,      128,    32,     128,    }
+    0x00080020, //{ 0,    256,    0,      0,      128,    0,      128,    }
+    0x00204080, //{ 128,  128,    128,    128,    0,      0,      0,      }
+    0x00244890, //{ 128,  128,    64,     0,      64,     64,     64,     }
+    0x00284490, //{ 128,  128,    32,     0,      64,     32,     128,    }
+    0x002444A0  //{ 128,  128,    32,     0,      128,    32,     64,     }
+};
+
+// L3 cache stuff 
+#define L3_CNTL_REG2_ADDRESS_OFFSET         ( 0xB020 )
+#define L3_CNTL_REG3_ADDRESS_OFFSET         ( 0xB024 )
+
+LOCAL void
+intel_gpgpu_set_L3(intel_gpgpu_t *state, uint32_t use_barrier)
+{
+  BEGIN_BATCH(state->batch, 6);
+  OUT_BATCH(state->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+  OUT_BATCH(state->batch, L3_CNTL_REG2_ADDRESS_OFFSET);
+  if (use_barrier)
+    OUT_BATCH(state->batch, Gen7L3CacheConfigReg2DataTable[8]);
+  else
+    OUT_BATCH(state->batch, Gen7L3CacheConfigReg2DataTable[4]);
+
+  OUT_BATCH(state->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+  OUT_BATCH(state->batch, L3_CNTL_REG3_ADDRESS_OFFSET);
+  if (use_barrier)
+    OUT_BATCH(state->batch, Gen7L3CacheConfigReg3DataTable[8]);
+  else
+    OUT_BATCH(state->batch, Gen7L3CacheConfigReg3DataTable[4]);
+  ADVANCE_BATCH(state->batch);
+
+  intel_batchbuffer_emit_mi_flush(state->batch);
+}
+
 LOCAL void
 gpgpu_batch_start(intel_gpgpu_t *state)
 {
   intel_batchbuffer_start_atomic(state->batch, 256);
   intel_batchbuffer_emit_mi_flush(state->batch);
+  if (state->drv->gen_ver >= 7)
+    intel_gpgpu_set_L3(state, state->ker->use_barrier);
   gpgpu_select_pipeline(state);
   gpgpu_set_base_address(state);
   gpgpu_load_vfe_state(state);
@@ -883,6 +946,8 @@ gpgpu_build_binding_table(intel_gpgpu_t *state)
   dri_bo_unmap(state->binding_table_b.bo);
 }
 
+#define KB 1024
+
 static void
 gpgpu_build_idrt(intel_gpgpu_t *state,
                  genx_gpgpu_kernel_t *kernel,
@@ -907,11 +972,32 @@ gpgpu_build_idrt(intel_gpgpu_t *state,
     desc->desc3.binding_table_pointer = state->binding_table_b.bo->offset >> 5;
     desc->desc4.curbe_read_len = kernel[i].cst_sz / 32;
     desc->desc4.curbe_read_offset = 0;
-    desc->desc5.group_threads_num = kernel[i].barrierID; /* BarrierID on GEN6 */
-    /* desc->desc5 = 0; - no barriers, groups, etc. */
     /* desc->desc6 = 0; - mbz */
     /* desc->desc7 = 0; - mbz */
 
+    /* Barriers / SLM are automatically handled on Gen7+ */
+    if (state->drv->gen_ver >= 7) {
+      size_t slm_sz = kernel[i].slm_sz;
+      desc->desc5.group_threads_num = kernel[i].use_barrier ? kernel[i].thread_n : 0;
+      desc->desc5.barrier_enable = kernel[i].use_barrier;
+      if (slm_sz > 0) {
+        if (slm_sz <= 4 * KB)
+          slm_sz = 4 * KB; //4KB
+        else if (slm_sz <= 8 * KB)
+          slm_sz = 8 * KB; //8KB
+        else if (slm_sz <= 16 * KB)
+          slm_sz = 16 * KB; //16KB
+        else if (slm_sz <= 32 * KB)
+          slm_sz = 32 * KB; //32KB
+        else if (slm_sz <= 64 * KB)
+          slm_sz = 64 * KB; //64KB
+        slm_sz = slm_sz >> 12;
+      }
+      desc->desc5.slm_sz = slm_sz;
+    }
+    else
+      desc->desc5.group_threads_num = kernel[i].barrierID; /* BarrierID on GEN6 */
+
     dri_bo_emit_reloc(bo,
                       I915_GEM_DOMAIN_INSTRUCTION, 0,
                       0,
@@ -950,6 +1036,7 @@ gpgpu_upload_constants(intel_gpgpu_t *state, void* data, uint32_t size)
 LOCAL void
 gpgpu_states_setup(intel_gpgpu_t *state, genx_gpgpu_kernel_t *kernel, uint32_t ker_n)
 {
+  state->ker = kernel;
   gpgpu_build_sampler_table(state);
   gpgpu_build_binding_table(state);
   gpgpu_build_idrt(state, kernel, ker_n);
@@ -1014,7 +1101,7 @@ gpgpu_walker(intel_gpgpu_t *state,
 {
   BEGIN_BATCH(state->batch, 11);
   OUT_BATCH(state->batch, CMD_GPGPU_WALKER | 9);
-  OUT_BATCH(state->batch, 0);                       /* kernel index */
+  OUT_BATCH(state->batch, 0);                        /* kernel index == 0 */
   OUT_BATCH(state->batch, (1 << 30) | (thread_n-1)); /* SIMD16 | thread max */
   OUT_BATCH(state->batch, global_wk_off[0]);
   OUT_BATCH(state->batch, global_wk_sz[0]-1);
@@ -1025,6 +1112,11 @@ gpgpu_walker(intel_gpgpu_t *state,
   OUT_BATCH(state->batch, ~0x0);
   OUT_BATCH(state->batch, ~0x0);
   ADVANCE_BATCH(state->batch);
+
+  BEGIN_BATCH(state->batch, 2);
+  OUT_BATCH(state->batch, CMD_MEDIA_STATE_FLUSH | 0);
+  OUT_BATCH(state->batch, 0);                        /* kernel index == 0 */
+  ADVANCE_BATCH(state->batch);
 }
 
 LOCAL int32_t
diff --git a/src/intel/intel_gpgpu.h b/src/intel/intel_gpgpu.h
index ceb7982..1cd5eb0 100644
--- a/src/intel/intel_gpgpu.h
+++ b/src/intel/intel_gpgpu.h
@@ -36,15 +36,17 @@ enum gen6_cache_control {
 #define MAX_SAMPLERS   16
 
 /* Use this structure to bind kernels in the gpgpu state */
-typedef struct genx_gpgpu_kernel
-{
+typedef struct genx_gpgpu_kernel {
   const char *name;        /* kernel name and bo name */
   uint32_t grf_blocks;     /* register blocks kernel wants (in 8 reg blocks) */
-  uint32_t cst_sz;         /* indicates if kernel needs constants */
+  uint32_t cst_sz;         /* total size of all constants */
   const uint32_t *bin;     /* binary code of the kernel */
   int32_t size;            /* kernel code size */
   struct _drm_intel_bo *bo;/* kernel code in the proper addr space */
   int32_t barrierID;       /* barrierID for _this_ kernel */
+  uint32_t use_barrier:1;  /* For gen7 (automatic barrier management) */
+  uint32_t thread_n:15;    /* For gen7 (automatic barrier management) */
+  uint32_t slm_sz:16;      /* For gen7 (automatic SLM allocation) */
 } genx_gpgpu_kernel_t;
 
 /* Convenient abstraction of the device */
-- 
2.7.4