Implemented SIMD8 for IVB (only tested on Fulsim but should work out of the box on...

author bsegovia <devnull@localhost>

Wed, 18 Jan 2012 03:49:31 +0000 (03:49 +0000)

committer Keith Packard <keithp@keithp.com>

Fri, 10 Aug 2012 23:15:07 +0000 (16:15 -0700)
author bsegovia <devnull@localhost>
Wed, 18 Jan 2012 03:49:31 +0000 (03:49 +0000)
committer Keith Packard <keithp@keithp.com>
Fri, 10 Aug 2012 23:15:07 +0000 (16:15 -0700)
diff --git a/kernels/urng_output.bmp b/kernels/urng_output.bmp

index c004fa5..42b1718 100644 (file)

Binary files a/kernels/urng_output.bmp and b/kernels/urng_output.bmp differ
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c

index 055f8ee..af3a822 100644 (file)
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -194,7 +194,7 @@ cl_command_queue_bind_surface(cl_command_queue queue,
      const size_t sz = max_thread *
                        k->patch.private_surf.size *
                        k->patch.exec_env.largest_compiled_simd_sz;
-    assert(k->patch.exec_env.largest_compiled_simd_sz == 16);
+    // assert(k->patch.exec_env.largest_compiled_simd_sz == 16);
      assert(k->patch.private_surf.offset % SURFACE_SZ == 0);
      index = k->patch.private_surf.offset / SURFACE_SZ;
      assert(index != MAX_SURFACES - 1);
@@ -209,7 +209,7 @@ cl_command_queue_bind_surface(cl_command_queue queue,
      const size_t sz = max_thread * /* XXX is it given per lane ??? */
                        k->patch.scratch.size *
                        k->patch.exec_env.largest_compiled_simd_sz;
-    assert(k->patch.exec_env.largest_compiled_simd_sz == 16);
+    // assert(k->patch.exec_env.largest_compiled_simd_sz == 16);
      assert(k->patch.scratch.offset % SURFACE_SZ == 0);
      assert(index != MAX_SURFACES - 1);
      index = k->patch.scratch.offset / SURFACE_SZ;
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c

index d783f9a..8c93b73 100644 (file)
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -43,6 +43,7 @@ cl_kernel_compute_batch_sz(cl_kernel k)
  static cl_int
  cl_set_local_ids(char *data,
                   const size_t *local_wk_sz,
+                 size_t simd_sz,
                   size_t cst_sz,
                   size_t id_offset,
                   size_t thread_n)
@@ -52,7 +53,7 @@ cl_set_local_ids(char *data,
    cl_int err = CL_SUCCESS;
  
    for (i = 0; i < 3; ++i)
-    TRY_ALLOC(ids[i], (uint16_t*) cl_calloc(sizeof(uint16_t), thread_n*16));
+    TRY_ALLOC(ids[i], (uint16_t*) cl_calloc(sizeof(uint16_t), thread_n*simd_sz));
  
    /* Compute the IDs */
    for (k = 0; k < local_wk_sz[2]; ++k)
@@ -67,10 +68,11 @@ cl_set_local_ids(char *data,
    curr = 0;
    data += id_offset;
    for (i = 0; i < thread_n; ++i, data += cst_sz) {
-    uint16_t *ids0 = (uint16_t *) (data +  0);
-    uint16_t *ids1 = (uint16_t *) (data + 32);
-    uint16_t *ids2 = (uint16_t *) (data + 64);
-    for (j = 0; j < 16; ++j, ++curr) {
+    /* Compiler use a GRF for each local ID (8 x 32 bits == 16 x 16 bits) */
+    uint16_t *ids0 = (uint16_t *) (data + 0);
+    uint16_t *ids1 = (uint16_t *) (data + 1*16*sizeof(uint16_t));
+    uint16_t *ids2 = (uint16_t *) (data + 2*16*sizeof(uint16_t));
+    for (j = 0; j < simd_sz; ++j, ++curr) {
        ids0[j] = ids[0][curr];
        ids1[j] = ids[1][curr];
        ids2[j] = ids[2][curr];
@@ -96,6 +98,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
    char *curbe = NULL;        /* Does not include per-thread local IDs */
    char *final_curbe = NULL;  /* Includes them */
    genx_gpgpu_kernel_t kernel;
+  const size_t simd_sz = ker->patch.exec_env.largest_compiled_simd_sz;
    size_t local_sz, batch_sz, cst_sz = ker->patch.curbe.sz;
    size_t i, thread_n, id_offset;
    cl_int err = CL_SUCCESS;
@@ -115,7 +118,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
  
    /* Check that the local work sizes are OK */
    TRY (cl_kernel_work_group_sz, ker, local_wk_sz, 3, &local_sz);
-  kernel.thread_n = thread_n = local_sz / 16; /* SIMD16 only */
+  kernel.thread_n = thread_n = local_sz / simd_sz;
  
    /* CURBE step 1. Allocate and fill fields shared by threads in workgroup */
    if (cst_sz > 0) {
@@ -142,7 +145,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
    TRY_ALLOC (final_curbe, (char*) cl_calloc(thread_n, cst_sz));
    for (i = 0; i < thread_n; ++i)
      memcpy(final_curbe + cst_sz * i, curbe, cst_sz);
-  TRY (cl_set_local_ids, final_curbe, local_wk_sz, cst_sz, id_offset, thread_n);
+  TRY (cl_set_local_ids, final_curbe, local_wk_sz, simd_sz, cst_sz, id_offset, thread_n);
    gpgpu_upload_constants(gpgpu, final_curbe, thread_n*cst_sz);
  
    /* Start a new batch buffer */
@@ -151,7 +154,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
    gpgpu_batch_start(gpgpu);
  
    /* Issue the GPGPU_WALKER command */
-  gpgpu_walker(gpgpu, thread_n, global_wk_off, global_wk_sz, local_wk_sz);
+  gpgpu_walker(gpgpu, simd_sz, thread_n, global_wk_off, global_wk_sz, local_wk_sz);
  
    /* Close the batch buffer and submit it */
    gpgpu_batch_end(gpgpu, 0);
diff --git a/src/cl_context.c b/src/cl_context.c

index 6dee771..c0fb80c 100644 (file)
--- a/src/cl_context.c
+++ b/src/cl_context.c
@@ -33,6 +33,13 @@
  #include <stdint.h>
  #include <assert.h>
  
+/* Do not include the full dependency */
+struct intel_driver;
+/* Get the command buffer interface */
+extern struct _drm_intel_bufmgr* intel_driver_get_buf(struct intel_driver*);
+/* Get the Gen HW version */
+extern uint32_t intel_driver_get_ver(struct intel_driver*);
+
  static cl_int
  cl_context_properties_is_ok(const cl_context_properties *properties)
  {
@@ -126,6 +133,7 @@ cl_context_new(void)
    TRY_ALLOC_NO_ERR (ctx->intel_drv, cl_intel_driver_new());
    ctx->magic = CL_MAGIC_CONTEXT_HEADER;
    ctx->ref_n = 1;
+  ctx->ver = intel_driver_get_ver(ctx->intel_drv);
    pthread_mutex_init(&ctx->program_lock, NULL);
    pthread_mutex_init(&ctx->queue_lock, NULL);
    pthread_mutex_init(&ctx->buffer_lock, NULL);
@@ -194,9 +202,6 @@ error:
    goto exit;
  }
  
-struct intel_driver;
-extern struct _drm_intel_bufmgr* intel_driver_get_buf(struct intel_driver*);
-
  struct _drm_intel_bufmgr*
  cl_context_get_intel_bufmgr(cl_context ctx)
  {
diff --git a/src/cl_context.h b/src/cl_context.h

index dae7657..f2c6302 100644 (file)
--- a/src/cl_context.h
+++ b/src/cl_context.h
@@ -43,6 +43,7 @@ struct _cl_context {
    pthread_mutex_t program_lock;     /* To allocate and deallocate programs */
    pthread_mutex_t buffer_lock;      /* To allocate and deallocate buffers */
    pthread_mutex_t sampler_lock;     /* To allocate and deallocate samplers */
+  uint32_t ver;                     /* Gen version */
  };
  
  /* Implement OpenCL function */
diff --git a/src/cl_kernel.c b/src/cl_kernel.c

index 8f77ebb..a3064c2 100644 (file)
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -516,9 +516,13 @@ cl_kernel_setup(cl_kernel k, const char *ker)
                                         64));
    drm_intel_bo_subdata(k->bo, 0, k->kernel_heap_sz, k->kernel_heap);
  
-  /* We have some restrictions on the compiled binary */
-  FATAL_IF (k->patch.exec_env.largest_compiled_simd_sz != 16, "Unsupported SIMD size");
-  FATAL_IF (k->patch.exec_env.compiled_simd16 == 0, "Unsupported SIMD size");
+  /* We have some restrictions on the compiled binary for SNB */
+  FATAL_IF (k->program->ctx->ver == 6 &&
+            k->patch.exec_env.largest_compiled_simd_sz != 16, "Unsupported SIMD size");
+  FATAL_IF (k->program->ctx->ver == 6 &&
+            k->patch.exec_env.compiled_simd16 == 0, "Unsupported SIMD size");
+  FATAL_IF (k->program->ctx->ver > 6 &&
+            k->patch.exec_env.largest_compiled_simd_sz == 32, "Unsupported SIMD size");
  
  error:
    return err;
diff --git a/src/cl_kernel.h b/src/cl_kernel.h

index cd4d00e..d11d551 100644 (file)
--- a/src/cl_kernel.h
+++ b/src/cl_kernel.h
@@ -288,7 +288,7 @@ struct _cl_kernel {
  };
  
  /* Size of the surface state as encoded in the binary blob */
-#define SURFACE_SZ 32
+#define SURFACE_SZ 64
  
  /* Allocate an empty kernel */
  extern cl_kernel cl_kernel_new(void);
diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c

index 305cd4b..a5778ee 100644 (file)
--- a/src/intel/intel_driver.c
+++ b/src/intel/intel_driver.c
@@ -294,6 +294,12 @@ intel_driver_get_buf(intel_driver_t *drv)
    return drv->bufmgr;
  }
  
+LOCAL uint32_t
+intel_driver_get_ver(struct intel_driver *drv)
+{
+  return drv->gen_ver;
+}
+
  LOCAL int
  cl_intel_get_device_id(void)
  {
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c

index 5e3f119..753a9bd 100644 (file)
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -770,6 +770,7 @@ gpgpu_run_with_inline(intel_gpgpu_t *state, int32_t ki, size_t sz)
  
  LOCAL void
  gpgpu_walker(intel_gpgpu_t *state,
+             uint32_t simd_sz,
               uint32_t thread_n,
               const size_t global_wk_off[3],
               const size_t global_wk_sz[3],
@@ -780,11 +781,14 @@ gpgpu_walker(intel_gpgpu_t *state,
      global_wk_sz[1] / local_wk_sz[1],
      global_wk_sz[2] / local_wk_sz[2]
    };
-
+  assert(simd_sz == 8 || simd_sz == 16);
    BEGIN_BATCH(state->batch, 11);
    OUT_BATCH(state->batch, CMD_GPGPU_WALKER | 9);
    OUT_BATCH(state->batch, 0);                        /* kernel index == 0 */
-  OUT_BATCH(state->batch, (1 << 30) | (thread_n-1)); /* SIMD16 | thread max */
+  if (simd_sz == 16)
+    OUT_BATCH(state->batch, (1 << 30) | (thread_n-1)); /* SIMD16 | thread max */
+  else
+    OUT_BATCH(state->batch, (0 << 30) | (thread_n-1)); /* SIMD8  | thread max */
    OUT_BATCH(state->batch, global_wk_off[0]);
    OUT_BATCH(state->batch, global_wk_dim[0]);
    OUT_BATCH(state->batch, global_wk_off[1]);
diff --git a/src/intel/intel_gpgpu.h b/src/intel/intel_gpgpu.h

index aeeb734..c4c38b3 100644 (file)
--- a/src/intel/intel_gpgpu.h
+++ b/src/intel/intel_gpgpu.h
@@ -134,6 +134,7 @@ extern char* gpgpu_run_with_inline(intel_gpgpu_t*, int32_t ki, size_t sz);
  /* Will spawn all threads */
  extern void
  gpgpu_walker(intel_gpgpu_t *state,
+             uint32_t simd_sz,
               uint32_t thread_n,
               const size_t global_wk_off[3],
               const size_t global_wk_sz[3],
author	bsegovia <devnull@localhost>
	Wed, 18 Jan 2012 03:49:31 +0000 (03:49 +0000)
committer	Keith Packard <keithp@keithp.com>
	Fri, 10 Aug 2012 23:15:07 +0000 (16:15 -0700)
kernels/urng_output.bmp		patch \| blob \| history
src/cl_command_queue.c		patch \| blob \| history
src/cl_command_queue_gen7.c		patch \| blob \| history
src/cl_context.c		patch \| blob \| history
src/cl_context.h		patch \| blob \| history
src/cl_kernel.c		patch \| blob \| history
src/cl_kernel.h		patch \| blob \| history
src/intel/intel_driver.c		patch \| blob \| history
src/intel/intel_gpgpu.c		patch \| blob \| history
src/intel/intel_gpgpu.h		patch \| blob \| history