All unit tests now pass

author bsegovia <devnull@localhost>

Thu, 4 Aug 2011 23:26:44 +0000 (23:26 +0000)

committer Keith Packard <keithp@keithp.com>

Fri, 10 Aug 2012 23:14:43 +0000 (16:14 -0700)
author bsegovia <devnull@localhost>
Thu, 4 Aug 2011 23:26:44 +0000 (23:26 +0000)
committer Keith Packard <keithp@keithp.com>
Fri, 10 Aug 2012 23:14:43 +0000 (16:14 -0700)
diff --git a/setup_fulsim.sh b/setup_fulsim.sh

new file mode 100644 (file)

index 0000000..9e355a0
--- /dev/null
+++ b/setup_fulsim.sh
@@ -0,0 +1,5 @@
+export INTEL_DEVID_OVERRIDE=0x0166     # or, 0x0112
+export DEVICE=ivb_m_gt2                #     snb_gt2 for SNB GT2 desktop
+export OCL_FULSIM_RUN=1
+export OCL_FULSIM_DEBUG_MODE=$1
+
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c

index 904df7e..b4e2735 100644 (file)
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -236,7 +236,7 @@ cl_run_fulsim(void)
      system("wine AubLoad.exe dump.aub -device sbrB0");
    else
      system("wine AubLoad.exe dump.aub -device sbrB0 -debug");
-#elif EMULATE_GEN == 7
+#elif EMULATE_GEN == 7 /* IVB */
    if (debug_mode == NULL || strcmp(debug_mode, "1"))
      system("wine AubLoad.exe dump.aub -device ivb2");
    else
diff --git a/src/cl_command_queue_gen6.c b/src/cl_command_queue_gen6.c

index 882d6bc..763b757 100644 (file)
--- a/src/cl_command_queue_gen6.c
+++ b/src/cl_command_queue_gen6.c
@@ -112,6 +112,8 @@ cl_command_queue_ND_range_gen6(cl_command_queue queue,
      kernels[i].size = 0,
      kernels[i].bo = ker->bo;
      kernels[i].barrierID = i;
+    kernels[i].use_barrier = 0; /* unused in gen6 */
+    kernels[i].thread_n = 0;    /* unused in gen6 */
    }
  
    /* All arguments must have been set */
@@ -157,7 +159,11 @@ cl_command_queue_ND_range_gen6(cl_command_queue queue,
    if (cst_sz > 0) {
      char *data = NULL;
      assert(ker->cst_buffer);
-    data = cl_kernel_create_cst_buffer(ker, global_wk_sz, local_wk_sz);
+    data = cl_kernel_create_cst_buffer(ker,
+                                       global_wk_off,
+                                       global_wk_sz,
+                                       local_wk_sz,
+                                       0, 0); /* unused on Gen6 */
      gpgpu_upload_constants(gpgpu, data, cst_sz);
      cl_free(data);
    }
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c

index 7fa7e21..9a65d98 100644 (file)
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -36,7 +36,7 @@
  static INLINE size_t
  cl_kernel_compute_batch_sz(cl_kernel k)
  {
-  size_t sz = 256 + 16;
+  size_t sz = 256 + 32;
    return sz;
  }
  
@@ -96,7 +96,6 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
    char *user = NULL;  /* User defined constants first */
    char *data = NULL;  /* Complete constant buffer to upload */
    genx_gpgpu_kernel_t kernel;
-  const size_t local_mem_sz = cl_kernel_local_memory_sz(ker);
    size_t local_sz, batch_sz, cst_sz = ker->patch.curbe.sz;
    size_t i, thread_n, id_offset;
    cl_int err = CL_SUCCESS;
@@ -108,13 +107,15 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
    kernel.size = 0,
    kernel.bo = ker->bo;
    kernel.barrierID = 0;
+  kernel.use_barrier = ker->patch.exec_env.has_barriers;
+  kernel.slm_sz = cl_kernel_local_memory_sz(ker);
  
    /* All arguments must have been set */
    TRY (cl_kernel_check_args, ker);
  
    /* Check that the local work sizes are OK */
    TRY (cl_kernel_work_group_sz, ker, local_wk_sz, 3, &local_sz);
-  thread_n = local_sz / 16; /* SIMD16 only */
+  kernel.thread_n = thread_n = local_sz / 16; /* SIMD16 only */
  
    /* Fill the constant buffer. Basically, we have to build one set of
     * constants for each thread. The constants also includes the local ids we
@@ -122,9 +123,14 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
     */
    if (cst_sz > 0) {
      assert(ker->cst_buffer);
-    user = cl_kernel_create_cst_buffer(ker, global_wk_sz, local_wk_sz);
+    user = cl_kernel_create_cst_buffer(ker,
+                                       global_wk_off,
+                                       global_wk_sz,
+                                       local_wk_sz,
+                                       3,
+                                       thread_n);
    }
-  id_offset = cst_sz =  ALIGN(cst_sz, 32); /* Align the user data on 32 bytes */
+  id_offset = cst_sz = ALIGN(cst_sz, 32); /* Align the user data on 32 bytes */
    kernel.cst_sz = cst_sz += 3 * 32;        /* Add local IDs (16 words) */
    TRY_ALLOC (data, (char*) cl_calloc(thread_n, cst_sz));
    for (i = 0; i < thread_n; ++i)
@@ -136,7 +142,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
    if (queue->last_batch != NULL)
      drm_intel_bo_unreference(queue->last_batch);
    queue->last_batch = NULL;
-  cl_command_queue_bind_surface(queue, ker, NULL, &private_bo, &scratch_bo, local_mem_sz);
+  cl_command_queue_bind_surface(queue, ker, NULL, &private_bo, &scratch_bo, 0);
    gpgpu_states_setup(gpgpu, &kernel, 1);
  
    /* We always have constant with Gen7 (local_ids are used) */
diff --git a/src/cl_kernel.c b/src/cl_kernel.c

index 1efb159..ac12d80 100644 (file)
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -737,7 +737,12 @@ cl_kernel_local_memory_sz(cl_kernel k)
  }
  
  LOCAL char*
-cl_kernel_create_cst_buffer(cl_kernel k, const size_t *global_wk_sz, const size_t *local_wk_sz)
+cl_kernel_create_cst_buffer(cl_kernel k,
+                            const size_t *global_wk_off,
+                            const size_t *global_wk_sz,
+                            const size_t *local_wk_sz,
+                            cl_uint wk_dim,
+                            cl_uint thread_n)
  {
    cl_curbe_patch_info_t *info = NULL;
    const size_t sz = k->patch.curbe.sz;
@@ -747,6 +752,17 @@ cl_kernel_create_cst_buffer(cl_kernel k, const size_t *global_wk_sz, const size_
    TRY_ALLOC_NO_ERR (data, (char *) cl_calloc(sz, 1));
    memcpy(data, k->cst_buffer, sz);
  
+  /* Global work group offset */
+  key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_OFFSET, 0, 0);
+  if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
+    memcpy(data+info->offsets[0], global_wk_off,   sizeof(uint32_t));
+  key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_OFFSET, 0, 4);
+  if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
+    memcpy(data+info->offsets[0], global_wk_off+1, sizeof(uint32_t));
+  key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_OFFSET, 0, 8);
+  if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
+    memcpy(data+info->offsets[0], global_wk_off+2, sizeof(uint32_t));
+
    /* Global work group size */
    key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_SIZE, 0, 0);
    if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
@@ -769,6 +785,11 @@ cl_kernel_create_cst_buffer(cl_kernel k, const size_t *global_wk_sz, const size_
    if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
      memcpy(data+info->offsets[0], local_wk_sz+2, sizeof(uint32_t));
  
+  /* HW thread number (Gen7+) */
+  key = cl_curbe_key(DATA_PARAMETER_NUM_HARDWARE_THREADS, 0, 0);
+  if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
+    memcpy(data+info->offsets[0], &thread_n, sizeof(uint32_t));
+
  exit:
    return data;
  error:
diff --git a/src/cl_kernel.h b/src/cl_kernel.h

index 1c9b08e..9c1cd5f 100644 (file)
--- a/src/cl_kernel.h
+++ b/src/cl_kernel.h
@@ -251,7 +251,7 @@ struct _cl_kernel {
    volatile int ref_n;            /* We reference count this object */
    struct _drm_intel_bo *bo;      /* The code itself */
    struct _drm_intel_bo *const_bo;/* Buffer for all __constants values in the OCL program */
-  cl_program program;  /* Owns this structure (and pointers) */
+  cl_program program;            /* Owns this structure (and pointers) */
    cl_arg_info_t *arg_info;       /* List of arguments */
    cl_curbe_patch_info_t *curbe_info; /* List of patch locations for the curbe */
    char *name;                   /* User defined name */
@@ -320,7 +320,12 @@ cl_curbe_key(uint32_t type, uint32_t index, uint32_t src_offset)
  
  /* Allocate, fill and return the CURBE */
  extern char*
-cl_kernel_create_cst_buffer(cl_kernel k, const size_t *global_wk_sz, const size_t *local_wk_sz);
+cl_kernel_create_cst_buffer(cl_kernel k,
+                            const size_t *global_wk_off,
+                            const size_t *global_wk_sz,
+                            const size_t *local_wk_sz,
+                            cl_uint wk_dim,
+                            cl_uint thread_n);
  
  /* Compute and check the work group size from the user provided local size */
  extern cl_int
diff --git a/src/intel/intel_defines.h b/src/intel/intel_defines.h

index 24aabf5..753bfc1 100644 (file)
--- a/src/intel/intel_defines.h
+++ b/src/intel/intel_defines.h
@@ -33,6 +33,8 @@
  #define CMD_MEDIA_STATE_FLUSH                   CMD(2, 0, 4)
  #define CMD_GPGPU_WALKER                        CMD(2, 1, 5)
  
+#define CMD_LOAD_REGISTER_IMM                   (0x22 << 23)
+
  #define CMD_STATE_BASE_ADDRESS                  CMD(0, 1, 1)
  #define CMD_STATE_SIP                           CMD(0, 1, 2)
  #define CMD_PIPELINE_SELECT                     CMD(1, 1, 4)
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c

index 2562239..002992e 100644 (file)
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -280,7 +280,7 @@ typedef struct gen6_interface_descriptor
    struct {
      uint32_t group_threads_num:8;        /* 0..64, 0 - no barrier use */
      uint32_t barrier_return_byte:8;
-    uint32_t shared_local_mem_size:5;    /* 0..16 - 0K..64K */
+    uint32_t slm_sz:5;                   /* 0..16 - 0K..64K */
      uint32_t barrier_enable:1;
      uint32_t rounding_mode:2;
      uint32_t barrier_return_grf_offset:8;
@@ -310,6 +310,7 @@ struct intel_gpgpu
  {
    intel_driver_t *drv;
    intel_batchbuffer_t *batch;
+  genx_gpgpu_kernel_t *ker;
  
    struct {
      dri_bo *bo;
@@ -431,8 +432,8 @@ gpgpu_load_vfe_state(intel_gpgpu_t *state)
      intel_batchbuffer_alloc_space(state->batch,0);
  
    memset(vfe, 0, sizeof(struct gen6_vfe_state_inline));
-  vfe->vfe1.fast_preempt = 1;
-  vfe->vfe1.gpgpu_mode = state->drv->gen_ver > 6 ? 1 : 0;
+  vfe->vfe1.fast_preempt = 0;
+  vfe->vfe1.gpgpu_mode = state->drv->gen_ver >= 7 ? 1 : 0;
    vfe->vfe1.bypass_gateway_ctl = 1;
    vfe->vfe1.reset_gateway_timer = 1;
    vfe->vfe1.urb_entries = state->urb.num_vfe_entries;
@@ -441,7 +442,7 @@ gpgpu_load_vfe_state(intel_gpgpu_t *state)
    vfe->vfe1.max_threads = state->max_threads - 1;
  /*  vfe->vfe3.curbe_size = 63; */
  /*  vfe->vfe3.urbe_size = 13; */
-  vfe->vfe4.scoreboard_enable = 1;
+  vfe->vfe4.scoreboard_enable = 0;
    intel_batchbuffer_alloc_space(state->batch, sizeof(gen6_vfe_state_inline_t));
    ADVANCE_BATCH(state->batch);
  }
@@ -470,11 +471,73 @@ gpgpu_load_idrt(intel_gpgpu_t *state)
    ADVANCE_BATCH(state->batch);
  }
  
+static const uint32_t Gen7L3CacheConfigReg2DataTable[] =
+{
+                // SLM    URB     DC      RO      I/S     C       T
+    0x00080040, //{ 0,    256,    0,      256,    0,      0,      0,      }
+    0x02040040, //{ 0,    256,    128,    128,    0,      0,      0,      }
+    0x00800040, //{ 0,    256,    32,     0,      64,     32,     128,    }
+    0x01000038, //{ 0,    224,    64,     0,      64,     32,     128,    }
+    0x02000030, //{ 0,    224,    128,    0,      64,     32,     64,     }
+    0x01000038, //{ 0,    224,    64,     0,      128,    32,     64,     }
+    0x00000038, //{ 0,    224,    0,      0,      128,    32,     128,    }
+    0x00000040, //{ 0,    256,    0,      0,      128,    0,      128,    }
+    0x0A140091, //{ 128,  128,    128,    128,    0,      0,      0,      }
+    0x09100091, //{ 128,  128,    64,     0,      64,     64,     64,     }
+    0x08900091, //{ 128,  128,    32,     0,      64,     32,     128,    }
+    0x08900091  //{ 128,  128,    32,     0,      128,    32,     64,     }
+};
+
+static const uint32_t Gen7L3CacheConfigReg3DataTable[] =
+{
+                // SLM    URB     DC      RO      I/S     C       T
+    0x00000000, //{ 0,    256,    0,      256,    0,      0,      0,      }
+    0x00000000, //{ 0,    256,    128,    128,    0,      0,      0,      }
+    0x00080410, //{ 0,    256,    32,     0,      64,     32,     128,    }
+    0x00080410, //{ 0,    224,    64,     0,      64,     32,     128,    }
+    0x00040410, //{ 0,    224,    128,    0,      64,     32,     64,     }
+    0x00040420, //{ 0,    224,    64,     0,      128,    32,     64,     }
+    0x00080420, //{ 0,    224,    0,      0,      128,    32,     128,    }
+    0x00080020, //{ 0,    256,    0,      0,      128,    0,      128,    }
+    0x00204080, //{ 128,  128,    128,    128,    0,      0,      0,      }
+    0x00244890, //{ 128,  128,    64,     0,      64,     64,     64,     }
+    0x00284490, //{ 128,  128,    32,     0,      64,     32,     128,    }
+    0x002444A0  //{ 128,  128,    32,     0,      128,    32,     64,     }
+};
+
+// L3 cache stuff 
+#define L3_CNTL_REG2_ADDRESS_OFFSET         ( 0xB020 )
+#define L3_CNTL_REG3_ADDRESS_OFFSET         ( 0xB024 )
+
+LOCAL void
+intel_gpgpu_set_L3(intel_gpgpu_t *state, uint32_t use_barrier)
+{
+  BEGIN_BATCH(state->batch, 6);
+  OUT_BATCH(state->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+  OUT_BATCH(state->batch, L3_CNTL_REG2_ADDRESS_OFFSET);
+  if (use_barrier)
+    OUT_BATCH(state->batch, Gen7L3CacheConfigReg2DataTable[8]);
+  else
+    OUT_BATCH(state->batch, Gen7L3CacheConfigReg2DataTable[4]);
+
+  OUT_BATCH(state->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+  OUT_BATCH(state->batch, L3_CNTL_REG3_ADDRESS_OFFSET);
+  if (use_barrier)
+    OUT_BATCH(state->batch, Gen7L3CacheConfigReg3DataTable[8]);
+  else
+    OUT_BATCH(state->batch, Gen7L3CacheConfigReg3DataTable[4]);
+  ADVANCE_BATCH(state->batch);
+
+  intel_batchbuffer_emit_mi_flush(state->batch);
+}
+
  LOCAL void
  gpgpu_batch_start(intel_gpgpu_t *state)
  {
    intel_batchbuffer_start_atomic(state->batch, 256);
    intel_batchbuffer_emit_mi_flush(state->batch);
+  if (state->drv->gen_ver >= 7)
+    intel_gpgpu_set_L3(state, state->ker->use_barrier);
    gpgpu_select_pipeline(state);
    gpgpu_set_base_address(state);
    gpgpu_load_vfe_state(state);
@@ -883,6 +946,8 @@ gpgpu_build_binding_table(intel_gpgpu_t *state)
    dri_bo_unmap(state->binding_table_b.bo);
  }
  
+#define KB 1024
+
  static void
  gpgpu_build_idrt(intel_gpgpu_t *state,
                   genx_gpgpu_kernel_t *kernel,
@@ -907,11 +972,32 @@ gpgpu_build_idrt(intel_gpgpu_t *state,
      desc->desc3.binding_table_pointer = state->binding_table_b.bo->offset >> 5;
      desc->desc4.curbe_read_len = kernel[i].cst_sz / 32;
      desc->desc4.curbe_read_offset = 0;
-    desc->desc5.group_threads_num = kernel[i].barrierID; /* BarrierID on GEN6 */
-    /* desc->desc5 = 0; - no barriers, groups, etc. */
      /* desc->desc6 = 0; - mbz */
      /* desc->desc7 = 0; - mbz */
  
+    /* Barriers / SLM are automatically handled on Gen7+ */
+    if (state->drv->gen_ver >= 7) {
+      size_t slm_sz = kernel[i].slm_sz;
+      desc->desc5.group_threads_num = kernel[i].use_barrier ? kernel[i].thread_n : 0;
+      desc->desc5.barrier_enable = kernel[i].use_barrier;
+      if (slm_sz > 0) {
+        if (slm_sz <= 4 * KB)
+          slm_sz = 4 * KB; //4KB
+        else if (slm_sz <= 8 * KB)
+          slm_sz = 8 * KB; //8KB
+        else if (slm_sz <= 16 * KB)
+          slm_sz = 16 * KB; //16KB
+        else if (slm_sz <= 32 * KB)
+          slm_sz = 32 * KB; //32KB
+        else if (slm_sz <= 64 * KB)
+          slm_sz = 64 * KB; //64KB
+        slm_sz = slm_sz >> 12;
+      }
+      desc->desc5.slm_sz = slm_sz;
+    }
+    else
+      desc->desc5.group_threads_num = kernel[i].barrierID; /* BarrierID on GEN6 */
+
      dri_bo_emit_reloc(bo,
                        I915_GEM_DOMAIN_INSTRUCTION, 0,
                        0,
@@ -950,6 +1036,7 @@ gpgpu_upload_constants(intel_gpgpu_t *state, void* data, uint32_t size)
  LOCAL void
  gpgpu_states_setup(intel_gpgpu_t *state, genx_gpgpu_kernel_t *kernel, uint32_t ker_n)
  {
+  state->ker = kernel;
    gpgpu_build_sampler_table(state);
    gpgpu_build_binding_table(state);
    gpgpu_build_idrt(state, kernel, ker_n);
@@ -1014,7 +1101,7 @@ gpgpu_walker(intel_gpgpu_t *state,
  {
    BEGIN_BATCH(state->batch, 11);
    OUT_BATCH(state->batch, CMD_GPGPU_WALKER | 9);
-  OUT_BATCH(state->batch, 0);                       /* kernel index */
+  OUT_BATCH(state->batch, 0);                        /* kernel index == 0 */
    OUT_BATCH(state->batch, (1 << 30) | (thread_n-1)); /* SIMD16 | thread max */
    OUT_BATCH(state->batch, global_wk_off[0]);
    OUT_BATCH(state->batch, global_wk_sz[0]-1);
@@ -1025,6 +1112,11 @@ gpgpu_walker(intel_gpgpu_t *state,
    OUT_BATCH(state->batch, ~0x0);
    OUT_BATCH(state->batch, ~0x0);
    ADVANCE_BATCH(state->batch);
+
+  BEGIN_BATCH(state->batch, 2);
+  OUT_BATCH(state->batch, CMD_MEDIA_STATE_FLUSH | 0);
+  OUT_BATCH(state->batch, 0);                        /* kernel index == 0 */
+  ADVANCE_BATCH(state->batch);
  }
  
  LOCAL int32_t
diff --git a/src/intel/intel_gpgpu.h b/src/intel/intel_gpgpu.h

index ceb7982..1cd5eb0 100644 (file)
--- a/src/intel/intel_gpgpu.h
+++ b/src/intel/intel_gpgpu.h
@@ -36,15 +36,17 @@ enum gen6_cache_control {
  #define MAX_SAMPLERS   16
  
  /* Use this structure to bind kernels in the gpgpu state */
-typedef struct genx_gpgpu_kernel
-{
+typedef struct genx_gpgpu_kernel {
    const char *name;        /* kernel name and bo name */
    uint32_t grf_blocks;     /* register blocks kernel wants (in 8 reg blocks) */
-  uint32_t cst_sz;         /* indicates if kernel needs constants */
+  uint32_t cst_sz;         /* total size of all constants */
    const uint32_t *bin;     /* binary code of the kernel */
    int32_t size;            /* kernel code size */
    struct _drm_intel_bo *bo;/* kernel code in the proper addr space */
    int32_t barrierID;       /* barrierID for _this_ kernel */
+  uint32_t use_barrier:1;  /* For gen7 (automatic barrier management) */
+  uint32_t thread_n:15;    /* For gen7 (automatic barrier management) */
+  uint32_t slm_sz:16;      /* For gen7 (automatic SLM allocation) */
  } genx_gpgpu_kernel_t;
  
  /* Convenient abstraction of the device */
author	bsegovia <devnull@localhost>
	Thu, 4 Aug 2011 23:26:44 +0000 (23:26 +0000)
committer	Keith Packard <keithp@keithp.com>
	Fri, 10 Aug 2012 23:14:43 +0000 (16:14 -0700)
setup_fulsim.sh	[new file with mode: 0644]	patch \| blob
src/cl_command_queue.c		patch \| blob \| history
src/cl_command_queue_gen6.c		patch \| blob \| history
src/cl_command_queue_gen7.c		patch \| blob \| history
src/cl_kernel.c		patch \| blob \| history
src/cl_kernel.h		patch \| blob \| history
src/intel/intel_defines.h		patch \| blob \| history
src/intel/intel_gpgpu.c		patch \| blob \| history
src/intel/intel_gpgpu.h		patch \| blob \| history