static INLINE size_t
cl_kernel_compute_batch_sz(cl_kernel k)
{
- size_t sz = 256 + 16;
+ size_t sz = 256 + 32;
return sz;
}
char *user = NULL; /* User defined constants first */
char *data = NULL; /* Complete constant buffer to upload */
genx_gpgpu_kernel_t kernel;
- const size_t local_mem_sz = cl_kernel_local_memory_sz(ker);
size_t local_sz, batch_sz, cst_sz = ker->patch.curbe.sz;
size_t i, thread_n, id_offset;
cl_int err = CL_SUCCESS;
kernel.size = 0,
kernel.bo = ker->bo;
kernel.barrierID = 0;
+ kernel.use_barrier = ker->patch.exec_env.has_barriers;
+ kernel.slm_sz = cl_kernel_local_memory_sz(ker);
/* All arguments must have been set */
TRY (cl_kernel_check_args, ker);
/* Check that the local work sizes are OK */
TRY (cl_kernel_work_group_sz, ker, local_wk_sz, 3, &local_sz);
- thread_n = local_sz / 16; /* SIMD16 only */
+ kernel.thread_n = thread_n = local_sz / 16; /* SIMD16 only */
/* Fill the constant buffer. Basically, we have to build one set of
* constants for each thread. The constants also includes the local ids we
*/
if (cst_sz > 0) {
assert(ker->cst_buffer);
- user = cl_kernel_create_cst_buffer(ker, global_wk_sz, local_wk_sz);
+ user = cl_kernel_create_cst_buffer(ker,
+ global_wk_off,
+ global_wk_sz,
+ local_wk_sz,
+ 3,
+ thread_n);
}
- id_offset = cst_sz = ALIGN(cst_sz, 32); /* Align the user data on 32 bytes */
+ id_offset = cst_sz = ALIGN(cst_sz, 32); /* Align the user data on 32 bytes */
kernel.cst_sz = cst_sz += 3 * 32; /* Add local IDs (16 words) */
TRY_ALLOC (data, (char*) cl_calloc(thread_n, cst_sz));
for (i = 0; i < thread_n; ++i)
if (queue->last_batch != NULL)
drm_intel_bo_unreference(queue->last_batch);
queue->last_batch = NULL;
- cl_command_queue_bind_surface(queue, ker, NULL, &private_bo, &scratch_bo, local_mem_sz);
+ cl_command_queue_bind_surface(queue, ker, NULL, &private_bo, &scratch_bo, 0);
gpgpu_states_setup(gpgpu, &kernel, 1);
/* We always have constant with Gen7 (local_ids are used) */
}
LOCAL char*
-cl_kernel_create_cst_buffer(cl_kernel k, const size_t *global_wk_sz, const size_t *local_wk_sz)
+cl_kernel_create_cst_buffer(cl_kernel k,
+ const size_t *global_wk_off,
+ const size_t *global_wk_sz,
+ const size_t *local_wk_sz,
+ cl_uint wk_dim,
+ cl_uint thread_n)
{
cl_curbe_patch_info_t *info = NULL;
const size_t sz = k->patch.curbe.sz;
TRY_ALLOC_NO_ERR (data, (char *) cl_calloc(sz, 1));
memcpy(data, k->cst_buffer, sz);
+ /* Global work group offset */
+ key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_OFFSET, 0, 0);
+ if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
+ memcpy(data+info->offsets[0], global_wk_off, sizeof(uint32_t));
+ key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_OFFSET, 0, 4);
+ if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
+ memcpy(data+info->offsets[0], global_wk_off+1, sizeof(uint32_t));
+ key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_OFFSET, 0, 8);
+ if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
+ memcpy(data+info->offsets[0], global_wk_off+2, sizeof(uint32_t));
+
/* Global work group size */
key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_SIZE, 0, 0);
if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
memcpy(data+info->offsets[0], local_wk_sz+2, sizeof(uint32_t));
+ /* HW thread number (Gen7+) */
+ key = cl_curbe_key(DATA_PARAMETER_NUM_HARDWARE_THREADS, 0, 0);
+ if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
+ memcpy(data+info->offsets[0], &thread_n, sizeof(uint32_t));
+
exit:
return data;
error:
struct {
uint32_t group_threads_num:8; /* 0..64, 0 - no barrier use */
uint32_t barrier_return_byte:8;
- uint32_t shared_local_mem_size:5; /* 0..16 - 0K..64K */
+ uint32_t slm_sz:5; /* 0..16 - 0K..64K */
uint32_t barrier_enable:1;
uint32_t rounding_mode:2;
uint32_t barrier_return_grf_offset:8;
{
intel_driver_t *drv;
intel_batchbuffer_t *batch;
+ genx_gpgpu_kernel_t *ker;
struct {
dri_bo *bo;
intel_batchbuffer_alloc_space(state->batch,0);
memset(vfe, 0, sizeof(struct gen6_vfe_state_inline));
- vfe->vfe1.fast_preempt = 1;
- vfe->vfe1.gpgpu_mode = state->drv->gen_ver > 6 ? 1 : 0;
+ vfe->vfe1.fast_preempt = 0;
+ vfe->vfe1.gpgpu_mode = state->drv->gen_ver >= 7 ? 1 : 0;
vfe->vfe1.bypass_gateway_ctl = 1;
vfe->vfe1.reset_gateway_timer = 1;
vfe->vfe1.urb_entries = state->urb.num_vfe_entries;
vfe->vfe1.max_threads = state->max_threads - 1;
/* vfe->vfe3.curbe_size = 63; */
/* vfe->vfe3.urbe_size = 13; */
- vfe->vfe4.scoreboard_enable = 1;
+ vfe->vfe4.scoreboard_enable = 0;
intel_batchbuffer_alloc_space(state->batch, sizeof(gen6_vfe_state_inline_t));
ADVANCE_BATCH(state->batch);
}
ADVANCE_BATCH(state->batch);
}
+static const uint32_t Gen7L3CacheConfigReg2DataTable[] =
+{
+ // SLM URB DC RO I/S C T
+ 0x00080040, //{ 0, 256, 0, 256, 0, 0, 0, }
+ 0x02040040, //{ 0, 256, 128, 128, 0, 0, 0, }
+ 0x00800040, //{ 0, 256, 32, 0, 64, 32, 128, }
+ 0x01000038, //{ 0, 224, 64, 0, 64, 32, 128, }
+ 0x02000030, //{ 0, 224, 128, 0, 64, 32, 64, }
+ 0x01000038, //{ 0, 224, 64, 0, 128, 32, 64, }
+ 0x00000038, //{ 0, 224, 0, 0, 128, 32, 128, }
+ 0x00000040, //{ 0, 256, 0, 0, 128, 0, 128, }
+ 0x0A140091, //{ 128, 128, 128, 128, 0, 0, 0, }
+ 0x09100091, //{ 128, 128, 64, 0, 64, 64, 64, }
+ 0x08900091, //{ 128, 128, 32, 0, 64, 32, 128, }
+ 0x08900091 //{ 128, 128, 32, 0, 128, 32, 64, }
+};
+
+static const uint32_t Gen7L3CacheConfigReg3DataTable[] =
+{
+ // SLM URB DC RO I/S C T
+ 0x00000000, //{ 0, 256, 0, 256, 0, 0, 0, }
+ 0x00000000, //{ 0, 256, 128, 128, 0, 0, 0, }
+ 0x00080410, //{ 0, 256, 32, 0, 64, 32, 128, }
+ 0x00080410, //{ 0, 224, 64, 0, 64, 32, 128, }
+ 0x00040410, //{ 0, 224, 128, 0, 64, 32, 64, }
+ 0x00040420, //{ 0, 224, 64, 0, 128, 32, 64, }
+ 0x00080420, //{ 0, 224, 0, 0, 128, 32, 128, }
+ 0x00080020, //{ 0, 256, 0, 0, 128, 0, 128, }
+ 0x00204080, //{ 128, 128, 128, 128, 0, 0, 0, }
+ 0x00244890, //{ 128, 128, 64, 0, 64, 64, 64, }
+ 0x00284490, //{ 128, 128, 32, 0, 64, 32, 128, }
+ 0x002444A0 //{ 128, 128, 32, 0, 128, 32, 64, }
+};
+
+// L3 cache stuff
+#define L3_CNTL_REG2_ADDRESS_OFFSET ( 0xB020 )
+#define L3_CNTL_REG3_ADDRESS_OFFSET ( 0xB024 )
+
+LOCAL void
+intel_gpgpu_set_L3(intel_gpgpu_t *state, uint32_t use_barrier)
+{
+ BEGIN_BATCH(state->batch, 6);
+ OUT_BATCH(state->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+ OUT_BATCH(state->batch, L3_CNTL_REG2_ADDRESS_OFFSET);
+ if (use_barrier)
+ OUT_BATCH(state->batch, Gen7L3CacheConfigReg2DataTable[8]);
+ else
+ OUT_BATCH(state->batch, Gen7L3CacheConfigReg2DataTable[4]);
+
+ OUT_BATCH(state->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+ OUT_BATCH(state->batch, L3_CNTL_REG3_ADDRESS_OFFSET);
+ if (use_barrier)
+ OUT_BATCH(state->batch, Gen7L3CacheConfigReg3DataTable[8]);
+ else
+ OUT_BATCH(state->batch, Gen7L3CacheConfigReg3DataTable[4]);
+ ADVANCE_BATCH(state->batch);
+
+ intel_batchbuffer_emit_mi_flush(state->batch);
+}
+
LOCAL void
gpgpu_batch_start(intel_gpgpu_t *state)
{
intel_batchbuffer_start_atomic(state->batch, 256);
intel_batchbuffer_emit_mi_flush(state->batch);
+ if (state->drv->gen_ver >= 7)
+ intel_gpgpu_set_L3(state, state->ker->use_barrier);
gpgpu_select_pipeline(state);
gpgpu_set_base_address(state);
gpgpu_load_vfe_state(state);
dri_bo_unmap(state->binding_table_b.bo);
}
+#define KB 1024
+
static void
gpgpu_build_idrt(intel_gpgpu_t *state,
genx_gpgpu_kernel_t *kernel,
desc->desc3.binding_table_pointer = state->binding_table_b.bo->offset >> 5;
desc->desc4.curbe_read_len = kernel[i].cst_sz / 32;
desc->desc4.curbe_read_offset = 0;
- desc->desc5.group_threads_num = kernel[i].barrierID; /* BarrierID on GEN6 */
- /* desc->desc5 = 0; - no barriers, groups, etc. */
/* desc->desc6 = 0; - mbz */
/* desc->desc7 = 0; - mbz */
+ /* Barriers / SLM are automatically handled on Gen7+ */
+ if (state->drv->gen_ver >= 7) {
+ size_t slm_sz = kernel[i].slm_sz;
+ desc->desc5.group_threads_num = kernel[i].use_barrier ? kernel[i].thread_n : 0;
+ desc->desc5.barrier_enable = kernel[i].use_barrier;
+ if (slm_sz > 0) {
+ if (slm_sz <= 4 * KB)
+ slm_sz = 4 * KB; //4KB
+ else if (slm_sz <= 8 * KB)
+ slm_sz = 8 * KB; //8KB
+ else if (slm_sz <= 16 * KB)
+ slm_sz = 16 * KB; //16KB
+ else if (slm_sz <= 32 * KB)
+ slm_sz = 32 * KB; //32KB
+ else if (slm_sz <= 64 * KB)
+ slm_sz = 64 * KB; //64KB
+ slm_sz = slm_sz >> 12;
+ }
+ desc->desc5.slm_sz = slm_sz;
+ }
+ else
+ desc->desc5.group_threads_num = kernel[i].barrierID; /* BarrierID on GEN6 */
+
dri_bo_emit_reloc(bo,
I915_GEM_DOMAIN_INSTRUCTION, 0,
0,
LOCAL void
gpgpu_states_setup(intel_gpgpu_t *state, genx_gpgpu_kernel_t *kernel, uint32_t ker_n)
{
+ state->ker = kernel;
gpgpu_build_sampler_table(state);
gpgpu_build_binding_table(state);
gpgpu_build_idrt(state, kernel, ker_n);
{
BEGIN_BATCH(state->batch, 11);
OUT_BATCH(state->batch, CMD_GPGPU_WALKER | 9);
- OUT_BATCH(state->batch, 0); /* kernel index */
+ OUT_BATCH(state->batch, 0); /* kernel index == 0 */
OUT_BATCH(state->batch, (1 << 30) | (thread_n-1)); /* SIMD16 | thread max */
OUT_BATCH(state->batch, global_wk_off[0]);
OUT_BATCH(state->batch, global_wk_sz[0]-1);
OUT_BATCH(state->batch, ~0x0);
OUT_BATCH(state->batch, ~0x0);
ADVANCE_BATCH(state->batch);
+
+ BEGIN_BATCH(state->batch, 2);
+ OUT_BATCH(state->batch, CMD_MEDIA_STATE_FLUSH | 0);
+ OUT_BATCH(state->batch, 0); /* kernel index == 0 */
+ ADVANCE_BATCH(state->batch);
}
LOCAL int32_t