From 9196b68dd5e7c0b2664d9f8339ee52ed454200c1 Mon Sep 17 00:00:00 2001 From: bsegovia Date: Thu, 4 Aug 2011 23:26:44 +0000 Subject: [PATCH] All unit tests now pass --- setup_fulsim.sh | 5 +++ src/cl_command_queue.c | 2 +- src/cl_command_queue_gen6.c | 8 +++- src/cl_command_queue_gen7.c | 18 +++++--- src/cl_kernel.c | 23 +++++++++- src/cl_kernel.h | 9 +++- src/intel/intel_defines.h | 2 + src/intel/intel_gpgpu.c | 106 +++++++++++++++++++++++++++++++++++++++++--- src/intel/intel_gpgpu.h | 8 ++-- 9 files changed, 160 insertions(+), 21 deletions(-) create mode 100644 setup_fulsim.sh diff --git a/setup_fulsim.sh b/setup_fulsim.sh new file mode 100644 index 0000000..9e355a0 --- /dev/null +++ b/setup_fulsim.sh @@ -0,0 +1,5 @@ +export INTEL_DEVID_OVERRIDE=0x0166 # or, 0x0112 +export DEVICE=ivb_m_gt2 # snb_gt2 for SNB GT2 desktop +export OCL_FULSIM_RUN=1 +export OCL_FULSIM_DEBUG_MODE=$1 + diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c index 904df7e..b4e2735 100644 --- a/src/cl_command_queue.c +++ b/src/cl_command_queue.c @@ -236,7 +236,7 @@ cl_run_fulsim(void) system("wine AubLoad.exe dump.aub -device sbrB0"); else system("wine AubLoad.exe dump.aub -device sbrB0 -debug"); -#elif EMULATE_GEN == 7 +#elif EMULATE_GEN == 7 /* IVB */ if (debug_mode == NULL || strcmp(debug_mode, "1")) system("wine AubLoad.exe dump.aub -device ivb2"); else diff --git a/src/cl_command_queue_gen6.c b/src/cl_command_queue_gen6.c index 882d6bc..763b757 100644 --- a/src/cl_command_queue_gen6.c +++ b/src/cl_command_queue_gen6.c @@ -112,6 +112,8 @@ cl_command_queue_ND_range_gen6(cl_command_queue queue, kernels[i].size = 0, kernels[i].bo = ker->bo; kernels[i].barrierID = i; + kernels[i].use_barrier = 0; /* unused in gen6 */ + kernels[i].thread_n = 0; /* unused in gen6 */ } /* All arguments must have been set */ @@ -157,7 +159,11 @@ cl_command_queue_ND_range_gen6(cl_command_queue queue, if (cst_sz > 0) { char *data = NULL; assert(ker->cst_buffer); - data = cl_kernel_create_cst_buffer(ker, global_wk_sz, local_wk_sz); + data = cl_kernel_create_cst_buffer(ker, + global_wk_off, + global_wk_sz, + local_wk_sz, + 0, 0); /* unused on Gen6 */ gpgpu_upload_constants(gpgpu, data, cst_sz); cl_free(data); } diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c index 7fa7e21..9a65d98 100644 --- a/src/cl_command_queue_gen7.c +++ b/src/cl_command_queue_gen7.c @@ -36,7 +36,7 @@ static INLINE size_t cl_kernel_compute_batch_sz(cl_kernel k) { - size_t sz = 256 + 16; + size_t sz = 256 + 32; return sz; } @@ -96,7 +96,6 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue, char *user = NULL; /* User defined constants first */ char *data = NULL; /* Complete constant buffer to upload */ genx_gpgpu_kernel_t kernel; - const size_t local_mem_sz = cl_kernel_local_memory_sz(ker); size_t local_sz, batch_sz, cst_sz = ker->patch.curbe.sz; size_t i, thread_n, id_offset; cl_int err = CL_SUCCESS; @@ -108,13 +107,15 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue, kernel.size = 0, kernel.bo = ker->bo; kernel.barrierID = 0; + kernel.use_barrier = ker->patch.exec_env.has_barriers; + kernel.slm_sz = cl_kernel_local_memory_sz(ker); /* All arguments must have been set */ TRY (cl_kernel_check_args, ker); /* Check that the local work sizes are OK */ TRY (cl_kernel_work_group_sz, ker, local_wk_sz, 3, &local_sz); - thread_n = local_sz / 16; /* SIMD16 only */ + kernel.thread_n = thread_n = local_sz / 16; /* SIMD16 only */ /* Fill the constant buffer. Basically, we have to build one set of * constants for each thread. The constants also includes the local ids we @@ -122,9 +123,14 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue, */ if (cst_sz > 0) { assert(ker->cst_buffer); - user = cl_kernel_create_cst_buffer(ker, global_wk_sz, local_wk_sz); + user = cl_kernel_create_cst_buffer(ker, + global_wk_off, + global_wk_sz, + local_wk_sz, + 3, + thread_n); } - id_offset = cst_sz = ALIGN(cst_sz, 32); /* Align the user data on 32 bytes */ + id_offset = cst_sz = ALIGN(cst_sz, 32); /* Align the user data on 32 bytes */ kernel.cst_sz = cst_sz += 3 * 32; /* Add local IDs (16 words) */ TRY_ALLOC (data, (char*) cl_calloc(thread_n, cst_sz)); for (i = 0; i < thread_n; ++i) @@ -136,7 +142,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue, if (queue->last_batch != NULL) drm_intel_bo_unreference(queue->last_batch); queue->last_batch = NULL; - cl_command_queue_bind_surface(queue, ker, NULL, &private_bo, &scratch_bo, local_mem_sz); + cl_command_queue_bind_surface(queue, ker, NULL, &private_bo, &scratch_bo, 0); gpgpu_states_setup(gpgpu, &kernel, 1); /* We always have constant with Gen7 (local_ids are used) */ diff --git a/src/cl_kernel.c b/src/cl_kernel.c index 1efb159..ac12d80 100644 --- a/src/cl_kernel.c +++ b/src/cl_kernel.c @@ -737,7 +737,12 @@ cl_kernel_local_memory_sz(cl_kernel k) } LOCAL char* -cl_kernel_create_cst_buffer(cl_kernel k, const size_t *global_wk_sz, const size_t *local_wk_sz) +cl_kernel_create_cst_buffer(cl_kernel k, + const size_t *global_wk_off, + const size_t *global_wk_sz, + const size_t *local_wk_sz, + cl_uint wk_dim, + cl_uint thread_n) { cl_curbe_patch_info_t *info = NULL; const size_t sz = k->patch.curbe.sz; @@ -747,6 +752,17 @@ cl_kernel_create_cst_buffer(cl_kernel k, const size_t *global_wk_sz, const size_ TRY_ALLOC_NO_ERR (data, (char *) cl_calloc(sz, 1)); memcpy(data, k->cst_buffer, sz); + /* Global work group offset */ + key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_OFFSET, 0, 0); + if ((info = cl_kernel_get_curbe_info(k, key)) != NULL) + memcpy(data+info->offsets[0], global_wk_off, sizeof(uint32_t)); + key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_OFFSET, 0, 4); + if ((info = cl_kernel_get_curbe_info(k, key)) != NULL) + memcpy(data+info->offsets[0], global_wk_off+1, sizeof(uint32_t)); + key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_OFFSET, 0, 8); + if ((info = cl_kernel_get_curbe_info(k, key)) != NULL) + memcpy(data+info->offsets[0], global_wk_off+2, sizeof(uint32_t)); + /* Global work group size */ key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_SIZE, 0, 0); if ((info = cl_kernel_get_curbe_info(k, key)) != NULL) @@ -769,6 +785,11 @@ cl_kernel_create_cst_buffer(cl_kernel k, const size_t *global_wk_sz, const size_ if ((info = cl_kernel_get_curbe_info(k, key)) != NULL) memcpy(data+info->offsets[0], local_wk_sz+2, sizeof(uint32_t)); + /* HW thread number (Gen7+) */ + key = cl_curbe_key(DATA_PARAMETER_NUM_HARDWARE_THREADS, 0, 0); + if ((info = cl_kernel_get_curbe_info(k, key)) != NULL) + memcpy(data+info->offsets[0], &thread_n, sizeof(uint32_t)); + exit: return data; error: diff --git a/src/cl_kernel.h b/src/cl_kernel.h index 1c9b08e..9c1cd5f 100644 --- a/src/cl_kernel.h +++ b/src/cl_kernel.h @@ -251,7 +251,7 @@ struct _cl_kernel { volatile int ref_n; /* We reference count this object */ struct _drm_intel_bo *bo; /* The code itself */ struct _drm_intel_bo *const_bo;/* Buffer for all __constants values in the OCL program */ - cl_program program; /* Owns this structure (and pointers) */ + cl_program program; /* Owns this structure (and pointers) */ cl_arg_info_t *arg_info; /* List of arguments */ cl_curbe_patch_info_t *curbe_info; /* List of patch locations for the curbe */ char *name; /* User defined name */ @@ -320,7 +320,12 @@ cl_curbe_key(uint32_t type, uint32_t index, uint32_t src_offset) /* Allocate, fill and return the CURBE */ extern char* -cl_kernel_create_cst_buffer(cl_kernel k, const size_t *global_wk_sz, const size_t *local_wk_sz); +cl_kernel_create_cst_buffer(cl_kernel k, + const size_t *global_wk_off, + const size_t *global_wk_sz, + const size_t *local_wk_sz, + cl_uint wk_dim, + cl_uint thread_n); /* Compute and check the work group size from the user provided local size */ extern cl_int diff --git a/src/intel/intel_defines.h b/src/intel/intel_defines.h index 24aabf5..753bfc1 100644 --- a/src/intel/intel_defines.h +++ b/src/intel/intel_defines.h @@ -33,6 +33,8 @@ #define CMD_MEDIA_STATE_FLUSH CMD(2, 0, 4) #define CMD_GPGPU_WALKER CMD(2, 1, 5) +#define CMD_LOAD_REGISTER_IMM (0x22 << 23) + #define CMD_STATE_BASE_ADDRESS CMD(0, 1, 1) #define CMD_STATE_SIP CMD(0, 1, 2) #define CMD_PIPELINE_SELECT CMD(1, 1, 4) diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c index 2562239..002992e 100644 --- a/src/intel/intel_gpgpu.c +++ b/src/intel/intel_gpgpu.c @@ -280,7 +280,7 @@ typedef struct gen6_interface_descriptor struct { uint32_t group_threads_num:8; /* 0..64, 0 - no barrier use */ uint32_t barrier_return_byte:8; - uint32_t shared_local_mem_size:5; /* 0..16 - 0K..64K */ + uint32_t slm_sz:5; /* 0..16 - 0K..64K */ uint32_t barrier_enable:1; uint32_t rounding_mode:2; uint32_t barrier_return_grf_offset:8; @@ -310,6 +310,7 @@ struct intel_gpgpu { intel_driver_t *drv; intel_batchbuffer_t *batch; + genx_gpgpu_kernel_t *ker; struct { dri_bo *bo; @@ -431,8 +432,8 @@ gpgpu_load_vfe_state(intel_gpgpu_t *state) intel_batchbuffer_alloc_space(state->batch,0); memset(vfe, 0, sizeof(struct gen6_vfe_state_inline)); - vfe->vfe1.fast_preempt = 1; - vfe->vfe1.gpgpu_mode = state->drv->gen_ver > 6 ? 1 : 0; + vfe->vfe1.fast_preempt = 0; + vfe->vfe1.gpgpu_mode = state->drv->gen_ver >= 7 ? 1 : 0; vfe->vfe1.bypass_gateway_ctl = 1; vfe->vfe1.reset_gateway_timer = 1; vfe->vfe1.urb_entries = state->urb.num_vfe_entries; @@ -441,7 +442,7 @@ gpgpu_load_vfe_state(intel_gpgpu_t *state) vfe->vfe1.max_threads = state->max_threads - 1; /* vfe->vfe3.curbe_size = 63; */ /* vfe->vfe3.urbe_size = 13; */ - vfe->vfe4.scoreboard_enable = 1; + vfe->vfe4.scoreboard_enable = 0; intel_batchbuffer_alloc_space(state->batch, sizeof(gen6_vfe_state_inline_t)); ADVANCE_BATCH(state->batch); } @@ -470,11 +471,73 @@ gpgpu_load_idrt(intel_gpgpu_t *state) ADVANCE_BATCH(state->batch); } +static const uint32_t Gen7L3CacheConfigReg2DataTable[] = +{ + // SLM URB DC RO I/S C T + 0x00080040, //{ 0, 256, 0, 256, 0, 0, 0, } + 0x02040040, //{ 0, 256, 128, 128, 0, 0, 0, } + 0x00800040, //{ 0, 256, 32, 0, 64, 32, 128, } + 0x01000038, //{ 0, 224, 64, 0, 64, 32, 128, } + 0x02000030, //{ 0, 224, 128, 0, 64, 32, 64, } + 0x01000038, //{ 0, 224, 64, 0, 128, 32, 64, } + 0x00000038, //{ 0, 224, 0, 0, 128, 32, 128, } + 0x00000040, //{ 0, 256, 0, 0, 128, 0, 128, } + 0x0A140091, //{ 128, 128, 128, 128, 0, 0, 0, } + 0x09100091, //{ 128, 128, 64, 0, 64, 64, 64, } + 0x08900091, //{ 128, 128, 32, 0, 64, 32, 128, } + 0x08900091 //{ 128, 128, 32, 0, 128, 32, 64, } +}; + +static const uint32_t Gen7L3CacheConfigReg3DataTable[] = +{ + // SLM URB DC RO I/S C T + 0x00000000, //{ 0, 256, 0, 256, 0, 0, 0, } + 0x00000000, //{ 0, 256, 128, 128, 0, 0, 0, } + 0x00080410, //{ 0, 256, 32, 0, 64, 32, 128, } + 0x00080410, //{ 0, 224, 64, 0, 64, 32, 128, } + 0x00040410, //{ 0, 224, 128, 0, 64, 32, 64, } + 0x00040420, //{ 0, 224, 64, 0, 128, 32, 64, } + 0x00080420, //{ 0, 224, 0, 0, 128, 32, 128, } + 0x00080020, //{ 0, 256, 0, 0, 128, 0, 128, } + 0x00204080, //{ 128, 128, 128, 128, 0, 0, 0, } + 0x00244890, //{ 128, 128, 64, 0, 64, 64, 64, } + 0x00284490, //{ 128, 128, 32, 0, 64, 32, 128, } + 0x002444A0 //{ 128, 128, 32, 0, 128, 32, 64, } +}; + +// L3 cache stuff +#define L3_CNTL_REG2_ADDRESS_OFFSET ( 0xB020 ) +#define L3_CNTL_REG3_ADDRESS_OFFSET ( 0xB024 ) + +LOCAL void +intel_gpgpu_set_L3(intel_gpgpu_t *state, uint32_t use_barrier) +{ + BEGIN_BATCH(state->batch, 6); + OUT_BATCH(state->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */ + OUT_BATCH(state->batch, L3_CNTL_REG2_ADDRESS_OFFSET); + if (use_barrier) + OUT_BATCH(state->batch, Gen7L3CacheConfigReg2DataTable[8]); + else + OUT_BATCH(state->batch, Gen7L3CacheConfigReg2DataTable[4]); + + OUT_BATCH(state->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */ + OUT_BATCH(state->batch, L3_CNTL_REG3_ADDRESS_OFFSET); + if (use_barrier) + OUT_BATCH(state->batch, Gen7L3CacheConfigReg3DataTable[8]); + else + OUT_BATCH(state->batch, Gen7L3CacheConfigReg3DataTable[4]); + ADVANCE_BATCH(state->batch); + + intel_batchbuffer_emit_mi_flush(state->batch); +} + LOCAL void gpgpu_batch_start(intel_gpgpu_t *state) { intel_batchbuffer_start_atomic(state->batch, 256); intel_batchbuffer_emit_mi_flush(state->batch); + if (state->drv->gen_ver >= 7) + intel_gpgpu_set_L3(state, state->ker->use_barrier); gpgpu_select_pipeline(state); gpgpu_set_base_address(state); gpgpu_load_vfe_state(state); @@ -883,6 +946,8 @@ gpgpu_build_binding_table(intel_gpgpu_t *state) dri_bo_unmap(state->binding_table_b.bo); } +#define KB 1024 + static void gpgpu_build_idrt(intel_gpgpu_t *state, genx_gpgpu_kernel_t *kernel, @@ -907,11 +972,32 @@ gpgpu_build_idrt(intel_gpgpu_t *state, desc->desc3.binding_table_pointer = state->binding_table_b.bo->offset >> 5; desc->desc4.curbe_read_len = kernel[i].cst_sz / 32; desc->desc4.curbe_read_offset = 0; - desc->desc5.group_threads_num = kernel[i].barrierID; /* BarrierID on GEN6 */ - /* desc->desc5 = 0; - no barriers, groups, etc. */ /* desc->desc6 = 0; - mbz */ /* desc->desc7 = 0; - mbz */ + /* Barriers / SLM are automatically handled on Gen7+ */ + if (state->drv->gen_ver >= 7) { + size_t slm_sz = kernel[i].slm_sz; + desc->desc5.group_threads_num = kernel[i].use_barrier ? kernel[i].thread_n : 0; + desc->desc5.barrier_enable = kernel[i].use_barrier; + if (slm_sz > 0) { + if (slm_sz <= 4 * KB) + slm_sz = 4 * KB; //4KB + else if (slm_sz <= 8 * KB) + slm_sz = 8 * KB; //8KB + else if (slm_sz <= 16 * KB) + slm_sz = 16 * KB; //16KB + else if (slm_sz <= 32 * KB) + slm_sz = 32 * KB; //32KB + else if (slm_sz <= 64 * KB) + slm_sz = 64 * KB; //64KB + slm_sz = slm_sz >> 12; + } + desc->desc5.slm_sz = slm_sz; + } + else + desc->desc5.group_threads_num = kernel[i].barrierID; /* BarrierID on GEN6 */ + dri_bo_emit_reloc(bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0, @@ -950,6 +1036,7 @@ gpgpu_upload_constants(intel_gpgpu_t *state, void* data, uint32_t size) LOCAL void gpgpu_states_setup(intel_gpgpu_t *state, genx_gpgpu_kernel_t *kernel, uint32_t ker_n) { + state->ker = kernel; gpgpu_build_sampler_table(state); gpgpu_build_binding_table(state); gpgpu_build_idrt(state, kernel, ker_n); @@ -1014,7 +1101,7 @@ gpgpu_walker(intel_gpgpu_t *state, { BEGIN_BATCH(state->batch, 11); OUT_BATCH(state->batch, CMD_GPGPU_WALKER | 9); - OUT_BATCH(state->batch, 0); /* kernel index */ + OUT_BATCH(state->batch, 0); /* kernel index == 0 */ OUT_BATCH(state->batch, (1 << 30) | (thread_n-1)); /* SIMD16 | thread max */ OUT_BATCH(state->batch, global_wk_off[0]); OUT_BATCH(state->batch, global_wk_sz[0]-1); @@ -1025,6 +1112,11 @@ gpgpu_walker(intel_gpgpu_t *state, OUT_BATCH(state->batch, ~0x0); OUT_BATCH(state->batch, ~0x0); ADVANCE_BATCH(state->batch); + + BEGIN_BATCH(state->batch, 2); + OUT_BATCH(state->batch, CMD_MEDIA_STATE_FLUSH | 0); + OUT_BATCH(state->batch, 0); /* kernel index == 0 */ + ADVANCE_BATCH(state->batch); } LOCAL int32_t diff --git a/src/intel/intel_gpgpu.h b/src/intel/intel_gpgpu.h index ceb7982..1cd5eb0 100644 --- a/src/intel/intel_gpgpu.h +++ b/src/intel/intel_gpgpu.h @@ -36,15 +36,17 @@ enum gen6_cache_control { #define MAX_SAMPLERS 16 /* Use this structure to bind kernels in the gpgpu state */ -typedef struct genx_gpgpu_kernel -{ +typedef struct genx_gpgpu_kernel { const char *name; /* kernel name and bo name */ uint32_t grf_blocks; /* register blocks kernel wants (in 8 reg blocks) */ - uint32_t cst_sz; /* indicates if kernel needs constants */ + uint32_t cst_sz; /* total size of all constants */ const uint32_t *bin; /* binary code of the kernel */ int32_t size; /* kernel code size */ struct _drm_intel_bo *bo;/* kernel code in the proper addr space */ int32_t barrierID; /* barrierID for _this_ kernel */ + uint32_t use_barrier:1; /* For gen7 (automatic barrier management) */ + uint32_t thread_n:15; /* For gen7 (automatic barrier management) */ + uint32_t slm_sz:16; /* For gen7 (automatic SLM allocation) */ } genx_gpgpu_kernel_t; /* Convenient abstraction of the device */ -- 2.7.4