From a7e1411408134bdf107ccf9147ee6e02afe94645 Mon Sep 17 00:00:00 2001 From: bsegovia Date: Tue, 29 Nov 2011 04:37:51 +0000 Subject: [PATCH] Ported the code to HSW --- CMakeLists.txt | 13 ++++++++++--- src/cl_command_queue.c | 7 +++++++ src/cl_device_id.c | 37 +++++++++++++++++++++++++++++++------ src/cl_kernel.c | 29 +++++++++++++++++++++-------- src/cl_kernel.h | 26 ++++++++++++++++++-------- src/cl_program.c | 10 +++++++++- src/intel/intel_driver.c | 8 ++++++-- src/intel/intel_gpgpu.c | 18 ++++++++++++------ 8 files changed, 114 insertions(+), 34 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 07ed3b9..5e974c6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,9 +13,10 @@ INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}) SET(CMAKE_VERBOSE_MAKEFILE "false") SET(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type") -SET(USE_INTEL_COMPILER CACHE BOOL "false") -SET(EMULATE_IVB CACHE BOOL "false") -SET(EMULATE_SNB CACHE BOOL "false") +SET(EMULATE_IVB false CACHE BOOL "To emulate IVB") +SET(EMULATE_SNB false CACHE BOOL "To emulate SNB") +SET(EMULATE_HSW false CACHE BOOL "To emulate HSW") +SET(USE_OLD_COMPILER false CACHE BOOL "To use the old compiler (required for SNB)") ADD_DEFINITIONS(-D__$(USER)__) IF (EMULATE_HSW) @@ -32,6 +33,12 @@ ELSE (EMULATE_IVB) ADD_DEFINITIONS(-DEMULATE_GEN=0) ENDIF (EMULATE_HSW) +IF (USE_OLD_COMPILER) + ADD_DEFINITIONS (-DUSE_OLD_COMPILER=1) +ELSE (USE_OLD_COMPILER) + ADD_DEFINITIONS (-DUSE_OLD_COMPILER=0) +ENDIF (USE_OLD_COMPILER) + IF (USE_FULSIM) ADD_DEFINITIONS(-DUSE_FULSIM=1) ELSE (USE_FULSIM) diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c index 6976bcc..466bd6a 100644 --- a/src/cl_command_queue.c +++ b/src/cl_command_queue.c @@ -273,6 +273,13 @@ cl_run_fulsim(void) system("wine AubLoad.exe dump.aub -device ivbB0"); else system("wine AubLoad.exe dump.aub -device ivbB0 -debug"); +#elif EMULATE_GEN == 75 /* HSW */ + if (debug_mode == NULL || strcmp(debug_mode, "1")) + system("wine AubLoad.exe dump.aub -device hsw.h.a0"); + else + system("wine AubLoad.exe dump.aub -device hsw.h.a0 -debug"); +#else +#error "Unknown device" #endif } diff --git a/src/cl_device_id.c b/src/cl_device_id.c index b20b132..3c66f70 100644 --- a/src/cl_device_id.c +++ b/src/cl_device_id.c @@ -75,15 +75,36 @@ static struct _cl_device_id intel_ivb_gt1_device = { #include "cl_gen7_device.h" }; +/* XXX we clone IVB for HSW now */ +static struct _cl_device_id intel_hsw_device = { + .max_compute_unit = 64, + .max_work_item_sizes = {512, 512, 512}, + .max_work_group_size = 512, + .max_clock_frequency = 1000, + .wg_sz = 512, + .compile_wg_sz = {0}, + +#include "cl_gen75_device.h" +}; + LOCAL cl_device_id cl_get_gt_device(void) { cl_device_id ret = NULL; const int device_id = cl_intel_get_device_id(); - if (device_id == PCI_CHIP_IVYBRIDGE_GT1 || - device_id == PCI_CHIP_IVYBRIDGE_M_GT1 || - device_id == PCI_CHIP_IVYBRIDGE_S_GT1) { + /* XXX we pick IVB for HSW now */ + if (device_id == PCI_CHIP_HASWELL_M || + device_id == PCI_CHIP_HASWELL_L || + device_id == PCI_CHIP_HASWELL_M0 || + device_id == PCI_CHIP_HASWELL_D0) { + intel_hsw_device.vendor_id = device_id; + intel_hsw_device.platform = intel_platform; + ret = &intel_hsw_device; + } + else if (device_id == PCI_CHIP_IVYBRIDGE_GT1 || + device_id == PCI_CHIP_IVYBRIDGE_M_GT1 || + device_id == PCI_CHIP_IVYBRIDGE_S_GT1) { intel_ivb_gt1_device.vendor_id = device_id; intel_ivb_gt1_device.platform = intel_platform; ret = &intel_ivb_gt1_device; @@ -174,7 +195,8 @@ cl_get_device_info(cl_device_id device, if (UNLIKELY(device != &intel_snb_gt1_device && device != &intel_snb_gt2_device && device != &intel_ivb_gt1_device && - device != &intel_ivb_gt2_device)) + device != &intel_ivb_gt2_device && + device != &intel_hsw_device)) return CL_INVALID_DEVICE; if (UNLIKELY(param_value == NULL)) return CL_INVALID_VALUE; @@ -249,14 +271,17 @@ cl_device_get_version(cl_device_id device, cl_int *ver) if (UNLIKELY(device != &intel_snb_gt1_device && device != &intel_snb_gt2_device && device != &intel_ivb_gt1_device && - device != &intel_ivb_gt2_device)) + device != &intel_ivb_gt2_device && + device != &intel_hsw_device)) return CL_INVALID_DEVICE; if (ver == NULL) return CL_SUCCESS; if (device == &intel_snb_gt1_device || device == &intel_snb_gt2_device) *ver = 6; - else + else if (device == &intel_ivb_gt1_device || device == &intel_ivb_gt2_device) *ver = 7; + else + *ver = 75; return CL_SUCCESS; } #undef DECL_FIELD diff --git a/src/cl_kernel.c b/src/cl_kernel.c index 378e5c4..4b67d7d 100644 --- a/src/cl_kernel.c +++ b/src/cl_kernel.c @@ -350,18 +350,26 @@ cl_kernel_setup_patch_list(cl_kernel k, const char *patch, size_t sz) case PATCH_TOKEN_CONSTANT_MEMORY_KERNEL_ARGUMENT: case PATCH_TOKEN_GLOBAL_MEMORY_KERNEL_ARGUMENT: { - cl_global_memory_object_arg_t *from = (cl_global_memory_object_arg_t *) patch; TRY_ALLOC (arg_info, CALLOC(cl_arg_info_t)); - arg_info->arg_index = from->index; - arg_info->offset = from->offset; - if (item->token == PATCH_TOKEN_GLOBAL_MEMORY_KERNEL_ARGUMENT) + if (item->token == PATCH_TOKEN_GLOBAL_MEMORY_KERNEL_ARGUMENT) { + cl_global_memory_object_arg_t *from = (cl_global_memory_object_arg_t *) patch; + arg_info->arg_index = from->index; + arg_info->offset = from->offset; arg_info->type = OCLRT_ARG_TYPE_BUFFER; - else if (item->token == PATCH_TOKEN_CONSTANT_MEMORY_KERNEL_ARGUMENT) + } + else if (item->token == PATCH_TOKEN_CONSTANT_MEMORY_KERNEL_ARGUMENT) { + cl_global_memory_object_arg_t *from = (cl_global_memory_object_arg_t *) patch; + arg_info->arg_index = from->index; + arg_info->offset = from->offset; arg_info->type = OCLRT_ARG_TYPE_CONST; - else if (item->token == PATCH_TOKEN_IMAGE_MEMORY_KERNEL_ARGUMENT) + } + else if (item->token == PATCH_TOKEN_IMAGE_MEMORY_KERNEL_ARGUMENT) { + cl_image_memory_object_arg_t *from = (cl_image_memory_object_arg_t *) patch; + arg_info->arg_index = from->index; + arg_info->offset = from->offset; arg_info->type = OCLRT_ARG_TYPE_IMAGE; - else + } else assert(0); arg_info->sz = sizeof(cl_mem); @@ -400,7 +408,12 @@ cl_kernel_setup_patch_list(cl_kernel k, const char *patch, size_t sz) case DATA_PARAMETER_IMAGE_CHANNEL_ORDER: case DATA_PARAMETER_NUM_HARDWARE_THREADS: { - curbe_key = cl_curbe_key(data->type, data->index, data->src_offset); +#if USE_OLD_COMPILER == 0 + if (data->type == DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_ARGUMENT_SIZES) + curbe_key = cl_curbe_key(data->type, data->index, 0); + else +#endif + curbe_key = cl_curbe_key(data->type, data->index, data->src_offset); curbe_info = cl_kernel_get_curbe_info_list(k, curbe_key); if (curbe_info != NULL) curbe_info->offsets[++curbe_info->last] = data->offset; diff --git a/src/cl_kernel.h b/src/cl_kernel.h index 704d1c5..cd4d00e 100644 --- a/src/cl_kernel.h +++ b/src/cl_kernel.h @@ -31,6 +31,13 @@ /* XXX Structures extracted from the WINDOWS CODE BASE */ /***************************************************************************/ +// Some fields went from 1 to 4 bytes with the new compiler +#if USE_OLD_COMPILER +typedef uint8_t cl_compiler_boolean_t; +#else +typedef uint32_t cl_compiler_boolean_t; +#endif /* USE_OLD_COMPILER */ + typedef struct cl_program_header { uint32_t magic; uint32_t version; @@ -41,11 +48,11 @@ typedef struct cl_program_header { typedef struct cl_arg_info { uint32_t arg_index; uint32_t type; - uint8_t is_null; + cl_compiler_boolean_t is_null; uint32_t offset; uint32_t sz; void *obj; - uint8_t is_patched; + cl_compiler_boolean_t is_patched; struct cl_arg_info *next; } cl_arg_info_t; @@ -57,8 +64,8 @@ typedef struct cl_curbe_patch_info { uint32_t arg_index; uint32_t sz; uint32_t src_offset; - uint8_t is_patched; - uint8_t is_local; + cl_compiler_boolean_t is_patched; + cl_compiler_boolean_t is_local; struct cl_curbe_patch_info *next; } cl_curbe_patch_info_t; @@ -68,13 +75,13 @@ typedef struct cl_kernel_header { uint32_t patch_list_sz; } cl_kernel_header_t; -typedef struct cl_kernel_header7_5 { +typedef struct cl_kernel_header75 { cl_kernel_header_t header; uint32_t kernel_heap_sz; uint32_t general_state_heap_sz; uint32_t dynamic_state_heap_sz; uint32_t surface_state_heap_sz; -} cl_kernel_header7_5_t; +} cl_kernel_header75_t; typedef struct cl_kernel_header7 { cl_kernel_header_t header; @@ -104,11 +111,14 @@ typedef struct cl_global_memory_object_arg { uint32_t offset; } cl_global_memory_object_arg_t; -typedef struct cl_patch_image_memory_object_arg { +#if USE_OLD_COMPILER == 0 +typedef struct cl_image_memory_object_arg { cl_patch_item_header_t header; uint32_t index; + uint32_t image_type; uint32_t offset; -} cl_patch_image_memory_object_arg_t; +} cl_image_memory_object_arg_t; +#endif typedef struct cl_patch_constant_memory_object_arg { uint32_t index; diff --git a/src/cl_program.c b/src/cl_program.c index 7cf39df..5916185 100644 --- a/src/cl_program.c +++ b/src/cl_program.c @@ -32,7 +32,11 @@ #include #include -static int icbe_ver = 1001L; +#if USE_OLD_COMPILER +static const int icbe_ver = 1001; +#else +static const int icbe_ver = 1002; +#endif #define DECL_LOAD_HEADER(GEN) \ static const char* \ @@ -54,6 +58,7 @@ JOIN(cl_kernel_load_header,GEN)(cl_kernel ker, \ DECL_LOAD_HEADER(6) DECL_LOAD_HEADER(7) +DECL_LOAD_HEADER(75) #undef DECL_LOAD_HEADER @@ -82,6 +87,9 @@ cl_program_decode(cl_program p) /* Format changes from generation to generation */ TRY_ALLOC (p->ker[i], cl_kernel_new()); switch (header->device) { + case IGFX_GEN7_5_CORE: + ker = cl_kernel_load_header75(p->ker[i], ker, &name_sz, &ker_sz); + break; case IGFX_GEN7_CORE: ker = cl_kernel_load_header7(p->ker[i], ker, &name_sz, &ker_sz); break; diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c index dd48222..305cd4b 100644 --- a/src/intel/intel_driver.c +++ b/src/intel/intel_driver.c @@ -109,14 +109,18 @@ intel_driver_init(intel_driver_t *driver, int dev_fd) #if EMULATE_GEN driver->gen_ver = EMULATE_GEN; - if (EMULATE_GEN == 7) + if (EMULATE_GEN == 75) + driver->device_id = PCI_CHIP_HASWELL_L; /* we pick L for HSW */ + else if (EMULATE_GEN == 7) driver->device_id = PCI_CHIP_IVYBRIDGE_GT2; /* we pick GT2 for IVB */ else if (EMULATE_GEN == 6) driver->device_id = PCI_CHIP_SANDYBRIDGE_GT2; /* we pick GT2 for SNB */ else FATAL ("Unsupported Gen for emulation"); #else - if (IS_GEN7(driver->device_id)) + if (IS_GEN75(driver->device_id)) + driver->gen_ver = 75; + else if (IS_GEN7(driver->device_id)) driver->gen_ver = 7; else if (IS_GEN6(driver->device_id)) driver->gen_ver = 6; diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c index 8bdb52a..3246a7e 100644 --- a/src/intel/intel_gpgpu.c +++ b/src/intel/intel_gpgpu.c @@ -171,14 +171,16 @@ gpgpu_load_vfe_state(intel_gpgpu_t *state) intel_batchbuffer_alloc_space(state->batch,0); memset(vfe, 0, sizeof(struct gen6_vfe_state_inline)); - vfe->vfe1.gpgpu_mode = state->drv->gen_ver >= 7 ? 1 : 0; + vfe->vfe1.gpgpu_mode = + (state->drv->gen_ver == 7 || state->drv->gen_ver == 75) ? 1 : 0; vfe->vfe1.bypass_gateway_ctl = 1; vfe->vfe1.reset_gateway_timer = 1; vfe->vfe1.max_threads = state->max_threads - 1; vfe->vfe1.urb_entries = 64; vfe->vfe3.curbe_size = 63; vfe->vfe3.urbe_size = 13; - vfe->vfe4.scoreboard_mask = state->drv->gen_ver >= 7 ? 0 : 0x80000000; + vfe->vfe4.scoreboard_mask = + (state->drv->gen_ver == 7 || state->drv->gen_ver == 75) ? 0 : 0x80000000; intel_batchbuffer_alloc_space(state->batch, sizeof(gen6_vfe_state_inline_t)); ADVANCE_BATCH(state->batch); } @@ -339,7 +341,7 @@ gpgpu_batch_start(intel_gpgpu_t *state) { intel_batchbuffer_start_atomic(state->batch, 256); gpgpu_pipe_control(state); - if (state->drv->gen_ver >= 7) + if (state->drv->gen_ver == 7 || state->drv->gen_ver == 75) intel_gpgpu_set_L3(state, state->ker->use_barrier); gpgpu_select_pipeline(state); gpgpu_set_base_address(state); @@ -589,8 +591,10 @@ gpgpu_bind_buf(intel_gpgpu_t *state, assert(index < MAX_SURFACES); if(state->drv->gen_ver == 6) gpgpu_bind_buf_gen6(state, index, obj_bo, size, cchint); - else if (state->drv->gen_ver == 7) + else if (state->drv->gen_ver == 7 || state->drv->gen_ver == 75) gpgpu_bind_buf_gen7(state, index, obj_bo, size, cchint); + else + NOT_IMPLEMENTED; } LOCAL void @@ -606,8 +610,10 @@ gpgpu_bind_image2D(intel_gpgpu_t *state, assert(index < MAX_SURFACES); if(state->drv->gen_ver == 6) gpgpu_bind_image2D_gen6(state, index, obj_bo, format, w, h, bpp, cchint); - else if (state->drv->gen_ver == 7) + else if (state->drv->gen_ver == 7 || state->drv->gen_ver == 75) gpgpu_bind_image2D_gen7(state, index, obj_bo, format, w, h, bpp, cchint); + else + NOT_IMPLEMENTED; } static void @@ -634,7 +640,7 @@ gpgpu_build_idrt(intel_gpgpu_t *state, desc->desc4.curbe_read_offset = 0; /* Barriers / SLM are automatically handled on Gen7+ */ - if (state->drv->gen_ver >= 7) { + if (state->drv->gen_ver == 7 || state->drv->gen_ver == 75) { size_t slm_sz = kernel[i].slm_sz; desc->desc5.group_threads_num = kernel[i].use_barrier ? kernel[i].thread_n : 0; desc->desc5.barrier_enable = kernel[i].use_barrier; -- 2.7.4