Ported the code to HSW
authorbsegovia <devnull@localhost>
Tue, 29 Nov 2011 04:37:51 +0000 (04:37 +0000)
committerKeith Packard <keithp@keithp.com>
Fri, 10 Aug 2012 23:15:05 +0000 (16:15 -0700)
CMakeLists.txt
src/cl_command_queue.c
src/cl_device_id.c
src/cl_kernel.c
src/cl_kernel.h
src/cl_program.c
src/intel/intel_driver.c
src/intel/intel_gpgpu.c

index 07ed3b9..5e974c6 100644 (file)
@@ -13,9 +13,10 @@ INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR})
 
 SET(CMAKE_VERBOSE_MAKEFILE "false")
 SET(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type")
-SET(USE_INTEL_COMPILER CACHE BOOL "false")
-SET(EMULATE_IVB CACHE BOOL "false")
-SET(EMULATE_SNB CACHE BOOL "false")
+SET(EMULATE_IVB false CACHE BOOL "To emulate IVB")
+SET(EMULATE_SNB false CACHE BOOL "To emulate SNB")
+SET(EMULATE_HSW false CACHE BOOL "To emulate HSW")
+SET(USE_OLD_COMPILER false CACHE BOOL "To use the old compiler (required for SNB)")
 ADD_DEFINITIONS(-D__$(USER)__)
 
 IF (EMULATE_HSW)
@@ -32,6 +33,12 @@ ELSE (EMULATE_IVB)
   ADD_DEFINITIONS(-DEMULATE_GEN=0)
 ENDIF (EMULATE_HSW)
 
+IF (USE_OLD_COMPILER)
+  ADD_DEFINITIONS (-DUSE_OLD_COMPILER=1)
+ELSE (USE_OLD_COMPILER)
+  ADD_DEFINITIONS (-DUSE_OLD_COMPILER=0)
+ENDIF (USE_OLD_COMPILER)
+
 IF (USE_FULSIM)
   ADD_DEFINITIONS(-DUSE_FULSIM=1)
 ELSE (USE_FULSIM)
index 6976bcc..466bd6a 100644 (file)
@@ -273,6 +273,13 @@ cl_run_fulsim(void)
     system("wine AubLoad.exe dump.aub -device ivbB0");
   else
     system("wine AubLoad.exe dump.aub -device ivbB0 -debug");
+#elif EMULATE_GEN == 75 /* HSW */
+  if (debug_mode == NULL || strcmp(debug_mode, "1"))
+    system("wine AubLoad.exe dump.aub -device hsw.h.a0");
+  else
+    system("wine AubLoad.exe dump.aub -device hsw.h.a0 -debug");
+#else
+#error "Unknown device"
 #endif
 }
 
index b20b132..3c66f70 100644 (file)
@@ -75,15 +75,36 @@ static struct _cl_device_id intel_ivb_gt1_device = {
 #include "cl_gen7_device.h"
 };
 
+/* XXX we clone IVB for HSW now */
+static struct _cl_device_id intel_hsw_device = {
+  .max_compute_unit = 64,
+  .max_work_item_sizes = {512, 512, 512},
+  .max_work_group_size = 512,
+  .max_clock_frequency = 1000,
+  .wg_sz = 512,
+  .compile_wg_sz = {0},        
+
+#include "cl_gen75_device.h"
+};
+
 LOCAL cl_device_id
 cl_get_gt_device(void)
 {
   cl_device_id ret = NULL;
   const int device_id = cl_intel_get_device_id();
 
-  if (device_id == PCI_CHIP_IVYBRIDGE_GT1   ||
-      device_id == PCI_CHIP_IVYBRIDGE_M_GT1 ||
-      device_id == PCI_CHIP_IVYBRIDGE_S_GT1) {
+  /* XXX we pick IVB for HSW now */
+  if (device_id == PCI_CHIP_HASWELL_M   ||
+      device_id == PCI_CHIP_HASWELL_L   ||
+      device_id == PCI_CHIP_HASWELL_M0  ||
+      device_id == PCI_CHIP_HASWELL_D0) {
+    intel_hsw_device.vendor_id = device_id;
+    intel_hsw_device.platform = intel_platform;
+    ret = &intel_hsw_device;
+  }
+  else if (device_id == PCI_CHIP_IVYBRIDGE_GT1   ||
+           device_id == PCI_CHIP_IVYBRIDGE_M_GT1 ||
+           device_id == PCI_CHIP_IVYBRIDGE_S_GT1) {
     intel_ivb_gt1_device.vendor_id = device_id;
     intel_ivb_gt1_device.platform = intel_platform;
     ret = &intel_ivb_gt1_device;
@@ -174,7 +195,8 @@ cl_get_device_info(cl_device_id     device,
   if (UNLIKELY(device != &intel_snb_gt1_device &&
                device != &intel_snb_gt2_device &&
                device != &intel_ivb_gt1_device &&
-               device != &intel_ivb_gt2_device))
+               device != &intel_ivb_gt2_device &&
+               device != &intel_hsw_device))
     return CL_INVALID_DEVICE;
   if (UNLIKELY(param_value == NULL))
     return CL_INVALID_VALUE;
@@ -249,14 +271,17 @@ cl_device_get_version(cl_device_id device, cl_int *ver)
   if (UNLIKELY(device != &intel_snb_gt1_device &&
                device != &intel_snb_gt2_device &&
                device != &intel_ivb_gt1_device &&
-               device != &intel_ivb_gt2_device))
+               device != &intel_ivb_gt2_device &&
+               device != &intel_hsw_device))
     return CL_INVALID_DEVICE;
   if (ver == NULL)
     return CL_SUCCESS;
   if (device == &intel_snb_gt1_device || device == &intel_snb_gt2_device)
     *ver = 6;
-  else
+  else if (device == &intel_ivb_gt1_device || device == &intel_ivb_gt2_device)
     *ver = 7;
+  else
+    *ver = 75;
   return CL_SUCCESS;
 }
 #undef DECL_FIELD
index 378e5c4..4b67d7d 100644 (file)
@@ -350,18 +350,26 @@ cl_kernel_setup_patch_list(cl_kernel k, const char *patch, size_t sz)
       case PATCH_TOKEN_CONSTANT_MEMORY_KERNEL_ARGUMENT:
       case PATCH_TOKEN_GLOBAL_MEMORY_KERNEL_ARGUMENT:
       {
-        cl_global_memory_object_arg_t *from = (cl_global_memory_object_arg_t *) patch;
 
         TRY_ALLOC (arg_info, CALLOC(cl_arg_info_t));
-        arg_info->arg_index = from->index;
-        arg_info->offset = from->offset;
-        if (item->token == PATCH_TOKEN_GLOBAL_MEMORY_KERNEL_ARGUMENT)
+        if (item->token == PATCH_TOKEN_GLOBAL_MEMORY_KERNEL_ARGUMENT) {
+          cl_global_memory_object_arg_t *from = (cl_global_memory_object_arg_t *) patch;
+          arg_info->arg_index = from->index;
+          arg_info->offset = from->offset;
           arg_info->type = OCLRT_ARG_TYPE_BUFFER;
-        else if (item->token == PATCH_TOKEN_CONSTANT_MEMORY_KERNEL_ARGUMENT)
+        }
+        else if (item->token == PATCH_TOKEN_CONSTANT_MEMORY_KERNEL_ARGUMENT) {
+          cl_global_memory_object_arg_t *from = (cl_global_memory_object_arg_t *) patch;
+          arg_info->arg_index = from->index;
+          arg_info->offset = from->offset;
           arg_info->type = OCLRT_ARG_TYPE_CONST;
-        else if (item->token == PATCH_TOKEN_IMAGE_MEMORY_KERNEL_ARGUMENT)
+        }
+        else if (item->token == PATCH_TOKEN_IMAGE_MEMORY_KERNEL_ARGUMENT) {
+          cl_image_memory_object_arg_t *from = (cl_image_memory_object_arg_t *) patch;
+          arg_info->arg_index = from->index;
+          arg_info->offset = from->offset;
           arg_info->type = OCLRT_ARG_TYPE_IMAGE;
-        else
+        else
           assert(0);
 
         arg_info->sz = sizeof(cl_mem);
@@ -400,7 +408,12 @@ cl_kernel_setup_patch_list(cl_kernel k, const char *patch, size_t sz)
           case DATA_PARAMETER_IMAGE_CHANNEL_ORDER:
           case DATA_PARAMETER_NUM_HARDWARE_THREADS:
           {
-            curbe_key = cl_curbe_key(data->type, data->index, data->src_offset);
+#if USE_OLD_COMPILER == 0
+            if (data->type == DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_ARGUMENT_SIZES)
+              curbe_key = cl_curbe_key(data->type, data->index, 0);
+            else
+#endif
+              curbe_key = cl_curbe_key(data->type, data->index, data->src_offset);
             curbe_info = cl_kernel_get_curbe_info_list(k, curbe_key);
             if (curbe_info != NULL)
               curbe_info->offsets[++curbe_info->last] = data->offset;
index 704d1c5..cd4d00e 100644 (file)
 /* XXX Structures extracted from the WINDOWS CODE BASE                     */
 /***************************************************************************/
 
+// Some fields went from 1 to 4 bytes with the new compiler
+#if USE_OLD_COMPILER
+typedef uint8_t cl_compiler_boolean_t;
+#else
+typedef uint32_t cl_compiler_boolean_t;
+#endif /* USE_OLD_COMPILER */
+
 typedef struct cl_program_header {
   uint32_t magic;
   uint32_t version;
@@ -41,11 +48,11 @@ typedef struct cl_program_header {
 typedef struct cl_arg_info {
   uint32_t arg_index;
   uint32_t type;
-  uint8_t is_null;
+  cl_compiler_boolean_t is_null;
   uint32_t offset;
   uint32_t sz;
   void *obj;
-  uint8_t is_patched;
+  cl_compiler_boolean_t is_patched;
   struct cl_arg_info *next;
 } cl_arg_info_t;
 
@@ -57,8 +64,8 @@ typedef struct cl_curbe_patch_info {
   uint32_t arg_index;
   uint32_t sz;
   uint32_t src_offset;
-  uint8_t is_patched;
-  uint8_t is_local;
+  cl_compiler_boolean_t is_patched;
+  cl_compiler_boolean_t is_local;
   struct cl_curbe_patch_info *next;
 } cl_curbe_patch_info_t;
 
@@ -68,13 +75,13 @@ typedef struct cl_kernel_header {
   uint32_t patch_list_sz;
 } cl_kernel_header_t;
 
-typedef struct cl_kernel_header7_5 {
+typedef struct cl_kernel_header75 {
   cl_kernel_header_t header;
   uint32_t kernel_heap_sz;
   uint32_t general_state_heap_sz;
   uint32_t dynamic_state_heap_sz;
   uint32_t surface_state_heap_sz;
-} cl_kernel_header7_5_t;
+} cl_kernel_header75_t;
 
 typedef struct cl_kernel_header7 {
   cl_kernel_header_t header;
@@ -104,11 +111,14 @@ typedef struct cl_global_memory_object_arg {
   uint32_t offset;
 } cl_global_memory_object_arg_t;
 
-typedef struct cl_patch_image_memory_object_arg {
+#if USE_OLD_COMPILER == 0
+typedef struct cl_image_memory_object_arg {
   cl_patch_item_header_t header;
   uint32_t index;
+  uint32_t image_type;
   uint32_t offset;
-} cl_patch_image_memory_object_arg_t;
+} cl_image_memory_object_arg_t;
+#endif
 
 typedef struct cl_patch_constant_memory_object_arg {
   uint32_t index;
index 7cf39df..5916185 100644 (file)
 #include <string.h>
 #include <assert.h>
 
-static int icbe_ver = 1001L;
+#if USE_OLD_COMPILER
+static const int icbe_ver = 1001;
+#else
+static const int icbe_ver = 1002;
+#endif
 
 #define DECL_LOAD_HEADER(GEN)                                           \
 static const char*                                                      \
@@ -54,6 +58,7 @@ JOIN(cl_kernel_load_header,GEN)(cl_kernel ker,                          \
 
 DECL_LOAD_HEADER(6)
 DECL_LOAD_HEADER(7)
+DECL_LOAD_HEADER(75)
 
 #undef DECL_LOAD_HEADER
 
@@ -82,6 +87,9 @@ cl_program_decode(cl_program p)
     /* Format changes from generation to generation */
     TRY_ALLOC (p->ker[i], cl_kernel_new());
     switch (header->device) {
+      case IGFX_GEN7_5_CORE:
+        ker = cl_kernel_load_header75(p->ker[i], ker, &name_sz, &ker_sz);
+      break;
       case IGFX_GEN7_CORE:
         ker = cl_kernel_load_header7(p->ker[i], ker, &name_sz, &ker_sz);
       break;
index dd48222..305cd4b 100644 (file)
@@ -109,14 +109,18 @@ intel_driver_init(intel_driver_t *driver, int dev_fd)
 
 #if EMULATE_GEN
   driver->gen_ver = EMULATE_GEN;
-  if (EMULATE_GEN == 7)
+  if (EMULATE_GEN == 75)
+    driver->device_id = PCI_CHIP_HASWELL_L;       /* we pick L for HSW */
+  else if (EMULATE_GEN == 7)
     driver->device_id = PCI_CHIP_IVYBRIDGE_GT2; /* we pick GT2 for IVB */
   else if (EMULATE_GEN == 6)
     driver->device_id = PCI_CHIP_SANDYBRIDGE_GT2; /* we pick GT2 for SNB */
   else
     FATAL ("Unsupported Gen for emulation");
 #else
-  if (IS_GEN7(driver->device_id))
+  if (IS_GEN75(driver->device_id))
+    driver->gen_ver = 75;
+  else if (IS_GEN7(driver->device_id))
     driver->gen_ver = 7;
   else if (IS_GEN6(driver->device_id))
     driver->gen_ver = 6;
index 8bdb52a..3246a7e 100644 (file)
@@ -171,14 +171,16 @@ gpgpu_load_vfe_state(intel_gpgpu_t *state)
     intel_batchbuffer_alloc_space(state->batch,0);
 
   memset(vfe, 0, sizeof(struct gen6_vfe_state_inline));
-  vfe->vfe1.gpgpu_mode = state->drv->gen_ver >= 7 ? 1 : 0;
+  vfe->vfe1.gpgpu_mode =
+    (state->drv->gen_ver == 7 || state->drv->gen_ver == 75) ? 1 : 0;
   vfe->vfe1.bypass_gateway_ctl = 1;
   vfe->vfe1.reset_gateway_timer = 1;
   vfe->vfe1.max_threads = state->max_threads - 1;
   vfe->vfe1.urb_entries = 64;
   vfe->vfe3.curbe_size = 63;
   vfe->vfe3.urbe_size = 13;
-  vfe->vfe4.scoreboard_mask = state->drv->gen_ver >= 7 ? 0 : 0x80000000;
+  vfe->vfe4.scoreboard_mask =
+    (state->drv->gen_ver == 7 || state->drv->gen_ver == 75) ? 0 : 0x80000000;
   intel_batchbuffer_alloc_space(state->batch, sizeof(gen6_vfe_state_inline_t));
   ADVANCE_BATCH(state->batch);
 }
@@ -339,7 +341,7 @@ gpgpu_batch_start(intel_gpgpu_t *state)
 {
   intel_batchbuffer_start_atomic(state->batch, 256);
   gpgpu_pipe_control(state);
-  if (state->drv->gen_ver >= 7)
+  if (state->drv->gen_ver == 7 || state->drv->gen_ver == 75)
     intel_gpgpu_set_L3(state, state->ker->use_barrier);
   gpgpu_select_pipeline(state);
   gpgpu_set_base_address(state);
@@ -589,8 +591,10 @@ gpgpu_bind_buf(intel_gpgpu_t *state,
   assert(index < MAX_SURFACES);
   if(state->drv->gen_ver == 6)
     gpgpu_bind_buf_gen6(state, index, obj_bo, size, cchint);
-  else if (state->drv->gen_ver == 7)
+  else if (state->drv->gen_ver == 7 || state->drv->gen_ver == 75)
     gpgpu_bind_buf_gen7(state, index, obj_bo, size, cchint);
+  else
+    NOT_IMPLEMENTED;
 }
 
 LOCAL void
@@ -606,8 +610,10 @@ gpgpu_bind_image2D(intel_gpgpu_t *state,
   assert(index < MAX_SURFACES);
   if(state->drv->gen_ver == 6)
     gpgpu_bind_image2D_gen6(state, index, obj_bo, format, w, h, bpp, cchint);
-  else if (state->drv->gen_ver == 7)
+  else if (state->drv->gen_ver == 7 || state->drv->gen_ver == 75)
     gpgpu_bind_image2D_gen7(state, index, obj_bo, format, w, h, bpp, cchint);
+  else
+    NOT_IMPLEMENTED;
 }
 
 static void
@@ -634,7 +640,7 @@ gpgpu_build_idrt(intel_gpgpu_t *state,
     desc->desc4.curbe_read_offset = 0;
 
     /* Barriers / SLM are automatically handled on Gen7+ */
-    if (state->drv->gen_ver >= 7) {
+    if (state->drv->gen_ver == 7 || state->drv->gen_ver == 75) {
       size_t slm_sz = kernel[i].slm_sz;
       desc->desc5.group_threads_num = kernel[i].use_barrier ? kernel[i].thread_n : 0;
       desc->desc5.barrier_enable = kernel[i].use_barrier;