Started to support Gen7 Implemented new surface state for Gen7

author bsegovia <devnull@localhost>

Thu, 28 Jul 2011 05:37:24 +0000 (05:37 +0000)

committer Keith Packard <keithp@keithp.com>

Fri, 10 Aug 2012 23:14:36 +0000 (16:14 -0700)
author bsegovia <devnull@localhost>
Thu, 28 Jul 2011 05:37:24 +0000 (05:37 +0000)
committer Keith Packard <keithp@keithp.com>
Fri, 10 Aug 2012 23:14:36 +0000 (16:14 -0700)
diff --git a/CMakeLists.txt b/CMakeLists.txt

index 325444e..977bc77 100644 (file)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,14 +14,28 @@ INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR})
  SET(CMAKE_VERBOSE_MAKEFILE "false")
  SET(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type")
  SET(USE_INTEL_COMPILER CACHE BOOL "false")
-SET(USE_FULSIM CACHE BOOL "false")
+SET(EMULATE_IVB CACHE BOOL "false")
+SET(EMULATE_SNB CACHE BOOL "false")
  ADD_DEFINITIONS(-D__$(USER)__)
  
+IF (EMULATE_IVB)
+  SET (USE_FULSIM "true")
+  ADD_DEFINITIONS(-DEMULATE_GEN=7)
+ELSEIF (EMULATE_SNB)
+  SET (USE_FULSIM "true")
+  ADD_DEFINITIONS(-DEMULATE_GEN=6)
+ELSE (EMULATE_IVB)
+  SET (USE_FULSIM "false")
+  ADD_DEFINITIONS(-DEMULATE_GEN=0)
+ENDIF (EMULATE_IVB)
+
  IF (USE_FULSIM)
    ADD_DEFINITIONS(-DUSE_FULSIM=1)
  ELSE (USE_FULSIM)
    ADD_DEFINITIONS(-DUSE_FULSIM=0)
  ENDIF (USE_FULSIM)
+
+
  SET(CMAKE_CXX_FLAGS "-Wall -Wno-invalid-offsetof -mfpmath=sse --no-exceptions --no-rtti -Wcast-align -std=c++0x")
  SET(CMAKE_C_FLAGS "-Wall -mfpmath=sse -msse2 -Wcast-align")
  
diff --git a/kernels/copybuffer/CopyBuffer0.krn b/kernels/copybuffer/CopyBuffer0.krn

index b639fdc..487d112 100644 (file)

Binary files a/kernels/copybuffer/CopyBuffer0.krn and b/kernels/copybuffer/CopyBuffer0.krn differ
diff --git a/kernels/copybuffer/CopyBuffer_0.bin b/kernels/copybuffer/CopyBuffer_0.bin

index 85e544b..9619479 100644 (file)

Binary files a/kernels/copybuffer/CopyBuffer_0.bin and b/kernels/copybuffer/CopyBuffer_0.bin differ
diff --git a/kernels/copybuffer/CopyBuffer_0.gen6.bin b/kernels/copybuffer/CopyBuffer_0.gen6.bin

new file mode 100644 (file)

index 0000000..85e544b

Binary files /dev/null and b/kernels/copybuffer/CopyBuffer_0.gen6.bin differ
diff --git a/kernels/copybuffer/CopyBuffer_0.ghal3d b/kernels/copybuffer/CopyBuffer_0.ghal3d

index cdf544f..bef18fe 100644 (file)
--- a/kernels/copybuffer/CopyBuffer_0.ghal3d
+++ b/kernels/copybuffer/CopyBuffer_0.ghal3d
@@ -1,6 +1,6 @@
         SHADER CopyBuffer
  
-       VERSION_2_1
+       VERSION_3_0
  
         DCL_THREADGROUP VARIABLE;
  
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c

index a03526c..f7a2740 100644 (file)
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -131,8 +131,6 @@ cl_command_queue_bind_surface(cl_command_queue queue,
    for (i = 0; i < k->arg_info_n; ++i) {
      if (k->arg_info[i].type != OCLRT_ARG_TYPE_BUFFER)
        continue;
-
-    /* XXX 64 comes from the patch list format. May change */
      assert(k->arg_info[i].offset % SURFACE_SZ == 0);
      index = k->arg_info[i].offset / SURFACE_SZ;
      mem = (cl_mem) k->args[k->arg_info[i].arg_index];
@@ -362,8 +360,8 @@ error:
  static char*
  cl_kernel_create_cst_buffer(cl_kernel k, 
                              cl_uint work_dim,
-                            const size_t *global_work_size,
-                            const size_t *local_work_size)
+                            const size_t *global_wk_sz,
+                            const size_t *local_wk_sz)
  {
    cl_curbe_patch_info_t *info = NULL;
    const size_t sz = k->patch.curbe.sz;
@@ -376,24 +374,24 @@ cl_kernel_create_cst_buffer(cl_kernel k,
    /* Global work group size */
    key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_SIZE, 0, 0);
    if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
-    memcpy(data+info->offsets[0], global_work_size,   sizeof(uint32_t));
+    memcpy(data+info->offsets[0], global_wk_sz,   sizeof(uint32_t));
    key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_SIZE, 0, 4);
    if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
-    memcpy(data+info->offsets[0], global_work_size+1, sizeof(uint32_t));
+    memcpy(data+info->offsets[0], global_wk_sz+1, sizeof(uint32_t));
    key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_SIZE, 0, 8);
    if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
-    memcpy(data+info->offsets[0], global_work_size+2, sizeof(uint32_t));
+    memcpy(data+info->offsets[0], global_wk_sz+2, sizeof(uint32_t));
  
    /* Local work group size */
    key = cl_curbe_key(DATA_PARAMETER_LOCAL_WORK_SIZE, 0, 0);
    if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
-    memcpy(data+info->offsets[0], local_work_size,   sizeof(uint32_t));
+    memcpy(data+info->offsets[0], local_wk_sz,   sizeof(uint32_t));
    key = cl_curbe_key(DATA_PARAMETER_LOCAL_WORK_SIZE, 0, 4);
    if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
-    memcpy(data+info->offsets[0], local_work_size+1, sizeof(uint32_t));
+    memcpy(data+info->offsets[0], local_wk_sz+1, sizeof(uint32_t));
    key = cl_curbe_key(DATA_PARAMETER_LOCAL_WORK_SIZE, 0, 8);
    if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
-    memcpy(data+info->offsets[0], local_work_size+2, sizeof(uint32_t));
+    memcpy(data+info->offsets[0], local_wk_sz+2, sizeof(uint32_t));
  
  exit:
    return data;
@@ -411,10 +409,17 @@ cl_run_fulsim(void)
    const char *debug_mode = getenv("OCL_FULSIM_DEBUG_MODE");
    if (run_it == NULL || strcmp(run_it, "1"))
      return;
+#if EMULATE_GEN == 6 /* SNB */
    if (debug_mode == NULL || strcmp(debug_mode, "1"))
      system("wine AubLoad.exe dump.aub -device sbrB0");
    else
      system("wine AubLoad.exe dump.aub -device sbrB0 -debug");
+#elif EMULATE_GEN == 7
+  if (debug_mode == NULL || strcmp(debug_mode, "1"))
+    system("wine AubLoad.exe dump.aub -device ivb2");
+  else
+    system("wine AubLoad.exe dump.aub -device ivb2 -debug");
+#endif
  }
  #endif /* USE_FULSIM */
  
@@ -423,8 +428,8 @@ cl_command_queue_ND_kernel(cl_command_queue queue,
                             cl_kernel ker,
                             cl_uint work_dim,
                             const size_t *global_work_offset,
-                           const size_t *global_work_size,
-                           const size_t *local_work_size)
+                           const size_t *global_wk_sz,
+                           const size_t *local_wk_sz)
  {
    cl_context ctx = queue->ctx;
    genx_gpgpu_state_t *gpgpu = queue->gpgpu;
@@ -458,13 +463,13 @@ cl_command_queue_ND_kernel(cl_command_queue queue,
    /* Total number of elements in the work group */
    for (i = 0; i < work_dim; ++i)
      if ((&ker->patch.exec_env.required_wgr_sz_x)[i] &&
-        (&ker->patch.exec_env.required_wgr_sz_x)[i] != local_work_size[i]) {
+        (&ker->patch.exec_env.required_wgr_sz_x)[i] != local_wk_sz[i]) {
        err = CL_INVALID_WORK_ITEM_SIZE;
        goto error;
      }
-  wrk_grp_sz = local_work_size[0];
+  wrk_grp_sz = local_wk_sz[0];
    for (i = 1; i < work_dim; ++i)
-    wrk_grp_sz *= local_work_size[i];
+    wrk_grp_sz *= local_wk_sz[i];
    FATAL_IF (wrk_grp_sz % 16, "Work group size must be a multiple of 16");
    if (wrk_grp_sz > ctx->device->max_work_group_size) {
      err = CL_INVALID_WORK_ITEM_SIZE;
@@ -472,9 +477,9 @@ cl_command_queue_ND_kernel(cl_command_queue queue,
    }
  
    /* Directly from the user defined values */
-  header.local_sz[0] = local_work_size[0];
-  header.local_sz[1] = local_work_size[1];
-  header.local_sz[2] = local_work_size[2];
+  header.local_sz[0] = local_wk_sz[0];
+  header.local_sz[1] = local_wk_sz[1];
+  header.local_sz[2] = local_wk_sz[2];
    offset[0] = header.grp_n[0] = 0;
    offset[1] = header.grp_n[1] = 0;
    offset[2] = header.grp_n[2] = 0;
@@ -483,7 +488,7 @@ cl_command_queue_ND_kernel(cl_command_queue queue,
    /* offsets are evenly divided by the local sizes */
    if (global_work_offset)
      for (i = 0; i < work_dim; ++i)
-      offset[i] = global_work_offset[i]/local_work_size[i];
+      offset[i] = global_work_offset[i]/local_wk_sz[i];
  
    /* Compute the local size per wg and the offsets for each local buffer */
    cl_kernel_handle_local_memory(ker, &header);
@@ -506,20 +511,17 @@ cl_command_queue_ND_kernel(cl_command_queue queue,
  
    /* Fill the constant buffer */
    if (cst_sz > 0) {
-    char *completed_cst = NULL;
+    char *data = NULL;
      assert(ker->cst_buffer);
-    completed_cst = cl_kernel_create_cst_buffer(ker,
-                                                work_dim,
-                                                global_work_size,
-                                                local_work_size);
-    gpgpu_upload_constants(gpgpu, completed_cst, cst_sz);
-    cl_free(completed_cst);
+    data = cl_kernel_create_cst_buffer(ker,work_dim,global_wk_sz,local_wk_sz);
+    gpgpu_upload_constants(gpgpu, data, cst_sz);
+    cl_free(data);
    }
  
    wrk_grp_n = 1;
    for (i = 0; i < work_dim; ++i) {
      TRY_ALLOC (ids[i], (cl_local_id_t*) cl_malloc(wrk_grp_sz*sizeof(uint16_t)));
-    grp_end[i] = offset[i] + global_work_size[i] / local_work_size[i];
+    grp_end[i] = offset[i] + global_wk_sz[i] / local_wk_sz[i];
      wrk_grp_n *= grp_end[i]-offset[i];
    }
    thread_n = wrk_grp_sz / 16;
@@ -528,16 +530,16 @@ cl_command_queue_ND_kernel(cl_command_queue queue,
    /* Start a new batch buffer */
    gpgpu_batch_reset(gpgpu, batch_sz);
    gpgpu_batch_start(gpgpu);
-
+#if 1
    /* Push all media objects. We implement three paths to make it (a bit) faster.
     * Local IDs are shared from work group to work group. We allocate once the
     * buffers and reuse them
     */
    if (work_dim == 3) {
      curr = 0;
-    for (i = 0; i < local_work_size[0]; ++i)
-    for (j = 0; j < local_work_size[1]; ++j)
-    for (k = 0; k < local_work_size[2]; ++k, ++curr) {
+    for (i = 0; i < local_wk_sz[0]; ++i)
+    for (j = 0; j < local_wk_sz[1]; ++j)
+    for (k = 0; k < local_wk_sz[2]; ++k, ++curr) {
        ((uint16_t*) ids[0])[curr] = i;
        ((uint16_t*) ids[1])[curr] = j;
        ((uint16_t*) ids[2])[curr] = k;
@@ -553,8 +555,8 @@ cl_command_queue_ND_kernel(cl_command_queue queue,
    }
    else if (work_dim == 2) {
      curr = 0;
-    for (i = 0; i < local_work_size[0]; ++i)
-    for (j = 0; j < local_work_size[1]; ++j, ++curr) {
+    for (i = 0; i < local_wk_sz[0]; ++i)
+    for (j = 0; j < local_wk_sz[1]; ++j, ++curr) {
        ((uint16_t*) ids[0])[curr] = i;
        ((uint16_t*) ids[1])[curr] = j;
      }
@@ -567,7 +569,7 @@ cl_command_queue_ND_kernel(cl_command_queue queue,
      }
    }
    else {
-    for (i = 0; i < local_work_size[0]; ++i)
+    for (i = 0; i < local_wk_sz[0]; ++i)
        ((uint16_t*) ids[0])[i] = i;
      for (header.grp_n[0] = offset[0]; header.grp_n[0] < grp_end[0]; ++header.grp_n[0]) {
        if (ker->patch.exec_env.has_barriers)
@@ -576,7 +578,7 @@ cl_command_queue_ND_kernel(cl_command_queue queue,
        barrierID = (barrierID + 1) % 16;
      }
    }
-
+#endif
    gpgpu_batch_end(gpgpu, 0);
    gpgpu_flush(gpgpu);
  
@@ -616,9 +618,9 @@ cl_command_queue_set_fulsim_buffer(cl_command_queue queue, cl_mem mem)
  #if USE_FULSIM
    cl_context ctx = queue->ctx;
    drm_intel_bufmgr *bufmgr = cl_context_get_intel_bufmgr(ctx);
-
    drm_intel_aub_set_bo_to_dump(bufmgr, mem->bo);
  #endif /* USE_FULSIM */
+
    queue->fulsim_out = mem;
    if (queue->fulsim_out != NULL) {
      cl_mem_delete(queue->fulsim_out);
diff --git a/src/cl_device_id.c b/src/cl_device_id.c

index a846adf..25106ae 100644 (file)
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -29,46 +29,75 @@
  #include <stdio.h>
  #include <string.h>
  
-static struct _cl_device_id intel_gt2_device = {
+static struct _cl_device_id intel_snb_gt2_device = {
    .max_compute_unit = 60,
    .max_work_item_sizes = {512, 512, 512},
    .max_work_group_size = 512,
    .max_clock_frequency = 1350,
  
-  /* Common fields between GT1 and GT2 */
    #include "cl_gen6_device.h"
  };
  
-static struct _cl_device_id intel_gt1_device = {
+static struct _cl_device_id intel_snb_gt1_device = {
    .max_compute_unit = 24,
    .max_work_item_sizes = {256, 256, 256},
    .max_work_group_size = 256,
    .max_clock_frequency = 1000,
  
-  /* Common fields between GT1 and GT2 */
    #include "cl_gen6_device.h"
  };
  
+static struct _cl_device_id intel_ivb_gt2_device = {
+  .max_compute_unit = 128,
+  .max_work_item_sizes = {512, 512, 512},
+  .max_work_group_size = 512,
+  .max_clock_frequency = 1000,
+
+  #include "cl_gen7_device.h"
+};
+
+static struct _cl_device_id intel_ivb_gt1_device = {
+  .max_compute_unit = 64,
+  .max_work_item_sizes = {512, 512, 512},
+  .max_work_group_size = 512,
+  .max_clock_frequency = 1000,
+
+  #include "cl_gen7_device.h"
+};
+
  LOCAL cl_device_id
  cl_get_gt_device(void)
  {
    cl_device_id ret = NULL;
    int device_id = cl_intel_get_device_id();
  
-  if (device_id == PCI_CHIP_SANDYBRIDGE_GT1   ||
+  if (device_id == PCI_CHIP_IVYBRIDGE_GT1   ||
+      device_id == PCI_CHIP_IVYBRIDGE_M_GT1 ||
+      device_id == PCI_CHIP_IVYBRIDGE_S_GT1) {
+    intel_ivb_gt1_device.vendor_id = device_id;
+    intel_ivb_gt1_device.platform = intel_platform;
+    ret = &intel_ivb_gt1_device;
+  }
+  else if (device_id == PCI_CHIP_IVYBRIDGE_GT2   ||
+      device_id == PCI_CHIP_IVYBRIDGE_M_GT2) {
+    intel_ivb_gt2_device.vendor_id = device_id;
+    intel_ivb_gt2_device.platform = intel_platform;
+    ret = &intel_ivb_gt2_device;
+  }
+  else if (device_id == PCI_CHIP_SANDYBRIDGE_GT1   ||
        device_id == PCI_CHIP_SANDYBRIDGE_M_GT1 ||
        device_id == PCI_CHIP_SANDYBRIDGE_S_GT) {
-    intel_gt1_device.vendor_id = device_id;
-    intel_gt1_device.platform = intel_platform;
-    ret = &intel_gt1_device;
+    intel_snb_gt1_device.vendor_id = device_id;
+    intel_snb_gt1_device.platform = intel_platform;
+    ret = &intel_snb_gt1_device;
    }
    else if (device_id == PCI_CHIP_SANDYBRIDGE_GT2      ||
             device_id == PCI_CHIP_SANDYBRIDGE_M_GT2    ||
             device_id == PCI_CHIP_SANDYBRIDGE_GT2_PLUS ||
             device_id == PCI_CHIP_SANDYBRIDGE_M_GT2_PLUS) {
-    intel_gt2_device.vendor_id = device_id;
-    intel_gt2_device.platform = intel_platform;
-    ret = &intel_gt2_device;
+    intel_snb_gt2_device.vendor_id = device_id;
+    intel_snb_gt2_device.platform = intel_platform;
+    ret = &intel_snb_gt2_device;
    }
    return ret;
  }
@@ -130,7 +159,10 @@ cl_get_device_info(cl_device_id     device,
                     void *           param_value,
                     size_t *         param_value_size_ret)
  {
-  if (UNLIKELY(device != &intel_gt1_device && device != &intel_gt2_device))
+  if (UNLIKELY(device != &intel_snb_gt1_device &&
+               device != &intel_snb_gt2_device &&
+               device != &intel_ivb_gt1_device &&
+               device != &intel_ivb_gt2_device))
      return CL_INVALID_DEVICE;
    if (UNLIKELY(param_value == NULL))
      return CL_INVALID_VALUE;
diff --git a/src/cl_gen6_device.h b/src/cl_gen6_device.h

index 32c01c2..b09121f 100644 (file)
--- a/src/cl_gen6_device.h
+++ b/src/cl_gen6_device.h
@@ -17,68 +17,14 @@
   * Author: Benjamin Segovia <benjamin.segovia@intel.com>
   */
  
-/* Common fields for both GT1 and GT2 devices. Fields which are not shared are
- * set in cl_device_id_object.c which basically deals with OpenCL devices
+/* Common fields for both SNB devices (either GT1 or GT2)
   */
-.device_type = CL_DEVICE_TYPE_GPU,
-.vendor_id = 0, /* == device_id (set when requested) */
-.max_work_item_dimensions = 3,
-.preferred_vector_width_char = 16,
-.preferred_vector_width_short = 16,
-.preferred_vector_width_int = 16,
-.preferred_vector_width_long = 16,
-.preferred_vector_width_float = 16,
-.preferred_vector_width_double = 0,
-.preferred_vector_width_half = 0,
-.native_vector_width_char = 16,
-.native_vector_width_short = 16,
-.native_vector_width_int = 16,
-.native_vector_width_long = 16,
-.native_vector_width_float = 16,
-.native_vector_width_double = 16,
-.native_vector_width_half = 16,
-.address_bits = 32,
-.max_mem_alloc_size = 128 * 1024 * 1024,
-.image_support = CL_FALSE,
-.max_read_image_args = 0,
-.max_write_image_args = 0,
-.image2d_max_width = 0,
-.image2d_max_height = 0,
-.image3d_max_width = 0,
-.image3d_max_height = 0,
-.image3d_max_depth = 0,
-.max_samplers = 0,
-.max_parameter_size = 256, /* Gen6 */
-.mem_base_addr_align = sizeof(cl_uint) * 8,
-.min_data_type_align_size = sizeof(cl_uint),
-.single_fp_config = 0, /* XXX */
-.global_mem_cache_type = CL_READ_WRITE_CACHE,
+.max_parameter_size = 256,
  .global_mem_cache_line_size = 128, /* XXX */
  .global_mem_cache_size = 8 << 10, /* XXX */
-.global_mem_size = 4,
-.max_constant_buffer_size = 64 << 10,
-.max_constant_args = 8,
-.local_mem_type = CL_GLOBAL, /* Gen6 */
-.local_mem_size = 16 << 10,  /* Gen6 */
-.error_correction_support = CL_FALSE,
-.host_unified_memory = CL_FALSE,
-.profiling_timer_resolution = 80, /* ns */
-.endian_little = CL_TRUE,
-.available = CL_TRUE,
-.compiler_available = CL_FALSE, /* XXX */
-.execution_capabilities = CL_EXEC_KERNEL,
-.queue_properties = CL_QUEUE_PROFILING_ENABLE,
-.platform = NULL, /* == intel_platform (set when requested) */
+.local_mem_type = CL_GLOBAL,
+.local_mem_size = 16 << 10,
  .gfx_id = IGFX_GEN6_CORE,
  
-#define DECL_INFO_STRING(FIELD, STRING) \
-    .FIELD = STRING,                    \
-    .JOIN(FIELD,_sz) = sizeof(STRING) + 1,
-DECL_INFO_STRING(name, "Intel HD Graphics Family")
-DECL_INFO_STRING(vendor, "Intel")
-DECL_INFO_STRING(version, "OpenCL 1.10")
-DECL_INFO_STRING(profile, "FULL_PROFILE")
-DECL_INFO_STRING(opencl_c_version, "OpenCL 1.10")
-DECL_INFO_STRING(extensions, "")
-#undef DECL_INFO_STRING
+#include "cl_gt_device.h"
  
diff --git a/src/cl_gen7_device.h b/src/cl_gen7_device.h

new file mode 100644 (file)

index 0000000..75c4e3f
--- /dev/null
+++ b/src/cl_gen7_device.h
@@ -0,0 +1,30 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+/* Common fields for both SNB devices (either GT1 or GT2)
+ */
+.max_parameter_size = 256, 
+.global_mem_cache_line_size = 128, /* XXX */
+.global_mem_cache_size = 8 << 10, /* XXX */
+.local_mem_type = CL_GLOBAL,
+.local_mem_size = 64 << 10,
+.gfx_id = IGFX_GEN7_CORE,
+
+#include "cl_gt_device.h"
+
diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h

new file mode 100644 (file)

index 0000000..d66d6ea
--- /dev/null
+++ b/src/cl_gt_device.h
@@ -0,0 +1,77 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+/* Common fields for both all GT devices (IVB / SNB) */
+.device_type = CL_DEVICE_TYPE_GPU,
+.vendor_id = 0, /* == device_id (set when requested) */
+.max_work_item_dimensions = 3,
+.preferred_vector_width_char = 16,
+.preferred_vector_width_short = 16,
+.preferred_vector_width_int = 16,
+.preferred_vector_width_long = 16,
+.preferred_vector_width_float = 16,
+.preferred_vector_width_double = 0,
+.preferred_vector_width_half = 0,
+.native_vector_width_char = 16,
+.native_vector_width_short = 16,
+.native_vector_width_int = 16,
+.native_vector_width_long = 16,
+.native_vector_width_float = 16,
+.native_vector_width_double = 16,
+.native_vector_width_half = 16,
+.address_bits = 32,
+.max_mem_alloc_size = 128 * 1024 * 1024,
+.image_support = CL_FALSE,
+.max_read_image_args = 0,
+.max_write_image_args = 0,
+.image2d_max_width = 0,
+.image2d_max_height = 0,
+.image3d_max_width = 0,
+.image3d_max_height = 0,
+.image3d_max_depth = 0,
+.max_samplers = 0,
+.mem_base_addr_align = sizeof(cl_uint) * 8,
+.min_data_type_align_size = sizeof(cl_uint),
+.single_fp_config = 0, /* XXX */
+.global_mem_cache_type = CL_READ_WRITE_CACHE,
+.global_mem_size = 4,
+.max_constant_buffer_size = 64 << 10,
+.max_constant_args = 8,
+.error_correction_support = CL_FALSE,
+.host_unified_memory = CL_FALSE,
+.profiling_timer_resolution = 80, /* ns */
+.endian_little = CL_TRUE,
+.available = CL_TRUE,
+.compiler_available = CL_FALSE, /* XXX */
+.execution_capabilities = CL_EXEC_KERNEL,
+.queue_properties = CL_QUEUE_PROFILING_ENABLE,
+.platform = NULL, /* == intel_platform (set when requested) */
+
+#define DECL_INFO_STRING(FIELD, STRING) \
+    .FIELD = STRING,                    \
+    .JOIN(FIELD,_sz) = sizeof(STRING) + 1,
+DECL_INFO_STRING(name, "Intel HD Graphics Family")
+DECL_INFO_STRING(vendor, "Intel")
+DECL_INFO_STRING(version, "OpenCL 1.10")
+DECL_INFO_STRING(profile, "FULL_PROFILE")
+DECL_INFO_STRING(opencl_c_version, "OpenCL 1.10")
+DECL_INFO_STRING(extensions, "")
+#undef DECL_INFO_STRING
+
+
diff --git a/src/cl_kernel.c b/src/cl_kernel.c

index 5c07b9b..20c0f42 100644 (file)
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -347,6 +347,10 @@ cl_kernel_setup_patch_list(cl_kernel k, const char *patch, size_t sz)
        ASSOC_ITEM (EXECUTION_ENVIRONMENT, exec_env, exec_env);
        ASSOC_ITEM (THREAD_PAYLOAD, thread_payload, thread_payload);
  
+      case PATCH_TOKEN_DATA_PARAMETER_STREAM:
+        info->curbe.sz = *(uint32_t *) patch;
+        info->curbe.offset = 0;
+      break;
        case PATCH_TOKEN_CONSTANT_MEMORY_KERNEL_ARGUMENT:
        case PATCH_TOKEN_GLOBAL_MEMORY_KERNEL_ARGUMENT:
        {
@@ -382,6 +386,7 @@ cl_kernel_setup_patch_list(cl_kernel k, const char *patch, size_t sz)
            case DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_ARGUMENT_SIZES:
            case DATA_PARAMETER_LOCAL_WORK_SIZE:
            case DATA_PARAMETER_GLOBAL_WORK_SIZE:
+          case DATA_PARAMETER_GLOBAL_WORK_OFFSET:
            case DATA_PARAMETER_NUM_WORK_GROUPS:
            case DATA_PARAMETER_WORK_DIMENSIONS:
            case DATA_PARAMETER_IMAGE_WIDTH:
@@ -389,6 +394,7 @@ cl_kernel_setup_patch_list(cl_kernel k, const char *patch, size_t sz)
            case DATA_PARAMETER_IMAGE_DEPTH:
            case DATA_PARAMETER_IMAGE_CHANNEL_DATA_TYPE:
            case DATA_PARAMETER_IMAGE_CHANNEL_ORDER:
+          case DATA_PARAMETER_NUM_HARDWARE_THREADS:
            {
              curbe_key = cl_curbe_key(data->type, data->index, data->src_offset);
              curbe_info = cl_kernel_get_curbe_info_list(k, curbe_key);
diff --git a/src/intel/cl_device_data.h b/src/intel/cl_device_data.h

index b2acee9..b7faef1 100644 (file)
--- a/src/intel/cl_device_data.h
+++ b/src/intel/cl_device_data.h
@@ -62,6 +62,24 @@
      devid == PCI_CHIP_SANDYBRIDGE_M_GT2_PLUS || \
      devid == PCI_CHIP_SANDYBRIDGE_S_GT)
  
+#define PCI_CHIP_IVYBRIDGE_GT1          0x0152  /* Desktop */
+#define PCI_CHIP_IVYBRIDGE_GT2          0x0162
+#define PCI_CHIP_IVYBRIDGE_M_GT1        0x0156  /* Mobile */
+#define PCI_CHIP_IVYBRIDGE_M_GT2        0x0166
+#define PCI_CHIP_IVYBRIDGE_S_GT1        0x015a  /* Server */
+
+#define IS_IVB_GT1(devid)               \
+  (devid == PCI_CHIP_IVYBRIDGE_GT1 ||   \
+   devid == PCI_CHIP_IVYBRIDGE_M_GT1 || \
+   devid == PCI_CHIP_IVYBRIDGE_S_GT1)
+
+#define IS_IVB_GT2(devid)               \
+  (devid == PCI_CHIP_IVYBRIDGE_GT2 ||   \
+   devid == PCI_CHIP_IVYBRIDGE_M_GT2)
+
+#define IS_IVYBRIDGE(devid) (IS_IVB_GT1(devid) || IS_IVB_GT2(devid))
+#define IS_GEN7(devid)      IS_IVYBRIDGE(devid)
+
  #ifdef __cplusplus
  extern "C" {
  #endif /* __cplusplus */
diff --git a/src/intel/genx_defines.h b/src/intel/genx_defines.h

index 819fcbd..af0e3db 100644 (file)
--- a/src/intel/genx_defines.h
+++ b/src/intel/genx_defines.h
@@ -239,6 +239,7 @@
  #define I965_SURFACEFORMAT_R16G16B16_SNORM                0x19D 
  #define I965_SURFACEFORMAT_R16G16B16_SSCALED              0x19E 
  #define I965_SURFACEFORMAT_R16G16B16_USCALED              0x19F
+#define I965_SURFACEFORMAT_RAW                            0x1FF
  
  #define I965_CULLMODE_BOTH      0
  #define I965_CULLMODE_NONE      1
diff --git a/src/intel/genx_gpgpu.c b/src/intel/genx_gpgpu.c

index 1685b0f..adf73f9 100644 (file)
--- a/src/intel/genx_gpgpu.c
+++ b/src/intel/genx_gpgpu.c
@@ -107,8 +107,77 @@ typedef struct gen6_surface_state
      uint32_t vertical_alignment:1;
      uint32_t x_offset:7;
    } ss5;
+
+  uint32_t ss6; /* unused */
+  uint32_t ss7; /* unused */
  } gen6_surface_state_t;
  
+typedef struct gen7_surface_state
+{
+  struct {
+    uint32_t cube_pos_z:1;
+    uint32_t cube_neg_z:1;
+    uint32_t cube_pos_y:1;
+    uint32_t cube_neg_y:1;
+    uint32_t cube_pos_x:1;
+    uint32_t cube_neg_x:1;
+    uint32_t media_boundary_pixel_mode:2;
+    uint32_t render_cache_rw_mode:1;
+    uint32_t pad1:1;
+    uint32_t surface_array_spacing:1;
+    uint32_t vertical_line_stride_offset:1;
+    uint32_t vertical_line_stride:1;
+    uint32_t tile_walk:1;
+    uint32_t tiled_surface:1;
+    uint32_t horizontal_alignment:1;
+    uint32_t vertical_alignment:2;
+    uint32_t surface_format:9;
+    uint32_t pad0:1;
+    uint32_t surface_array:1;
+    uint32_t surface_type:3;
+  } ss0;
+
+  struct {
+    uint32_t base_addr;
+  } ss1;
+
+  struct {
+    uint32_t width:14;
+    uint32_t pad1:2;
+    uint32_t height:14;
+    uint32_t pad0:2;
+  } ss2;
+
+  struct {
+    uint32_t pitch:18;
+    uint32_t pad0:3;
+    uint32_t depth:11;
+  } ss3;
+
+  uint32_t ss4;
+
+  struct {
+    uint32_t mip_count:4;
+    uint32_t surface_min_load:4;
+    uint32_t pad2:6;
+    uint32_t coherence_type:1;
+    uint32_t stateless_force_write_thru:1;
+    uint32_t surface_object_control_state:4;
+    uint32_t y_offset:4;
+    uint32_t pad0:1;
+    uint32_t x_offset:7;
+  } ss5;
+
+  uint32_t ss6; /* unused */
+  uint32_t ss7; /* unused */
+
+} gen7_surface_state_t;
+
+#define GEN7_CACHED_IN_LLC 3
+
+STATIC_ASSERT(sizeof(gen6_surface_state_t) == sizeof(gen7_surface_state_t));
+static const size_t surface_state_sz = sizeof(gen6_surface_state_t);
+
  typedef struct gen6_vfe_state_inline
  {
    struct {
@@ -343,13 +412,16 @@ gpgpu_set_base_address(genx_gpgpu_state_t *state)
    OUT_BATCH(state->batch, 0 | (def_cc << 8) | BASE_ADDRESS_MODIFY); /* Instruction Base Addr  */
    /* If we output an AUB file, we limit the total size to 64MB */
  #if USE_FULSIM
-  OUT_BATCH(state->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* General State Access Upper Bound - Ignore Check */
+  OUT_BATCH(state->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* General State Access Upper Bound */
+  OUT_BATCH(state->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* Dynamic State Access Upper Bound */
+  OUT_BATCH(state->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* Indirect Obj Access Upper Bound */
+  OUT_BATCH(state->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* Instruction Access Upper Bound */
  #else
    OUT_BATCH(state->batch, 0 | BASE_ADDRESS_MODIFY);
+  OUT_BATCH(state->batch, 0 | BASE_ADDRESS_MODIFY);
+  OUT_BATCH(state->batch, 0 | BASE_ADDRESS_MODIFY);
+  OUT_BATCH(state->batch, 0 | BASE_ADDRESS_MODIFY);
  #endif
-  OUT_BATCH(state->batch, 0 | BASE_ADDRESS_MODIFY); /* Dynamic State Access Upper Bound - Ignore Check */
-  OUT_BATCH(state->batch, 0 | BASE_ADDRESS_MODIFY); /* Indirect Obj Access Upper Bound - Ignore Check */
-  OUT_BATCH(state->batch, 0 | BASE_ADDRESS_MODIFY); /* Instruction Access Upper Bound - Ignore Check */
    ADVANCE_BATCH(state->batch);
  }
  
@@ -383,11 +455,10 @@ gpgpu_load_constant_buffer(genx_gpgpu_state_t *state)
    BEGIN_BATCH(state->batch, 4);
    OUT_BATCH(state->batch, CMD(2,0,1) | (4 - 2));  /* length-2 */
    OUT_BATCH(state->batch, 0);                     /* mbz */
-  OUT_BATCH(state->batch, state->urb.size_cs_entry*
-                          state->urb.num_cs_entries*32);
-  OUT_RELOC(state->batch, state->curbe_b.bo,
-            I915_GEM_DOMAIN_INSTRUCTION, 0,
-            0);
+  OUT_BATCH(state->batch,
+            state->urb.size_cs_entry*
+            state->urb.num_cs_entries*32);
+  OUT_RELOC(state->batch, state->curbe_b.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
    ADVANCE_BATCH(state->batch);
  }
  
@@ -398,9 +469,7 @@ gpgpu_load_idrt(genx_gpgpu_state_t *state)
    OUT_BATCH(state->batch, CMD(2,0,2) | (4 - 2)); /* length-2 */
    OUT_BATCH(state->batch, 0);                    /* mbz */
    OUT_BATCH(state->batch, state->idrt_b.num*32);
-  OUT_RELOC(state->batch, state->idrt_b.bo,
-            I915_GEM_DOMAIN_INSTRUCTION, 0,
-            0);
+  OUT_RELOC(state->batch, state->idrt_b.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
    ADVANCE_BATCH(state->batch);
  }
  
@@ -469,7 +538,7 @@ gpgpu_flush(genx_gpgpu_state_t *state)
  
  LOCAL void
  gpgpu_state_init(genx_gpgpu_state_t *state,
-                 uint32_t max_thr, 
+                 uint32_t max_threads,
                   uint32_t size_vfe_entry,
                   uint32_t num_vfe_entries,
                   uint32_t size_cs_entry,
@@ -478,8 +547,6 @@ gpgpu_state_init(genx_gpgpu_state_t *state,
    dri_bo *bo;
    int32_t i;
  
-  assert(max_thr > 0 && max_thr < MAX_THREADS);
-
    /* URB */
    state->urb.vfe_start = 0;
    state->urb.num_vfe_entries = num_vfe_entries;
@@ -487,6 +554,7 @@ gpgpu_state_init(genx_gpgpu_state_t *state,
    state->urb.num_cs_entries = num_cs_entries;
    state->urb.size_cs_entry = size_cs_entry;
    state->urb.cs_start = state->urb.vfe_start + state->urb.num_vfe_entries * state->urb.size_vfe_entry;
+  state->max_threads = max_threads;
  
    /* constant buffer */
    if(state->curbe_b.bo)
@@ -494,7 +562,7 @@ gpgpu_state_init(genx_gpgpu_state_t *state,
    uint32_t size_cb = state->urb.num_cs_entries * state->urb.size_cs_entry * (512/8);
    size_cb = (size_cb + (4096 - 1)) & (~(4096-1)); /* roundup to 4K */
    bo = dri_bo_alloc(state->drv->bufmgr,
-                    "constant buffer",
+                    "CONSTANT_BUFFER",
                      size_cb,
                      64);
    assert(bo);
@@ -511,13 +579,13 @@ gpgpu_state_init(genx_gpgpu_state_t *state,
    if(state->binding_table_b.bo)
      dri_bo_unreference(state->binding_table_b.bo);
    bo = dri_bo_alloc(state->drv->bufmgr, 
-                    "binding table",
+                    "SS_SURF_BIND",
                      MAX_SURFACES * sizeof(uint32_t),
                      32);
    assert(bo);
    state->binding_table_b.bo = bo;
  
-  /* interface descriptor remapping table */
+  /* IDRT */
    if(state->idrt_b.bo)
      dri_bo_unreference(state->idrt_b.bo);
    bo = dri_bo_alloc(state->drv->bufmgr, 
@@ -565,14 +633,11 @@ gpgpu_bind_surf_2d(genx_gpgpu_state_t *state,
      state->surface_state_b[index].bo = NULL;
    }
  
-  bo = dri_bo_alloc(state->drv->bufmgr,
-                    "surface state", 
-                    sizeof(gen6_surface_state_t),
-                    32);
+  bo = dri_bo_alloc(state->drv->bufmgr, "surface state", surface_state_sz, 32);
    assert(bo);
    dri_bo_map(bo, 1);
    assert(bo->virtual);
-  ss = (gen6_surface_state_t *)bo->virtual;
+  ss = (gen6_surface_state_t*) bo->virtual;
    memset(ss, 0, sizeof(*ss));
    ss->ss0.surface_type = I965_SURFACE_2D;
    ss->ss0.surface_format = format;
@@ -584,9 +649,8 @@ gpgpu_bind_surf_2d(genx_gpgpu_state_t *state,
    ss->ss3.pitch = (w*4) - 1; /* TEMP patch */
  
    /* TODO: parse GFDT bit as well */
-  if(state->drv->gen_ver == 6) {
+  if(state->drv->gen_ver == 6)
      ss->ss5.cache_control = cchint;
-  }
  
    if (is_dst) {
      write_domain = I915_GEM_DOMAIN_RENDER;
@@ -704,9 +768,6 @@ gpgpu_bind_buf(genx_gpgpu_state_t *state,
                 uint32_t size,
                 uint32_t cchint)
  {
-  uint32_t size_ss = ((size + 0xf) >> 4)-1; /* ceil(size/16) - 1 */
-
-  gen6_surface_state_t *ss;
    dri_bo *bo;
    uint32_t write_domain, read_domain;
  
@@ -718,40 +779,51 @@ gpgpu_bind_buf(genx_gpgpu_state_t *state,
      state->surface_state_b[index].bo = NULL;
    }
  
-  bo = dri_bo_alloc(state->drv->bufmgr, 
-      "surface state", 
-      sizeof(gen6_surface_state_t), 32);
+  bo = dri_bo_alloc(state->drv->bufmgr, "SS_SURFACE", surface_state_sz, 32);
    assert(bo);
    dri_bo_map(bo, 1);
    assert(bo->virtual);
-  ss = (gen6_surface_state_t *)bo->virtual;
-  memset(ss, 0, sizeof(*ss));
-
-  ss->ss0.surface_type = I965_SURFACE_BUFFER;
-  ss->ss0.surface_format = I965_SURFACEFORMAT_R32G32B32A32_FLOAT;
-  ss->ss0.vert_line_stride = 0;
-  ss->ss0.vert_line_stride_ofs = 0;
-  ss->ss1.base_addr = obj_bo->offset + offset;
-  ss->ss2.width = (size_ss & 0x7f); /* bits 6:0 of size_ss */
-  ss->ss2.height = (size_ss >> 7) & 0x1fff; /* bits 19:7 of size_ss */
-  ss->ss3.pitch = 16-1;
-  ss->ss3.depth = (size_ss >> 20); /* bits 26:20 of size_ss */
-
-  /* TODO: parse GFDT bit as well */
-  if(state->drv->gen_ver==6)
-    ss->ss5.cache_control = cchint;
-
    write_domain = I915_GEM_DOMAIN_RENDER;
    read_domain = I915_GEM_DOMAIN_RENDER;
  
-  dri_bo_emit_reloc(bo,
-                    read_domain,
-                    write_domain,
-                    offset,
-                    offsetof(gen6_surface_state_t, ss1),
-                    obj_bo);
-  dri_bo_unmap(bo);
+  if(state->drv->gen_ver == 6) {
+    gen6_surface_state_t *ss = (gen6_surface_state_t *) bo->virtual;
+    const uint32_t size_ss = ((size+0xf) >> 4) - 1; /* ceil(size/16) - 1 */
+    memset(ss, 0, sizeof(*ss));
+    ss->ss0.surface_type = I965_SURFACE_BUFFER;
+    ss->ss0.surface_format = I965_SURFACEFORMAT_R32G32B32A32_FLOAT;
+    ss->ss1.base_addr = obj_bo->offset + offset;
+    ss->ss2.width = size_ss & 0x7f; /* bits 6:0 of size_ss */
+    ss->ss2.height = (size_ss >> 7) & 0x1fff; /* bits 19:7 of size_ss */
+    ss->ss3.pitch = 0xf;
+    ss->ss3.depth = size_ss >> 20; /* bits 26:20 of size_ss */
+    ss->ss5.cache_control = cchint;
+    dri_bo_emit_reloc(bo,
+                      read_domain,
+                      write_domain,
+                      offset,
+                      offsetof(gen6_surface_state_t, ss1),
+                      obj_bo);
+  } else if (state->drv->gen_ver == 7) {
+    gen7_surface_state_t *ss = (gen7_surface_state_t *) bo->virtual;
+    const uint32_t size_ss = size - 1;
+    memset(ss, 0, sizeof(*ss));
+    ss->ss0.surface_type = I965_SURFACE_BUFFER;
+    ss->ss0.surface_format = I965_SURFACEFORMAT_RAW;
+    ss->ss1.base_addr = obj_bo->offset + offset;
+    ss->ss2.width  = size_ss & 0x7f; /* bits 6:0 of size_ss */
+    ss->ss2.height = (size_ss & 0x1fff80) >> 7; /* bits 20:7 of size_ss */
+    ss->ss3.depth  = (size_ss & 0xffe00000) >> 20; /* bits 27:21 of size_ss */
+    ss->ss5.surface_object_control_state = GEN7_CACHED_IN_LLC;
+    dri_bo_emit_reloc(bo,
+                      read_domain,
+                      write_domain,
+                      offset,
+                      offsetof(gen7_surface_state_t, ss1),
+                      obj_bo);
+  }
  
+  dri_bo_unmap(bo);
    assert(index < (int) MAX_SURFACES);
    state->surface_state_b[index].bo = bo;
  }
diff --git a/src/intel/genx_gpgpu.h b/src/intel/genx_gpgpu.h

index 2186885..d263604 100644 (file)
--- a/src/intel/genx_gpgpu.h
+++ b/src/intel/genx_gpgpu.h
@@ -101,7 +101,7 @@ extern void gpgpu_bind_buf(genx_gpgpu_state_t*,
  
  /* Configure state, size in 512-bit units */
  extern void gpgpu_state_init(genx_gpgpu_state_t*,
-                             uint32_t max_thr,
+                             uint32_t max_threads,
                               uint32_t size_vfe_entry,
                               uint32_t num_vfe_entries,
                               uint32_t size_cs_entry,
diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c

index e44e675..b4e8735 100644 (file)
--- a/src/intel/intel_driver.c
+++ b/src/intel/intel_driver.c
@@ -114,12 +114,24 @@ intel_driver_init(intel_driver_t *driver, int dev_fd)
    assert(res);
    intel_driver_memman_init(driver);
  
-  if (IS_GEN6(driver->device_id))
+#if EMULATE_GEN
+  driver->gen_ver = EMULATE_GEN;
+  if (EMULATE_GEN == 7)
+    driver->device_id = PCI_CHIP_IVYBRIDGE_GT2; /* we pick GT2 for IVB */
+  else if (EMULATE_GEN == 6)
+    driver->device_id = PCI_CHIP_SANDYBRIDGE_GT2; /* we pick GT2 for SNB */
+  else
+    FATAL ("Unsupported Gen for emulation");
+#else
+  if (IS_GEN&(driver->device_id))
+    driver->gen_ver = 7;
+  else if (IS_GEN6(driver->device_id))
      driver->gen_ver = 6;
    else if(IS_IGDNG(driver->device_id))
      driver->gen_ver = 5;
    else
      driver->gen_ver = 4;
+#endif /* EMULATE_GEN */
  }
  
  LOCAL int
author	bsegovia <devnull@localhost>
	Thu, 28 Jul 2011 05:37:24 +0000 (05:37 +0000)
committer	Keith Packard <keithp@keithp.com>
	Fri, 10 Aug 2012 23:14:36 +0000 (16:14 -0700)
CMakeLists.txt		patch \| blob \| history
kernels/copybuffer/CopyBuffer0.krn		patch \| blob \| history
kernels/copybuffer/CopyBuffer_0.bin		patch \| blob \| history
kernels/copybuffer/CopyBuffer_0.gen6.bin	[new file with mode: 0644]	patch \| blob
kernels/copybuffer/CopyBuffer_0.ghal3d		patch \| blob \| history
src/cl_command_queue.c		patch \| blob \| history
src/cl_device_id.c		patch \| blob \| history
src/cl_gen6_device.h		patch \| blob \| history
src/cl_gen7_device.h	[new file with mode: 0644]	patch \| blob
src/cl_gt_device.h	[new file with mode: 0644]	patch \| blob
src/cl_kernel.c		patch \| blob \| history
src/intel/cl_device_data.h		patch \| blob \| history
src/intel/genx_defines.h		patch \| blob \| history
src/intel/genx_gpgpu.c		patch \| blob \| history
src/intel/genx_gpgpu.h		patch \| blob \| history
src/intel/intel_driver.c		patch \| blob \| history