Implement sampler support.
authorZhigang Gong <zhigang.gong@linux.intel.com>
Sun, 6 Jan 2013 05:16:49 +0000 (13:16 +0800)
committerZhigang Gong <zhigang.gong@linux.intel.com>
Wed, 10 Apr 2013 06:51:31 +0000 (14:51 +0800)
Accept sampler kernel argument and pass the sampler to driver side.
At driver side, gen7 driver will allocate a sampler slot and initlaize
one sampler state according to the cl sampler state. and then pass
back the slot index to the curbe array.

After this commit, read_imagei(image, sampler, coord) could work as
expected.

Signed-off-by: Zhigang Gong <zhigang.gong@linux.intel.com>
Reviewed-by: Lu Guanqun <guanqun.lu@intel.com>
src/cl_command_queue.c
src/cl_driver.h
src/cl_driver_defs.c
src/cl_kernel.c
src/cl_kernel.h
src/intel/intel_defines.h
src/intel/intel_gpgpu.c

index 943a332..d31f58a 100644 (file)
@@ -107,7 +107,8 @@ cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
     arg_type = gbe_kernel_get_arg_type(k->opaque, i);
     if (arg_type != GBE_ARG_GLOBAL_PTR &&
         arg_type != GBE_ARG_CONSTANT_PTR &&
-        arg_type != GBE_ARG_IMAGE)
+        arg_type != GBE_ARG_IMAGE &&
+        arg_type != GBE_ARG_SAMPLER)
       continue;
     offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, i);
     if (arg_type == GBE_ARG_IMAGE) {
@@ -116,6 +117,9 @@ cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
                             k->args[i].mem->intel_fmt, k->args[i].mem->w,
                             k->args[i].mem->h, k->args[i].mem->pitch,
                             k->args[i].mem->tiling);
+    } else if (arg_type == GBE_ARG_SAMPLER) {
+      uint32_t *curbe_index = (uint32_t*)(k->curbe + offset);
+      cl_gpgpu_insert_sampler(queue->gpgpu, curbe_index, k->args[i].sampler);
     } else
       cl_gpgpu_bind_buf(queue->gpgpu, k->args[i].mem->bo, offset, cc_llc_l3);
   }
index 8c86f32..43719a9 100644 (file)
@@ -47,6 +47,7 @@ typedef struct _cl_driver *cl_driver;
 typedef struct _cl_gpgpu *cl_gpgpu;
 
 typedef struct _cl_context_prop *cl_context_prop;
+typedef struct _cl_sampler *cl_sampler;
 
 /**************************************************************************
  * Driver
@@ -109,6 +110,10 @@ extern cl_gpgpu_delete_cb *cl_gpgpu_delete;
 typedef void (cl_gpgpu_bind_buf_cb)(cl_gpgpu, cl_buffer, uint32_t offset, uint32_t cchint);
 extern cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf;
 
+/* Insert a sampler */
+typedef void (cl_gpgpu_insert_sampler_cb)(cl_gpgpu, uint32_t *curbe_index, cl_sampler sampler);
+extern cl_gpgpu_insert_sampler_cb *cl_gpgpu_insert_sampler;
+
 /* Set a 2d texture */
 typedef void (cl_gpgpu_bind_image2D_cb)(cl_gpgpu state,
                                         uint32_t *curbe_index,
index a0d0d8c..a9c1e22 100644 (file)
@@ -57,4 +57,5 @@ LOCAL cl_gpgpu_batch_start_cb *cl_gpgpu_batch_start = NULL;
 LOCAL cl_gpgpu_batch_end_cb *cl_gpgpu_batch_end = NULL;
 LOCAL cl_gpgpu_flush_cb *cl_gpgpu_flush = NULL;
 LOCAL cl_gpgpu_walker_cb *cl_gpgpu_walker = NULL;
+LOCAL cl_gpgpu_insert_sampler_cb *cl_gpgpu_insert_sampler = NULL;
 
index 831d36b..356a8a7 100644 (file)
@@ -25,6 +25,7 @@
 #include "cl_alloc.h"
 #include "cl_utils.h"
 #include "CL/cl.h"
+#include "cl_sampler.h"
 
 #include <stdio.h>
 #include <string.h>
@@ -126,6 +127,22 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
     return CL_SUCCESS;
   }
 
+  /* For a sampler*/
+  if (arg_type == GBE_ARG_SAMPLER) {
+     cl_sampler sampler;
+     if (UNLIKELY(value == NULL))
+      return CL_INVALID_KERNEL_ARGS;
+     sampler = *(cl_sampler*)value;
+
+     if (UNLIKELY(sampler->magic != CL_MAGIC_SAMPLER_HEADER))
+       return CL_INVALID_ARG_VALUE;
+     k->args[index].local_sz = 0;
+     k->args[index].is_set = 1;
+     k->args[index].mem = NULL;
+     k->args[index].sampler = sampler;
+     return CL_SUCCESS;
+  }
+
   /* Otherwise, we just need to check that this is a buffer */
   if (UNLIKELY(value == NULL))
     return CL_INVALID_KERNEL_ARGS;
index 0347d0a..e444f3b 100644 (file)
@@ -36,6 +36,7 @@ struct _gbe_kernel;
  */
 typedef struct cl_argument {
   cl_mem mem;           /* For image and regular buffers */
+  cl_sampler sampler;   /* For sampler. */
   uint32_t local_sz:31; /* For __local size specification */
   uint32_t is_set:1;    /* All args must be set before NDRange */
 } cl_argument;
index fbf4619..19bdbed 100644 (file)
 #define GEN7_PIPE_CONTROL_OPCODE_3D_CONTROL 0x2
 #define GEN7_PIPE_CONTROL_SUBOPCODE_3D_CONTROL 0x0
 
+#define GEN_MAPFILTER_NEAREST        0x0
+#define GEN_MAPFILTER_LINEAR         0x1
+#define GEN_MAPFILTER_ANISOTROPIC    0x2
+
+#define GEN_MIPFILTER_NONE        0
+#define GEN_MIPFILTER_NEAREST     1
+#define GEN_MIPFILTER_LINEAR      3
+
+#define GEN_ADDRESS_ROUNDING_ENABLE_U_MAG      0x20
+#define GEN_ADDRESS_ROUNDING_ENABLE_U_MIN      0x10
+#define GEN_ADDRESS_ROUNDING_ENABLE_V_MAG      0x08
+#define GEN_ADDRESS_ROUNDING_ENABLE_V_MIN      0x04
+#define GEN_ADDRESS_ROUNDING_ENABLE_R_MAG      0x02
+#define GEN_ADDRESS_ROUNDING_ENABLE_R_MIN      0x01
+
+#define GEN_TEXCOORDMODE_WRAP            0
+#define GEN_TEXCOORDMODE_MIRROR          1
+#define GEN_TEXCOORDMODE_CLAMP           2
+#define GEN_TEXCOORDMODE_CUBE            3
+#define GEN_TEXCOORDMODE_CLAMP_BORDER    4
+#define GEN_TEXCOORDMODE_MIRROR_ONCE     5
+
 #endif /* __GENX_DEFINES_H__ */
 
index 119bf31..c035076 100644 (file)
@@ -37,6 +37,7 @@
 
 #include "cl_alloc.h"
 #include "cl_utils.h"
+#include "cl_sampler.h"
 
 #define GEN_CMD_MEDIA_OBJECT  (0x71000000)
 #define MO_TS_BIT             (1 << 24)
@@ -56,6 +57,8 @@ enum { max_buf_n = 128 };
 
 enum { max_img_n = 32 };
 
+enum {max_sampler_n = 16 };
+
 /* Handle GPGPU state */
 struct intel_gpgpu
 {
@@ -70,6 +73,8 @@ struct intel_gpgpu
   unsigned int img_index_base;          /* base index for image surface.*/
   drm_intel_bo *binded_img[max_img_n];  /* all images binded for the call */
 
+  unsigned int sampler_bitmap;          /* sampler usage bitmap. */
+
   struct { drm_intel_bo *bo; } stack_b;
   struct { drm_intel_bo *bo; } idrt_b;
   struct { drm_intel_bo *bo; } surface_heap_b;
@@ -347,6 +352,7 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
   gpgpu->binded_n = 0;
   gpgpu->img_bitmap = 0;
   gpgpu->img_index_base = 2;
+  gpgpu->sampler_bitmap = ~((1 << max_sampler_n) - 1);
 
   /* URB */
   gpgpu->urb.num_cs_entries = 64;
@@ -481,6 +487,16 @@ intel_gpgpu_get_free_img_index(intel_gpgpu_t *gpgpu)
   return slot + gpgpu->img_index_base;
 }
 
+static int
+intel_gpgpu_get_free_sampler_index(intel_gpgpu_t *gpgpu)
+{
+  int slot;
+  assert(~gpgpu->sampler_bitmap != 0);
+  slot = __fls(~gpgpu->sampler_bitmap);
+  gpgpu->sampler_bitmap |= (1 << (31 - slot));
+  return slot - max_sampler_n;
+}
+
 static void
 intel_gpgpu_bind_image2D_gen7(intel_gpgpu_t *gpgpu,
                               uint32_t *curbe_index,
@@ -638,6 +654,88 @@ intel_gpgpu_upload_samplers(intel_gpgpu_t *gpgpu, const void *data, uint32_t n)
   }
 }
 
+int translate_wrap_mode(uint32_t cl_address_mode, int using_nearest)
+{
+   switch( cl_address_mode ) {
+   case CL_ADDRESS_NONE:
+   case CL_ADDRESS_REPEAT:
+      return GEN_TEXCOORDMODE_WRAP;
+   case CL_ADDRESS_CLAMP:
+      /* GL_CLAMP is the weird mode where coordinates are clamped to
+       * [0.0, 1.0], so linear filtering of coordinates outside of
+       * [0.0, 1.0] give you half edge texel value and half border
+       * color.  The fragment shader will clamp the coordinates, and
+       * we set clamp_border here, which gets the result desired.  We
+       * just use clamp(_to_edge) for nearest, because for nearest
+       * clamping to 1.0 gives border color instead of the desired
+       * edge texels.
+       */
+      if (using_nearest)
+         return GEN_TEXCOORDMODE_CLAMP;
+      else
+         return GEN_TEXCOORDMODE_CLAMP_BORDER;
+   case CL_ADDRESS_CLAMP_TO_EDGE:
+      return GEN_TEXCOORDMODE_CLAMP;
+   case CL_ADDRESS_MIRRORED_REPEAT:
+      return GEN_TEXCOORDMODE_MIRROR;
+   default:
+      return GEN_TEXCOORDMODE_WRAP;
+   }
+}
+
+static void
+intel_gpgpu_insert_sampler(intel_gpgpu_t *gpgpu, uint32_t *curbe_index, cl_sampler cl_sampler)
+{
+  int index;
+  int using_nearest = 0;
+  uint32_t wrap_mode;
+  gen7_sampler_state_t *sampler;
+
+  index = intel_gpgpu_get_free_sampler_index(gpgpu);
+  sampler = (gen7_sampler_state_t *)gpgpu->sampler_state_b.bo->virtual + index;
+  if (!cl_sampler->normalized_coords)
+    sampler->ss3.non_normalized_coord = 1;
+  else
+    sampler->ss3.non_normalized_coord = 0;
+
+  switch (cl_sampler->filter) {
+  case CL_FILTER_NEAREST:
+    sampler->ss0.min_filter = GEN_MAPFILTER_NEAREST;
+    sampler->ss0.mip_filter = GEN_MIPFILTER_NONE;
+    sampler->ss0.mag_filter = GEN_MAPFILTER_NEAREST;
+    using_nearest = 1;
+    break;
+  case CL_FILTER_LINEAR:
+    sampler->ss0.min_filter = GEN_MAPFILTER_LINEAR;
+    sampler->ss0.mip_filter = GEN_MIPFILTER_NONE;
+    sampler->ss0.mag_filter = GEN_MAPFILTER_LINEAR;
+    break;
+  }
+
+  wrap_mode = translate_wrap_mode(cl_sampler->address, using_nearest);
+  sampler->ss3.r_wrap_mode = wrap_mode;
+  sampler->ss3.s_wrap_mode = wrap_mode;
+  sampler->ss3.t_wrap_mode = wrap_mode;
+
+  sampler->ss0.lod_preclamp = 1; /* OpenGL mode */
+  sampler->ss0.default_color_mode = 0; /* OpenGL/DX10 mode */
+
+  sampler->ss0.base_level = 0;
+
+  sampler->ss1.max_lod = 0;
+  sampler->ss1.min_lod = 0;
+
+  if (sampler->ss0.min_filter != GEN_MAPFILTER_NEAREST)
+     sampler->ss3.address_round |= GEN_ADDRESS_ROUNDING_ENABLE_U_MIN |
+                                   GEN_ADDRESS_ROUNDING_ENABLE_V_MIN |
+                                   GEN_ADDRESS_ROUNDING_ENABLE_R_MIN;
+  if (sampler->ss0.mag_filter != GEN_MAPFILTER_NEAREST)
+     sampler->ss3.address_round |= GEN_ADDRESS_ROUNDING_ENABLE_U_MAG |
+                                   GEN_ADDRESS_ROUNDING_ENABLE_V_MAG |
+                                   GEN_ADDRESS_ROUNDING_ENABLE_R_MAG;
+  *curbe_index = index;
+}
+
 static void
 intel_gpgpu_states_setup(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
 {
@@ -712,5 +810,6 @@ intel_set_gpgpu_callbacks(void)
   cl_gpgpu_batch_end = (cl_gpgpu_batch_end_cb *) intel_gpgpu_batch_end;
   cl_gpgpu_flush = (cl_gpgpu_flush_cb *) intel_gpgpu_flush;
   cl_gpgpu_walker = (cl_gpgpu_walker_cb *) intel_gpgpu_walker;
+  cl_gpgpu_insert_sampler = (cl_gpgpu_insert_sampler_cb *) intel_gpgpu_insert_sampler;
 }