lib: Add GPGPU fill

author Zhenyu Wang <zhenyuw@linux.intel.com>

Wed, 3 Dec 2014 11:05:09 +0000 (19:05 +0800)

committer Zhenyu Wang <zhenyuw@linux.intel.com>

Thu, 4 Dec 2014 02:17:16 +0000 (10:17 +0800)
author Zhenyu Wang <zhenyuw@linux.intel.com>
Wed, 3 Dec 2014 11:05:09 +0000 (19:05 +0800)
committer Zhenyu Wang <zhenyuw@linux.intel.com>
Thu, 4 Dec 2014 02:17:16 +0000 (10:17 +0800)
diff --git a/lib/gen7_media.h b/lib/gen7_media.h

index d5f9921..91294d2 100644 (file)
--- a/lib/gen7_media.h
+++ b/lib/gen7_media.h
@@ -179,6 +179,7 @@
  #define GEN7_PIPELINE_SELECT                   GFXPIPE(1, 1, 4)
  # define PIPELINE_SELECT_3D                    (0 << 0)
  # define PIPELINE_SELECT_MEDIA                 (1 << 0)
+# define PIPELINE_SELECT_GPGPU                 (2 << 0)
  
  #define GEN7_STATE_BASE_ADDRESS                        GFXPIPE(0, 1, 1)
  # define BASE_ADDRESS_MODIFY                   (1 << 0)
@@ -187,6 +188,7 @@
  #define GEN7_MEDIA_CURBE_LOAD                  GFXPIPE(2, 0, 1)
  #define GEN7_MEDIA_INTERFACE_DESCRIPTOR_LOAD   GFXPIPE(2, 0, 2)
  #define GEN7_MEDIA_OBJECT                      GFXPIPE(2, 1, 0)
+#define GEN7_GPGPU_WALKER                       GFXPIPE(2, 1, 5)
  
  struct gen7_interface_descriptor_data
  {
diff --git a/lib/intel_batchbuffer.c b/lib/intel_batchbuffer.c

index 4b3a5b8..c70f6d8 100644 (file)
--- a/lib/intel_batchbuffer.c
+++ b/lib/intel_batchbuffer.c
@@ -511,3 +511,22 @@ igt_fillfunc_t igt_get_media_fillfunc(int devid)
  
         return fill;
  }
+
+/**
+ * igt_get_gpgpu_fillfunc:
+ * @devid: pci device id
+ *
+ * Returns:
+ *
+ * The platform-specific gpgpu fill function pointer for the device specified
+ * with @devid. Will return NULL when no gpgpu fill function is implemented.
+ */
+igt_fillfunc_t igt_get_gpgpu_fillfunc(int devid)
+{
+       igt_fillfunc_t fill = NULL;
+
+       if (IS_GEN7(devid))
+               fill = gen7_gpgpu_fillfunc;
+
+       return fill;
+}
diff --git a/lib/intel_batchbuffer.h b/lib/intel_batchbuffer.h

index f0e21ea..12f7be1 100644 (file)
--- a/lib/intel_batchbuffer.h
+++ b/lib/intel_batchbuffer.h
@@ -250,11 +250,11 @@ igt_render_copyfunc_t igt_get_render_copyfunc(int devid);
   * @color: fill color to use
   *
   * This is the type of the per-platform fill functions using media
- * pipeline. The platform-specific implementation can be obtained
- * by calling igt_get_media_fillfunc().
+ * or gpgpu pipeline. The platform-specific implementation can be obtained
+ * by calling igt_get_media_fillfunc() or igt_get_gpgpu_fillfunc().
   *
   * A fill function will emit a batchbuffer to the kernel which executes
- * the specified blit fill operation using the media engine.
+ * the specified blit fill operation using the media/gpgpu engine.
   */
  typedef void (*igt_fillfunc_t)(struct intel_batchbuffer *batch,
                                struct igt_buf *dst,
@@ -263,5 +263,6 @@ typedef void (*igt_fillfunc_t)(struct intel_batchbuffer *batch,
                                uint8_t color);
  
  igt_fillfunc_t igt_get_media_fillfunc(int devid);
+igt_fillfunc_t igt_get_gpgpu_fillfunc(int devid);
  
  #endif
diff --git a/lib/media_fill.h b/lib/media_fill.h

index 226489c..2a30055 100644 (file)
--- a/lib/media_fill.h
+++ b/lib/media_fill.h
@@ -32,4 +32,11 @@ gen9_media_fillfunc(struct intel_batchbuffer *batch,
                  unsigned width, unsigned height,
                  uint8_t color);
  
+void
+gen7_gpgpu_fillfunc(struct intel_batchbuffer *batch,
+                   struct igt_buf *dst,
+                   unsigned x, unsigned y,
+                   unsigned width, unsigned height,
+                   uint8_t color);
+
  #endif /* RENDE_MEDIA_FILL_H */
diff --git a/lib/media_fill_gen7.c b/lib/media_fill_gen7.c

index 5a23b7d..7113fda 100644 (file)
--- a/lib/media_fill_gen7.c
+++ b/lib/media_fill_gen7.c
@@ -8,7 +8,6 @@
  
  #include <assert.h>
  
-
  static const uint32_t media_kernel[][4] = {
         { 0x00400001, 0x20200231, 0x00000020, 0x00000000 },
         { 0x00600001, 0x20800021, 0x008d0000, 0x00000000 },
@@ -23,6 +22,23 @@ static const uint32_t media_kernel[][4] = {
         { 0x07800031, 0x20001ca8, 0x00000e00, 0x82000010 },
  };
  
+/* shaders/gpgpu/gpgpu_fill.gxa */
+static const uint32_t gpgpu_kernel[][4] = {
+       { 0x00400001, 0x20200231, 0x00000020, 0x00000000 },
+       { 0x00000041, 0x20400c21, 0x00000004, 0x00000010 },
+       { 0x00000001, 0x20440021, 0x00000018, 0x00000000 },
+       { 0x00600001, 0x20800021, 0x008d0000, 0x00000000 },
+       { 0x00200001, 0x20800021, 0x00450040, 0x00000000 },
+       { 0x00000001, 0x20880061, 0x00000000, 0x0000000f },
+       { 0x00800001, 0x20a00021, 0x00000020, 0x00000000 },
+       { 0x00800001, 0x20e00021, 0x00000020, 0x00000000 },
+       { 0x00800001, 0x21200021, 0x00000020, 0x00000000 },
+       { 0x00800001, 0x21600021, 0x00000020, 0x00000000 },
+       { 0x05800031, 0x24001ca8, 0x00000080, 0x120a8000 },
+       { 0x00600001, 0x2e000021, 0x008d0000, 0x00000000 },
+       { 0x07800031, 0x20001ca8, 0x00000e00, 0x82000010 },
+};
+
  static uint32_t
  batch_used(struct intel_batchbuffer *batch)
  {
@@ -160,14 +176,15 @@ gen7_fill_media_kernel(struct intel_batchbuffer *batch,
  }
  
  static uint32_t
-gen7_fill_interface_descriptor(struct intel_batchbuffer *batch, struct igt_buf *dst)
+gen7_fill_interface_descriptor(struct intel_batchbuffer *batch, struct igt_buf *dst,
+                              const uint32_t kernel[][4], size_t size)
  {
         struct gen7_interface_descriptor_data *idd;
         uint32_t offset;
         uint32_t binding_table_offset, kernel_offset;
  
         binding_table_offset = gen7_fill_binding_table(batch, dst);
-       kernel_offset = gen7_fill_media_kernel(batch, media_kernel, sizeof(media_kernel));
+       kernel_offset = gen7_fill_media_kernel(batch, kernel, size);
  
         idd = batch_alloc(batch, sizeof(*idd), 64);
         offset = batch_offset(batch, idd);
@@ -329,7 +346,9 @@ gen7_media_fillfunc(struct intel_batchbuffer *batch,
         batch->ptr = &batch->buffer[BATCH_STATE_SPLIT];
  
         curbe_buffer = gen7_fill_curbe_buffer_data(batch, color);
-       interface_descriptor = gen7_fill_interface_descriptor(batch, dst);
+       interface_descriptor = gen7_fill_interface_descriptor(batch, dst,
+                                                             media_kernel,
+                                                             sizeof(media_kernel));
         igt_assert(batch->ptr < &batch->buffer[4095]);
  
         /* media pipeline */
@@ -353,3 +372,137 @@ gen7_media_fillfunc(struct intel_batchbuffer *batch,
         gen7_render_flush(batch, batch_end);
         intel_batchbuffer_reset(batch);
  }
+
+static void
+gen7_emit_vfe_state_gpgpu(struct intel_batchbuffer *batch)
+{
+       OUT_BATCH(GEN7_MEDIA_VFE_STATE | (8 - 2));
+
+       /* scratch buffer */
+       OUT_BATCH(0);
+
+       /* number of threads & urb entries */
+       OUT_BATCH(1 << 16 | /* max num of threads */
+                 0 << 8 | /* num of URB entry */
+                 1 << 2); /* GPGPU mode */
+
+       OUT_BATCH(0);
+
+       /* urb entry size & curbe size */
+       OUT_BATCH(0 << 16 |     /* URB entry size in 256 bits unit */
+                 1);           /* CURBE entry size in 256 bits unit */
+
+       /* scoreboard */
+       OUT_BATCH(0);
+       OUT_BATCH(0);
+       OUT_BATCH(0);
+}
+
+static void
+gen7_emit_gpgpu_walk(struct intel_batchbuffer *batch,
+                    unsigned x, unsigned y,
+                    unsigned width, unsigned height)
+{
+       uint32_t x_dim, y_dim, tmp, right_mask;
+
+       /*
+        * Simply do SIMD16 based dispatch, so every thread uses
+        * SIMD16 channels.
+        *
+        * Define our own thread group size, e.g 16x1 for every group, then
+        * will have 1 thread each group in SIMD16 dispatch. So thread
+        * width/height/depth are all 1.
+        *
+        * Then thread group X = width / 16 (aligned to 16)
+        * thread group Y = height;
+        */
+       x_dim = (width + 15) / 16;
+       y_dim = height;
+
+       tmp = width & 15;
+       if (tmp == 0)
+               right_mask = (1 << 16) - 1;
+       else
+               right_mask = (1 << tmp) - 1;
+
+       OUT_BATCH(GEN7_GPGPU_WALKER | 9);
+
+       /* interface descriptor offset */
+       OUT_BATCH(0);
+
+       /* SIMD size, thread w/h/d */
+       OUT_BATCH(1 << 30 | /* SIMD16 */
+                 0 << 16 | /* depth:1 */
+                 0 << 8 | /* height:1 */
+                 0); /* width:1 */
+
+       /* thread group X */
+       OUT_BATCH(0);
+       OUT_BATCH(x_dim);
+
+       /* thread group Y */
+       OUT_BATCH(0);
+       OUT_BATCH(y_dim);
+
+       /* thread group Z */
+       OUT_BATCH(0);
+       OUT_BATCH(1);
+
+       /* right mask */
+       OUT_BATCH(right_mask);
+
+       /* bottom mask, height 1, always 0xffffffff */
+       OUT_BATCH(0xffffffff);
+}
+
+void
+gen7_gpgpu_fillfunc(struct intel_batchbuffer *batch,
+                   struct igt_buf *dst,
+                   unsigned x, unsigned y,
+                   unsigned width, unsigned height,
+                   uint8_t color)
+{
+       uint32_t curbe_buffer, interface_descriptor;
+       uint32_t batch_end;
+
+       intel_batchbuffer_flush(batch);
+
+       /* setup states */
+       batch->ptr = &batch->buffer[BATCH_STATE_SPLIT];
+
+       /*
+        * const buffer needs to fill for every thread, but as we have just 1 thread
+        * per every group, so need only one curbe data.
+        *
+        * For each thread, just use thread group ID for buffer offset.
+        */
+       curbe_buffer = gen7_fill_curbe_buffer_data(batch, color);
+
+       interface_descriptor = gen7_fill_interface_descriptor(batch, dst,
+                                                             gpgpu_kernel,
+                                                             sizeof(gpgpu_kernel));
+       igt_assert(batch->ptr < &batch->buffer[4095]);
+
+       batch->ptr = batch->buffer;
+
+       /* GPGPU pipeline */
+       OUT_BATCH(GEN7_PIPELINE_SELECT | PIPELINE_SELECT_GPGPU);
+
+       gen7_emit_state_base_address(batch);
+
+       gen7_emit_vfe_state_gpgpu(batch);
+
+       gen7_emit_curbe_load(batch, curbe_buffer);
+
+       gen7_emit_interface_descriptor_load(batch, interface_descriptor);
+
+       gen7_emit_gpgpu_walk(batch, x, y, width, height);
+
+       OUT_BATCH(MI_BATCH_BUFFER_END);
+
+       batch_end = batch_align(batch, 8);
+       igt_assert(batch_end < BATCH_STATE_SPLIT);
+
+       gen7_render_flush(batch, batch_end);
+       intel_batchbuffer_reset(batch);
+}
diff --git a/shaders/gpgpu/README b/shaders/gpgpu/README

new file mode 100644 (file)

index 0000000..3bf328a
--- /dev/null
+++ b/shaders/gpgpu/README
@@ -0,0 +1,4 @@
+
+Commands used to generate the shader on gen7
+$> m4 gpgpu_fill.gxa > gpgpu_fill.gxm
+$> intel-gen4asm -g 7 -o <output> gpgpu_fill.gxm
diff --git a/shaders/gpgpu/gpgpu_fill.gxa b/shaders/gpgpu/gpgpu_fill.gxa

new file mode 100644 (file)

index 0000000..fc309f3
--- /dev/null
+++ b/shaders/gpgpu/gpgpu_fill.gxa
@@ -0,0 +1,51 @@
+/*
+ * Registers
+ * g0 -- header
+ * g1 -- constant
+ * g2 -- calculate X/Y offset
+ * g4-g12 payload for write message
+ */
+define(`ORIG',          `g2.0<2,2,1>UD')
+define(`ORIG_X',        `g2.0<1>UD')
+define(`ORIG_Y',        `g2.4<1>UD')
+define(`COLOR',         `g1.0')
+define(`COLORUB',       `COLOR<0,1,0>UB')
+define(`COLORUD',       `COLOR<0,1,0>UD')
+define(`X',             `g0.4<0,1,0>UD')
+define(`Y',             `g0.24<0,1,0>UD')
+
+mov(4)  COLOR<1>UB      COLORUB         {align1};
+
+/* WRITE */
+/* count thread group ID for X/Y offset */
+mul(1)  ORIG_X          X        0x10UD {align1};
+mov(1)  ORIG_Y          Y               {align1};
+mov(8)  g4.0<1>UD       g0.0<8,8,1>UD   {align1};
+mov(2)  g4.0<1>UD       ORIG            {align1};
+/* Normal mode: for block height 1 row and block width 16 bytes */
+mov(1)  g4.8<1>UD       0x0000000fUD    {align1};
+
+mov(16) g5.0<1>UD       COLORUD         {align1 compr};
+mov(16) g7.0<1>UD       COLORUD         {align1 compr};
+mov(16) g9.0<1>UD       COLORUD         {align1 compr};
+mov(16) g11.0<1>UD      COLORUD         {align1 compr};
+
+/*
+ * comment out the following instruction on Gen7
+ * write(0, 0, 10, 12)
+ *   10: media_block_write
+ *   12: data cache data port 1
+ */
+send(16) 4 acc0<1>UW null write(0, 0, 10, 12) mlen 9 rlen 0 {align1};
+
+/*
+ * uncomment the following instruction on Gen7
+ * write(0, 0, 10, 0)
+ *   10: media_block_write
+ *    0: reander cache data port
+ */
+/* send(16) 4 acc0<1>UW null write(0, 0, 10, 0) mlen 9 rlen 0 {align1}; */
+
+/* EOT */
+mov(8)  g112.0<1>UD       g0.0<8,8,1>UD   {align1};
+send(16) 112 null<1>UW null thread_spawner(0, 0, 1) mlen 1 rlen 0 {align1 EOT};
author	Zhenyu Wang <zhenyuw@linux.intel.com>
	Wed, 3 Dec 2014 11:05:09 +0000 (19:05 +0800)
committer	Zhenyu Wang <zhenyuw@linux.intel.com>
	Thu, 4 Dec 2014 02:17:16 +0000 (10:17 +0800)
lib/gen7_media.h		patch \| blob \| history
lib/intel_batchbuffer.c		patch \| blob \| history
lib/intel_batchbuffer.h		patch \| blob \| history
lib/media_fill.h		patch \| blob \| history
lib/media_fill_gen7.c		patch \| blob \| history
shaders/gpgpu/README	[new file with mode: 0644]	patch \| blob
shaders/gpgpu/gpgpu_fill.gxa	[new file with mode: 0644]	patch \| blob