#include <assert.h>
-
static const uint32_t media_kernel[][4] = {
{ 0x00400001, 0x20200231, 0x00000020, 0x00000000 },
{ 0x00600001, 0x20800021, 0x008d0000, 0x00000000 },
{ 0x07800031, 0x20001ca8, 0x00000e00, 0x82000010 },
};
+/* shaders/gpgpu/gpgpu_fill.gxa */
+static const uint32_t gpgpu_kernel[][4] = {
+ { 0x00400001, 0x20200231, 0x00000020, 0x00000000 },
+ { 0x00000041, 0x20400c21, 0x00000004, 0x00000010 },
+ { 0x00000001, 0x20440021, 0x00000018, 0x00000000 },
+ { 0x00600001, 0x20800021, 0x008d0000, 0x00000000 },
+ { 0x00200001, 0x20800021, 0x00450040, 0x00000000 },
+ { 0x00000001, 0x20880061, 0x00000000, 0x0000000f },
+ { 0x00800001, 0x20a00021, 0x00000020, 0x00000000 },
+ { 0x00800001, 0x20e00021, 0x00000020, 0x00000000 },
+ { 0x00800001, 0x21200021, 0x00000020, 0x00000000 },
+ { 0x00800001, 0x21600021, 0x00000020, 0x00000000 },
+ { 0x05800031, 0x24001ca8, 0x00000080, 0x120a8000 },
+ { 0x00600001, 0x2e000021, 0x008d0000, 0x00000000 },
+ { 0x07800031, 0x20001ca8, 0x00000e00, 0x82000010 },
+};
+
static uint32_t
batch_used(struct intel_batchbuffer *batch)
{
}
static uint32_t
-gen7_fill_interface_descriptor(struct intel_batchbuffer *batch, struct igt_buf *dst)
+gen7_fill_interface_descriptor(struct intel_batchbuffer *batch, struct igt_buf *dst,
+ const uint32_t kernel[][4], size_t size)
{
struct gen7_interface_descriptor_data *idd;
uint32_t offset;
uint32_t binding_table_offset, kernel_offset;
binding_table_offset = gen7_fill_binding_table(batch, dst);
- kernel_offset = gen7_fill_media_kernel(batch, media_kernel, sizeof(media_kernel));
+ kernel_offset = gen7_fill_media_kernel(batch, kernel, size);
idd = batch_alloc(batch, sizeof(*idd), 64);
offset = batch_offset(batch, idd);
batch->ptr = &batch->buffer[BATCH_STATE_SPLIT];
curbe_buffer = gen7_fill_curbe_buffer_data(batch, color);
- interface_descriptor = gen7_fill_interface_descriptor(batch, dst);
+ interface_descriptor = gen7_fill_interface_descriptor(batch, dst,
+ media_kernel,
+ sizeof(media_kernel));
igt_assert(batch->ptr < &batch->buffer[4095]);
/* media pipeline */
gen7_render_flush(batch, batch_end);
intel_batchbuffer_reset(batch);
}
+
+static void
+gen7_emit_vfe_state_gpgpu(struct intel_batchbuffer *batch)
+{
+ OUT_BATCH(GEN7_MEDIA_VFE_STATE | (8 - 2));
+
+ /* scratch buffer */
+ OUT_BATCH(0);
+
+ /* number of threads & urb entries */
+ OUT_BATCH(1 << 16 | /* max num of threads */
+ 0 << 8 | /* num of URB entry */
+ 1 << 2); /* GPGPU mode */
+
+ OUT_BATCH(0);
+
+ /* urb entry size & curbe size */
+ OUT_BATCH(0 << 16 | /* URB entry size in 256 bits unit */
+ 1); /* CURBE entry size in 256 bits unit */
+
+ /* scoreboard */
+ OUT_BATCH(0);
+ OUT_BATCH(0);
+ OUT_BATCH(0);
+}
+
+static void
+gen7_emit_gpgpu_walk(struct intel_batchbuffer *batch,
+ unsigned x, unsigned y,
+ unsigned width, unsigned height)
+{
+ uint32_t x_dim, y_dim, tmp, right_mask;
+
+ /*
+ * Simply do SIMD16 based dispatch, so every thread uses
+ * SIMD16 channels.
+ *
+ * Define our own thread group size, e.g 16x1 for every group, then
+ * will have 1 thread each group in SIMD16 dispatch. So thread
+ * width/height/depth are all 1.
+ *
+ * Then thread group X = width / 16 (aligned to 16)
+ * thread group Y = height;
+ */
+ x_dim = (width + 15) / 16;
+ y_dim = height;
+
+ tmp = width & 15;
+ if (tmp == 0)
+ right_mask = (1 << 16) - 1;
+ else
+ right_mask = (1 << tmp) - 1;
+
+ OUT_BATCH(GEN7_GPGPU_WALKER | 9);
+
+ /* interface descriptor offset */
+ OUT_BATCH(0);
+
+ /* SIMD size, thread w/h/d */
+ OUT_BATCH(1 << 30 | /* SIMD16 */
+ 0 << 16 | /* depth:1 */
+ 0 << 8 | /* height:1 */
+ 0); /* width:1 */
+
+ /* thread group X */
+ OUT_BATCH(0);
+ OUT_BATCH(x_dim);
+
+ /* thread group Y */
+ OUT_BATCH(0);
+ OUT_BATCH(y_dim);
+
+ /* thread group Z */
+ OUT_BATCH(0);
+ OUT_BATCH(1);
+
+ /* right mask */
+ OUT_BATCH(right_mask);
+
+ /* bottom mask, height 1, always 0xffffffff */
+ OUT_BATCH(0xffffffff);
+}
+
+void
+gen7_gpgpu_fillfunc(struct intel_batchbuffer *batch,
+ struct igt_buf *dst,
+ unsigned x, unsigned y,
+ unsigned width, unsigned height,
+ uint8_t color)
+{
+ uint32_t curbe_buffer, interface_descriptor;
+ uint32_t batch_end;
+
+ intel_batchbuffer_flush(batch);
+
+ /* setup states */
+ batch->ptr = &batch->buffer[BATCH_STATE_SPLIT];
+
+ /*
+ * const buffer needs to fill for every thread, but as we have just 1 thread
+ * per every group, so need only one curbe data.
+ *
+ * For each thread, just use thread group ID for buffer offset.
+ */
+ curbe_buffer = gen7_fill_curbe_buffer_data(batch, color);
+
+ interface_descriptor = gen7_fill_interface_descriptor(batch, dst,
+ gpgpu_kernel,
+ sizeof(gpgpu_kernel));
+ igt_assert(batch->ptr < &batch->buffer[4095]);
+
+ batch->ptr = batch->buffer;
+
+ /* GPGPU pipeline */
+ OUT_BATCH(GEN7_PIPELINE_SELECT | PIPELINE_SELECT_GPGPU);
+
+ gen7_emit_state_base_address(batch);
+
+ gen7_emit_vfe_state_gpgpu(batch);
+
+ gen7_emit_curbe_load(batch, curbe_buffer);
+
+ gen7_emit_interface_descriptor_load(batch, interface_descriptor);
+
+ gen7_emit_gpgpu_walk(batch, x, y, width, height);
+
+ OUT_BATCH(MI_BATCH_BUFFER_END);
+
+ batch_end = batch_align(batch, 8);
+ igt_assert(batch_end < BATCH_STATE_SPLIT);
+
+ gen7_render_flush(batch, batch_end);
+ intel_batchbuffer_reset(batch);
+}
--- /dev/null
+/*
+ * Registers
+ * g0 -- header
+ * g1 -- constant
+ * g2 -- calculate X/Y offset
+ * g4-g12 payload for write message
+ */
+define(`ORIG', `g2.0<2,2,1>UD')
+define(`ORIG_X', `g2.0<1>UD')
+define(`ORIG_Y', `g2.4<1>UD')
+define(`COLOR', `g1.0')
+define(`COLORUB', `COLOR<0,1,0>UB')
+define(`COLORUD', `COLOR<0,1,0>UD')
+define(`X', `g0.4<0,1,0>UD')
+define(`Y', `g0.24<0,1,0>UD')
+
+mov(4) COLOR<1>UB COLORUB {align1};
+
+/* WRITE */
+/* count thread group ID for X/Y offset */
+mul(1) ORIG_X X 0x10UD {align1};
+mov(1) ORIG_Y Y {align1};
+mov(8) g4.0<1>UD g0.0<8,8,1>UD {align1};
+mov(2) g4.0<1>UD ORIG {align1};
+/* Normal mode: for block height 1 row and block width 16 bytes */
+mov(1) g4.8<1>UD 0x0000000fUD {align1};
+
+mov(16) g5.0<1>UD COLORUD {align1 compr};
+mov(16) g7.0<1>UD COLORUD {align1 compr};
+mov(16) g9.0<1>UD COLORUD {align1 compr};
+mov(16) g11.0<1>UD COLORUD {align1 compr};
+
+/*
+ * comment out the following instruction on Gen7
+ * write(0, 0, 10, 12)
+ * 10: media_block_write
+ * 12: data cache data port 1
+ */
+send(16) 4 acc0<1>UW null write(0, 0, 10, 12) mlen 9 rlen 0 {align1};
+
+/*
+ * uncomment the following instruction on Gen7
+ * write(0, 0, 10, 0)
+ * 10: media_block_write
+ * 0: reander cache data port
+ */
+/* send(16) 4 acc0<1>UW null write(0, 0, 10, 0) mlen 9 rlen 0 {align1}; */
+
+/* EOT */
+mov(8) g112.0<1>UD g0.0<8,8,1>UD {align1};
+send(16) 112 null<1>UW null thread_spawner(0, 0, 1) mlen 1 rlen 0 {align1 EOT};