const cl_event * event_wait_list,
cl_event * event)
{
+ size_t fixed_global_off[] = {0,0,0};
+ size_t fixed_global_sz[] = {1,1,1};
+ size_t fixed_local_sz[] = {16,1,1};
cl_int err = CL_SUCCESS;
cl_int i;
FATAL_IF(event_wait_list != NULL, "Events are not supported");
FATAL_IF(event != NULL, "Events are not supported");
+ if (local_work_size != NULL)
+ for (i = 0; i < work_dim; ++i)
+ fixed_local_sz[i] = local_work_size[i];
+ if (global_work_size != NULL)
+ for (i = 0; i < work_dim; ++i)
+ fixed_global_sz[i] = global_work_size[i];
+ if (global_work_offset != NULL)
+ for (i = 0; i < work_dim; ++i)
+ fixed_global_off[i] = global_work_offset[i];
+
/* Do device specific checks are enqueue the kernel */
err = cl_command_queue_ND_range(command_queue,
kernel,
- work_dim,
- global_work_offset,
- global_work_size,
- local_work_size);
+ fixed_global_off,
+ fixed_global_sz,
+ fixed_local_sz);
error:
return err;
}
/* Allocate local surface needed for SLM and bind it */
- if (local_sz != 0) {
+ if (local && local_sz != 0) {
const size_t sz = 16 * local_sz; /* XXX 16 == maximum barrier number */
assert(k->patch.local_surf.offset % SURFACE_SZ == 0);
index = k->patch.local_surf.offset / SURFACE_SZ;
*local = drm_intel_bo_alloc(bufmgr, "CL local surface", sz, 64);
gpgpu_bind_buf(gpgpu, index, *local, 0, sz, cc_llc_mlc);
}
- else
+ else if (local)
*local = NULL;
/* Allocate private surface and bind it */
- if (k->patch.private_surf.size != 0) {
+ if (priv && k->patch.private_surf.size != 0) {
const size_t sz = max_thread *
k->patch.private_surf.size *
k->patch.exec_env.largest_compiled_simd_sz;
*priv = drm_intel_bo_alloc(bufmgr, "CL private surface", sz, 64);
gpgpu_bind_buf(gpgpu, index, *priv, 0, sz, cc_llc_mlc);
}
- else
+ else if(priv)
*priv = NULL;
/* Allocate scratch surface and bind it */
- if (k->patch.scratch.size != 0) {
+ if (scratch && k->patch.scratch.size != 0) {
const size_t sz = max_thread * /* XXX is it given per lane ??? */
k->patch.scratch.size *
k->patch.exec_env.largest_compiled_simd_sz;
*scratch = drm_intel_bo_alloc(bufmgr, "CL scratch surface", sz, 64);
gpgpu_bind_buf(gpgpu, index, *scratch, 0, sz, cc_llc_mlc);
}
- else
+ else if (scratch)
*scratch = NULL;
/* Now bind a bo used for synchronization */
}
#endif /* USE_FULSIM */
-extern cl_int cl_command_queue_ND_range_gen6(cl_command_queue, cl_kernel, cl_uint, const size_t*, const size_t*, const size_t*);
-extern cl_int cl_command_queue_ND_range_gen7(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *);
+extern cl_int cl_command_queue_ND_range_gen6(cl_command_queue, cl_kernel, const size_t*, const size_t*, const size_t*);
+extern cl_int cl_command_queue_ND_range_gen7(cl_command_queue, cl_kernel, const size_t *, const size_t *, const size_t *);
LOCAL cl_int
cl_command_queue_ND_range(cl_command_queue queue,
- cl_kernel ker,
- cl_uint wk_dim,
+ cl_kernel k,
const size_t *global_wk_off,
const size_t *global_wk_sz,
const size_t *local_wk_sz)
cl_int err = CL_SUCCESS;
if (ver == 6)
- TRY (cl_command_queue_ND_range_gen6, queue,
- ker,
- wk_dim,
- global_wk_off,
- global_wk_sz,
- local_wk_sz);
+ TRY (cl_command_queue_ND_range_gen6, queue, k, global_wk_off, global_wk_sz, local_wk_sz);
else if (ver == 7)
- TRY (cl_command_queue_ND_range_gen7, queue,
- ker,
- wk_dim,
- global_wk_off,
- global_wk_sz,
- local_wk_sz);
+ TRY (cl_command_queue_ND_range_gen7, queue, k, global_wk_off, global_wk_sz, local_wk_sz);
else
FATAL ("Unknown Gen Device");
/* Map ND range kernel from OCL API */
extern cl_int cl_command_queue_ND_range(cl_command_queue queue,
cl_kernel ker,
- cl_uint work_dim,
const size_t *global_work_offset,
const size_t *global_work_size,
const size_t *local_work_size);
} cl_local_id_t;
static INLINE size_t
-cl_ker_compute_batch_sz(cl_kernel k,
- size_t wrk_dim_n,
- size_t wk_grp_n,
- size_t thread_n)
+cl_kernel_compute_batch_sz(cl_kernel k, size_t wk_grp_n, size_t thread_n)
{
size_t sz = 256; /* upper bound of the complete prelude */
size_t media_obj_sz = 6 * 4; /* size of one MEDIA OBJECT */
media_obj_sz += sizeof(cl_inline_header_t); /* header for all threads */
- media_obj_sz += wrk_dim_n * sizeof(cl_local_id_t);/* for each dimension */
+ media_obj_sz += 3 * sizeof(cl_local_id_t);/* for each dimension */
if (k->patch.exec_env.has_barriers)
media_obj_sz += 4 * 4; /* one barrier update per object */
sz += media_obj_sz * wk_grp_n * thread_n;
}
static INLINE void
-cl_command_queue_enqueue_wk_grp3(cl_command_queue queue,
- cl_local_id_t **ids,
- const cl_inline_header_t *header,
- uint32_t thread_n,
- uint32_t barrierID)
+cl_command_queue_enqueue_wk_grp(cl_command_queue queue,
+ cl_local_id_t **ids,
+ const cl_inline_header_t *header,
+ uint32_t thread_n,
+ uint32_t barrierID)
{
intel_gpgpu_t *gpgpu = queue->gpgpu;
uint32_t i;
}
}
-static INLINE void
-cl_command_queue_enqueue_wk_grp2(cl_command_queue queue,
- cl_local_id_t **ids,
- const cl_inline_header_t *header,
- uint32_t thread_n,
- uint32_t barrierID)
-{
- intel_gpgpu_t *gpgpu = queue->gpgpu;
- uint32_t i;
- for (i = 0; i < thread_n; ++i) {
- const size_t sz = sizeof(cl_inline_header_t) + 2*sizeof(cl_local_id_t);
- char *data = gpgpu_run_with_inline(gpgpu, barrierID, sz);
- size_t offset = 0;
- assert(data);
- *((cl_inline_header_t *) (data + offset)) = *header;
- offset += sizeof(cl_inline_header_t);
- *((cl_local_id_t *) (data + offset)) = ids[0][i];
- offset += sizeof(cl_local_id_t);
- *((cl_local_id_t *) (data + offset)) = ids[1][i];
- }
-}
-
-static INLINE void
-cl_command_queue_enqueue_wk_grp1(cl_command_queue queue,
- cl_local_id_t **ids,
- const cl_inline_header_t *header,
- uint32_t thread_n,
- uint32_t barrierID)
-{
- intel_gpgpu_t *gpgpu = queue->gpgpu;
- uint32_t i;
- for (i = 0; i < thread_n; ++i) {
- const size_t sz = sizeof(cl_inline_header_t) + sizeof(cl_local_id_t);
- char *data = gpgpu_run_with_inline(gpgpu, barrierID, sz);
- size_t offset = 0;
- assert(data);
- *((cl_inline_header_t *) (data + offset)) = *header;
- offset += sizeof(cl_inline_header_t);
- *((cl_local_id_t *) (data + offset)) = ids[0][i];
- }
-}
-
LOCAL cl_int
cl_command_queue_ND_range_gen6(cl_command_queue queue,
cl_kernel ker,
- cl_uint wk_dim,
- const size_t *global_work_offset,
+ const size_t *global_wk_off,
const size_t *global_wk_sz,
const size_t *local_wk_sz)
{
TRY (cl_kernel_check_args, ker);
/* Check that the local work sizes are OK */
- TRY (cl_kernel_work_group_sz, ker, local_wk_sz, wk_dim, &wk_grp_sz);
+ TRY (cl_kernel_work_group_sz, ker, local_wk_sz, 3, &wk_grp_sz);
/* Directly from the user defined values */
header.local_sz[0] = local_wk_sz[0];
header.exec_mask = ~0;
/* offsets are evenly divided by the local sizes */
- if (global_work_offset)
- for (i = 0; i < wk_dim; ++i)
- offset[i] = global_work_offset[i]/local_wk_sz[i];
+ offset[0] = global_wk_off[0] / local_wk_sz[0];
+ offset[1] = global_wk_off[1] / local_wk_sz[1];
+ offset[2] = global_wk_off[2] / local_wk_sz[2];
/* Compute the local size per wg and the offsets for each local buffer */
header.local_mem_sz = cl_kernel_local_memory_sz(ker);
if (cst_sz > 0) {
char *data = NULL;
assert(ker->cst_buffer);
- data = cl_kernel_create_cst_buffer(ker,wk_dim,global_wk_sz,local_wk_sz);
+ data = cl_kernel_create_cst_buffer(ker, global_wk_sz, local_wk_sz);
gpgpu_upload_constants(gpgpu, data, cst_sz);
cl_free(data);
}
wk_grp_n = 1;
- for (i = 0; i < wk_dim; ++i) {
+ for (i = 0; i < 3; ++i) {
TRY_ALLOC (ids[i], (cl_local_id_t*) cl_malloc(wk_grp_sz*sizeof(uint16_t)));
grp_end[i] = offset[i] + global_wk_sz[i] / local_wk_sz[i];
wk_grp_n *= grp_end[i]-offset[i];
}
thread_n = wk_grp_sz / 16;
- batch_sz = cl_ker_compute_batch_sz(ker, wk_dim, wk_grp_n, thread_n);
+ batch_sz = cl_kernel_compute_batch_sz(ker, wk_grp_n, thread_n);
/* Start a new batch buffer */
gpgpu_batch_reset(gpgpu, batch_sz);
* Local IDs are shared from work group to work group. We allocate once the
* buffers and reuse them
*/
- if (wk_dim == 3) {
- curr = 0;
- for (i = 0; i < local_wk_sz[0]; ++i)
- for (j = 0; j < local_wk_sz[1]; ++j)
- for (k = 0; k < local_wk_sz[2]; ++k, ++curr) {
- ((uint16_t*) ids[0])[curr] = i;
- ((uint16_t*) ids[1])[curr] = j;
- ((uint16_t*) ids[2])[curr] = k;
- }
- for (header.grp_n[0] = offset[0]; header.grp_n[0] < grp_end[0]; ++header.grp_n[0])
- for (header.grp_n[1] = offset[1]; header.grp_n[1] < grp_end[1]; ++header.grp_n[1])
- for (header.grp_n[2] = offset[2]; header.grp_n[2] < grp_end[2]; ++header.grp_n[2]) {
- if (ker->patch.exec_env.has_barriers)
- gpgpu_update_barrier(gpgpu, barrierID, thread_n);
- cl_command_queue_enqueue_wk_grp3(queue, ids, &header, thread_n, barrierID);
- barrierID = (barrierID + 1) % 16;
- }
- }
- else if (wk_dim == 2) {
- curr = 0;
- for (i = 0; i < local_wk_sz[0]; ++i)
- for (j = 0; j < local_wk_sz[1]; ++j, ++curr) {
- ((uint16_t*) ids[0])[curr] = i;
- ((uint16_t*) ids[1])[curr] = j;
- }
- for (header.grp_n[0] = offset[0]; header.grp_n[0] < grp_end[0]; ++header.grp_n[0])
- for (header.grp_n[1] = offset[1]; header.grp_n[1] < grp_end[1]; ++header.grp_n[1]) {
- if (ker->patch.exec_env.has_barriers)
- gpgpu_update_barrier(gpgpu, barrierID, thread_n);
- cl_command_queue_enqueue_wk_grp2(queue, ids, &header, thread_n, barrierID);
- barrierID = (barrierID + 1) % 16;
- }
+ curr = 0;
+ for (i = 0; i < local_wk_sz[0]; ++i)
+ for (j = 0; j < local_wk_sz[1]; ++j)
+ for (k = 0; k < local_wk_sz[2]; ++k, ++curr) {
+ ((uint16_t*) ids[0])[curr] = i;
+ ((uint16_t*) ids[1])[curr] = j;
+ ((uint16_t*) ids[2])[curr] = k;
}
- else {
- for (i = 0; i < local_wk_sz[0]; ++i)
- ((uint16_t*) ids[0])[i] = i;
- for (header.grp_n[0] = offset[0]; header.grp_n[0] < grp_end[0]; ++header.grp_n[0]) {
- if (ker->patch.exec_env.has_barriers)
- gpgpu_update_barrier(gpgpu, barrierID, thread_n);
- cl_command_queue_enqueue_wk_grp1(queue, ids, &header, thread_n, barrierID);
- barrierID = (barrierID + 1) % 16;
- }
+ for (header.grp_n[0] = offset[0]; header.grp_n[0] < grp_end[0]; ++header.grp_n[0])
+ for (header.grp_n[1] = offset[1]; header.grp_n[1] < grp_end[1]; ++header.grp_n[1])
+ for (header.grp_n[2] = offset[2]; header.grp_n[2] < grp_end[2]; ++header.grp_n[2]) {
+ if (ker->patch.exec_env.has_barriers)
+ gpgpu_update_barrier(gpgpu, barrierID, thread_n);
+ cl_command_queue_enqueue_wk_grp(queue, ids, &header, thread_n, barrierID);
+ barrierID = (barrierID + 1) % 16;
}
gpgpu_batch_end(gpgpu, 0);
#include <stdio.h>
#include <string.h>
+static INLINE size_t
+cl_kernel_compute_batch_sz(cl_kernel k)
+{
+ size_t sz = 256 + 16;
+ return sz;
+}
+
+static cl_int
+cl_set_local_ids(char *data,
+ const size_t *local_wk_sz,
+ size_t cst_sz,
+ size_t id_offset,
+ size_t thread_n)
+{
+ uint16_t *ids[3] = {NULL,NULL,NULL};
+ size_t i, j, k, curr = 0;
+ cl_int err = CL_SUCCESS;
+
+ for (i = 0; i < 3; ++i)
+ TRY_ALLOC(ids[i], (uint16_t*) cl_calloc(sizeof(uint16_t), thread_n*16));
+
+ /* Compute the IDs */
+ for (i = 0; i < local_wk_sz[0]; ++i)
+ for (j = 0; j < local_wk_sz[1]; ++j)
+ for (k = 0; k < local_wk_sz[2]; ++k, ++curr) {
+ ((uint16_t*) ids[0])[curr] = i;
+ ((uint16_t*) ids[1])[curr] = j;
+ ((uint16_t*) ids[2])[curr] = k;
+ }
+
+ /* Copy them to the constant buffer */
+ curr = 0;
+ data += id_offset;
+ for (i = 0; i < thread_n; ++i, data += cst_sz) {
+ uint16_t *ids0 = (uint16_t *) (data + 0);
+ uint16_t *ids1 = (uint16_t *) (data + 32);
+ uint16_t *ids2 = (uint16_t *) (data + 64);
+ for (j = 0; j < 16; ++j, ++curr) {/* SIMD16 */
+ ids0[j] = ids[0][curr];
+ ids1[j] = ids[1][curr];
+ ids2[j] = ids[2][curr];
+ }
+ }
+
+error:
+ for (i = 0; i < 3; ++i)
+ cl_free(ids[i]);
+ return err;
+}
+
LOCAL cl_int
cl_command_queue_ND_range_gen7(cl_command_queue queue,
- cl_kernel ker,
- cl_uint wk_dim,
- const size_t *global_work_offset,
- const size_t *global_wk_sz,
- const size_t *local_wk_sz)
+ cl_kernel ker,
+ const size_t *global_wk_off,
+ const size_t *global_wk_sz,
+ const size_t *local_wk_sz)
{
-#if 0
cl_context ctx = queue->ctx;
intel_gpgpu_t *gpgpu = queue->gpgpu;
drm_intel_bo *private_bo = NULL, *scratch_bo = NULL;
+ char *user = NULL; /* User defined constants first */
+ char *data = NULL; /* Complete constant buffer to upload */
genx_gpgpu_kernel_t kernel;
- const size_t cst_sz = ker->patch.curbe.sz;
-#endif
+ const size_t local_mem_sz = cl_kernel_local_memory_sz(ker);
+ size_t local_sz, batch_sz, cst_sz = ker->patch.curbe.sz;
+ size_t i, thread_n, id_offset;
+ cl_int err = CL_SUCCESS;
+
+ /* Setup kernel */
+ kernel.name = "OCL kernel";
+ kernel.grf_blocks = 128;
+ kernel.bin = NULL,
+ kernel.size = 0,
+ kernel.bo = ker->bo;
+ kernel.barrierID = 0;
+
+ /* All arguments must have been set */
+ TRY (cl_kernel_check_args, ker);
+
+ /* Check that the local work sizes are OK */
+ TRY (cl_kernel_work_group_sz, ker, local_wk_sz, 3, &local_sz);
+ thread_n = local_sz / 16; /* SIMD16 only */
+
+ /* Fill the constant buffer. Basically, we have to build one set of
+ * constants for each thread. The constants also includes the local ids we
+ * append after all the other regular values (function parameters...)
+ */
+ if (cst_sz > 0) {
+ assert(ker->cst_buffer);
+ user = cl_kernel_create_cst_buffer(ker, global_wk_sz, local_wk_sz);
+ }
+ id_offset = cst_sz = ALIGN(cst_sz, 32); /* Align the user data on 32 bytes */
+ kernel.cst_sz = cst_sz += 3 * 32; /* Add local IDs (16 words) */
+ TRY_ALLOC (data, (char*) cl_calloc(thread_n, cst_sz));
+ for (i = 0; i < thread_n; ++i)
+ memcpy(data + cst_sz * i, user, cst_sz);
+ TRY (cl_set_local_ids, data, local_wk_sz, cst_sz, id_offset, thread_n);
+
+ /* Setup the kernel */
+ gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, 4, 64, cst_sz / 32, 64);
+ if (queue->last_batch != NULL)
+ drm_intel_bo_unreference(queue->last_batch);
+ queue->last_batch = NULL;
+ cl_command_queue_bind_surface(queue, ker, NULL, &private_bo, &scratch_bo, local_mem_sz);
+ gpgpu_states_setup(gpgpu, &kernel, 1);
+
+ /* We always have constant with Gen7 (local_ids are used) */
+ gpgpu_upload_constants(gpgpu, data, thread_n*cst_sz);
+
+ /* Start a new batch buffer */
+ batch_sz = cl_kernel_compute_batch_sz(ker);
+ gpgpu_batch_reset(gpgpu, batch_sz);
+ gpgpu_batch_start(gpgpu);
+
+ /* Issue the GPGPU_WALKER command */
+ gpgpu_walker(gpgpu, thread_n, global_wk_off, global_wk_sz, local_wk_sz);
+
+ /* Close the batch buffer and submit it */
+ gpgpu_batch_end(gpgpu, 0);
+ gpgpu_flush(gpgpu);
- return CL_SUCCESS;
+error:
+ /* Release all temporary buffers */
+ if (private_bo)
+ drm_intel_bo_unreference(private_bo);
+ if (scratch_bo)
+ drm_intel_bo_unreference(scratch_bo);
+ cl_free(data);
+ cl_free(user);
+ return err;
}
}
LOCAL char*
-cl_kernel_create_cst_buffer(cl_kernel k,
- cl_uint wk_dim,
- const size_t *global_wk_sz,
- const size_t *local_wk_sz)
+cl_kernel_create_cst_buffer(cl_kernel k, const size_t *global_wk_sz, const size_t *local_wk_sz)
{
cl_curbe_patch_info_t *info = NULL;
const size_t sz = k->patch.curbe.sz;
LOCAL cl_int
cl_kernel_work_group_sz(cl_kernel ker,
const size_t *local_wk_sz,
- cl_uint wk_dim,
+ uint32_t wk_dim,
size_t *wk_grp_sz)
{
cl_int err = CL_SUCCESS;
+ size_t sz;
cl_uint i;
for (i = 0; i < wk_dim; ++i)
err = CL_INVALID_WORK_ITEM_SIZE;
goto error;
}
- *wk_grp_sz = local_wk_sz[0];
+ sz = local_wk_sz[0];
for (i = 1; i < wk_dim; ++i)
- *wk_grp_sz *= local_wk_sz[i];
- FATAL_IF (*wk_grp_sz % 16, "Work group size must be a multiple of 16");
- if (*wk_grp_sz > ker->program->ctx->device->max_work_group_size) {
+ sz *= local_wk_sz[i];
+ FATAL_IF (sz % 16, "Work group size must be a multiple of 16");
+ if (sz > ker->program->ctx->device->max_work_group_size) {
err = CL_INVALID_WORK_ITEM_SIZE;
goto error;
}
error:
+ if (wk_grp_sz)
+ *wk_grp_sz = sz;
return err;
}
/* Allocate, fill and return the CURBE */
extern char*
-cl_kernel_create_cst_buffer(cl_kernel k,
- cl_uint wk_dim,
- const size_t *global_wk_sz,
- const size_t *local_wk_sz);
+cl_kernel_create_cst_buffer(cl_kernel k, const size_t *global_wk_sz, const size_t *local_wk_sz);
/* Compute and check the work group size from the user provided local size */
extern cl_int
#define CMD_STATE_PREFETCH CMD(0, 0, 3)
#define CMD_MEDIA_GATEWAY_STATE CMD(2, 0, 3)
#define CMD_MEDIA_STATE_FLUSH CMD(2, 0, 4)
+#define CMD_GPGPU_WALKER CMD(2, 1, 5)
#define CMD_STATE_BASE_ADDRESS CMD(0, 1, 1)
#define CMD_STATE_SIP CMD(0, 1, 2)
#define CMD_SAMPLER_PALETTE_LOAD CMD(3, 1, 2)
#define CMD_MEDIA_STATE_POINTERS CMD(2, 0, 0)
-#define CMD_MEDIA CMD(2, 1, 0)
-#define CMD_MEDIA_EX CMD(2, 1, 1)
+#define CMD_MEDIA CMD(2, 1, 0)
+#define CMD_MEDIA_EX CMD(2, 1, 1)
#define CMD_PIPELINED_POINTERS CMD(3, 0, 0)
#define CMD_BINDING_TABLE_POINTERS CMD(3, 0, 1)
BEGIN_BATCH(state->batch, 8);
OUT_BATCH(state->batch, CMD_MEDIA_STATE_POINTERS | (8-2));
- struct gen6_vfe_state_inline* vfe = (struct gen6_vfe_state_inline*)
+ gen6_vfe_state_inline_t* vfe = (gen6_vfe_state_inline_t*)
intel_batchbuffer_alloc_space(state->batch,0);
memset(vfe, 0, sizeof(struct gen6_vfe_state_inline));
vfe->vfe1.fast_preempt = 1;
+ vfe->vfe1.gpgpu_mode = state->drv->gen_ver > 6 ? 1 : 0;
vfe->vfe1.bypass_gateway_ctl = 1;
vfe->vfe1.reset_gateway_timer = 1;
vfe->vfe1.urb_entries = state->urb.num_vfe_entries;
/* vfe->vfe3.curbe_size = 63; */
/* vfe->vfe3.urbe_size = 13; */
vfe->vfe4.scoreboard_enable = 1;
- intel_batchbuffer_alloc_space(state->batch, sizeof(struct gen6_vfe_state_inline));
+ intel_batchbuffer_alloc_space(state->batch, sizeof(gen6_vfe_state_inline_t));
ADVANCE_BATCH(state->batch);
}
return (char*) intel_batchbuffer_alloc_space(state->batch,sz);
}
+LOCAL void
+gpgpu_walker(intel_gpgpu_t *state,
+ uint32_t thread_n,
+ const size_t global_wk_off[3],
+ const size_t global_wk_sz[3],
+ const size_t local_wk_sz[3])
+{
+ BEGIN_BATCH(state->batch, 11);
+ OUT_BATCH(state->batch, CMD_GPGPU_WALKER | 9);
+ OUT_BATCH(state->batch, 0); /* kernel index */
+ OUT_BATCH(state->batch, (1 << 30) | (thread_n-1)); /* SIMD16 | thread max */
+ OUT_BATCH(state->batch, global_wk_off[0]);
+ OUT_BATCH(state->batch, global_wk_sz[0]-1);
+ OUT_BATCH(state->batch, global_wk_off[1]);
+ OUT_BATCH(state->batch, global_wk_sz[1]-1);
+ OUT_BATCH(state->batch, global_wk_off[2]);
+ OUT_BATCH(state->batch, global_wk_sz[2]-1);
+ OUT_BATCH(state->batch, ~0x0);
+ OUT_BATCH(state->batch, ~0x0);
+ ADVANCE_BATCH(state->batch);
+}
+
LOCAL int32_t
intel_gpgpu_version(intel_gpgpu_t *gpgpu)
{
*/
extern char* gpgpu_run_with_inline(intel_gpgpu_t*, int32_t ki, size_t sz);
+/* Will spawn all threads */
+extern void
+gpgpu_walker(intel_gpgpu_t *state,
+ uint32_t thread_n,
+ const size_t global_wk_off[3],
+ const size_t global_wk_sz[3],
+ const size_t local_wk_sz[3]);
+
#endif /* __GENX_GPGPU_H__ */