const size_t sz = max_thread *
k->patch.private_surf.size *
k->patch.exec_env.largest_compiled_simd_sz;
- assert(k->patch.exec_env.largest_compiled_simd_sz == 16);
+ // assert(k->patch.exec_env.largest_compiled_simd_sz == 16);
assert(k->patch.private_surf.offset % SURFACE_SZ == 0);
index = k->patch.private_surf.offset / SURFACE_SZ;
assert(index != MAX_SURFACES - 1);
const size_t sz = max_thread * /* XXX is it given per lane ??? */
k->patch.scratch.size *
k->patch.exec_env.largest_compiled_simd_sz;
- assert(k->patch.exec_env.largest_compiled_simd_sz == 16);
+ // assert(k->patch.exec_env.largest_compiled_simd_sz == 16);
assert(k->patch.scratch.offset % SURFACE_SZ == 0);
assert(index != MAX_SURFACES - 1);
index = k->patch.scratch.offset / SURFACE_SZ;
static cl_int
cl_set_local_ids(char *data,
const size_t *local_wk_sz,
+ size_t simd_sz,
size_t cst_sz,
size_t id_offset,
size_t thread_n)
cl_int err = CL_SUCCESS;
for (i = 0; i < 3; ++i)
- TRY_ALLOC(ids[i], (uint16_t*) cl_calloc(sizeof(uint16_t), thread_n*16));
+ TRY_ALLOC(ids[i], (uint16_t*) cl_calloc(sizeof(uint16_t), thread_n*simd_sz));
/* Compute the IDs */
for (k = 0; k < local_wk_sz[2]; ++k)
curr = 0;
data += id_offset;
for (i = 0; i < thread_n; ++i, data += cst_sz) {
- uint16_t *ids0 = (uint16_t *) (data + 0);
- uint16_t *ids1 = (uint16_t *) (data + 32);
- uint16_t *ids2 = (uint16_t *) (data + 64);
- for (j = 0; j < 16; ++j, ++curr) {
+ /* Compiler use a GRF for each local ID (8 x 32 bits == 16 x 16 bits) */
+ uint16_t *ids0 = (uint16_t *) (data + 0);
+ uint16_t *ids1 = (uint16_t *) (data + 1*16*sizeof(uint16_t));
+ uint16_t *ids2 = (uint16_t *) (data + 2*16*sizeof(uint16_t));
+ for (j = 0; j < simd_sz; ++j, ++curr) {
ids0[j] = ids[0][curr];
ids1[j] = ids[1][curr];
ids2[j] = ids[2][curr];
char *curbe = NULL; /* Does not include per-thread local IDs */
char *final_curbe = NULL; /* Includes them */
genx_gpgpu_kernel_t kernel;
+ const size_t simd_sz = ker->patch.exec_env.largest_compiled_simd_sz;
size_t local_sz, batch_sz, cst_sz = ker->patch.curbe.sz;
size_t i, thread_n, id_offset;
cl_int err = CL_SUCCESS;
/* Check that the local work sizes are OK */
TRY (cl_kernel_work_group_sz, ker, local_wk_sz, 3, &local_sz);
- kernel.thread_n = thread_n = local_sz / 16; /* SIMD16 only */
+ kernel.thread_n = thread_n = local_sz / simd_sz;
/* CURBE step 1. Allocate and fill fields shared by threads in workgroup */
if (cst_sz > 0) {
TRY_ALLOC (final_curbe, (char*) cl_calloc(thread_n, cst_sz));
for (i = 0; i < thread_n; ++i)
memcpy(final_curbe + cst_sz * i, curbe, cst_sz);
- TRY (cl_set_local_ids, final_curbe, local_wk_sz, cst_sz, id_offset, thread_n);
+ TRY (cl_set_local_ids, final_curbe, local_wk_sz, simd_sz, cst_sz, id_offset, thread_n);
gpgpu_upload_constants(gpgpu, final_curbe, thread_n*cst_sz);
/* Start a new batch buffer */
gpgpu_batch_start(gpgpu);
/* Issue the GPGPU_WALKER command */
- gpgpu_walker(gpgpu, thread_n, global_wk_off, global_wk_sz, local_wk_sz);
+ gpgpu_walker(gpgpu, simd_sz, thread_n, global_wk_off, global_wk_sz, local_wk_sz);
/* Close the batch buffer and submit it */
gpgpu_batch_end(gpgpu, 0);
#include <stdint.h>
#include <assert.h>
+/* Do not include the full dependency */
+struct intel_driver;
+/* Get the command buffer interface */
+extern struct _drm_intel_bufmgr* intel_driver_get_buf(struct intel_driver*);
+/* Get the Gen HW version */
+extern uint32_t intel_driver_get_ver(struct intel_driver*);
+
static cl_int
cl_context_properties_is_ok(const cl_context_properties *properties)
{
TRY_ALLOC_NO_ERR (ctx->intel_drv, cl_intel_driver_new());
ctx->magic = CL_MAGIC_CONTEXT_HEADER;
ctx->ref_n = 1;
+ ctx->ver = intel_driver_get_ver(ctx->intel_drv);
pthread_mutex_init(&ctx->program_lock, NULL);
pthread_mutex_init(&ctx->queue_lock, NULL);
pthread_mutex_init(&ctx->buffer_lock, NULL);
goto exit;
}
-struct intel_driver;
-extern struct _drm_intel_bufmgr* intel_driver_get_buf(struct intel_driver*);
-
struct _drm_intel_bufmgr*
cl_context_get_intel_bufmgr(cl_context ctx)
{
pthread_mutex_t program_lock; /* To allocate and deallocate programs */
pthread_mutex_t buffer_lock; /* To allocate and deallocate buffers */
pthread_mutex_t sampler_lock; /* To allocate and deallocate samplers */
+ uint32_t ver; /* Gen version */
};
/* Implement OpenCL function */
64));
drm_intel_bo_subdata(k->bo, 0, k->kernel_heap_sz, k->kernel_heap);
- /* We have some restrictions on the compiled binary */
- FATAL_IF (k->patch.exec_env.largest_compiled_simd_sz != 16, "Unsupported SIMD size");
- FATAL_IF (k->patch.exec_env.compiled_simd16 == 0, "Unsupported SIMD size");
+ /* We have some restrictions on the compiled binary for SNB */
+ FATAL_IF (k->program->ctx->ver == 6 &&
+ k->patch.exec_env.largest_compiled_simd_sz != 16, "Unsupported SIMD size");
+ FATAL_IF (k->program->ctx->ver == 6 &&
+ k->patch.exec_env.compiled_simd16 == 0, "Unsupported SIMD size");
+ FATAL_IF (k->program->ctx->ver > 6 &&
+ k->patch.exec_env.largest_compiled_simd_sz == 32, "Unsupported SIMD size");
error:
return err;
};
/* Size of the surface state as encoded in the binary blob */
-#define SURFACE_SZ 32
+#define SURFACE_SZ 64
/* Allocate an empty kernel */
extern cl_kernel cl_kernel_new(void);
return drv->bufmgr;
}
+LOCAL uint32_t
+intel_driver_get_ver(struct intel_driver *drv)
+{
+ return drv->gen_ver;
+}
+
LOCAL int
cl_intel_get_device_id(void)
{
LOCAL void
gpgpu_walker(intel_gpgpu_t *state,
+ uint32_t simd_sz,
uint32_t thread_n,
const size_t global_wk_off[3],
const size_t global_wk_sz[3],
global_wk_sz[1] / local_wk_sz[1],
global_wk_sz[2] / local_wk_sz[2]
};
-
+ assert(simd_sz == 8 || simd_sz == 16);
BEGIN_BATCH(state->batch, 11);
OUT_BATCH(state->batch, CMD_GPGPU_WALKER | 9);
OUT_BATCH(state->batch, 0); /* kernel index == 0 */
- OUT_BATCH(state->batch, (1 << 30) | (thread_n-1)); /* SIMD16 | thread max */
+ if (simd_sz == 16)
+ OUT_BATCH(state->batch, (1 << 30) | (thread_n-1)); /* SIMD16 | thread max */
+ else
+ OUT_BATCH(state->batch, (0 << 30) | (thread_n-1)); /* SIMD8 | thread max */
OUT_BATCH(state->batch, global_wk_off[0]);
OUT_BATCH(state->batch, global_wk_dim[0]);
OUT_BATCH(state->batch, global_wk_off[1]);
/* Will spawn all threads */
extern void
gpgpu_walker(intel_gpgpu_t *state,
+ uint32_t simd_sz,
uint32_t thread_n,
const size_t global_wk_off[3],
const size_t global_wk_sz[3],