cl_context.c
cl_command_queue.c
cl_command_queue_gen7.c
- cl_buffer.c
- cl_buffer.cpp
- sim/sim_buffer.c
+ cl_driver.c
+ cl_driver.cpp
+ sim/sim_driver.c
intel/intel_gpgpu.c
intel/intel_batchbuffer.c
intel/intel_driver.c
#include "cl_mem.h"
#include "cl_utils.h"
#include "cl_alloc.h"
-#include "intel_bufmgr.h"
-#include "intel/intel_gpgpu.h"
+#include "cl_driver.h"
#include <assert.h>
#include <stdio.h>
queue->magic = CL_MAGIC_QUEUE_HEADER;
queue->ref_n = 1;
queue->ctx = ctx;
- TRY_ALLOC_NO_ERR (queue->gpgpu,
- intel_gpgpu_new((struct intel_driver*) ctx->intel_drv));
+ TRY_ALLOC_NO_ERR (queue->gpgpu, cl_gpgpu_new(ctx->drv));
/* Append the command queue in the list */
pthread_mutex_lock(&ctx->queue_lock);
}
cl_mem_delete(queue->perf);
cl_context_delete(queue->ctx);
- intel_gpgpu_delete(queue->gpgpu);
+ cl_gpgpu_delete(queue->gpgpu);
queue->magic = CL_MAGIC_DEAD_HEADER; /* For safety */
cl_free(queue);
}
atomic_inc(&queue->ref_n);
}
- LOCAL cl_int
+LOCAL cl_int
cl_command_queue_bind_surface(cl_command_queue queue,
cl_kernel k,
char *curbe,
- drm_intel_bo **local,
- drm_intel_bo **priv,
- drm_intel_bo **scratch,
+ cl_buffer **local,
+ cl_buffer **priv,
+ cl_buffer **scratch,
uint32_t local_sz)
{
cl_context ctx = queue->ctx;
- intel_gpgpu_t *gpgpu = queue->gpgpu;
- drm_intel_bufmgr *bufmgr = cl_context_get_intel_bufmgr(ctx);
- drm_intel_bo *sync_bo = NULL;
+ cl_gpgpu *gpgpu = queue->gpgpu;
+ cl_buffer_mgr *bufmgr = cl_context_get_bufmgr(ctx);
+ cl_buffer *sync_bo = NULL;
cl_int err = CL_SUCCESS;
#if 0
cl_context ctx = queue->ctx;
intel_gpgpu_t *gpgpu = queue->gpgpu;
drm_intel_bufmgr *bufmgr = cl_context_get_intel_bufmgr(ctx);
cl_mem mem = NULL;
- drm_intel_bo *bo = NULL, *sync_bo = NULL;
+ cl_buffer *bo = NULL, *sync_bo = NULL;
const size_t max_thread = ctx->device->max_compute_unit;
cl_int err = CL_SUCCESS;
uint32_t i, index;
assert(k->patch.local_surf.offset % SURFACE_SZ == 0);
index = k->patch.local_surf.offset / SURFACE_SZ;
assert(index != MAX_SURFACES - 1);
- *local = drm_intel_bo_alloc(bufmgr, "CL local surface", sz, 64);
+ *local = cl_buffer_alloc(bufmgr, "CL local surface", sz, 64);
gpgpu_bind_buf(gpgpu, index, *local, cc_llc_l3);
}
else if (local)
assert(k->patch.private_surf.offset % SURFACE_SZ == 0);
index = k->patch.private_surf.offset / SURFACE_SZ;
assert(index != MAX_SURFACES - 1);
- *priv = drm_intel_bo_alloc(bufmgr, "CL private surface", sz, 64);
+ *priv = cl_buffer_alloc(bufmgr, "CL private surface", sz, 64);
gpgpu_bind_buf(gpgpu, index, *priv, cc_llc_l3);
}
else if(priv)
assert(k->patch.scratch.offset % SURFACE_SZ == 0);
assert(index != MAX_SURFACES - 1);
index = k->patch.scratch.offset / SURFACE_SZ;
- *scratch = drm_intel_bo_alloc(bufmgr, "CL scratch surface", sz, 64);
+ *scratch = cl_buffer_alloc(bufmgr, "CL scratch surface", sz, 64);
gpgpu_bind_buf(gpgpu, index, *scratch, cc_llc_l3);
}
else if (scratch)
*scratch = NULL;
#endif
/* Now bind a bo used for synchronization */
- sync_bo = drm_intel_bo_alloc(bufmgr, "sync surface", 64, 64);
- gpgpu_bind_buf(gpgpu, MAX_SURFACES-1, sync_bo, cc_llc_l3);
+ sync_bo = cl_buffer_alloc(bufmgr, "sync surface", 64, 64);
+ cl_gpgpu_bind_buf(gpgpu, MAX_SURFACES-1, sync_bo, cc_llc_l3);
if (queue->last_batch != NULL)
- drm_intel_bo_unreference(queue->last_batch);
+ cl_buffer_unreference(queue->last_batch);
queue->last_batch = sync_bo;
// error:
}
#if USE_FULSIM
-extern void drm_intel_bufmgr_gem_stop_aubfile(drm_intel_bufmgr*);
-extern void drm_intel_bufmgr_gem_set_aubfile(drm_intel_bufmgr*, FILE*);
-extern void aub_exec_dump_raw_file(drm_intel_bo*, size_t offset, size_t sz);
+extern void drm_intel_bufmgr_gem_stop_aubfile(cl_buffer_mgr*);
+extern void drm_intel_bufmgr_gem_set_aubfile(cl_buffer_mgr*, FILE*);
+extern void aub_exec_dump_raw_file(cl_buffer*, size_t offset, size_t sz);
static void
cl_run_fulsim(void)
const size_t *global_wk_sz,
const size_t *local_wk_sz)
{
- intel_gpgpu_t *gpgpu = queue->gpgpu;
- const int32_t ver = intel_gpgpu_version(gpgpu);
+ const int32_t ver = cl_driver_get_ver(queue->ctx->drv);
cl_int err = CL_SUCCESS;
#if USE_FULSIM
- drm_intel_bufmgr *bufmgr = NULL;
+ cl_buffer_mgr *bufmgr = NULL;
FILE *file = fopen("dump.aub", "wb");
FATAL_IF (file == NULL, "Unable to open file dump.aub");
- bufmgr = cl_context_get_intel_bufmgr(queue->ctx);
+ bufmgr = cl_context_get_bufmgr(queue->ctx);
drm_intel_bufmgr_gem_set_aubfile(bufmgr, file);
#endif /* USE_FULSIM */
{
if (queue->last_batch == NULL)
return CL_SUCCESS;
- drm_intel_bo_wait_rendering(queue->last_batch);
- drm_intel_bo_unreference(queue->last_batch);
+ cl_buffer_wait_rendering(queue->last_batch);
+ cl_buffer_unreference(queue->last_batch);
queue->last_batch = NULL;
return CL_SUCCESS;
}
#define __CL_COMMAND_QUEUE_H__
#include "cl_internals.h"
+#include "cl_driver.h"
#include "CL/cl.h"
#include <stdint.h>
/* Basically, this is a (kind-of) batch buffer */
struct _cl_command_queue {
- uint64_t magic; /* To identify it as a command queue */
- volatile int ref_n; /* We reference count this object */
- cl_context ctx; /* Its parent context */
- cl_command_queue prev, next; /* We chain the command queues together */
- struct intel_gpgpu *gpgpu; /* Setup all GEN commands */
- cl_mem perf; /* Where to put the perf counters */
- cl_mem fulsim_out; /* Fulsim will output this buffer */
- struct _drm_intel_bo *last_batch;/* To synchronize using clFinish */
+ uint64_t magic; /* To identify it as a command queue */
+ volatile int ref_n; /* We reference count this object */
+ cl_context ctx; /* Its parent context */
+ cl_command_queue prev, next; /* We chain the command queues together */
+ cl_gpgpu *gpgpu; /* Setup all GEN commands */
+ cl_mem perf; /* Where to put the perf counters */
+ cl_mem fulsim_out; /* Fulsim will output this buffer */
+ cl_buffer *last_batch; /* To synchronize using clFinish */
};
/* Allocate and initialize a new command queue. Also insert it in the list of
extern cl_int cl_command_queue_bind_surface(cl_command_queue queue,
cl_kernel k,
char *curbe,
- struct _drm_intel_bo **local,
- struct _drm_intel_bo **priv,
- struct _drm_intel_bo **scratch,
+ cl_buffer **local,
+ cl_buffer **priv,
+ cl_buffer **scratch,
uint32_t local_sz);
#endif /* __CL_COMMAND_QUEUE_H__ */
#include "cl_mem.h"
#include "cl_utils.h"
#include "cl_alloc.h"
-#include "intel_bufmgr.h"
-#include "intel/intel_gpgpu.h"
#include <assert.h>
#include <stdio.h>
const size_t *local_wk_sz)
{
cl_context ctx = queue->ctx;
- intel_gpgpu_t *gpgpu = queue->gpgpu;
+ cl_gpgpu *gpgpu = queue->gpgpu;
char *curbe = NULL; /* Does not include per-thread local IDs */
char *final_curbe = NULL; /* Includes them */
- drm_intel_bo *private_bo = NULL, *scratch_bo = NULL;
- genx_gpgpu_kernel_t kernel;
+ cl_buffer *private_bo = NULL, *scratch_bo = NULL;
+ cl_gpgpu_kernel_t kernel;
const uint32_t simd_sz = cl_kernel_get_simd_width(ker);
size_t i, batch_sz = 0u, local_sz = 0u, thread_n = 0u, id_offset = 0u, cst_sz = 0u;
cl_int err = CL_SUCCESS;
kernel.cst_sz = cst_sz += 3 * 32; /* Add local IDs (16 words) */
/* Setup the kernel */
- gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32);
+ cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32);
if (queue->last_batch != NULL)
- drm_intel_bo_unreference(queue->last_batch);
+ cl_buffer_unreference(queue->last_batch);
queue->last_batch = NULL;
cl_command_queue_bind_surface(queue, ker, curbe, NULL, &private_bo, &scratch_bo, 0);
- gpgpu_states_setup(gpgpu, &kernel, 1);
+ cl_gpgpu_states_setup(gpgpu, &kernel, 1);
/* CURBE step 2. Give the localID and upload it to video memory */
TRY_ALLOC (final_curbe, (char*) alloca(thread_n * cst_sz));
for (i = 0; i < thread_n; ++i)
memcpy(final_curbe + cst_sz * i, curbe, cst_sz - 3*32);
TRY (cl_set_local_ids, final_curbe, local_wk_sz, simd_sz, cst_sz, id_offset, thread_n);
- gpgpu_upload_constants(gpgpu, final_curbe, thread_n*cst_sz);
+ cl_gpgpu_upload_constants(gpgpu, final_curbe, thread_n*cst_sz);
/* Start a new batch buffer */
batch_sz = cl_kernel_compute_batch_sz(ker);
- gpgpu_batch_reset(gpgpu, batch_sz);
- gpgpu_batch_start(gpgpu);
+ cl_gpgpu_batch_reset(gpgpu, batch_sz);
+ cl_gpgpu_batch_start(gpgpu);
/* Issue the GPGPU_WALKER command */
- gpgpu_walker(gpgpu, simd_sz, thread_n, global_wk_off, global_wk_sz, local_wk_sz);
+ cl_gpgpu_walker(gpgpu, simd_sz, thread_n, global_wk_off, global_wk_sz, local_wk_sz);
/* Close the batch buffer and submit it */
- gpgpu_batch_end(gpgpu, 0);
- gpgpu_flush(gpgpu);
-
-#if 0
- cl_context ctx = queue->ctx;
- intel_gpgpu_t *gpgpu = queue->gpgpu;
- drm_intel_bo *private_bo = NULL, *scratch_bo = NULL;
- char *curbe = NULL; /* Does not include per-thread local IDs */
- char *final_curbe = NULL; /* Includes them */
- genx_gpgpu_kernel_t kernel;
- //const size_t simd_sz = ker->patch.exec_env.largest_compiled_simd_sz;
- const size_t simd_sz = 16;
- size_t local_sz, batch_sz, cst_sz = ker->patch.curbe.sz;
- size_t i, thread_n, id_offset;
- cl_int err = CL_SUCCESS;
-
- /* Setup kernel */
- kernel.name = "OCL kernel";
- kernel.grf_blocks = 128;
- kernel.bin = ker->kernel_heap; // _PLASMA ; NULL
- kernel.size = ker->kernel_heap_sz; // _PLASMA ; 0
- kernel.bo = ker->bo;
- kernel.barrierID = 0;
- kernel.use_barrier = ker->patch.exec_env.has_barriers;
- kernel.slm_sz = cl_kernel_local_memory_sz(ker);
-
- /* All arguments must have been set */
- TRY (cl_kernel_check_args, ker);
-
- /* Check that the local work sizes are OK */
- TRY (cl_kernel_work_group_sz, ker, local_wk_sz, 3, &local_sz);
- //kernel.thread_n = thread_n = local_sz / simd_sz;
- kernel.thread_n = thread_n = local_sz / simd_sz;
-
- /* CURBE step 1. Allocate and fill fields shared by threads in workgroup */
- if (cst_sz > 0) {
- assert(ker->cst_buffer);
- curbe = cl_kernel_create_cst_buffer(ker,
- global_wk_off,
- global_wk_sz,
- local_wk_sz,
- 3,
- thread_n);
- }
- id_offset = cst_sz = ALIGN(cst_sz, 32); /* Align the user data on 32 bytes */
- kernel.cst_sz = cst_sz += 3 * 32; /* Add local IDs (16 words) */
-
- /* Setup the kernel */
- gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32);
- if (queue->last_batch != NULL)
- drm_intel_bo_unreference(queue->last_batch);
- queue->last_batch = NULL;
- cl_command_queue_bind_surface(queue, ker, curbe, NULL, &private_bo, &scratch_bo, 0);
- gpgpu_states_setup(gpgpu, &kernel, 1);
-
- /* CURBE step 2. Give the localID and upload it to video memory */
- TRY_ALLOC (final_curbe, (char*) cl_calloc(thread_n, cst_sz));
- for (i = 0; i < thread_n; ++i)
- memcpy(final_curbe + cst_sz * i, curbe, cst_sz);
- TRY (cl_set_local_ids, final_curbe, local_wk_sz, simd_sz, cst_sz, id_offset, thread_n);
- gpgpu_upload_constants(gpgpu, final_curbe, thread_n*cst_sz);
-
- /* Start a new batch buffer */
- batch_sz = cl_kernel_compute_batch_sz(ker);
- gpgpu_batch_reset(gpgpu, batch_sz);
- gpgpu_batch_start(gpgpu);
-
- /* Issue the GPGPU_WALKER command */
- gpgpu_walker(gpgpu, simd_sz, thread_n, global_wk_off, global_wk_sz, local_wk_sz);
-
- /* Close the batch buffer and submit it */
- gpgpu_batch_end(gpgpu, 0);
- gpgpu_flush(gpgpu);
-
-error:
- /* Release all temporary buffers */
- if (private_bo) drm_intel_bo_unreference(private_bo);
- if (scratch_bo) drm_intel_bo_unreference(scratch_bo);
- cl_free(final_curbe);
- cl_free(curbe);
- return err;
-#endif
+ cl_gpgpu_batch_end(gpgpu, 0);
+ cl_gpgpu_flush(gpgpu);
error:
return err;
#include "cl_mem.h"
#include "cl_alloc.h"
#include "cl_utils.h"
+#include "cl_driver.h"
#include "CL/cl.h"
#include <stdint.h>
#include <assert.h>
-/* Do not include the full dependency */
-struct intel_driver;
-/* Get the command buffer interface */
-extern struct _drm_intel_bufmgr* intel_driver_get_bufmgr(struct intel_driver*);
-/* Get the Gen HW version */
-extern uint32_t intel_driver_get_ver(struct intel_driver*);
-
static cl_int
cl_context_properties_is_ok(const cl_context_properties *properties)
{
cl_context ctx = NULL;
TRY_ALLOC_NO_ERR (ctx, CALLOC(struct _cl_context));
- TRY_ALLOC_NO_ERR (ctx->intel_drv, cl_intel_driver_new());
+ TRY_ALLOC_NO_ERR (ctx->drv, cl_driver_new());
ctx->magic = CL_MAGIC_CONTEXT_HEADER;
ctx->ref_n = 1;
- ctx->ver = intel_driver_get_ver(ctx->intel_drv);
+ ctx->ver = cl_driver_get_ver(ctx->drv);
pthread_mutex_init(&ctx->program_lock, NULL);
pthread_mutex_init(&ctx->queue_lock, NULL);
pthread_mutex_init(&ctx->buffer_lock, NULL);
assert(ctx->queues == NULL);
assert(ctx->programs == NULL);
assert(ctx->buffers == NULL);
- assert(ctx->intel_drv);
- cl_intel_driver_delete(ctx->intel_drv);
+ assert(ctx->drv);
+ cl_driver_delete(ctx->drv);
ctx->magic = CL_MAGIC_DEAD_HEADER; /* For safety */
cl_free(ctx);
}
goto exit;
}
-struct _drm_intel_bufmgr*
-cl_context_get_intel_bufmgr(cl_context ctx)
+struct cl_buffer_mgr*
+cl_context_get_bufmgr(cl_context ctx)
{
- return intel_driver_get_bufmgr((struct intel_driver*) ctx->intel_drv);
+ return cl_driver_get_bufmgr(ctx->drv);
}
#define __CL_CONTEXT_H__
#include "cl_internals.h"
+#include "cl_driver.h"
#include "CL/cl.h"
#include <stdint.h>
struct _cl_context {
uint64_t magic; /* To identify it as a context */
volatile int ref_n; /* We reference count this object */
- struct intel_driver *intel_drv; /* Handles the real HW */
+ cl_driver *drv; /* Handles HW or simulator */
cl_device_id device; /* All information about the GPU device */
cl_command_queue queues; /* All command queues currently allocated */
cl_program programs; /* All programs currently allocated */
cl_command_queue_properties,
cl_int*);
-/* Use for all GPU buffers */
-extern struct _drm_intel_bufmgr* cl_context_get_intel_bufmgr(cl_context);
-
/* Enqueue a ND Range kernel */
extern cl_int cl_context_ND_kernel(cl_context,
cl_command_queue,
const size_t*);
/* Used for allocation */
-extern struct _drm_intel_bufmgr*
-cl_context_get_intel_bufmgr(cl_context ctx);
+extern cl_buffer_mgr* cl_context_get_bufmgr(cl_context ctx);
#endif /* __CL_CONTEXT_H__ */
* Author: Benjamin Segovia <benjamin.segovia@intel.com>
*/
-#include "cl_buffer.h"
+#include "cl_driver.h"
+#include "cl_utils.h"
#include <stdlib.h>
-cl_buffer_alloc_cb *cl_buffer_alloc = NULL;
-cl_buffer_unreference_cb *cl_buffer_unreference = NULL;
-cl_buffer_map_cb *cl_buffer_map = NULL;
-cl_buffer_unmap_cb *cl_buffer_unmap = NULL;
-cl_buffer_pin_cb *cl_buffer_pin = NULL;
-cl_buffer_unpin_cb *cl_buffer_unpin = NULL;
-cl_buffer_subdata_cb *cl_buffer_subdata = NULL;
-cl_buffer_emit_reloc_cb *cl_buffer_emit_reloc = NULL;
-cl_driver_get_bufmgr_cb *cl_driver_get_bufmgr = NULL;
-cl_driver_get_ver_cb *cl_driver_get_ver = NULL;
+/* Driver */
+LOCAL cl_driver_new_cb *cl_driver_new = NULL;
+LOCAL cl_driver_delete_cb *cl_driver_delete = NULL;
+LOCAL cl_driver_get_bufmgr_cb *cl_driver_get_bufmgr = NULL;
+
+/* Buffer */
+LOCAL cl_driver_get_ver_cb *cl_driver_get_ver = NULL;
+LOCAL cl_buffer_alloc_cb *cl_buffer_alloc = NULL;
+LOCAL cl_buffer_reference_cb *cl_buffer_reference = NULL;
+LOCAL cl_buffer_unreference_cb *cl_buffer_unreference = NULL;
+LOCAL cl_buffer_map_cb *cl_buffer_map = NULL;
+LOCAL cl_buffer_unmap_cb *cl_buffer_unmap = NULL;
+LOCAL cl_buffer_get_virtual_cb *cl_buffer_get_virtual = NULL;
+LOCAL cl_buffer_pin_cb *cl_buffer_pin = NULL;
+LOCAL cl_buffer_unpin_cb *cl_buffer_unpin = NULL;
+LOCAL cl_buffer_subdata_cb *cl_buffer_subdata = NULL;
+LOCAL cl_buffer_emit_reloc_cb *cl_buffer_emit_reloc = NULL;
+LOCAL cl_buffer_wait_rendering_cb *cl_buffer_wait_rendering = NULL;
+
+/* GPGPU */
+LOCAL cl_gpgpu_new_cb *cl_gpgpu_new = NULL;
+LOCAL cl_gpgpu_delete_cb *cl_gpgpu_delete = NULL;
+LOCAL cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf = NULL;
+LOCAL cl_gpgpu_bind_image2D_cb *cl_gpgpu_bind_image2D = NULL;
+LOCAL cl_gpgpu_state_init_cb *cl_gpgpu_state_init = NULL;
+LOCAL cl_gpgpu_set_perf_counters_cb *cl_gpgpu_set_perf_counters = NULL;
+LOCAL cl_gpgpu_upload_constants_cb *cl_gpgpu_upload_constants = NULL;
+LOCAL cl_gpgpu_states_setup_cb *cl_gpgpu_states_setup = NULL;
+LOCAL cl_gpgpu_update_barrier_cb *cl_gpgpu_update_barrier = NULL;
+LOCAL cl_gpgpu_upload_samplers_cb *cl_gpgpu_upload_samplers = NULL;
+LOCAL cl_gpgpu_batch_reset_cb *cl_gpgpu_batch_reset = NULL;
+LOCAL cl_gpgpu_batch_start_cb *cl_gpgpu_batch_start = NULL;
+LOCAL cl_gpgpu_batch_end_cb *cl_gpgpu_batch_end = NULL;
+LOCAL cl_gpgpu_flush_cb *cl_gpgpu_flush = NULL;
+LOCAL cl_gpgpu_walker_cb *cl_gpgpu_walker = NULL;
*/
extern "C" {
-#include "sim/sim_buffer.h"
-#include "intel/intel_buffer.h"
+#include "sim/sim_driver.h"
+#include "intel/intel_driver.h"
#include "cl_utils.h"
#include <stdlib.h>
#include <string.h>
#define __CL_BUFFER_H__
#include <stdint.h>
+#include <stdlib.h>
-/* Hide behind some call backs the buffer allocation / deallocation ... This
+/* XXX needed for previous driver */
+#define MAX_SURFACES 128
+#define MAX_SAMPLERS 16
+
+/**************************************************************************
+ * cl_driver:
+ * Hide behind some call backs the buffer allocation / deallocation ... This
* will allow us to make the use of a software performance simulator easier and
* to minimize the code specific for the HW and for the simulator
- */
+ **************************************************************************/
/* Encapsulates command buffer / data buffer / kernels */
typedef struct cl_buffer cl_buffer;
/* Encapsulates the driver backend functionalities */
typedef struct cl_driver cl_driver;
+/* Encapsulates the gpgpu stream of commands */
+typedef struct cl_gpgpu cl_gpgpu;
+
+/**************************************************************************
+ * Driver
+ **************************************************************************/
/* Create a new driver */
typedef cl_driver* (cl_driver_new_cb)(void);
-extern cl_driver_new_cb cl_driver_new;
+extern cl_driver_new_cb *cl_driver_new;
/* Delete the driver */
-typedef cl_driver* (cl_driver_delete_cb)(void);
-extern cl_driver_delete_cb cl_driver_delete;
+typedef void (cl_driver_delete_cb)(cl_driver*);
+extern cl_driver_delete_cb *cl_driver_delete;
/* Get the buffer manager from the driver */
typedef cl_buffer_mgr* (cl_driver_get_bufmgr_cb)(cl_driver*);
typedef uint32_t (cl_driver_get_ver_cb)(cl_driver*);
extern cl_driver_get_ver_cb *cl_driver_get_ver;
+/**************************************************************************
+ * GPGPU command streamer
+ **************************************************************************/
+/* Describe texture tiling */
+typedef enum cl_gpgpu_tiling {
+ GPGPU_NO_TILE = 0,
+ GPGPU_TILE_X = 1,
+ GPGPU_TILE_Y = 2,
+} cl_gpgpu_tiling_t;
+
+/* Cache control options */
+enum gen6_cache_control {
+ cc_gtt = 0x0,
+ cc_l3 = 0x1,
+ cc_llc = 0x2,
+ cc_llc_l3 = 0x3
+};
+
+
+/* Use this structure to bind kernels in the gpgpu state */
+typedef struct cl_gpgpu_kernel {
+ const char *name; /* kernel name and bo name */
+ uint32_t grf_blocks; /* register blocks kernel wants (in 8 reg blocks) */
+ uint32_t cst_sz; /* total size of all constants */
+ cl_buffer *bo; /* kernel code in the proper addr space */
+ int32_t barrierID; /* barrierID for _this_ kernel */
+ uint32_t use_barrier:1; /* For gen7 (automatic barrier management) */
+ uint32_t thread_n:15; /* For gen7 (automatic barrier management) */
+ uint32_t slm_sz:16; /* For gen7 (automatic SLM allocation) */
+} cl_gpgpu_kernel_t;
+
+/* Create a new gpgpu state */
+typedef cl_gpgpu* (cl_gpgpu_new_cb)(cl_driver*);
+extern cl_gpgpu_new_cb *cl_gpgpu_new;
+
+/* Delete the gpgpu state */
+typedef void (cl_gpgpu_delete_cb)(cl_gpgpu*);
+extern cl_gpgpu_delete_cb *cl_gpgpu_delete;
+
+/* Bind a regular unformatted buffer */
+typedef void (cl_gpgpu_bind_buf_cb)(cl_gpgpu*, int32_t index, cl_buffer *, uint32_t cchint);
+extern cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf;
+
+/* Set a 2d texture */
+typedef void (cl_gpgpu_bind_image2D_cb)(cl_gpgpu state,
+ int32_t index,
+ struct cl_buffer* obj_bo,
+ uint32_t format,
+ int32_t w,
+ int32_t h,
+ int pitch,
+ cl_gpgpu_tiling_t tiling);
+extern cl_gpgpu_bind_image2D_cb *cl_gpgpu_bind_image2D;
+
+/* Configure internal state */
+typedef void (cl_gpgpu_state_init_cb)(cl_gpgpu*, uint32_t max_threads, uint32_t size_cs_entry);
+extern cl_gpgpu_state_init_cb *cl_gpgpu_state_init;
+
+/* Set the buffer object where to report performance counters */
+typedef void (cl_gpgpu_set_perf_counters_cb)(cl_gpgpu*, struct cl_buffer *perf);
+extern cl_gpgpu_set_perf_counters_cb *cl_gpgpu_set_perf_counters;
+
+/* Fills current constant buffer with data */
+typedef void (cl_gpgpu_upload_constants_cb)(cl_gpgpu*, const void* data, uint32_t size);
+extern cl_gpgpu_upload_constants_cb *cl_gpgpu_upload_constants;
+
+/* Setup all indirect states */
+typedef void (cl_gpgpu_states_setup_cb)(cl_gpgpu*, cl_gpgpu_kernel_t* kernel, uint32_t ker_n);
+extern cl_gpgpu_states_setup_cb *cl_gpgpu_states_setup;
+
+/* Make HW threads use barrierID */
+typedef void (cl_gpgpu_update_barrier_cb)(cl_gpgpu*, uint32_t barrierID, uint32_t thread_n);
+extern cl_gpgpu_update_barrier_cb *cl_gpgpu_update_barrier;
+
+/* Upload the constant samplers as specified inside the OCL kernel */
+typedef void (cl_gpgpu_upload_samplers_cb)(cl_gpgpu *state, const void *data, uint32_t n);
+extern cl_gpgpu_upload_samplers_cb *cl_gpgpu_upload_samplers;
+
+/* Set a sampler */
+typedef void (cl_gpgpu_set_sampler_cb)(cl_gpgpu*, uint32_t index, uint32_t non_normalized);
+extern cl_gpgpu_set_sampler_cb *cl_gpgpu_set_sampler;
+
+/* Allocate the batch buffer and return the BO used for the batch buffer */
+typedef void (cl_gpgpu_batch_reset_cb)(cl_gpgpu*, size_t sz);
+extern cl_gpgpu_batch_reset_cb *cl_gpgpu_batch_reset;
+
+/* Atomic begin, pipeline select, urb, pipeline state and constant buffer */
+typedef void (cl_gpgpu_batch_start_cb)(cl_gpgpu*);
+extern cl_gpgpu_batch_start_cb *cl_gpgpu_batch_start;
+
+/* atomic end with possibly inserted flush */
+typedef void (cl_gpgpu_batch_end_cb)(cl_gpgpu*, int32_t flush_mode);
+extern cl_gpgpu_batch_end_cb *cl_gpgpu_batch_end;
+
+/* Flush the command buffer */
+typedef void (cl_gpgpu_flush_cb)(cl_gpgpu *state);
+extern cl_gpgpu_flush_cb *cl_gpgpu_flush;
+
+/* Will spawn all threads */
+typedef void (cl_gpgpu_walker_cb)(cl_gpgpu *state,
+ uint32_t simd_sz,
+ uint32_t thread_n,
+ const size_t global_wk_off[3],
+ const size_t global_wk_sz[3],
+ const size_t local_wk_sz[3]);
+extern cl_gpgpu_walker_cb *cl_gpgpu_walker;
+
+/**************************************************************************
+ * Buffer
+ **************************************************************************/
/* Allocate a buffer */
typedef cl_buffer* (cl_buffer_alloc_cb)(cl_buffer_mgr*, const char*, unsigned long, unsigned long);
extern cl_buffer_alloc_cb *cl_buffer_alloc;
typedef void (cl_buffer_unreference_cb)(cl_buffer*);
extern cl_buffer_unreference_cb *cl_buffer_unreference;
+/* Add one more ref on a buffer */
+typedef void (cl_buffer_reference_cb)(cl_buffer*);
+extern cl_buffer_reference_cb *cl_buffer_reference;
+
/* Map a buffer */
-typedef void* (cl_buffer_map_cb)(cl_buffer*);
+typedef int (cl_buffer_map_cb)(cl_buffer*, uint32_t write_enable);
extern cl_buffer_map_cb *cl_buffer_map;
/* Unmap a buffer */
-typedef void* (cl_buffer_unmap_cb)(cl_buffer*);
+typedef int (cl_buffer_unmap_cb)(cl_buffer*);
extern cl_buffer_unmap_cb *cl_buffer_unmap;
+/* Get the virtual address (when mapped) */
+typedef void* (cl_buffer_get_virtual_cb)(cl_buffer*);
+extern cl_buffer_get_virtual_cb *cl_buffer_get_virtual;
+
/* Pin a buffer */
-typedef int (cl_buffer_pin_cb)(cl_buffer*);
+typedef int (cl_buffer_pin_cb)(cl_buffer*, uint32_t alignment);
extern cl_buffer_pin_cb *cl_buffer_pin;
/* Unpin a buffer */
typedef int (cl_buffer_emit_reloc_cb) (cl_buffer *, uint32_t, cl_buffer*, uint32_t, uint32_t, uint32_t);
extern cl_buffer_emit_reloc_cb *cl_buffer_emit_reloc;
+/* Wait for all pending rendering for this buffer to complete */
+typedef int (cl_buffer_wait_rendering_cb) (cl_buffer *);
+extern cl_buffer_wait_rendering_cb *cl_buffer_wait_rendering;
+
#endif /* __CL_BUFFER_H__ */
#include "cl_image.h"
#include "cl_utils.h"
-#ifdef _PLASMA
-#include "plasma/intel_defines.h"
-#else
#include "intel/intel_defines.h"
-#endif
#include <assert.h>
#include "cl_alloc.h"
#include "cl_utils.h"
#include "CL/cl.h"
-#include "intel_bufmgr.h"
-#include "intel/intel_gpgpu.h"
#include "gen/program.h"
#include <stdio.h>
if (atomic_dec(&k->ref_n) > 1) return;
/* Release one reference on all bos we own */
- if (k->bo) drm_intel_bo_unreference(k->bo);
- if (k->const_bo) drm_intel_bo_unreference(k->const_bo);
+ if (k->bo) cl_buffer_unreference(k->bo);
+ if (k->const_bo) cl_buffer_unreference(k->const_bo);
/* This will be true for kernels created by clCreateKernel */
if (k->ref_its_program) cl_program_delete(k->program);
cl_kernel_setup(cl_kernel k, const struct GenKernel *gen_kernel)
{
cl_context ctx = k->program->ctx;
- drm_intel_bufmgr *bufmgr = cl_context_get_intel_bufmgr(ctx);
+ cl_buffer_mgr *bufmgr = cl_context_get_bufmgr(ctx);
/* Allocate the gen code here */
const uint32_t code_sz = GenKernelGetCodeSize(gen_kernel);
const char *code = GenKernelGetCode(gen_kernel);
- k->bo = drm_intel_bo_alloc(bufmgr, "CL kernel", code_sz, 64u);
+ k->bo = cl_buffer_alloc(bufmgr, "CL kernel", code_sz, 64u);
/* Upload the code */
- drm_intel_bo_subdata(k->bo, 0, code_sz, code);
+ cl_buffer_subdata(k->bo, 0, code_sz, code);
k->gen_kernel = gen_kernel;
}
to->program = from->program;
/* Retain the bos */
- if (from->bo) drm_intel_bo_reference(from->bo);
- if (from->const_bo) drm_intel_bo_reference(from->const_bo);
+ if (from->bo) cl_buffer_reference(from->bo);
+ if (from->const_bo) cl_buffer_reference(from->const_bo);
/* We retain the program destruction since this kernel (user allocated)
* depends on the program for some of its pointers
#include "cl_defs.h"
#include "cl_internals.h"
+#include "cl_driver.h"
#include "CL/cl.h"
#include <stdint.h>
struct _cl_kernel {
uint64_t magic; /* To identify it as a kernel */
volatile int ref_n; /* We reference count this object */
- struct _drm_intel_bo *bo; /* The code itself */
- struct _drm_intel_bo *const_bo; /* Buffer for all __constants values in the OCL program */
+ struct cl_buffer *bo; /* The code itself */
+ struct cl_buffer *const_bo; /* Buffer for all __constants values in the OCL program */
cl_program program; /* Owns this structure (and pointers) */
const struct GenKernel *gen_kernel; /* (Opaque) compiler structure for the OCL kernel */
uint8_t ref_its_program; /* True only for the user kernel (those created by clCreateKernel) */
#include "cl_utils.h"
#include "cl_alloc.h"
#include "cl_device_id.h"
-
-#include "intel/intel_driver.h"
-#include "intel/intel_gpgpu.h"
-#include "intel_bufmgr.h" /* libdrm_intel */
-#include "cl_buffer.h"
+#include "cl_driver.h"
#include "CL/cl.h"
#include "CL/cl_intel.h"
cl_int is_tiled,
cl_int *errcode)
{
- drm_intel_bufmgr *bufmgr = NULL;
+ cl_buffer_mgr *bufmgr = NULL;
cl_mem mem = NULL;
cl_int err = CL_SUCCESS;
size_t alignment = 64;
alignment = 4096;
/* Allocate space in memory */
- bufmgr = cl_context_get_intel_bufmgr(ctx);
+ bufmgr = cl_context_get_bufmgr(ctx);
assert(bufmgr);
- mem->bo = drm_intel_bo_alloc(bufmgr, "CL memory object", sz, alignment);
+ mem->bo = cl_buffer_alloc(bufmgr, "CL memory object", sz, alignment);
if (UNLIKELY(mem->bo == NULL)) {
err = CL_MEM_ALLOCATION_FAILURE;
goto error;
/* Copy the data if required */
if (flags & CL_MEM_COPY_HOST_PTR) /* TODO check other flags too */
- drm_intel_bo_subdata(mem->bo, 0, sz, data);
+ cl_buffer_subdata(mem->bo, 0, sz, data);
exit:
if (errcode_ret)
{
size_t x, y, p;
char *dst;
- drm_intel_bo_map(mem->bo, 1);
- dst = drm_intel_bo_get_virtual(mem->bo);
+ cl_buffer_map(mem->bo, 1);
+ dst = cl_buffer_get_virtual(mem->bo);
for (y = 0; y < h; ++y) {
char *src = (char*) data + pitch * y;
for (x = 0; x < w; ++x) {
src += bpp;
}
}
- drm_intel_bo_unmap(mem->bo);
+ cl_buffer_unmap(mem->bo);
}
static const uint32_t tile_sz = 4096; /* 4KB per tile */
char *img = NULL;
char *end = (char*) data + pitch * h;
- drm_intel_bo_map(mem->bo, 1);
- img = drm_intel_bo_get_virtual(mem->bo);
+ cl_buffer_map(mem->bo, 1);
+ img = cl_buffer_get_virtual(mem->bo);
for (tiley = 0; tiley < tiley_n; ++tiley)
for (tilex = 0; tilex < tilex_n; ++tilex) {
char *tile = img + (tilex + tiley * tilex_n) * tile_sz;
}
}
}
- drm_intel_bo_unmap(mem->bo);
+ cl_buffer_unmap(mem->bo);
}
static void
char *img = NULL;
char *end = (char*) data + pitch * h;
- drm_intel_bo_map(mem->bo, 1);
- img = drm_intel_bo_get_virtual(mem->bo);
+ cl_buffer_map(mem->bo, 1);
+ img = cl_buffer_get_virtual(mem->bo);
for (tiley = 0; tiley < tiley_n; ++tiley)
for (tilex = 0; tilex < tilex_n; ++tilex) {
char *tile = img + (tiley * tilex_n + tilex) * tile_sz;
}
}
}
- drm_intel_bo_unmap(mem->bo);
+ cl_buffer_unmap(mem->bo);
}
LOCAL cl_mem
#undef DO_IMAGE_ERROR
/* Pick up tiling mode (we do only linear on SNB) */
- if (ctx->intel_drv->gen_ver != 6)
+ if (cl_driver_get_ver(ctx->drv) != 6)
tiling = CL_TILE_Y;
/* Tiling requires to align both pitch and height */
if (atomic_dec(&mem->ref_n) > 1)
return;
if (LIKELY(mem->bo != NULL))
- drm_intel_bo_unreference(mem->bo);
+ cl_buffer_unreference(mem->bo);
/* Remove it from the list */
assert(mem->ctx);
LOCAL void*
cl_mem_map(cl_mem mem)
{
- drm_intel_bo_map(mem->bo, 1);
- assert(drm_intel_bo_get_virtual(mem->bo));
- return drm_intel_bo_get_virtual(mem->bo);
+ cl_buffer_map(mem->bo, 1);
+ assert(cl_buffer_get_virtual(mem->bo));
+ return cl_buffer_get_virtual(mem->bo);
}
LOCAL cl_int
cl_mem_unmap(cl_mem mem)
{
- drm_intel_bo_unmap(mem->bo);
+ cl_buffer_unmap(mem->bo);
return CL_SUCCESS;
}
assert(mem);
if (UNLIKELY((mem->flags & CL_MEM_PINNABLE) == 0))
return CL_INVALID_MEM;
- drm_intel_bo_pin(mem->bo, 4096);
+ cl_buffer_pin(mem->bo, 4096);
return CL_SUCCESS;
}
assert(mem);
if (UNLIKELY((mem->flags & CL_MEM_PINNABLE) == 0))
return CL_INVALID_MEM;
- drm_intel_bo_unpin(mem->bo);
+ cl_buffer_unpin(mem->bo);
return CL_SUCCESS;
}
#define __CL_MEM_H__
#include "cl_internals.h"
+#include "cl_driver.h"
#include "CL/cl.h"
-/* Store the object in video memory */
-struct _drm_intel_bo;
-
typedef enum cl_image_tiling {
CL_NO_TILE = 0,
CL_TILE_X = 1,
struct _cl_mem {
uint64_t magic; /* To identify it as a memory object */
volatile int ref_n; /* This object is reference counted */
- struct _drm_intel_bo *bo; /* Data in GPU memory */
+ cl_buffer *bo; /* Data in GPU memory */
cl_mem prev, next; /* We chain the memory buffers together */
cl_context ctx; /* Context it belongs to */
cl_mem_flags flags; /* Flags specified at the creation time */
/* Add one more reference to this object */
extern void cl_mem_add_ref(cl_mem);
-/* Directly map a memory object (just use drm_intel_bo_map) */
+/* Directly map a memory object */
extern void *cl_mem_map(cl_mem);
-/* Unmap a memory object (just use drm_intel_bo_unmap) */
+/* Unmap a memory object */
extern cl_int cl_mem_unmap(cl_mem);
/* Pin/unpin the buffer in memory (you must be root) */
*/
#include "intel_driver.h"
+#include "intel_gpgpu.h"
#include "intel_batchbuffer.h"
#include "x11/dricommon.h"
#include "cl_utils.h"
#include "cl_alloc.h"
+#include "cl_driver.h"
#include "cl_genx_driver.h"
#define SET_BLOCKED_SIGSET(DRIVER) do { \
RESTORE_BLOCKED_SIGSET(DRIVER); \
} while (0)
-
LOCAL intel_driver_t*
intel_driver_new(void)
{
return drv->gen_ver;
}
+LOCAL uint32_t drm_intel_bo_get_size(drm_intel_bo *bo) { return bo->size; }
+LOCAL void* drm_intel_bo_get_virtual(drm_intel_bo *bo) { return bo->virtual; }
+
LOCAL void
intel_setup_callbacks(void)
{
+ cl_driver_new = (cl_driver_new_cb *) cl_intel_driver_new;
+ cl_driver_delete = (cl_driver_delete_cb *) cl_intel_driver_delete;
+ cl_driver_get_ver = (cl_driver_get_ver_cb *) intel_driver_get_ver;
+ cl_driver_get_bufmgr = (cl_driver_get_bufmgr_cb *) intel_driver_get_bufmgr;
cl_buffer_alloc = (cl_buffer_alloc_cb *) drm_intel_bo_alloc;
+ cl_buffer_reference = (cl_buffer_reference_cb *) drm_intel_bo_reference;
cl_buffer_unreference = (cl_buffer_unreference_cb *) drm_intel_bo_unreference;
cl_buffer_map = (cl_buffer_map_cb *) drm_intel_bo_map;
cl_buffer_unmap = (cl_buffer_unmap_cb *) drm_intel_bo_unmap;
+ cl_buffer_get_virtual = (cl_buffer_get_virtual_cb *) drm_intel_bo_unmap;
cl_buffer_pin = (cl_buffer_pin_cb *) drm_intel_bo_pin;
cl_buffer_unpin = (cl_buffer_unpin_cb *) drm_intel_bo_unpin;
cl_buffer_subdata = (cl_buffer_subdata_cb *) drm_intel_bo_subdata;
cl_buffer_emit_reloc = (cl_buffer_emit_reloc_cb *) drm_intel_bo_emit_reloc;
- cl_driver_get_bufmgr = (cl_driver_get_bufmgr_cb *) intel_driver_get_bufmgr;
- cl_driver_get_ver = (cl_driver_get_ver_cb *) intel_driver_get_ver;
+ cl_buffer_wait_rendering = (cl_buffer_wait_rendering_cb *) drm_intel_bo_wait_rendering;
+ intel_set_gpgpu_callbacks();
}
{
intel_driver_t *drv;
intel_batchbuffer_t *batch;
- genx_gpgpu_kernel_t *ker;
+ cl_gpgpu_kernel_t *ker;
struct {
dri_bo *bo;
uint32_t max_threads; /* max threads requested by the user */
};
-LOCAL intel_gpgpu_t*
-intel_gpgpu_new(intel_driver_t *drv)
-{
- intel_gpgpu_t *state = NULL;
+typedef struct intel_gpgpu intel_gpgpu_t;
- TRY_ALLOC_NO_ERR (state, CALLOC(intel_gpgpu_t));
- state->drv = drv;
- state->batch = intel_batchbuffer_new(state->drv);
- assert(state->batch);
- intel_batchbuffer_init(state->batch, state->drv);
-
-exit:
- return state;
-error:
- intel_gpgpu_delete(state);
- state = NULL;
- goto exit;
-}
-
-LOCAL void
+static void
intel_gpgpu_delete(intel_gpgpu_t *state)
{
if (state == NULL)
cl_free(state);
}
+static intel_gpgpu_t*
+intel_gpgpu_new(intel_driver_t *drv)
+{
+ intel_gpgpu_t *state = NULL;
+
+ TRY_ALLOC_NO_ERR (state, CALLOC(intel_gpgpu_t));
+ state->drv = drv;
+ state->batch = intel_batchbuffer_new(state->drv);
+ assert(state->batch);
+ intel_batchbuffer_init(state->batch, state->drv);
+
+exit:
+ return state;
+error:
+ intel_gpgpu_delete(state);
+ state = NULL;
+ goto exit;
+}
+
static void
-gpgpu_select_pipeline(intel_gpgpu_t *state)
+intel_gpgpu_select_pipeline(intel_gpgpu_t *state)
{
BEGIN_BATCH(state->batch, 1);
OUT_BATCH(state->batch, CMD_PIPELINE_SELECT | PIPELINE_SELECT_MEDIA);
}
static void
-gpgpu_set_base_address(intel_gpgpu_t *state)
+intel_gpgpu_set_base_address(intel_gpgpu_t *state)
{
const uint32_t def_cc = cc_llc_l3; /* default Cache Control value */
BEGIN_BATCH(state->batch, 10);
}
static void
-gpgpu_load_vfe_state(intel_gpgpu_t *state)
+intel_gpgpu_load_vfe_state(intel_gpgpu_t *state)
{
BEGIN_BATCH(state->batch, 8);
OUT_BATCH(state->batch, CMD_MEDIA_STATE_POINTERS | (8-2));
intel_batchbuffer_alloc_space(state->batch,0);
memset(vfe, 0, sizeof(struct gen6_vfe_state_inline));
- vfe->vfe1.gpgpu_mode =
- (state->drv->gen_ver == 7 || state->drv->gen_ver == 75) ? 1 : 0;
+ vfe->vfe1.gpgpu_mode = 1;
vfe->vfe1.bypass_gateway_ctl = 1;
vfe->vfe1.reset_gateway_timer = 1;
vfe->vfe1.max_threads = state->max_threads - 1;
vfe->vfe1.urb_entries = 64;
- if (state->drv->gen_ver >= 7) {
- vfe->vfe3.curbe_size = 480;
- vfe->vfe4.scoreboard_mask = 0;
- } else {
- vfe->vfe3.curbe_size = 63;
- vfe->vfe3.urb_size = 13;
- vfe->vfe4.scoreboard_mask = 0x80000000;
- }
-
- //M
- //p
+ vfe->vfe3.curbe_size = 480;
+ vfe->vfe4.scoreboard_mask = 0;
//vfe->vfe3.urb_size = 13;
//vfe->vfe4.scoreboard_mask = (state->drv->gen_ver == 7 || state->drv->gen_ver == 75) ? 0 : 0x80000000;
intel_batchbuffer_alloc_space(state->batch, sizeof(gen6_vfe_state_inline_t));
}
static void
-gpgpu_load_constant_buffer(intel_gpgpu_t *state)
+intel_gpgpu_load_constant_buffer(intel_gpgpu_t *state)
{
BEGIN_BATCH(state->batch, 4);
OUT_BATCH(state->batch, CMD(2,0,1) | (4 - 2)); /* length-2 */
}
static void
-gpgpu_load_idrt(intel_gpgpu_t *state)
+intel_gpgpu_load_idrt(intel_gpgpu_t *state)
{
BEGIN_BATCH(state->batch, 4);
OUT_BATCH(state->batch, CMD(2,0,2) | (4 - 2)); /* length-2 */
};
static void
-gpgpu_pipe_control(intel_gpgpu_t *state)
+intel_gpgpu_pipe_control(intel_gpgpu_t *state)
{
BEGIN_BATCH(state->batch, SIZEOF32(gen6_pipe_control_t));
gen6_pipe_control_t* pc = (gen6_pipe_control_t*)
ADVANCE_BATCH(state->batch);
}
-LOCAL void
+static void
intel_gpgpu_set_L3(intel_gpgpu_t *state, uint32_t use_barrier)
{
BEGIN_BATCH(state->batch, 6);
OUT_BATCH(state->batch, gpgpu_l3_config_reg2[4]);
ADVANCE_BATCH(state->batch);
- gpgpu_pipe_control(state);
+ intel_gpgpu_pipe_control(state);
}
-LOCAL void
-gpgpu_batch_start(intel_gpgpu_t *state)
+static void
+intel_gpgpu_batch_start(intel_gpgpu_t *state)
{
intel_batchbuffer_start_atomic(state->batch, 256);
- gpgpu_pipe_control(state);
+ intel_gpgpu_pipe_control(state);
if (state->drv->gen_ver == 7 || state->drv->gen_ver == 75)
intel_gpgpu_set_L3(state, state->ker->use_barrier);
- gpgpu_select_pipeline(state);
- gpgpu_set_base_address(state);
- gpgpu_load_vfe_state(state);
- gpgpu_load_constant_buffer(state);
- gpgpu_load_idrt(state);
+ intel_gpgpu_select_pipeline(state);
+ intel_gpgpu_set_base_address(state);
+ intel_gpgpu_load_vfe_state(state);
+ intel_gpgpu_load_constant_buffer(state);
+ intel_gpgpu_load_idrt(state);
if (state->perf_b.bo) {
BEGIN_BATCH(state->batch, 3);
}
}
-LOCAL void
-gpgpu_batch_end(intel_gpgpu_t *state, int32_t flush_mode)
+static void
+intel_gpgpu_batch_end(intel_gpgpu_t *state, int32_t flush_mode)
{
/* Insert the performance counter command */
if (state->perf_b.bo) {
ADVANCE_BATCH(state->batch);
}
- if(flush_mode) gpgpu_pipe_control(state);
+ if(flush_mode) intel_gpgpu_pipe_control(state);
intel_batchbuffer_end_atomic(state->batch);
}
-LOCAL void
-gpgpu_batch_reset(intel_gpgpu_t *state, size_t sz)
+static void
+intel_gpgpu_batch_reset(intel_gpgpu_t *state, size_t sz)
{
intel_batchbuffer_reset(state->batch, sz);
}
-LOCAL void
-gpgpu_flush(intel_gpgpu_t *state)
+static void
+intel_gpgpu_flush(intel_gpgpu_t *state)
{
intel_batchbuffer_flush(state->batch);
}
-LOCAL void
-gpgpu_state_init(intel_gpgpu_t *state,
- uint32_t max_threads,
- uint32_t size_cs_entry)
+static void
+intel_gpgpu_state_init(intel_gpgpu_t *state,
+ uint32_t max_threads,
+ uint32_t size_cs_entry)
{
dri_bo *bo;
}
static void
-gpgpu_set_buf_reloc_gen7(intel_gpgpu_t *state, int32_t index, dri_bo* obj_bo)
+intel_gpgpu_set_buf_reloc_gen7(intel_gpgpu_t *state, int32_t index, dri_bo* obj_bo)
{
surface_heap_t *heap = state->surface_heap_b.bo->virtual;
heap->binding_table[index] = offsetof(surface_heap_t, surface) +
}
static void
-gpgpu_bind_buf_gen7(intel_gpgpu_t *state,
- int32_t index,
- dri_bo* obj_bo,
- uint32_t size,
- uint32_t cchint)
+intel_gpgpu_bind_buf_gen7(intel_gpgpu_t *state,
+ int32_t index,
+ dri_bo* obj_bo,
+ uint32_t size,
+ uint32_t cchint)
{
surface_heap_t *heap = state->surface_heap_b.bo->virtual;
gen7_surface_state_t *ss = (gen7_surface_state_t *) heap->surface[index];
ss->ss2.height = (size_ss & 0x1fff80) >> 7; /* bits 20:7 of size_ss */
ss->ss3.depth = (size_ss & 0xffe00000) >> 20; /* bits 27:21 of size_ss */
ss->ss5.cache_control = cc_llc_l3;
- gpgpu_set_buf_reloc_gen7(state, index, obj_bo);
+ intel_gpgpu_set_buf_reloc_gen7(state, index, obj_bo);
}
static void
-gpgpu_bind_image2D_gen7(intel_gpgpu_t *state,
- int32_t index,
- dri_bo* obj_bo,
- uint32_t format,
- int32_t w,
- int32_t h,
- int32_t pitch,
- int32_t tiling)
+intel_gpgpu_bind_image2D_gen7(intel_gpgpu_t *state,
+ int32_t index,
+ dri_bo* obj_bo,
+ uint32_t format,
+ int32_t w,
+ int32_t h,
+ int32_t pitch,
+ int32_t tiling)
{
surface_heap_t *heap = state->surface_heap_b.bo->virtual;
gen7_surface_state_t *ss = (gen7_surface_state_t *) heap->surface[index];
ss->ss0.tiled_surface = 1;
ss->ss0.tile_walk = I965_TILEWALK_YMAJOR;
}
- gpgpu_set_buf_reloc_gen7(state, index, obj_bo);
+ intel_gpgpu_set_buf_reloc_gen7(state, index, obj_bo);
}
-LOCAL void
-gpgpu_bind_buf(intel_gpgpu_t *state,
- int32_t index,
- dri_bo* obj_bo,
- uint32_t cchint)
+static void
+intel_gpgpu_bind_buf(intel_gpgpu_t *state,
+ int32_t index,
+ drm_intel_bo *obj_bo,
+ uint32_t cchint)
{
- uint32_t size = obj_bo->size;
-
+ const uint32_t size = obj_bo->size;
assert(index < MAX_SURFACES);
if (state->drv->gen_ver == 7 || state->drv->gen_ver == 75)
- gpgpu_bind_buf_gen7(state, index, obj_bo, size, cchint);
+ intel_gpgpu_bind_buf_gen7(state, index, obj_bo, size, cchint);
else
NOT_IMPLEMENTED;
}
-LOCAL void
-gpgpu_bind_image2D(intel_gpgpu_t *state,
- int32_t index,
- dri_bo* obj_bo,
- uint32_t format,
- int32_t w,
- int32_t h,
- int32_t pitch,
- gpgpu_tiling_t tiling)
+static void
+intel_gpgpu_bind_image2D(intel_gpgpu_t *state,
+ int32_t index,
+ cl_buffer *obj_bo,
+ uint32_t format,
+ int32_t w,
+ int32_t h,
+ int32_t pitch,
+ cl_gpgpu_tiling_t tiling)
{
assert(index < MAX_SURFACES);
if (state->drv->gen_ver == 7 || state->drv->gen_ver == 75)
- gpgpu_bind_image2D_gen7(state, index, obj_bo, format, w, h, pitch, tiling);
+ intel_gpgpu_bind_image2D_gen7(state, index, (drm_intel_bo*) obj_bo, format, w, h, pitch, tiling);
else
NOT_IMPLEMENTED;
}
static void
-gpgpu_build_idrt(intel_gpgpu_t *state,
- genx_gpgpu_kernel_t *kernel,
- uint32_t ker_n)
+intel_gpgpu_build_idrt(intel_gpgpu_t *state,
+ cl_gpgpu_kernel_t *kernel,
+ uint32_t ker_n)
{
gen6_interface_descriptor_t *desc;
- dri_bo *bo;
+ drm_intel_bo *bo = NULL, *ker_bo = NULL;
uint32_t i;
bo = state->idrt_b.bo;
for (i = 0; i < ker_n; i++) {
memset(desc, 0, sizeof(*desc));
- desc->desc0.kernel_start_pointer = kernel[i].bo->offset >> 6; /* reloc */
+ ker_bo = (drm_intel_bo *) kernel[i].bo;
+ desc->desc0.kernel_start_pointer = ker_bo->offset >> 6; /* reloc */
desc->desc2.sampler_state_pointer = state->sampler_state_b.bo->offset >> 5;
desc->desc3.binding_table_entry_count = 0; /* no prefetch */
desc->desc3.binding_table_pointer = 0;
I915_GEM_DOMAIN_INSTRUCTION, 0,
0,
i * sizeof(*desc) + offsetof(gen6_interface_descriptor_t, desc0),
- kernel[i].bo);
+ ker_bo);
dri_bo_emit_reloc(bo,
I915_GEM_DOMAIN_INSTRUCTION, 0,
dri_bo_unmap(bo);
}
-LOCAL void
-gpgpu_upload_constants(intel_gpgpu_t *state, const void* data, uint32_t size)
+static void
+intel_gpgpu_upload_constants(intel_gpgpu_t *state, const void* data, uint32_t size)
{
unsigned char *constant_buffer = NULL;
dri_bo_unmap(state->curbe_b.bo);
}
-LOCAL void
-gpgpu_upload_samplers(intel_gpgpu_t *state, const void *data, uint32_t n)
+static void
+intel_gpgpu_upload_samplers(intel_gpgpu_t *state, const void *data, uint32_t n)
{
if (n) {
/*sizeof(gen6_sampler_state_t) == sizeof(gen7_surface_state_t) */
}
}
-LOCAL void
-gpgpu_states_setup(intel_gpgpu_t *state, genx_gpgpu_kernel_t *kernel, uint32_t ker_n)
+static void
+intel_gpgpu_states_setup(intel_gpgpu_t *state, cl_gpgpu_kernel_t *kernel, uint32_t ker_n)
{
state->ker = kernel;
- gpgpu_build_idrt(state, kernel, ker_n);
+ intel_gpgpu_build_idrt(state, kernel, ker_n);
dri_bo_unmap(state->surface_heap_b.bo);
dri_bo_unmap(state->sampler_state_b.bo);
}
-LOCAL void
-gpgpu_update_barrier(intel_gpgpu_t *state, uint32_t barrierID, uint32_t thread_n)
+static void
+intel_gpgpu_update_barrier(intel_gpgpu_t *state, uint32_t barrierID, uint32_t thread_n)
{
BEGIN_BATCH(state->batch, 4);
OUT_BATCH(state->batch, CMD_MEDIA_STATE_FLUSH | 0);
ADVANCE_BATCH(state->batch);
}
-LOCAL void
-gpgpu_set_perf_counters(intel_gpgpu_t *state, dri_bo *perf)
+static void
+intel_gpgpu_set_perf_counters(intel_gpgpu_t *state, cl_buffer *perf)
{
if (state->perf_b.bo)
drm_intel_bo_unreference(state->perf_b.bo);
- drm_intel_bo_reference(perf);
- state->perf_b.bo = perf;
-}
-
-LOCAL void
-gpgpu_run(intel_gpgpu_t *state, int32_t ki)
-{
- BEGIN_BATCH(state->batch, 6);
- OUT_BATCH(state->batch, GEN_CMD_MEDIA_OBJECT | 5);
- OUT_BATCH(state->batch, ki); /* kernel index + DBGOBJ=0 << 8 */
- OUT_BATCH(state->batch, 0); /* no children, no thread sync, no scoreboard, any half-slice, indirect data size = 0 */
- OUT_BATCH(state->batch, 0); /* no indirect data ptr */
- OUT_BATCH(state->batch, 0); /* scoreboard Y,X = 0 or just 0 in reg.0 */
- OUT_BATCH(state->batch, 0); /* scoreboard color, mask = 0 or just 0 in reg.1 */
- OUT_BATCH(state->batch, 0); /* scoreboard color, mask = 0 or just 0 in reg.1 */
- ADVANCE_BATCH(state->batch);
-}
-
-LOCAL char*
-gpgpu_run_with_inline(intel_gpgpu_t *state, int32_t ki, size_t sz)
-{
- const uint32_t len = (uint32_t) (sz >> 2);
-
- assert(sz % sizeof(int32_t) == 0);
- BEGIN_BATCH(state->batch, len + 6);
- OUT_BATCH(state->batch, GEN_CMD_MEDIA_OBJECT | (len + 4));
- OUT_BATCH(state->batch, ki); /* kernel index + DBGOBJ=0 << 8 */
- OUT_BATCH(state->batch, 0); /* no children, no thread sync, no scoreboard, any half-slice, indirect data size = 0 */
- OUT_BATCH(state->batch, 0); /* no indirect data ptr */
- OUT_BATCH(state->batch, 0); /* scoreboard Y,X = 0 or just 0 in reg.0 */
- OUT_BATCH(state->batch, 0); /* scoreboard color, mask = 0 or just 0 in reg.1 */
- return (char*) intel_batchbuffer_alloc_space(state->batch,sz);
+ drm_intel_bo_reference((drm_intel_bo*) perf);
+ state->perf_b.bo = (drm_intel_bo*) perf;
}
-LOCAL void
-gpgpu_walker(intel_gpgpu_t *state,
- uint32_t simd_sz,
- uint32_t thread_n,
- const size_t global_wk_off[3],
- const size_t global_wk_sz[3],
- const size_t local_wk_sz[3])
+static void
+intel_gpgpu_walker(intel_gpgpu_t *state,
+ uint32_t simd_sz,
+ uint32_t thread_n,
+ const size_t global_wk_off[3],
+ const size_t global_wk_sz[3],
+ const size_t local_wk_sz[3])
{
const uint32_t global_wk_dim[3] = {
global_wk_sz[0] / local_wk_sz[0],
ADVANCE_BATCH(state->batch);
}
-LOCAL int32_t
-intel_gpgpu_version(intel_gpgpu_t *gpgpu)
-{
- return gpgpu->drv->gen_ver;
+LOCAL void
+intel_set_gpgpu_callbacks(void)
+{
+ cl_gpgpu_new = (cl_gpgpu_new_cb *) intel_gpgpu_new;
+ cl_gpgpu_delete = (cl_gpgpu_delete_cb *) intel_gpgpu_delete;
+ cl_gpgpu_bind_image2D = (cl_gpgpu_bind_image2D_cb *) intel_gpgpu_bind_image2D;
+ cl_gpgpu_bind_buf = (cl_gpgpu_bind_buf_cb *) intel_gpgpu_bind_buf;
+ cl_gpgpu_state_init = (cl_gpgpu_state_init_cb *) intel_gpgpu_state_init;
+ cl_gpgpu_set_perf_counters = (cl_gpgpu_set_perf_counters_cb *) intel_gpgpu_set_perf_counters;
+ cl_gpgpu_upload_constants = (cl_gpgpu_upload_constants_cb *) intel_gpgpu_upload_constants;
+ cl_gpgpu_states_setup = (cl_gpgpu_states_setup_cb *) intel_gpgpu_states_setup;
+ cl_gpgpu_update_barrier = (cl_gpgpu_update_barrier_cb *) intel_gpgpu_update_barrier;
+ cl_gpgpu_upload_samplers = (cl_gpgpu_upload_samplers_cb *) intel_gpgpu_upload_samplers;
+ cl_gpgpu_batch_reset = (cl_gpgpu_batch_reset_cb *) intel_gpgpu_batch_reset;
+ cl_gpgpu_batch_start = (cl_gpgpu_batch_start_cb *) intel_gpgpu_batch_start;
+ cl_gpgpu_batch_end = (cl_gpgpu_batch_end_cb *) intel_gpgpu_batch_end;
+ cl_gpgpu_flush = (cl_gpgpu_flush_cb *) intel_gpgpu_flush;
+ cl_gpgpu_walker = (cl_gpgpu_walker_cb *) intel_gpgpu_walker;
}
-uint32_t drm_intel_bo_get_size(drm_intel_bo *bo)
-{
- return (bo->size);
-}
-void *drm_intel_bo_get_virtual(drm_intel_bo *bo)
-{
- return (bo->virtual);
-}
#define __GENX_GPGPU_H__
#include "cl_utils.h"
-#include "intel_bufmgr.h"
+#include "cl_driver.h"
#include <stdlib.h>
#include <stdint.h>
-#define MAX_SURFACES 128
-#define MAX_SAMPLERS 16
-
-enum gen6_cache_control {
- cc_gtt = 0x0,
- cc_l3 = 0x1,
- cc_llc = 0x2,
- cc_llc_l3 = 0x3
-};
-
-/* Use this structure to bind kernels in the gpgpu state */
-typedef struct genx_gpgpu_kernel {
- const char *name; /* kernel name and bo name */
- uint32_t grf_blocks; /* register blocks kernel wants (in 8 reg blocks) */
- uint32_t cst_sz; /* total size of all constants */
- struct _drm_intel_bo *bo;/* kernel code in the proper addr space */
- int32_t barrierID; /* barrierID for _this_ kernel */
- uint32_t use_barrier:1; /* For gen7 (automatic barrier management) */
- uint32_t thread_n:15; /* For gen7 (automatic barrier management) */
- uint32_t slm_sz:16; /* For gen7 (automatic SLM allocation) */
-} genx_gpgpu_kernel_t;
-
-/* Convenient abstraction of the device */
-struct intel_driver;
-
-/* Covenient way to talk to the device */
-typedef struct intel_gpgpu intel_gpgpu_t;
-
-/* Buffer object as exposed by drm_intel */
-struct _drm_intel_bo;
-
-/* Allocate and initialize a GPGPU state */
-extern intel_gpgpu_t* intel_gpgpu_new(struct intel_driver*);
-
-/* Destroy and deallocate a GPGPU state */
-extern void intel_gpgpu_delete(intel_gpgpu_t*);
-
-/* Get the device generation */
-extern int32_t intel_gpgpu_version(intel_gpgpu_t*);
-
-/* Buffer management wrapper APIs */
-extern uint32_t drm_intel_bo_get_size(drm_intel_bo *bo);
-extern void *drm_intel_bo_get_virtual(drm_intel_bo *bo);
-
-/* Set typeless buffer descriptor in the current binding table */
-extern void gpgpu_bind_buf(intel_gpgpu_t*,
- int32_t index,
- struct _drm_intel_bo* obj_bo,
- uint32_t cchint);
-
-typedef enum gpgpu_tiling {
- GPGPU_NO_TILE = 0,
- GPGPU_TILE_X = 1,
- GPGPU_TILE_Y = 2,
-} gpgpu_tiling_t;
-
-/* Set a 2d texture */
-extern void gpgpu_bind_image2D(intel_gpgpu_t *state,
- int32_t index,
- struct _drm_intel_bo* obj_bo,
- uint32_t format,
- int32_t w,
- int32_t h,
- int pitch,
- gpgpu_tiling_t tiling);
-
-/* Configure state, size in 512-bit units */
-extern void gpgpu_state_init(intel_gpgpu_t*, uint32_t max_threads, uint32_t size_cs_entry);
-
-/* Set the buffer object where to report performance counters */
-extern void gpgpu_set_perf_counters(intel_gpgpu_t*, struct _drm_intel_bo *perf);
-
-/* Fills current constant buffer with data */
-extern void gpgpu_upload_constants(intel_gpgpu_t*, const void* data, uint32_t size);
-
-/* Setup all indirect states */
-extern void gpgpu_states_setup(intel_gpgpu_t*, genx_gpgpu_kernel_t* kernel, uint32_t ker_n);
-
-/* Make HW threads use barrierID */
-extern void gpgpu_update_barrier(intel_gpgpu_t*, uint32_t barrierID, uint32_t thread_n);
-
-/* Upload the constant samplers as specified inside the OCL kernel */
-extern void gpgpu_upload_samplers(intel_gpgpu_t *state, const void *data, uint32_t n);
-
-/* Set a sampler */
-extern void gpgpu_set_sampler(intel_gpgpu_t*, uint32_t index, uint32_t non_normalized);
-
-/* Allocate the batch buffer and return the BO used for the batch buffer */
-extern void gpgpu_batch_reset(intel_gpgpu_t*, size_t sz);
-
-/* Atomic begin, pipeline select, urb, pipeline state and constant buffer */
-extern void gpgpu_batch_start(intel_gpgpu_t*);
-
-/* atomic end with possibly inserted flush */
-extern void gpgpu_batch_end(intel_gpgpu_t*, int32_t flush_mode);
-
-/* Emit MI_FLUSH */
-extern void gpgpu_flush(intel_gpgpu_t*);
-
-/* Enqueue a MEDIA object with no inline data */
-extern void gpgpu_run(intel_gpgpu_t*, int32_t ki);
-
-/* Enqueue a MEDIA object with inline data to push afterward. Returns the
- * pointer where to push. sz is the size of the data we are going to pass
- */
-extern char* gpgpu_run_with_inline(intel_gpgpu_t*, int32_t ki, size_t sz);
-
-/* Will spawn all threads */
-extern void
-gpgpu_walker(intel_gpgpu_t *state,
- uint32_t simd_sz,
- uint32_t thread_n,
- const size_t global_wk_off[3],
- const size_t global_wk_sz[3],
- const size_t local_wk_sz[3]);
+/* Set the gpgpu related call backs */
+extern void intel_set_gpgpu_callbacks(void);
#endif /* __GENX_GPGPU_H__ */
#include "cl_utils.h"
#include "cl_alloc.h"
-#include "sim/sim_buffer.h"
+#include "sim/sim_driver.h"
#include "CL/cl.h"
#include <stdlib.h>
#include <assert.h>
#include <string.h>
-#include "cl_buffer.h"
+#include "cl_driver.h"
/* Just to count allocations */
typedef struct sim_bufmgr { volatile int buf_n; } sim_bufmgr_t;
sim_buffer_delete(buf);
}
-static void*
-sim_buffer_map(sim_buffer_t *buf)
-{
- assert(buf);
- return buf->data;
-}
-
static int
sim_buffer_subdata(sim_buffer_t *buf, unsigned long offset, unsigned long size, const void *data)
{
{
return 1;
}
+
+static int sim_buffer_map(sim_buffer_t *buf, uint32_t write_enable) {return 0;}
static int sim_buffer_unmap(sim_buffer_t *buf) {return 0;}
-static int sim_buffer_pin(sim_buffer_t *buf) {return 0;}
+static int sim_buffer_pin(sim_buffer_t *buf, uint32_t alignment) {return 0;}
static int sim_buffer_unpin(sim_buffer_t *buf) {return 0;}
LOCAL void
* Author: Benjamin Segovia <benjamin.segovia@intel.com>
*/
-#ifndef __SIM_BUFFER_H__
-#define __SIM_BUFFER_H__
+#ifndef __SIM_DRIVER_H__
+#define __SIM_DRIVER_H__
/* Setup the call backs when using the simulator */
extern void sim_setup_callbacks(void);
-#endif /* __SIM_BUFFER_H__ */
+#endif /* __SIM_DRIVER_H__ */
+