From 0ae66a4b425602d1a1a1c04687e0b7c8deae3577 Mon Sep 17 00:00:00 2001 From: Benjamin Segovia Date: Thu, 29 Mar 2012 17:26:36 +0000 Subject: [PATCH] Removed all direct dependencies to the intel driver (ie files contained in intel/) and replaced them by dependencies to the call backs. This should allow an easier integration with the performance simulator --- src/CMakeLists.txt | 6 +- src/cl_command_queue.c | 53 +++--- src/cl_command_queue.h | 23 +-- src/cl_command_queue_gen7.c | 106 ++---------- src/cl_context.c | 22 +-- src/cl_context.h | 9 +- src/cl_driver.c | 48 ++++-- src/cl_driver.cpp | 4 +- src/cl_driver.h | 151 ++++++++++++++++- src/cl_image.c | 4 - src/cl_kernel.c | 16 +- src/cl_kernel.h | 5 +- src/cl_mem.c | 48 +++--- src/cl_mem.h | 10 +- src/intel/intel_driver.c | 16 +- src/intel/intel_gpgpu.c | 292 +++++++++++++++------------------ src/intel/intel_gpgpu.h | 119 +------------- src/sim/{sim_buffer.c => sim_driver.c} | 15 +- src/sim/{sim_buffer.h => sim_driver.h} | 7 +- 19 files changed, 436 insertions(+), 518 deletions(-) rename src/sim/{sim_buffer.c => sim_driver.c} (94%) rename src/sim/{sim_buffer.h => sim_driver.h} (91%) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index facc990..d3517c1 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -18,9 +18,9 @@ SET(OPENCL_SRC cl_context.c cl_command_queue.c cl_command_queue_gen7.c - cl_buffer.c - cl_buffer.cpp - sim/sim_buffer.c + cl_driver.c + cl_driver.cpp + sim/sim_driver.c intel/intel_gpgpu.c intel/intel_batchbuffer.c intel/intel_driver.c diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c index cef1485..0be37c8 100644 --- a/src/cl_command_queue.c +++ b/src/cl_command_queue.c @@ -25,8 +25,7 @@ #include "cl_mem.h" #include "cl_utils.h" #include "cl_alloc.h" -#include "intel_bufmgr.h" -#include "intel/intel_gpgpu.h" +#include "cl_driver.h" #include #include @@ -42,8 +41,7 @@ cl_command_queue_new(cl_context ctx) queue->magic = CL_MAGIC_QUEUE_HEADER; queue->ref_n = 1; queue->ctx = ctx; - TRY_ALLOC_NO_ERR (queue->gpgpu, - intel_gpgpu_new((struct intel_driver*) ctx->intel_drv)); + TRY_ALLOC_NO_ERR (queue->gpgpu, cl_gpgpu_new(ctx->drv)); /* Append the command queue in the list */ pthread_mutex_lock(&ctx->queue_lock); @@ -87,7 +85,7 @@ cl_command_queue_delete(cl_command_queue queue) } cl_mem_delete(queue->perf); cl_context_delete(queue->ctx); - intel_gpgpu_delete(queue->gpgpu); + cl_gpgpu_delete(queue->gpgpu); queue->magic = CL_MAGIC_DEAD_HEADER; /* For safety */ cl_free(queue); } @@ -98,26 +96,26 @@ cl_command_queue_add_ref(cl_command_queue queue) atomic_inc(&queue->ref_n); } - LOCAL cl_int +LOCAL cl_int cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k, char *curbe, - drm_intel_bo **local, - drm_intel_bo **priv, - drm_intel_bo **scratch, + cl_buffer **local, + cl_buffer **priv, + cl_buffer **scratch, uint32_t local_sz) { cl_context ctx = queue->ctx; - intel_gpgpu_t *gpgpu = queue->gpgpu; - drm_intel_bufmgr *bufmgr = cl_context_get_intel_bufmgr(ctx); - drm_intel_bo *sync_bo = NULL; + cl_gpgpu *gpgpu = queue->gpgpu; + cl_buffer_mgr *bufmgr = cl_context_get_bufmgr(ctx); + cl_buffer *sync_bo = NULL; cl_int err = CL_SUCCESS; #if 0 cl_context ctx = queue->ctx; intel_gpgpu_t *gpgpu = queue->gpgpu; drm_intel_bufmgr *bufmgr = cl_context_get_intel_bufmgr(ctx); cl_mem mem = NULL; - drm_intel_bo *bo = NULL, *sync_bo = NULL; + cl_buffer *bo = NULL, *sync_bo = NULL; const size_t max_thread = ctx->device->max_compute_unit; cl_int err = CL_SUCCESS; uint32_t i, index; @@ -163,7 +161,7 @@ cl_command_queue_bind_surface(cl_command_queue queue, assert(k->patch.local_surf.offset % SURFACE_SZ == 0); index = k->patch.local_surf.offset / SURFACE_SZ; assert(index != MAX_SURFACES - 1); - *local = drm_intel_bo_alloc(bufmgr, "CL local surface", sz, 64); + *local = cl_buffer_alloc(bufmgr, "CL local surface", sz, 64); gpgpu_bind_buf(gpgpu, index, *local, cc_llc_l3); } else if (local) @@ -178,7 +176,7 @@ cl_command_queue_bind_surface(cl_command_queue queue, assert(k->patch.private_surf.offset % SURFACE_SZ == 0); index = k->patch.private_surf.offset / SURFACE_SZ; assert(index != MAX_SURFACES - 1); - *priv = drm_intel_bo_alloc(bufmgr, "CL private surface", sz, 64); + *priv = cl_buffer_alloc(bufmgr, "CL private surface", sz, 64); gpgpu_bind_buf(gpgpu, index, *priv, cc_llc_l3); } else if(priv) @@ -193,17 +191,17 @@ cl_command_queue_bind_surface(cl_command_queue queue, assert(k->patch.scratch.offset % SURFACE_SZ == 0); assert(index != MAX_SURFACES - 1); index = k->patch.scratch.offset / SURFACE_SZ; - *scratch = drm_intel_bo_alloc(bufmgr, "CL scratch surface", sz, 64); + *scratch = cl_buffer_alloc(bufmgr, "CL scratch surface", sz, 64); gpgpu_bind_buf(gpgpu, index, *scratch, cc_llc_l3); } else if (scratch) *scratch = NULL; #endif /* Now bind a bo used for synchronization */ - sync_bo = drm_intel_bo_alloc(bufmgr, "sync surface", 64, 64); - gpgpu_bind_buf(gpgpu, MAX_SURFACES-1, sync_bo, cc_llc_l3); + sync_bo = cl_buffer_alloc(bufmgr, "sync surface", 64, 64); + cl_gpgpu_bind_buf(gpgpu, MAX_SURFACES-1, sync_bo, cc_llc_l3); if (queue->last_batch != NULL) - drm_intel_bo_unreference(queue->last_batch); + cl_buffer_unreference(queue->last_batch); queue->last_batch = sync_bo; // error: @@ -212,9 +210,9 @@ cl_command_queue_bind_surface(cl_command_queue queue, } #if USE_FULSIM -extern void drm_intel_bufmgr_gem_stop_aubfile(drm_intel_bufmgr*); -extern void drm_intel_bufmgr_gem_set_aubfile(drm_intel_bufmgr*, FILE*); -extern void aub_exec_dump_raw_file(drm_intel_bo*, size_t offset, size_t sz); +extern void drm_intel_bufmgr_gem_stop_aubfile(cl_buffer_mgr*); +extern void drm_intel_bufmgr_gem_set_aubfile(cl_buffer_mgr*, FILE*); +extern void aub_exec_dump_raw_file(cl_buffer*, size_t offset, size_t sz); static void cl_run_fulsim(void) @@ -435,15 +433,14 @@ cl_command_queue_ND_range(cl_command_queue queue, const size_t *global_wk_sz, const size_t *local_wk_sz) { - intel_gpgpu_t *gpgpu = queue->gpgpu; - const int32_t ver = intel_gpgpu_version(gpgpu); + const int32_t ver = cl_driver_get_ver(queue->ctx->drv); cl_int err = CL_SUCCESS; #if USE_FULSIM - drm_intel_bufmgr *bufmgr = NULL; + cl_buffer_mgr *bufmgr = NULL; FILE *file = fopen("dump.aub", "wb"); FATAL_IF (file == NULL, "Unable to open file dump.aub"); - bufmgr = cl_context_get_intel_bufmgr(queue->ctx); + bufmgr = cl_context_get_bufmgr(queue->ctx); drm_intel_bufmgr_gem_set_aubfile(bufmgr, file); #endif /* USE_FULSIM */ @@ -469,8 +466,8 @@ cl_command_queue_finish(cl_command_queue queue) { if (queue->last_batch == NULL) return CL_SUCCESS; - drm_intel_bo_wait_rendering(queue->last_batch); - drm_intel_bo_unreference(queue->last_batch); + cl_buffer_wait_rendering(queue->last_batch); + cl_buffer_unreference(queue->last_batch); queue->last_batch = NULL; return CL_SUCCESS; } diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h index c47350e..f03e61c 100644 --- a/src/cl_command_queue.h +++ b/src/cl_command_queue.h @@ -21,6 +21,7 @@ #define __CL_COMMAND_QUEUE_H__ #include "cl_internals.h" +#include "cl_driver.h" #include "CL/cl.h" #include @@ -28,14 +29,14 @@ struct intel_gpgpu; /* Basically, this is a (kind-of) batch buffer */ struct _cl_command_queue { - uint64_t magic; /* To identify it as a command queue */ - volatile int ref_n; /* We reference count this object */ - cl_context ctx; /* Its parent context */ - cl_command_queue prev, next; /* We chain the command queues together */ - struct intel_gpgpu *gpgpu; /* Setup all GEN commands */ - cl_mem perf; /* Where to put the perf counters */ - cl_mem fulsim_out; /* Fulsim will output this buffer */ - struct _drm_intel_bo *last_batch;/* To synchronize using clFinish */ + uint64_t magic; /* To identify it as a command queue */ + volatile int ref_n; /* We reference count this object */ + cl_context ctx; /* Its parent context */ + cl_command_queue prev, next; /* We chain the command queues together */ + cl_gpgpu *gpgpu; /* Setup all GEN commands */ + cl_mem perf; /* Where to put the perf counters */ + cl_mem fulsim_out; /* Fulsim will output this buffer */ + cl_buffer *last_batch; /* To synchronize using clFinish */ }; /* Allocate and initialize a new command queue. Also insert it in the list of @@ -69,9 +70,9 @@ extern cl_int cl_command_queue_finish(cl_command_queue); extern cl_int cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k, char *curbe, - struct _drm_intel_bo **local, - struct _drm_intel_bo **priv, - struct _drm_intel_bo **scratch, + cl_buffer **local, + cl_buffer **priv, + cl_buffer **scratch, uint32_t local_sz); #endif /* __CL_COMMAND_QUEUE_H__ */ diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c index ecf27dd..78af68f 100644 --- a/src/cl_command_queue_gen7.c +++ b/src/cl_command_queue_gen7.c @@ -25,8 +25,6 @@ #include "cl_mem.h" #include "cl_utils.h" #include "cl_alloc.h" -#include "intel_bufmgr.h" -#include "intel/intel_gpgpu.h" #include #include @@ -90,11 +88,11 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue, const size_t *local_wk_sz) { cl_context ctx = queue->ctx; - intel_gpgpu_t *gpgpu = queue->gpgpu; + cl_gpgpu *gpgpu = queue->gpgpu; char *curbe = NULL; /* Does not include per-thread local IDs */ char *final_curbe = NULL; /* Includes them */ - drm_intel_bo *private_bo = NULL, *scratch_bo = NULL; - genx_gpgpu_kernel_t kernel; + cl_buffer *private_bo = NULL, *scratch_bo = NULL; + cl_gpgpu_kernel_t kernel; const uint32_t simd_sz = cl_kernel_get_simd_width(ker); size_t i, batch_sz = 0u, local_sz = 0u, thread_n = 0u, id_offset = 0u, cst_sz = 0u; cl_int err = CL_SUCCESS; @@ -115,12 +113,12 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue, kernel.cst_sz = cst_sz += 3 * 32; /* Add local IDs (16 words) */ /* Setup the kernel */ - gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32); + cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32); if (queue->last_batch != NULL) - drm_intel_bo_unreference(queue->last_batch); + cl_buffer_unreference(queue->last_batch); queue->last_batch = NULL; cl_command_queue_bind_surface(queue, ker, curbe, NULL, &private_bo, &scratch_bo, 0); - gpgpu_states_setup(gpgpu, &kernel, 1); + cl_gpgpu_states_setup(gpgpu, &kernel, 1); /* CURBE step 2. Give the localID and upload it to video memory */ TRY_ALLOC (final_curbe, (char*) alloca(thread_n * cst_sz)); @@ -128,99 +126,19 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue, for (i = 0; i < thread_n; ++i) memcpy(final_curbe + cst_sz * i, curbe, cst_sz - 3*32); TRY (cl_set_local_ids, final_curbe, local_wk_sz, simd_sz, cst_sz, id_offset, thread_n); - gpgpu_upload_constants(gpgpu, final_curbe, thread_n*cst_sz); + cl_gpgpu_upload_constants(gpgpu, final_curbe, thread_n*cst_sz); /* Start a new batch buffer */ batch_sz = cl_kernel_compute_batch_sz(ker); - gpgpu_batch_reset(gpgpu, batch_sz); - gpgpu_batch_start(gpgpu); + cl_gpgpu_batch_reset(gpgpu, batch_sz); + cl_gpgpu_batch_start(gpgpu); /* Issue the GPGPU_WALKER command */ - gpgpu_walker(gpgpu, simd_sz, thread_n, global_wk_off, global_wk_sz, local_wk_sz); + cl_gpgpu_walker(gpgpu, simd_sz, thread_n, global_wk_off, global_wk_sz, local_wk_sz); /* Close the batch buffer and submit it */ - gpgpu_batch_end(gpgpu, 0); - gpgpu_flush(gpgpu); - -#if 0 - cl_context ctx = queue->ctx; - intel_gpgpu_t *gpgpu = queue->gpgpu; - drm_intel_bo *private_bo = NULL, *scratch_bo = NULL; - char *curbe = NULL; /* Does not include per-thread local IDs */ - char *final_curbe = NULL; /* Includes them */ - genx_gpgpu_kernel_t kernel; - //const size_t simd_sz = ker->patch.exec_env.largest_compiled_simd_sz; - const size_t simd_sz = 16; - size_t local_sz, batch_sz, cst_sz = ker->patch.curbe.sz; - size_t i, thread_n, id_offset; - cl_int err = CL_SUCCESS; - - /* Setup kernel */ - kernel.name = "OCL kernel"; - kernel.grf_blocks = 128; - kernel.bin = ker->kernel_heap; // _PLASMA ; NULL - kernel.size = ker->kernel_heap_sz; // _PLASMA ; 0 - kernel.bo = ker->bo; - kernel.barrierID = 0; - kernel.use_barrier = ker->patch.exec_env.has_barriers; - kernel.slm_sz = cl_kernel_local_memory_sz(ker); - - /* All arguments must have been set */ - TRY (cl_kernel_check_args, ker); - - /* Check that the local work sizes are OK */ - TRY (cl_kernel_work_group_sz, ker, local_wk_sz, 3, &local_sz); - //kernel.thread_n = thread_n = local_sz / simd_sz; - kernel.thread_n = thread_n = local_sz / simd_sz; - - /* CURBE step 1. Allocate and fill fields shared by threads in workgroup */ - if (cst_sz > 0) { - assert(ker->cst_buffer); - curbe = cl_kernel_create_cst_buffer(ker, - global_wk_off, - global_wk_sz, - local_wk_sz, - 3, - thread_n); - } - id_offset = cst_sz = ALIGN(cst_sz, 32); /* Align the user data on 32 bytes */ - kernel.cst_sz = cst_sz += 3 * 32; /* Add local IDs (16 words) */ - - /* Setup the kernel */ - gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32); - if (queue->last_batch != NULL) - drm_intel_bo_unreference(queue->last_batch); - queue->last_batch = NULL; - cl_command_queue_bind_surface(queue, ker, curbe, NULL, &private_bo, &scratch_bo, 0); - gpgpu_states_setup(gpgpu, &kernel, 1); - - /* CURBE step 2. Give the localID and upload it to video memory */ - TRY_ALLOC (final_curbe, (char*) cl_calloc(thread_n, cst_sz)); - for (i = 0; i < thread_n; ++i) - memcpy(final_curbe + cst_sz * i, curbe, cst_sz); - TRY (cl_set_local_ids, final_curbe, local_wk_sz, simd_sz, cst_sz, id_offset, thread_n); - gpgpu_upload_constants(gpgpu, final_curbe, thread_n*cst_sz); - - /* Start a new batch buffer */ - batch_sz = cl_kernel_compute_batch_sz(ker); - gpgpu_batch_reset(gpgpu, batch_sz); - gpgpu_batch_start(gpgpu); - - /* Issue the GPGPU_WALKER command */ - gpgpu_walker(gpgpu, simd_sz, thread_n, global_wk_off, global_wk_sz, local_wk_sz); - - /* Close the batch buffer and submit it */ - gpgpu_batch_end(gpgpu, 0); - gpgpu_flush(gpgpu); - -error: - /* Release all temporary buffers */ - if (private_bo) drm_intel_bo_unreference(private_bo); - if (scratch_bo) drm_intel_bo_unreference(scratch_bo); - cl_free(final_curbe); - cl_free(curbe); - return err; -#endif + cl_gpgpu_batch_end(gpgpu, 0); + cl_gpgpu_flush(gpgpu); error: return err; diff --git a/src/cl_context.c b/src/cl_context.c index 2e8ff24..3d926b4 100644 --- a/src/cl_context.c +++ b/src/cl_context.c @@ -25,6 +25,7 @@ #include "cl_mem.h" #include "cl_alloc.h" #include "cl_utils.h" +#include "cl_driver.h" #include "CL/cl.h" @@ -33,13 +34,6 @@ #include #include -/* Do not include the full dependency */ -struct intel_driver; -/* Get the command buffer interface */ -extern struct _drm_intel_bufmgr* intel_driver_get_bufmgr(struct intel_driver*); -/* Get the Gen HW version */ -extern uint32_t intel_driver_get_ver(struct intel_driver*); - static cl_int cl_context_properties_is_ok(const cl_context_properties *properties) { @@ -130,10 +124,10 @@ cl_context_new(void) cl_context ctx = NULL; TRY_ALLOC_NO_ERR (ctx, CALLOC(struct _cl_context)); - TRY_ALLOC_NO_ERR (ctx->intel_drv, cl_intel_driver_new()); + TRY_ALLOC_NO_ERR (ctx->drv, cl_driver_new()); ctx->magic = CL_MAGIC_CONTEXT_HEADER; ctx->ref_n = 1; - ctx->ver = intel_driver_get_ver(ctx->intel_drv); + ctx->ver = cl_driver_get_ver(ctx->drv); pthread_mutex_init(&ctx->program_lock, NULL); pthread_mutex_init(&ctx->queue_lock, NULL); pthread_mutex_init(&ctx->buffer_lock, NULL); @@ -163,8 +157,8 @@ cl_context_delete(cl_context ctx) assert(ctx->queues == NULL); assert(ctx->programs == NULL); assert(ctx->buffers == NULL); - assert(ctx->intel_drv); - cl_intel_driver_delete(ctx->intel_drv); + assert(ctx->drv); + cl_driver_delete(ctx->drv); ctx->magic = CL_MAGIC_DEAD_HEADER; /* For safety */ cl_free(ctx); } @@ -202,9 +196,9 @@ error: goto exit; } -struct _drm_intel_bufmgr* -cl_context_get_intel_bufmgr(cl_context ctx) +struct cl_buffer_mgr* +cl_context_get_bufmgr(cl_context ctx) { - return intel_driver_get_bufmgr((struct intel_driver*) ctx->intel_drv); + return cl_driver_get_bufmgr(ctx->drv); } diff --git a/src/cl_context.h b/src/cl_context.h index f2c6302..27f6d87 100644 --- a/src/cl_context.h +++ b/src/cl_context.h @@ -21,6 +21,7 @@ #define __CL_CONTEXT_H__ #include "cl_internals.h" +#include "cl_driver.h" #include "CL/cl.h" #include @@ -33,7 +34,7 @@ struct intel_driver; struct _cl_context { uint64_t magic; /* To identify it as a context */ volatile int ref_n; /* We reference count this object */ - struct intel_driver *intel_drv; /* Handles the real HW */ + cl_driver *drv; /* Handles HW or simulator */ cl_device_id device; /* All information about the GPU device */ cl_command_queue queues; /* All command queues currently allocated */ cl_program programs; /* All programs currently allocated */ @@ -69,9 +70,6 @@ extern cl_command_queue cl_context_create_queue(cl_context, cl_command_queue_properties, cl_int*); -/* Use for all GPU buffers */ -extern struct _drm_intel_bufmgr* cl_context_get_intel_bufmgr(cl_context); - /* Enqueue a ND Range kernel */ extern cl_int cl_context_ND_kernel(cl_context, cl_command_queue, @@ -82,8 +80,7 @@ extern cl_int cl_context_ND_kernel(cl_context, const size_t*); /* Used for allocation */ -extern struct _drm_intel_bufmgr* -cl_context_get_intel_bufmgr(cl_context ctx); +extern cl_buffer_mgr* cl_context_get_bufmgr(cl_context ctx); #endif /* __CL_CONTEXT_H__ */ diff --git a/src/cl_driver.c b/src/cl_driver.c index ed2aaa0..65828e2 100644 --- a/src/cl_driver.c +++ b/src/cl_driver.c @@ -17,17 +17,43 @@ * Author: Benjamin Segovia */ -#include "cl_buffer.h" +#include "cl_driver.h" +#include "cl_utils.h" #include -cl_buffer_alloc_cb *cl_buffer_alloc = NULL; -cl_buffer_unreference_cb *cl_buffer_unreference = NULL; -cl_buffer_map_cb *cl_buffer_map = NULL; -cl_buffer_unmap_cb *cl_buffer_unmap = NULL; -cl_buffer_pin_cb *cl_buffer_pin = NULL; -cl_buffer_unpin_cb *cl_buffer_unpin = NULL; -cl_buffer_subdata_cb *cl_buffer_subdata = NULL; -cl_buffer_emit_reloc_cb *cl_buffer_emit_reloc = NULL; -cl_driver_get_bufmgr_cb *cl_driver_get_bufmgr = NULL; -cl_driver_get_ver_cb *cl_driver_get_ver = NULL; +/* Driver */ +LOCAL cl_driver_new_cb *cl_driver_new = NULL; +LOCAL cl_driver_delete_cb *cl_driver_delete = NULL; +LOCAL cl_driver_get_bufmgr_cb *cl_driver_get_bufmgr = NULL; + +/* Buffer */ +LOCAL cl_driver_get_ver_cb *cl_driver_get_ver = NULL; +LOCAL cl_buffer_alloc_cb *cl_buffer_alloc = NULL; +LOCAL cl_buffer_reference_cb *cl_buffer_reference = NULL; +LOCAL cl_buffer_unreference_cb *cl_buffer_unreference = NULL; +LOCAL cl_buffer_map_cb *cl_buffer_map = NULL; +LOCAL cl_buffer_unmap_cb *cl_buffer_unmap = NULL; +LOCAL cl_buffer_get_virtual_cb *cl_buffer_get_virtual = NULL; +LOCAL cl_buffer_pin_cb *cl_buffer_pin = NULL; +LOCAL cl_buffer_unpin_cb *cl_buffer_unpin = NULL; +LOCAL cl_buffer_subdata_cb *cl_buffer_subdata = NULL; +LOCAL cl_buffer_emit_reloc_cb *cl_buffer_emit_reloc = NULL; +LOCAL cl_buffer_wait_rendering_cb *cl_buffer_wait_rendering = NULL; + +/* GPGPU */ +LOCAL cl_gpgpu_new_cb *cl_gpgpu_new = NULL; +LOCAL cl_gpgpu_delete_cb *cl_gpgpu_delete = NULL; +LOCAL cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf = NULL; +LOCAL cl_gpgpu_bind_image2D_cb *cl_gpgpu_bind_image2D = NULL; +LOCAL cl_gpgpu_state_init_cb *cl_gpgpu_state_init = NULL; +LOCAL cl_gpgpu_set_perf_counters_cb *cl_gpgpu_set_perf_counters = NULL; +LOCAL cl_gpgpu_upload_constants_cb *cl_gpgpu_upload_constants = NULL; +LOCAL cl_gpgpu_states_setup_cb *cl_gpgpu_states_setup = NULL; +LOCAL cl_gpgpu_update_barrier_cb *cl_gpgpu_update_barrier = NULL; +LOCAL cl_gpgpu_upload_samplers_cb *cl_gpgpu_upload_samplers = NULL; +LOCAL cl_gpgpu_batch_reset_cb *cl_gpgpu_batch_reset = NULL; +LOCAL cl_gpgpu_batch_start_cb *cl_gpgpu_batch_start = NULL; +LOCAL cl_gpgpu_batch_end_cb *cl_gpgpu_batch_end = NULL; +LOCAL cl_gpgpu_flush_cb *cl_gpgpu_flush = NULL; +LOCAL cl_gpgpu_walker_cb *cl_gpgpu_walker = NULL; diff --git a/src/cl_driver.cpp b/src/cl_driver.cpp index 87e1f2e..df26518 100644 --- a/src/cl_driver.cpp +++ b/src/cl_driver.cpp @@ -18,8 +18,8 @@ */ extern "C" { -#include "sim/sim_buffer.h" -#include "intel/intel_buffer.h" +#include "sim/sim_driver.h" +#include "intel/intel_driver.h" #include "cl_utils.h" #include #include diff --git a/src/cl_driver.h b/src/cl_driver.h index ff9fa6f..116959e 100644 --- a/src/cl_driver.h +++ b/src/cl_driver.h @@ -21,11 +21,18 @@ #define __CL_BUFFER_H__ #include +#include -/* Hide behind some call backs the buffer allocation / deallocation ... This +/* XXX needed for previous driver */ +#define MAX_SURFACES 128 +#define MAX_SAMPLERS 16 + +/************************************************************************** + * cl_driver: + * Hide behind some call backs the buffer allocation / deallocation ... This * will allow us to make the use of a software performance simulator easier and * to minimize the code specific for the HW and for the simulator - */ + **************************************************************************/ /* Encapsulates command buffer / data buffer / kernels */ typedef struct cl_buffer cl_buffer; @@ -36,13 +43,19 @@ typedef struct cl_buffer_mgr cl_buffer_mgr; /* Encapsulates the driver backend functionalities */ typedef struct cl_driver cl_driver; +/* Encapsulates the gpgpu stream of commands */ +typedef struct cl_gpgpu cl_gpgpu; + +/************************************************************************** + * Driver + **************************************************************************/ /* Create a new driver */ typedef cl_driver* (cl_driver_new_cb)(void); -extern cl_driver_new_cb cl_driver_new; +extern cl_driver_new_cb *cl_driver_new; /* Delete the driver */ -typedef cl_driver* (cl_driver_delete_cb)(void); -extern cl_driver_delete_cb cl_driver_delete; +typedef void (cl_driver_delete_cb)(cl_driver*); +extern cl_driver_delete_cb *cl_driver_delete; /* Get the buffer manager from the driver */ typedef cl_buffer_mgr* (cl_driver_get_bufmgr_cb)(cl_driver*); @@ -52,6 +65,116 @@ extern cl_driver_get_bufmgr_cb *cl_driver_get_bufmgr; typedef uint32_t (cl_driver_get_ver_cb)(cl_driver*); extern cl_driver_get_ver_cb *cl_driver_get_ver; +/************************************************************************** + * GPGPU command streamer + **************************************************************************/ +/* Describe texture tiling */ +typedef enum cl_gpgpu_tiling { + GPGPU_NO_TILE = 0, + GPGPU_TILE_X = 1, + GPGPU_TILE_Y = 2, +} cl_gpgpu_tiling_t; + +/* Cache control options */ +enum gen6_cache_control { + cc_gtt = 0x0, + cc_l3 = 0x1, + cc_llc = 0x2, + cc_llc_l3 = 0x3 +}; + + +/* Use this structure to bind kernels in the gpgpu state */ +typedef struct cl_gpgpu_kernel { + const char *name; /* kernel name and bo name */ + uint32_t grf_blocks; /* register blocks kernel wants (in 8 reg blocks) */ + uint32_t cst_sz; /* total size of all constants */ + cl_buffer *bo; /* kernel code in the proper addr space */ + int32_t barrierID; /* barrierID for _this_ kernel */ + uint32_t use_barrier:1; /* For gen7 (automatic barrier management) */ + uint32_t thread_n:15; /* For gen7 (automatic barrier management) */ + uint32_t slm_sz:16; /* For gen7 (automatic SLM allocation) */ +} cl_gpgpu_kernel_t; + +/* Create a new gpgpu state */ +typedef cl_gpgpu* (cl_gpgpu_new_cb)(cl_driver*); +extern cl_gpgpu_new_cb *cl_gpgpu_new; + +/* Delete the gpgpu state */ +typedef void (cl_gpgpu_delete_cb)(cl_gpgpu*); +extern cl_gpgpu_delete_cb *cl_gpgpu_delete; + +/* Bind a regular unformatted buffer */ +typedef void (cl_gpgpu_bind_buf_cb)(cl_gpgpu*, int32_t index, cl_buffer *, uint32_t cchint); +extern cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf; + +/* Set a 2d texture */ +typedef void (cl_gpgpu_bind_image2D_cb)(cl_gpgpu state, + int32_t index, + struct cl_buffer* obj_bo, + uint32_t format, + int32_t w, + int32_t h, + int pitch, + cl_gpgpu_tiling_t tiling); +extern cl_gpgpu_bind_image2D_cb *cl_gpgpu_bind_image2D; + +/* Configure internal state */ +typedef void (cl_gpgpu_state_init_cb)(cl_gpgpu*, uint32_t max_threads, uint32_t size_cs_entry); +extern cl_gpgpu_state_init_cb *cl_gpgpu_state_init; + +/* Set the buffer object where to report performance counters */ +typedef void (cl_gpgpu_set_perf_counters_cb)(cl_gpgpu*, struct cl_buffer *perf); +extern cl_gpgpu_set_perf_counters_cb *cl_gpgpu_set_perf_counters; + +/* Fills current constant buffer with data */ +typedef void (cl_gpgpu_upload_constants_cb)(cl_gpgpu*, const void* data, uint32_t size); +extern cl_gpgpu_upload_constants_cb *cl_gpgpu_upload_constants; + +/* Setup all indirect states */ +typedef void (cl_gpgpu_states_setup_cb)(cl_gpgpu*, cl_gpgpu_kernel_t* kernel, uint32_t ker_n); +extern cl_gpgpu_states_setup_cb *cl_gpgpu_states_setup; + +/* Make HW threads use barrierID */ +typedef void (cl_gpgpu_update_barrier_cb)(cl_gpgpu*, uint32_t barrierID, uint32_t thread_n); +extern cl_gpgpu_update_barrier_cb *cl_gpgpu_update_barrier; + +/* Upload the constant samplers as specified inside the OCL kernel */ +typedef void (cl_gpgpu_upload_samplers_cb)(cl_gpgpu *state, const void *data, uint32_t n); +extern cl_gpgpu_upload_samplers_cb *cl_gpgpu_upload_samplers; + +/* Set a sampler */ +typedef void (cl_gpgpu_set_sampler_cb)(cl_gpgpu*, uint32_t index, uint32_t non_normalized); +extern cl_gpgpu_set_sampler_cb *cl_gpgpu_set_sampler; + +/* Allocate the batch buffer and return the BO used for the batch buffer */ +typedef void (cl_gpgpu_batch_reset_cb)(cl_gpgpu*, size_t sz); +extern cl_gpgpu_batch_reset_cb *cl_gpgpu_batch_reset; + +/* Atomic begin, pipeline select, urb, pipeline state and constant buffer */ +typedef void (cl_gpgpu_batch_start_cb)(cl_gpgpu*); +extern cl_gpgpu_batch_start_cb *cl_gpgpu_batch_start; + +/* atomic end with possibly inserted flush */ +typedef void (cl_gpgpu_batch_end_cb)(cl_gpgpu*, int32_t flush_mode); +extern cl_gpgpu_batch_end_cb *cl_gpgpu_batch_end; + +/* Flush the command buffer */ +typedef void (cl_gpgpu_flush_cb)(cl_gpgpu *state); +extern cl_gpgpu_flush_cb *cl_gpgpu_flush; + +/* Will spawn all threads */ +typedef void (cl_gpgpu_walker_cb)(cl_gpgpu *state, + uint32_t simd_sz, + uint32_t thread_n, + const size_t global_wk_off[3], + const size_t global_wk_sz[3], + const size_t local_wk_sz[3]); +extern cl_gpgpu_walker_cb *cl_gpgpu_walker; + +/************************************************************************** + * Buffer + **************************************************************************/ /* Allocate a buffer */ typedef cl_buffer* (cl_buffer_alloc_cb)(cl_buffer_mgr*, const char*, unsigned long, unsigned long); extern cl_buffer_alloc_cb *cl_buffer_alloc; @@ -60,16 +183,24 @@ extern cl_buffer_alloc_cb *cl_buffer_alloc; typedef void (cl_buffer_unreference_cb)(cl_buffer*); extern cl_buffer_unreference_cb *cl_buffer_unreference; +/* Add one more ref on a buffer */ +typedef void (cl_buffer_reference_cb)(cl_buffer*); +extern cl_buffer_reference_cb *cl_buffer_reference; + /* Map a buffer */ -typedef void* (cl_buffer_map_cb)(cl_buffer*); +typedef int (cl_buffer_map_cb)(cl_buffer*, uint32_t write_enable); extern cl_buffer_map_cb *cl_buffer_map; /* Unmap a buffer */ -typedef void* (cl_buffer_unmap_cb)(cl_buffer*); +typedef int (cl_buffer_unmap_cb)(cl_buffer*); extern cl_buffer_unmap_cb *cl_buffer_unmap; +/* Get the virtual address (when mapped) */ +typedef void* (cl_buffer_get_virtual_cb)(cl_buffer*); +extern cl_buffer_get_virtual_cb *cl_buffer_get_virtual; + /* Pin a buffer */ -typedef int (cl_buffer_pin_cb)(cl_buffer*); +typedef int (cl_buffer_pin_cb)(cl_buffer*, uint32_t alignment); extern cl_buffer_pin_cb *cl_buffer_pin; /* Unpin a buffer */ @@ -84,5 +215,9 @@ extern cl_buffer_subdata_cb *cl_buffer_subdata; typedef int (cl_buffer_emit_reloc_cb) (cl_buffer *, uint32_t, cl_buffer*, uint32_t, uint32_t, uint32_t); extern cl_buffer_emit_reloc_cb *cl_buffer_emit_reloc; +/* Wait for all pending rendering for this buffer to complete */ +typedef int (cl_buffer_wait_rendering_cb) (cl_buffer *); +extern cl_buffer_wait_rendering_cb *cl_buffer_wait_rendering; + #endif /* __CL_BUFFER_H__ */ diff --git a/src/cl_image.c b/src/cl_image.c index 143c9fb..bba741d 100644 --- a/src/cl_image.c +++ b/src/cl_image.c @@ -19,11 +19,7 @@ #include "cl_image.h" #include "cl_utils.h" -#ifdef _PLASMA -#include "plasma/intel_defines.h" -#else #include "intel/intel_defines.h" -#endif #include diff --git a/src/cl_kernel.c b/src/cl_kernel.c index bc63fb8..6214aea 100644 --- a/src/cl_kernel.c +++ b/src/cl_kernel.c @@ -25,8 +25,6 @@ #include "cl_alloc.h" #include "cl_utils.h" #include "CL/cl.h" -#include "intel_bufmgr.h" -#include "intel/intel_gpgpu.h" #include "gen/program.h" #include @@ -45,8 +43,8 @@ cl_kernel_delete(cl_kernel k) if (atomic_dec(&k->ref_n) > 1) return; /* Release one reference on all bos we own */ - if (k->bo) drm_intel_bo_unreference(k->bo); - if (k->const_bo) drm_intel_bo_unreference(k->const_bo); + if (k->bo) cl_buffer_unreference(k->bo); + if (k->const_bo) cl_buffer_unreference(k->const_bo); /* This will be true for kernels created by clCreateKernel */ if (k->ref_its_program) cl_program_delete(k->program); @@ -104,15 +102,15 @@ LOCAL void cl_kernel_setup(cl_kernel k, const struct GenKernel *gen_kernel) { cl_context ctx = k->program->ctx; - drm_intel_bufmgr *bufmgr = cl_context_get_intel_bufmgr(ctx); + cl_buffer_mgr *bufmgr = cl_context_get_bufmgr(ctx); /* Allocate the gen code here */ const uint32_t code_sz = GenKernelGetCodeSize(gen_kernel); const char *code = GenKernelGetCode(gen_kernel); - k->bo = drm_intel_bo_alloc(bufmgr, "CL kernel", code_sz, 64u); + k->bo = cl_buffer_alloc(bufmgr, "CL kernel", code_sz, 64u); /* Upload the code */ - drm_intel_bo_subdata(k->bo, 0, code_sz, code); + cl_buffer_subdata(k->bo, 0, code_sz, code); k->gen_kernel = gen_kernel; } @@ -132,8 +130,8 @@ cl_kernel_dup(const cl_kernel from) to->program = from->program; /* Retain the bos */ - if (from->bo) drm_intel_bo_reference(from->bo); - if (from->const_bo) drm_intel_bo_reference(from->const_bo); + if (from->bo) cl_buffer_reference(from->bo); + if (from->const_bo) cl_buffer_reference(from->const_bo); /* We retain the program destruction since this kernel (user allocated) * depends on the program for some of its pointers diff --git a/src/cl_kernel.h b/src/cl_kernel.h index f5a7876..fbc17ca 100644 --- a/src/cl_kernel.h +++ b/src/cl_kernel.h @@ -22,6 +22,7 @@ #include "cl_defs.h" #include "cl_internals.h" +#include "cl_driver.h" #include "CL/cl.h" #include @@ -34,8 +35,8 @@ struct GenKernel; struct _cl_kernel { uint64_t magic; /* To identify it as a kernel */ volatile int ref_n; /* We reference count this object */ - struct _drm_intel_bo *bo; /* The code itself */ - struct _drm_intel_bo *const_bo; /* Buffer for all __constants values in the OCL program */ + struct cl_buffer *bo; /* The code itself */ + struct cl_buffer *const_bo; /* Buffer for all __constants values in the OCL program */ cl_program program; /* Owns this structure (and pointers) */ const struct GenKernel *gen_kernel; /* (Opaque) compiler structure for the OCL kernel */ uint8_t ref_its_program; /* True only for the user kernel (those created by clCreateKernel) */ diff --git a/src/cl_mem.c b/src/cl_mem.c index a1aa53b..2d29b9d 100644 --- a/src/cl_mem.c +++ b/src/cl_mem.c @@ -23,11 +23,7 @@ #include "cl_utils.h" #include "cl_alloc.h" #include "cl_device_id.h" - -#include "intel/intel_driver.h" -#include "intel/intel_gpgpu.h" -#include "intel_bufmgr.h" /* libdrm_intel */ -#include "cl_buffer.h" +#include "cl_driver.h" #include "CL/cl.h" #include "CL/cl_intel.h" @@ -41,7 +37,7 @@ cl_mem_allocate(cl_context ctx, cl_int is_tiled, cl_int *errcode) { - drm_intel_bufmgr *bufmgr = NULL; + cl_buffer_mgr *bufmgr = NULL; cl_mem mem = NULL; cl_int err = CL_SUCCESS; size_t alignment = 64; @@ -67,9 +63,9 @@ cl_mem_allocate(cl_context ctx, alignment = 4096; /* Allocate space in memory */ - bufmgr = cl_context_get_intel_bufmgr(ctx); + bufmgr = cl_context_get_bufmgr(ctx); assert(bufmgr); - mem->bo = drm_intel_bo_alloc(bufmgr, "CL memory object", sz, alignment); + mem->bo = cl_buffer_alloc(bufmgr, "CL memory object", sz, alignment); if (UNLIKELY(mem->bo == NULL)) { err = CL_MEM_ALLOCATION_FAILURE; goto error; @@ -119,7 +115,7 @@ cl_mem_new(cl_context ctx, /* Copy the data if required */ if (flags & CL_MEM_COPY_HOST_PTR) /* TODO check other flags too */ - drm_intel_bo_subdata(mem->bo, 0, sz, data); + cl_buffer_subdata(mem->bo, 0, sz, data); exit: if (errcode_ret) @@ -141,8 +137,8 @@ cl_mem_copy_data_linear(cl_mem mem, { size_t x, y, p; char *dst; - drm_intel_bo_map(mem->bo, 1); - dst = drm_intel_bo_get_virtual(mem->bo); + cl_buffer_map(mem->bo, 1); + dst = cl_buffer_get_virtual(mem->bo); for (y = 0; y < h; ++y) { char *src = (char*) data + pitch * y; for (x = 0; x < w; ++x) { @@ -152,7 +148,7 @@ cl_mem_copy_data_linear(cl_mem mem, src += bpp; } } - drm_intel_bo_unmap(mem->bo); + cl_buffer_unmap(mem->bo); } static const uint32_t tile_sz = 4096; /* 4KB per tile */ @@ -179,8 +175,8 @@ cl_mem_copy_data_tilex(cl_mem mem, char *img = NULL; char *end = (char*) data + pitch * h; - drm_intel_bo_map(mem->bo, 1); - img = drm_intel_bo_get_virtual(mem->bo); + cl_buffer_map(mem->bo, 1); + img = cl_buffer_get_virtual(mem->bo); for (tiley = 0; tiley < tiley_n; ++tiley) for (tilex = 0; tilex < tilex_n; ++tilex) { char *tile = img + (tilex + tiley * tilex_n) * tile_sz; @@ -193,7 +189,7 @@ cl_mem_copy_data_tilex(cl_mem mem, } } } - drm_intel_bo_unmap(mem->bo); + cl_buffer_unmap(mem->bo); } static void @@ -214,8 +210,8 @@ cl_mem_copy_data_tiley(cl_mem mem, char *img = NULL; char *end = (char*) data + pitch * h; - drm_intel_bo_map(mem->bo, 1); - img = drm_intel_bo_get_virtual(mem->bo); + cl_buffer_map(mem->bo, 1); + img = cl_buffer_get_virtual(mem->bo); for (tiley = 0; tiley < tiley_n; ++tiley) for (tilex = 0; tilex < tilex_n; ++tilex) { char *tile = img + (tiley * tilex_n + tilex) * tile_sz; @@ -229,7 +225,7 @@ cl_mem_copy_data_tiley(cl_mem mem, } } } - drm_intel_bo_unmap(mem->bo); + cl_buffer_unmap(mem->bo); } LOCAL cl_mem @@ -279,7 +275,7 @@ cl_mem_new_image2D(cl_context ctx, #undef DO_IMAGE_ERROR /* Pick up tiling mode (we do only linear on SNB) */ - if (ctx->intel_drv->gen_ver != 6) + if (cl_driver_get_ver(ctx->drv) != 6) tiling = CL_TILE_Y; /* Tiling requires to align both pitch and height */ @@ -336,7 +332,7 @@ cl_mem_delete(cl_mem mem) if (atomic_dec(&mem->ref_n) > 1) return; if (LIKELY(mem->bo != NULL)) - drm_intel_bo_unreference(mem->bo); + cl_buffer_unreference(mem->bo); /* Remove it from the list */ assert(mem->ctx); @@ -363,15 +359,15 @@ cl_mem_add_ref(cl_mem mem) LOCAL void* cl_mem_map(cl_mem mem) { - drm_intel_bo_map(mem->bo, 1); - assert(drm_intel_bo_get_virtual(mem->bo)); - return drm_intel_bo_get_virtual(mem->bo); + cl_buffer_map(mem->bo, 1); + assert(cl_buffer_get_virtual(mem->bo)); + return cl_buffer_get_virtual(mem->bo); } LOCAL cl_int cl_mem_unmap(cl_mem mem) { - drm_intel_bo_unmap(mem->bo); + cl_buffer_unmap(mem->bo); return CL_SUCCESS; } @@ -381,7 +377,7 @@ cl_mem_pin(cl_mem mem) assert(mem); if (UNLIKELY((mem->flags & CL_MEM_PINNABLE) == 0)) return CL_INVALID_MEM; - drm_intel_bo_pin(mem->bo, 4096); + cl_buffer_pin(mem->bo, 4096); return CL_SUCCESS; } @@ -391,7 +387,7 @@ cl_mem_unpin(cl_mem mem) assert(mem); if (UNLIKELY((mem->flags & CL_MEM_PINNABLE) == 0)) return CL_INVALID_MEM; - drm_intel_bo_unpin(mem->bo); + cl_buffer_unpin(mem->bo); return CL_SUCCESS; } diff --git a/src/cl_mem.h b/src/cl_mem.h index 1177a76..00dcd93 100644 --- a/src/cl_mem.h +++ b/src/cl_mem.h @@ -21,11 +21,9 @@ #define __CL_MEM_H__ #include "cl_internals.h" +#include "cl_driver.h" #include "CL/cl.h" -/* Store the object in video memory */ -struct _drm_intel_bo; - typedef enum cl_image_tiling { CL_NO_TILE = 0, CL_TILE_X = 1, @@ -36,7 +34,7 @@ typedef enum cl_image_tiling { struct _cl_mem { uint64_t magic; /* To identify it as a memory object */ volatile int ref_n; /* This object is reference counted */ - struct _drm_intel_bo *bo; /* Data in GPU memory */ + cl_buffer *bo; /* Data in GPU memory */ cl_mem prev, next; /* We chain the memory buffers together */ cl_context ctx; /* Context it belongs to */ cl_mem_flags flags; /* Flags specified at the creation time */ @@ -67,10 +65,10 @@ extern void cl_mem_delete(cl_mem); /* Add one more reference to this object */ extern void cl_mem_add_ref(cl_mem); -/* Directly map a memory object (just use drm_intel_bo_map) */ +/* Directly map a memory object */ extern void *cl_mem_map(cl_mem); -/* Unmap a memory object (just use drm_intel_bo_unmap) */ +/* Unmap a memory object */ extern cl_int cl_mem_unmap(cl_mem); /* Pin/unpin the buffer in memory (you must be root) */ diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c index f151d92..def75f1 100644 --- a/src/intel/intel_driver.c +++ b/src/intel/intel_driver.c @@ -18,6 +18,7 @@ */ #include "intel_driver.h" +#include "intel_gpgpu.h" #include "intel_batchbuffer.h" #include "x11/dricommon.h" @@ -32,6 +33,7 @@ #include "cl_utils.h" #include "cl_alloc.h" +#include "cl_driver.h" #include "cl_genx_driver.h" #define SET_BLOCKED_SIGSET(DRIVER) do { \ @@ -59,7 +61,6 @@ RESTORE_BLOCKED_SIGSET(DRIVER); \ } while (0) - LOCAL intel_driver_t* intel_driver_new(void) { @@ -340,18 +341,27 @@ intel_driver_get_ver(struct intel_driver *drv) return drv->gen_ver; } +LOCAL uint32_t drm_intel_bo_get_size(drm_intel_bo *bo) { return bo->size; } +LOCAL void* drm_intel_bo_get_virtual(drm_intel_bo *bo) { return bo->virtual; } + LOCAL void intel_setup_callbacks(void) { + cl_driver_new = (cl_driver_new_cb *) cl_intel_driver_new; + cl_driver_delete = (cl_driver_delete_cb *) cl_intel_driver_delete; + cl_driver_get_ver = (cl_driver_get_ver_cb *) intel_driver_get_ver; + cl_driver_get_bufmgr = (cl_driver_get_bufmgr_cb *) intel_driver_get_bufmgr; cl_buffer_alloc = (cl_buffer_alloc_cb *) drm_intel_bo_alloc; + cl_buffer_reference = (cl_buffer_reference_cb *) drm_intel_bo_reference; cl_buffer_unreference = (cl_buffer_unreference_cb *) drm_intel_bo_unreference; cl_buffer_map = (cl_buffer_map_cb *) drm_intel_bo_map; cl_buffer_unmap = (cl_buffer_unmap_cb *) drm_intel_bo_unmap; + cl_buffer_get_virtual = (cl_buffer_get_virtual_cb *) drm_intel_bo_unmap; cl_buffer_pin = (cl_buffer_pin_cb *) drm_intel_bo_pin; cl_buffer_unpin = (cl_buffer_unpin_cb *) drm_intel_bo_unpin; cl_buffer_subdata = (cl_buffer_subdata_cb *) drm_intel_bo_subdata; cl_buffer_emit_reloc = (cl_buffer_emit_reloc_cb *) drm_intel_bo_emit_reloc; - cl_driver_get_bufmgr = (cl_driver_get_bufmgr_cb *) intel_driver_get_bufmgr; - cl_driver_get_ver = (cl_driver_get_ver_cb *) intel_driver_get_ver; + cl_buffer_wait_rendering = (cl_buffer_wait_rendering_cb *) drm_intel_bo_wait_rendering; + intel_set_gpgpu_callbacks(); } diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c index 7f01f61..a82ba6e 100644 --- a/src/intel/intel_gpgpu.c +++ b/src/intel/intel_gpgpu.c @@ -58,7 +58,7 @@ struct intel_gpgpu { intel_driver_t *drv; intel_batchbuffer_t *batch; - genx_gpgpu_kernel_t *ker; + cl_gpgpu_kernel_t *ker; struct { dri_bo *bo; @@ -78,26 +78,9 @@ struct intel_gpgpu uint32_t max_threads; /* max threads requested by the user */ }; -LOCAL intel_gpgpu_t* -intel_gpgpu_new(intel_driver_t *drv) -{ - intel_gpgpu_t *state = NULL; +typedef struct intel_gpgpu intel_gpgpu_t; - TRY_ALLOC_NO_ERR (state, CALLOC(intel_gpgpu_t)); - state->drv = drv; - state->batch = intel_batchbuffer_new(state->drv); - assert(state->batch); - intel_batchbuffer_init(state->batch, state->drv); - -exit: - return state; -error: - intel_gpgpu_delete(state); - state = NULL; - goto exit; -} - -LOCAL void +static void intel_gpgpu_delete(intel_gpgpu_t *state) { if (state == NULL) @@ -118,8 +101,27 @@ intel_gpgpu_delete(intel_gpgpu_t *state) cl_free(state); } +static intel_gpgpu_t* +intel_gpgpu_new(intel_driver_t *drv) +{ + intel_gpgpu_t *state = NULL; + + TRY_ALLOC_NO_ERR (state, CALLOC(intel_gpgpu_t)); + state->drv = drv; + state->batch = intel_batchbuffer_new(state->drv); + assert(state->batch); + intel_batchbuffer_init(state->batch, state->drv); + +exit: + return state; +error: + intel_gpgpu_delete(state); + state = NULL; + goto exit; +} + static void -gpgpu_select_pipeline(intel_gpgpu_t *state) +intel_gpgpu_select_pipeline(intel_gpgpu_t *state) { BEGIN_BATCH(state->batch, 1); OUT_BATCH(state->batch, CMD_PIPELINE_SELECT | PIPELINE_SELECT_MEDIA); @@ -127,7 +129,7 @@ gpgpu_select_pipeline(intel_gpgpu_t *state) } static void -gpgpu_set_base_address(intel_gpgpu_t *state) +intel_gpgpu_set_base_address(intel_gpgpu_t *state) { const uint32_t def_cc = cc_llc_l3; /* default Cache Control value */ BEGIN_BATCH(state->batch, 10); @@ -162,7 +164,7 @@ gpgpu_set_base_address(intel_gpgpu_t *state) } static void -gpgpu_load_vfe_state(intel_gpgpu_t *state) +intel_gpgpu_load_vfe_state(intel_gpgpu_t *state) { BEGIN_BATCH(state->batch, 8); OUT_BATCH(state->batch, CMD_MEDIA_STATE_POINTERS | (8-2)); @@ -171,23 +173,13 @@ gpgpu_load_vfe_state(intel_gpgpu_t *state) intel_batchbuffer_alloc_space(state->batch,0); memset(vfe, 0, sizeof(struct gen6_vfe_state_inline)); - vfe->vfe1.gpgpu_mode = - (state->drv->gen_ver == 7 || state->drv->gen_ver == 75) ? 1 : 0; + vfe->vfe1.gpgpu_mode = 1; vfe->vfe1.bypass_gateway_ctl = 1; vfe->vfe1.reset_gateway_timer = 1; vfe->vfe1.max_threads = state->max_threads - 1; vfe->vfe1.urb_entries = 64; - if (state->drv->gen_ver >= 7) { - vfe->vfe3.curbe_size = 480; - vfe->vfe4.scoreboard_mask = 0; - } else { - vfe->vfe3.curbe_size = 63; - vfe->vfe3.urb_size = 13; - vfe->vfe4.scoreboard_mask = 0x80000000; - } - - //M - //p + vfe->vfe3.curbe_size = 480; + vfe->vfe4.scoreboard_mask = 0; //vfe->vfe3.urb_size = 13; //vfe->vfe4.scoreboard_mask = (state->drv->gen_ver == 7 || state->drv->gen_ver == 75) ? 0 : 0x80000000; intel_batchbuffer_alloc_space(state->batch, sizeof(gen6_vfe_state_inline_t)); @@ -195,7 +187,7 @@ gpgpu_load_vfe_state(intel_gpgpu_t *state) } static void -gpgpu_load_constant_buffer(intel_gpgpu_t *state) +intel_gpgpu_load_constant_buffer(intel_gpgpu_t *state) { BEGIN_BATCH(state->batch, 4); OUT_BATCH(state->batch, CMD(2,0,1) | (4 - 2)); /* length-2 */ @@ -213,7 +205,7 @@ gpgpu_load_constant_buffer(intel_gpgpu_t *state) } static void -gpgpu_load_idrt(intel_gpgpu_t *state) +intel_gpgpu_load_idrt(intel_gpgpu_t *state) { BEGIN_BATCH(state->batch, 4); OUT_BATCH(state->batch, CMD(2,0,2) | (4 - 2)); /* length-2 */ @@ -311,7 +303,7 @@ enum GFX3DSTATE_PIPELINED_SUBOPCODE }; static void -gpgpu_pipe_control(intel_gpgpu_t *state) +intel_gpgpu_pipe_control(intel_gpgpu_t *state) { BEGIN_BATCH(state->batch, SIZEOF32(gen6_pipe_control_t)); gen6_pipe_control_t* pc = (gen6_pipe_control_t*) @@ -328,7 +320,7 @@ gpgpu_pipe_control(intel_gpgpu_t *state) ADVANCE_BATCH(state->batch); } -LOCAL void +static void intel_gpgpu_set_L3(intel_gpgpu_t *state, uint32_t use_barrier) { BEGIN_BATCH(state->batch, 6); @@ -347,21 +339,21 @@ intel_gpgpu_set_L3(intel_gpgpu_t *state, uint32_t use_barrier) OUT_BATCH(state->batch, gpgpu_l3_config_reg2[4]); ADVANCE_BATCH(state->batch); - gpgpu_pipe_control(state); + intel_gpgpu_pipe_control(state); } -LOCAL void -gpgpu_batch_start(intel_gpgpu_t *state) +static void +intel_gpgpu_batch_start(intel_gpgpu_t *state) { intel_batchbuffer_start_atomic(state->batch, 256); - gpgpu_pipe_control(state); + intel_gpgpu_pipe_control(state); if (state->drv->gen_ver == 7 || state->drv->gen_ver == 75) intel_gpgpu_set_L3(state, state->ker->use_barrier); - gpgpu_select_pipeline(state); - gpgpu_set_base_address(state); - gpgpu_load_vfe_state(state); - gpgpu_load_constant_buffer(state); - gpgpu_load_idrt(state); + intel_gpgpu_select_pipeline(state); + intel_gpgpu_set_base_address(state); + intel_gpgpu_load_vfe_state(state); + intel_gpgpu_load_constant_buffer(state); + intel_gpgpu_load_idrt(state); if (state->perf_b.bo) { BEGIN_BATCH(state->batch, 3); @@ -378,8 +370,8 @@ gpgpu_batch_start(intel_gpgpu_t *state) } } -LOCAL void -gpgpu_batch_end(intel_gpgpu_t *state, int32_t flush_mode) +static void +intel_gpgpu_batch_end(intel_gpgpu_t *state, int32_t flush_mode) { /* Insert the performance counter command */ if (state->perf_b.bo) { @@ -396,26 +388,26 @@ gpgpu_batch_end(intel_gpgpu_t *state, int32_t flush_mode) ADVANCE_BATCH(state->batch); } - if(flush_mode) gpgpu_pipe_control(state); + if(flush_mode) intel_gpgpu_pipe_control(state); intel_batchbuffer_end_atomic(state->batch); } -LOCAL void -gpgpu_batch_reset(intel_gpgpu_t *state, size_t sz) +static void +intel_gpgpu_batch_reset(intel_gpgpu_t *state, size_t sz) { intel_batchbuffer_reset(state->batch, sz); } -LOCAL void -gpgpu_flush(intel_gpgpu_t *state) +static void +intel_gpgpu_flush(intel_gpgpu_t *state) { intel_batchbuffer_flush(state->batch); } -LOCAL void -gpgpu_state_init(intel_gpgpu_t *state, - uint32_t max_threads, - uint32_t size_cs_entry) +static void +intel_gpgpu_state_init(intel_gpgpu_t *state, + uint32_t max_threads, + uint32_t size_cs_entry) { dri_bo *bo; @@ -477,7 +469,7 @@ gpgpu_state_init(intel_gpgpu_t *state, } static void -gpgpu_set_buf_reloc_gen7(intel_gpgpu_t *state, int32_t index, dri_bo* obj_bo) +intel_gpgpu_set_buf_reloc_gen7(intel_gpgpu_t *state, int32_t index, dri_bo* obj_bo) { surface_heap_t *heap = state->surface_heap_b.bo->virtual; heap->binding_table[index] = offsetof(surface_heap_t, surface) + @@ -492,11 +484,11 @@ gpgpu_set_buf_reloc_gen7(intel_gpgpu_t *state, int32_t index, dri_bo* obj_bo) } static void -gpgpu_bind_buf_gen7(intel_gpgpu_t *state, - int32_t index, - dri_bo* obj_bo, - uint32_t size, - uint32_t cchint) +intel_gpgpu_bind_buf_gen7(intel_gpgpu_t *state, + int32_t index, + dri_bo* obj_bo, + uint32_t size, + uint32_t cchint) { surface_heap_t *heap = state->surface_heap_b.bo->virtual; gen7_surface_state_t *ss = (gen7_surface_state_t *) heap->surface[index]; @@ -509,18 +501,18 @@ gpgpu_bind_buf_gen7(intel_gpgpu_t *state, ss->ss2.height = (size_ss & 0x1fff80) >> 7; /* bits 20:7 of size_ss */ ss->ss3.depth = (size_ss & 0xffe00000) >> 20; /* bits 27:21 of size_ss */ ss->ss5.cache_control = cc_llc_l3; - gpgpu_set_buf_reloc_gen7(state, index, obj_bo); + intel_gpgpu_set_buf_reloc_gen7(state, index, obj_bo); } static void -gpgpu_bind_image2D_gen7(intel_gpgpu_t *state, - int32_t index, - dri_bo* obj_bo, - uint32_t format, - int32_t w, - int32_t h, - int32_t pitch, - int32_t tiling) +intel_gpgpu_bind_image2D_gen7(intel_gpgpu_t *state, + int32_t index, + dri_bo* obj_bo, + uint32_t format, + int32_t w, + int32_t h, + int32_t pitch, + int32_t tiling) { surface_heap_t *heap = state->surface_heap_b.bo->virtual; gen7_surface_state_t *ss = (gen7_surface_state_t *) heap->surface[index]; @@ -539,48 +531,47 @@ gpgpu_bind_image2D_gen7(intel_gpgpu_t *state, ss->ss0.tiled_surface = 1; ss->ss0.tile_walk = I965_TILEWALK_YMAJOR; } - gpgpu_set_buf_reloc_gen7(state, index, obj_bo); + intel_gpgpu_set_buf_reloc_gen7(state, index, obj_bo); } -LOCAL void -gpgpu_bind_buf(intel_gpgpu_t *state, - int32_t index, - dri_bo* obj_bo, - uint32_t cchint) +static void +intel_gpgpu_bind_buf(intel_gpgpu_t *state, + int32_t index, + drm_intel_bo *obj_bo, + uint32_t cchint) { - uint32_t size = obj_bo->size; - + const uint32_t size = obj_bo->size; assert(index < MAX_SURFACES); if (state->drv->gen_ver == 7 || state->drv->gen_ver == 75) - gpgpu_bind_buf_gen7(state, index, obj_bo, size, cchint); + intel_gpgpu_bind_buf_gen7(state, index, obj_bo, size, cchint); else NOT_IMPLEMENTED; } -LOCAL void -gpgpu_bind_image2D(intel_gpgpu_t *state, - int32_t index, - dri_bo* obj_bo, - uint32_t format, - int32_t w, - int32_t h, - int32_t pitch, - gpgpu_tiling_t tiling) +static void +intel_gpgpu_bind_image2D(intel_gpgpu_t *state, + int32_t index, + cl_buffer *obj_bo, + uint32_t format, + int32_t w, + int32_t h, + int32_t pitch, + cl_gpgpu_tiling_t tiling) { assert(index < MAX_SURFACES); if (state->drv->gen_ver == 7 || state->drv->gen_ver == 75) - gpgpu_bind_image2D_gen7(state, index, obj_bo, format, w, h, pitch, tiling); + intel_gpgpu_bind_image2D_gen7(state, index, (drm_intel_bo*) obj_bo, format, w, h, pitch, tiling); else NOT_IMPLEMENTED; } static void -gpgpu_build_idrt(intel_gpgpu_t *state, - genx_gpgpu_kernel_t *kernel, - uint32_t ker_n) +intel_gpgpu_build_idrt(intel_gpgpu_t *state, + cl_gpgpu_kernel_t *kernel, + uint32_t ker_n) { gen6_interface_descriptor_t *desc; - dri_bo *bo; + drm_intel_bo *bo = NULL, *ker_bo = NULL; uint32_t i; bo = state->idrt_b.bo; @@ -590,7 +581,8 @@ gpgpu_build_idrt(intel_gpgpu_t *state, for (i = 0; i < ker_n; i++) { memset(desc, 0, sizeof(*desc)); - desc->desc0.kernel_start_pointer = kernel[i].bo->offset >> 6; /* reloc */ + ker_bo = (drm_intel_bo *) kernel[i].bo; + desc->desc0.kernel_start_pointer = ker_bo->offset >> 6; /* reloc */ desc->desc2.sampler_state_pointer = state->sampler_state_b.bo->offset >> 5; desc->desc3.binding_table_entry_count = 0; /* no prefetch */ desc->desc3.binding_table_pointer = 0; @@ -624,7 +616,7 @@ gpgpu_build_idrt(intel_gpgpu_t *state, I915_GEM_DOMAIN_INSTRUCTION, 0, 0, i * sizeof(*desc) + offsetof(gen6_interface_descriptor_t, desc0), - kernel[i].bo); + ker_bo); dri_bo_emit_reloc(bo, I915_GEM_DOMAIN_INSTRUCTION, 0, @@ -637,8 +629,8 @@ gpgpu_build_idrt(intel_gpgpu_t *state, dri_bo_unmap(bo); } -LOCAL void -gpgpu_upload_constants(intel_gpgpu_t *state, const void* data, uint32_t size) +static void +intel_gpgpu_upload_constants(intel_gpgpu_t *state, const void* data, uint32_t size) { unsigned char *constant_buffer = NULL; @@ -649,8 +641,8 @@ gpgpu_upload_constants(intel_gpgpu_t *state, const void* data, uint32_t size) dri_bo_unmap(state->curbe_b.bo); } -LOCAL void -gpgpu_upload_samplers(intel_gpgpu_t *state, const void *data, uint32_t n) +static void +intel_gpgpu_upload_samplers(intel_gpgpu_t *state, const void *data, uint32_t n) { if (n) { /*sizeof(gen6_sampler_state_t) == sizeof(gen7_surface_state_t) */ @@ -659,17 +651,17 @@ gpgpu_upload_samplers(intel_gpgpu_t *state, const void *data, uint32_t n) } } -LOCAL void -gpgpu_states_setup(intel_gpgpu_t *state, genx_gpgpu_kernel_t *kernel, uint32_t ker_n) +static void +intel_gpgpu_states_setup(intel_gpgpu_t *state, cl_gpgpu_kernel_t *kernel, uint32_t ker_n) { state->ker = kernel; - gpgpu_build_idrt(state, kernel, ker_n); + intel_gpgpu_build_idrt(state, kernel, ker_n); dri_bo_unmap(state->surface_heap_b.bo); dri_bo_unmap(state->sampler_state_b.bo); } -LOCAL void -gpgpu_update_barrier(intel_gpgpu_t *state, uint32_t barrierID, uint32_t thread_n) +static void +intel_gpgpu_update_barrier(intel_gpgpu_t *state, uint32_t barrierID, uint32_t thread_n) { BEGIN_BATCH(state->batch, 4); OUT_BATCH(state->batch, CMD_MEDIA_STATE_FLUSH | 0); @@ -679,52 +671,22 @@ gpgpu_update_barrier(intel_gpgpu_t *state, uint32_t barrierID, uint32_t thread_n ADVANCE_BATCH(state->batch); } -LOCAL void -gpgpu_set_perf_counters(intel_gpgpu_t *state, dri_bo *perf) +static void +intel_gpgpu_set_perf_counters(intel_gpgpu_t *state, cl_buffer *perf) { if (state->perf_b.bo) drm_intel_bo_unreference(state->perf_b.bo); - drm_intel_bo_reference(perf); - state->perf_b.bo = perf; -} - -LOCAL void -gpgpu_run(intel_gpgpu_t *state, int32_t ki) -{ - BEGIN_BATCH(state->batch, 6); - OUT_BATCH(state->batch, GEN_CMD_MEDIA_OBJECT | 5); - OUT_BATCH(state->batch, ki); /* kernel index + DBGOBJ=0 << 8 */ - OUT_BATCH(state->batch, 0); /* no children, no thread sync, no scoreboard, any half-slice, indirect data size = 0 */ - OUT_BATCH(state->batch, 0); /* no indirect data ptr */ - OUT_BATCH(state->batch, 0); /* scoreboard Y,X = 0 or just 0 in reg.0 */ - OUT_BATCH(state->batch, 0); /* scoreboard color, mask = 0 or just 0 in reg.1 */ - OUT_BATCH(state->batch, 0); /* scoreboard color, mask = 0 or just 0 in reg.1 */ - ADVANCE_BATCH(state->batch); -} - -LOCAL char* -gpgpu_run_with_inline(intel_gpgpu_t *state, int32_t ki, size_t sz) -{ - const uint32_t len = (uint32_t) (sz >> 2); - - assert(sz % sizeof(int32_t) == 0); - BEGIN_BATCH(state->batch, len + 6); - OUT_BATCH(state->batch, GEN_CMD_MEDIA_OBJECT | (len + 4)); - OUT_BATCH(state->batch, ki); /* kernel index + DBGOBJ=0 << 8 */ - OUT_BATCH(state->batch, 0); /* no children, no thread sync, no scoreboard, any half-slice, indirect data size = 0 */ - OUT_BATCH(state->batch, 0); /* no indirect data ptr */ - OUT_BATCH(state->batch, 0); /* scoreboard Y,X = 0 or just 0 in reg.0 */ - OUT_BATCH(state->batch, 0); /* scoreboard color, mask = 0 or just 0 in reg.1 */ - return (char*) intel_batchbuffer_alloc_space(state->batch,sz); + drm_intel_bo_reference((drm_intel_bo*) perf); + state->perf_b.bo = (drm_intel_bo*) perf; } -LOCAL void -gpgpu_walker(intel_gpgpu_t *state, - uint32_t simd_sz, - uint32_t thread_n, - const size_t global_wk_off[3], - const size_t global_wk_sz[3], - const size_t local_wk_sz[3]) +static void +intel_gpgpu_walker(intel_gpgpu_t *state, + uint32_t simd_sz, + uint32_t thread_n, + const size_t global_wk_off[3], + const size_t global_wk_sz[3], + const size_t local_wk_sz[3]) { const uint32_t global_wk_dim[3] = { global_wk_sz[0] / local_wk_sz[0], @@ -755,17 +717,23 @@ gpgpu_walker(intel_gpgpu_t *state, ADVANCE_BATCH(state->batch); } -LOCAL int32_t -intel_gpgpu_version(intel_gpgpu_t *gpgpu) -{ - return gpgpu->drv->gen_ver; +LOCAL void +intel_set_gpgpu_callbacks(void) +{ + cl_gpgpu_new = (cl_gpgpu_new_cb *) intel_gpgpu_new; + cl_gpgpu_delete = (cl_gpgpu_delete_cb *) intel_gpgpu_delete; + cl_gpgpu_bind_image2D = (cl_gpgpu_bind_image2D_cb *) intel_gpgpu_bind_image2D; + cl_gpgpu_bind_buf = (cl_gpgpu_bind_buf_cb *) intel_gpgpu_bind_buf; + cl_gpgpu_state_init = (cl_gpgpu_state_init_cb *) intel_gpgpu_state_init; + cl_gpgpu_set_perf_counters = (cl_gpgpu_set_perf_counters_cb *) intel_gpgpu_set_perf_counters; + cl_gpgpu_upload_constants = (cl_gpgpu_upload_constants_cb *) intel_gpgpu_upload_constants; + cl_gpgpu_states_setup = (cl_gpgpu_states_setup_cb *) intel_gpgpu_states_setup; + cl_gpgpu_update_barrier = (cl_gpgpu_update_barrier_cb *) intel_gpgpu_update_barrier; + cl_gpgpu_upload_samplers = (cl_gpgpu_upload_samplers_cb *) intel_gpgpu_upload_samplers; + cl_gpgpu_batch_reset = (cl_gpgpu_batch_reset_cb *) intel_gpgpu_batch_reset; + cl_gpgpu_batch_start = (cl_gpgpu_batch_start_cb *) intel_gpgpu_batch_start; + cl_gpgpu_batch_end = (cl_gpgpu_batch_end_cb *) intel_gpgpu_batch_end; + cl_gpgpu_flush = (cl_gpgpu_flush_cb *) intel_gpgpu_flush; + cl_gpgpu_walker = (cl_gpgpu_walker_cb *) intel_gpgpu_walker; } -uint32_t drm_intel_bo_get_size(drm_intel_bo *bo) -{ - return (bo->size); -} -void *drm_intel_bo_get_virtual(drm_intel_bo *bo) -{ - return (bo->virtual); -} diff --git a/src/intel/intel_gpgpu.h b/src/intel/intel_gpgpu.h index e3b8dfd..006a55b 100644 --- a/src/intel/intel_gpgpu.h +++ b/src/intel/intel_gpgpu.h @@ -21,126 +21,13 @@ #define __GENX_GPGPU_H__ #include "cl_utils.h" -#include "intel_bufmgr.h" +#include "cl_driver.h" #include #include -#define MAX_SURFACES 128 -#define MAX_SAMPLERS 16 - -enum gen6_cache_control { - cc_gtt = 0x0, - cc_l3 = 0x1, - cc_llc = 0x2, - cc_llc_l3 = 0x3 -}; - -/* Use this structure to bind kernels in the gpgpu state */ -typedef struct genx_gpgpu_kernel { - const char *name; /* kernel name and bo name */ - uint32_t grf_blocks; /* register blocks kernel wants (in 8 reg blocks) */ - uint32_t cst_sz; /* total size of all constants */ - struct _drm_intel_bo *bo;/* kernel code in the proper addr space */ - int32_t barrierID; /* barrierID for _this_ kernel */ - uint32_t use_barrier:1; /* For gen7 (automatic barrier management) */ - uint32_t thread_n:15; /* For gen7 (automatic barrier management) */ - uint32_t slm_sz:16; /* For gen7 (automatic SLM allocation) */ -} genx_gpgpu_kernel_t; - -/* Convenient abstraction of the device */ -struct intel_driver; - -/* Covenient way to talk to the device */ -typedef struct intel_gpgpu intel_gpgpu_t; - -/* Buffer object as exposed by drm_intel */ -struct _drm_intel_bo; - -/* Allocate and initialize a GPGPU state */ -extern intel_gpgpu_t* intel_gpgpu_new(struct intel_driver*); - -/* Destroy and deallocate a GPGPU state */ -extern void intel_gpgpu_delete(intel_gpgpu_t*); - -/* Get the device generation */ -extern int32_t intel_gpgpu_version(intel_gpgpu_t*); - -/* Buffer management wrapper APIs */ -extern uint32_t drm_intel_bo_get_size(drm_intel_bo *bo); -extern void *drm_intel_bo_get_virtual(drm_intel_bo *bo); - -/* Set typeless buffer descriptor in the current binding table */ -extern void gpgpu_bind_buf(intel_gpgpu_t*, - int32_t index, - struct _drm_intel_bo* obj_bo, - uint32_t cchint); - -typedef enum gpgpu_tiling { - GPGPU_NO_TILE = 0, - GPGPU_TILE_X = 1, - GPGPU_TILE_Y = 2, -} gpgpu_tiling_t; - -/* Set a 2d texture */ -extern void gpgpu_bind_image2D(intel_gpgpu_t *state, - int32_t index, - struct _drm_intel_bo* obj_bo, - uint32_t format, - int32_t w, - int32_t h, - int pitch, - gpgpu_tiling_t tiling); - -/* Configure state, size in 512-bit units */ -extern void gpgpu_state_init(intel_gpgpu_t*, uint32_t max_threads, uint32_t size_cs_entry); - -/* Set the buffer object where to report performance counters */ -extern void gpgpu_set_perf_counters(intel_gpgpu_t*, struct _drm_intel_bo *perf); - -/* Fills current constant buffer with data */ -extern void gpgpu_upload_constants(intel_gpgpu_t*, const void* data, uint32_t size); - -/* Setup all indirect states */ -extern void gpgpu_states_setup(intel_gpgpu_t*, genx_gpgpu_kernel_t* kernel, uint32_t ker_n); - -/* Make HW threads use barrierID */ -extern void gpgpu_update_barrier(intel_gpgpu_t*, uint32_t barrierID, uint32_t thread_n); - -/* Upload the constant samplers as specified inside the OCL kernel */ -extern void gpgpu_upload_samplers(intel_gpgpu_t *state, const void *data, uint32_t n); - -/* Set a sampler */ -extern void gpgpu_set_sampler(intel_gpgpu_t*, uint32_t index, uint32_t non_normalized); - -/* Allocate the batch buffer and return the BO used for the batch buffer */ -extern void gpgpu_batch_reset(intel_gpgpu_t*, size_t sz); - -/* Atomic begin, pipeline select, urb, pipeline state and constant buffer */ -extern void gpgpu_batch_start(intel_gpgpu_t*); - -/* atomic end with possibly inserted flush */ -extern void gpgpu_batch_end(intel_gpgpu_t*, int32_t flush_mode); - -/* Emit MI_FLUSH */ -extern void gpgpu_flush(intel_gpgpu_t*); - -/* Enqueue a MEDIA object with no inline data */ -extern void gpgpu_run(intel_gpgpu_t*, int32_t ki); - -/* Enqueue a MEDIA object with inline data to push afterward. Returns the - * pointer where to push. sz is the size of the data we are going to pass - */ -extern char* gpgpu_run_with_inline(intel_gpgpu_t*, int32_t ki, size_t sz); - -/* Will spawn all threads */ -extern void -gpgpu_walker(intel_gpgpu_t *state, - uint32_t simd_sz, - uint32_t thread_n, - const size_t global_wk_off[3], - const size_t global_wk_sz[3], - const size_t local_wk_sz[3]); +/* Set the gpgpu related call backs */ +extern void intel_set_gpgpu_callbacks(void); #endif /* __GENX_GPGPU_H__ */ diff --git a/src/sim/sim_buffer.c b/src/sim/sim_driver.c similarity index 94% rename from src/sim/sim_buffer.c rename to src/sim/sim_driver.c index 820c770..5e5cfad 100644 --- a/src/sim/sim_buffer.c +++ b/src/sim/sim_driver.c @@ -19,13 +19,13 @@ #include "cl_utils.h" #include "cl_alloc.h" -#include "sim/sim_buffer.h" +#include "sim/sim_driver.h" #include "CL/cl.h" #include #include #include -#include "cl_buffer.h" +#include "cl_driver.h" /* Just to count allocations */ typedef struct sim_bufmgr { volatile int buf_n; } sim_bufmgr_t; @@ -81,13 +81,6 @@ sim_buffer_unreference(sim_buffer_t *buf) sim_buffer_delete(buf); } -static void* -sim_buffer_map(sim_buffer_t *buf) -{ - assert(buf); - return buf->data; -} - static int sim_buffer_subdata(sim_buffer_t *buf, unsigned long offset, unsigned long size, const void *data) { @@ -107,8 +100,10 @@ sim_buffer_emit_reloc(sim_buffer_t *buf, { return 1; } + +static int sim_buffer_map(sim_buffer_t *buf, uint32_t write_enable) {return 0;} static int sim_buffer_unmap(sim_buffer_t *buf) {return 0;} -static int sim_buffer_pin(sim_buffer_t *buf) {return 0;} +static int sim_buffer_pin(sim_buffer_t *buf, uint32_t alignment) {return 0;} static int sim_buffer_unpin(sim_buffer_t *buf) {return 0;} LOCAL void diff --git a/src/sim/sim_buffer.h b/src/sim/sim_driver.h similarity index 91% rename from src/sim/sim_buffer.h rename to src/sim/sim_driver.h index 230d89c..f80f234 100644 --- a/src/sim/sim_buffer.h +++ b/src/sim/sim_driver.h @@ -17,11 +17,12 @@ * Author: Benjamin Segovia */ -#ifndef __SIM_BUFFER_H__ -#define __SIM_BUFFER_H__ +#ifndef __SIM_DRIVER_H__ +#define __SIM_DRIVER_H__ /* Setup the call backs when using the simulator */ extern void sim_setup_callbacks(void); -#endif /* __SIM_BUFFER_H__ */ +#endif /* __SIM_DRIVER_H__ */ + -- 2.7.4