From 68cec04a5b9b44bf636b3eb0342cc2fac596814b Mon Sep 17 00:00:00 2001 From: Zhigang Gong Date: Thu, 3 Jul 2014 11:33:10 +0800 Subject: [PATCH] runtime: fix a gpgpu event and thread local gpgpu handling bug. When pending a command queue, we need to record the whole gpgpu structure not just the batch buffer. For the following reason: 1. We need to keep those private buffer, for example those printf buffers. 2. We need to make sure this gpgpu will not be reused by other enqueuement. v2: Don't try to flush all user event attached to the queue. Just need to flush the current event when doing command queue flush. Signed-off-by: Zhigang Gong Reviewed-by: "Yang, Rong R" --- src/cl_api.c | 3 +- src/cl_command_queue.c | 14 +++++++-- src/cl_command_queue.h | 4 +++ src/cl_driver.h | 8 ++---- src/cl_driver_defs.c | 4 +-- src/cl_enqueue.c | 2 +- src/cl_event.c | 26 ++++++++++++++--- src/cl_event.h | 7 +++-- src/cl_thread.c | 20 +++++++++++++ src/cl_thread.h | 3 ++ src/intel/intel_batchbuffer.c | 13 --------- src/intel/intel_batchbuffer.h | 1 - src/intel/intel_gpgpu.c | 66 +++++++++++++++++-------------------------- 13 files changed, 97 insertions(+), 74 deletions(-) diff --git a/src/cl_api.c b/src/cl_api.c index d54ada6..8759027 100644 --- a/src/cl_api.c +++ b/src/cl_api.c @@ -69,7 +69,7 @@ handle_events(cl_command_queue queue, cl_int num, const cl_event *wait_list, cl_event* event, enqueue_data* data, cl_command_type type) { cl_int status = cl_event_wait_events(num, wait_list, queue); - cl_event e; + cl_event e = NULL; if(event != NULL || status == CL_ENQUEUE_EXECUTE_DEFER) { e = cl_event_new(queue->ctx, queue, type, event!=NULL); @@ -85,6 +85,7 @@ handle_events(cl_command_queue queue, cl_int num, const cl_event *wait_list, cl_event_new_enqueue_callback(e, data, num, wait_list); } } + queue->current_event = e; return status; } diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c index 3184acb..d9718bf 100644 --- a/src/cl_command_queue.c +++ b/src/cl_command_queue.c @@ -28,6 +28,7 @@ #include "cl_alloc.h" #include "cl_driver.h" #include "cl_khr_icd.h" +#include "cl_event.h" #include "performance.h" #include @@ -421,10 +422,9 @@ error: return err; } -LOCAL cl_int -cl_command_queue_flush(cl_command_queue queue) +LOCAL void +cl_command_queue_flush_gpgpu(cl_command_queue queue, cl_gpgpu gpgpu) { - GET_QUEUE_THREAD_GPGPU(queue); size_t global_wk_sz[3]; void* printf_info = cl_gpgpu_get_printf_info(gpgpu, global_wk_sz); @@ -447,7 +447,15 @@ cl_command_queue_flush(cl_command_queue queue) global_wk_sz[0] = global_wk_sz[1] = global_wk_sz[2] = 0; cl_gpgpu_set_printf_info(gpgpu, NULL, global_wk_sz); } +} +LOCAL cl_int +cl_command_queue_flush(cl_command_queue queue) +{ + GET_QUEUE_THREAD_GPGPU(queue); + cl_command_queue_flush_gpgpu(queue, gpgpu); + if (queue->current_event) + cl_event_flush(queue->current_event); cl_invalid_thread_gpgpu(queue); return CL_SUCCESS; } diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h index b79d63a..bd70f25 100644 --- a/src/cl_command_queue.h +++ b/src/cl_command_queue.h @@ -41,6 +41,7 @@ struct _cl_command_queue { cl_int wait_events_num; /* Number of Non-complete user events */ cl_int wait_events_size; /* The size of array that wait_events point to */ cl_event last_event; /* The last event in the queue, for enqueue mark used */ + cl_event current_event; /* Current event. */ cl_command_queue_properties props; /* Queue properties */ cl_command_queue prev, next; /* We chain the command queues together */ void *thread_data; /* Used to store thread context data */ @@ -82,6 +83,9 @@ cl_int cl_command_queue_set_fulsim_buffer(cl_command_queue, cl_mem); /* Flush for the command queue */ extern cl_int cl_command_queue_flush(cl_command_queue); +/* Flush for the specified gpgpu */ +extern void cl_command_queue_flush_gpgpu(cl_command_queue, cl_gpgpu); + /* Wait for the completion of the command queue */ extern cl_int cl_command_queue_finish(cl_command_queue); diff --git a/src/cl_driver.h b/src/cl_driver.h index 2999eb7..3d1d8d8 100644 --- a/src/cl_driver.h +++ b/src/cl_driver.h @@ -197,13 +197,9 @@ extern cl_gpgpu_event_new_cb *cl_gpgpu_event_new; typedef int (cl_gpgpu_event_update_status_cb)(cl_gpgpu_event, int); extern cl_gpgpu_event_update_status_cb *cl_gpgpu_event_update_status; -/* pending flush the batch buffer of this event */ -typedef void (cl_gpgpu_event_pending_cb)(cl_gpgpu, cl_gpgpu_event); -extern cl_gpgpu_event_pending_cb *cl_gpgpu_event_pending; - /* flush the batch buffer of this event */ -typedef void (cl_gpgpu_event_resume_cb)(cl_gpgpu_event); -extern cl_gpgpu_event_resume_cb *cl_gpgpu_event_resume; +typedef void (cl_gpgpu_event_flush_cb)(cl_gpgpu_event); +extern cl_gpgpu_event_flush_cb *cl_gpgpu_event_flush; /* cancel exec batch buffer of this event */ typedef void (cl_gpgpu_event_cancel_cb)(cl_gpgpu_event); diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c index c9385c4..72f25d9 100644 --- a/src/cl_driver_defs.c +++ b/src/cl_driver_defs.c @@ -79,9 +79,7 @@ LOCAL cl_gpgpu_walker_cb *cl_gpgpu_walker = NULL; LOCAL cl_gpgpu_bind_sampler_cb *cl_gpgpu_bind_sampler = NULL; LOCAL cl_gpgpu_event_new_cb *cl_gpgpu_event_new = NULL; LOCAL cl_gpgpu_event_update_status_cb *cl_gpgpu_event_update_status = NULL; -LOCAL cl_gpgpu_event_pending_cb *cl_gpgpu_event_pending = NULL; -LOCAL cl_gpgpu_event_resume_cb *cl_gpgpu_event_resume = NULL; -LOCAL cl_gpgpu_event_cancel_cb *cl_gpgpu_event_cancel = NULL; +LOCAL cl_gpgpu_event_flush_cb *cl_gpgpu_event_flush = NULL; LOCAL cl_gpgpu_event_delete_cb *cl_gpgpu_event_delete = NULL; LOCAL cl_gpgpu_event_get_exec_timestamp_cb *cl_gpgpu_event_get_exec_timestamp = NULL; LOCAL cl_gpgpu_event_get_gpu_cur_timestamp_cb *cl_gpgpu_event_get_gpu_cur_timestamp = NULL; diff --git a/src/cl_enqueue.c b/src/cl_enqueue.c index 7fa44ff..af118ad 100644 --- a/src/cl_enqueue.c +++ b/src/cl_enqueue.c @@ -461,7 +461,7 @@ cl_int cl_enqueue_handle(cl_event event, enqueue_data* data) case EnqueueNDRangeKernel: case EnqueueFillBuffer: case EnqueueFillImage: - cl_gpgpu_event_resume((cl_gpgpu_event)data->ptr); + cl_event_flush(event); return CL_SUCCESS; case EnqueueNativeKernel: return cl_enqueue_native_kernel(data); diff --git a/src/cl_event.c b/src/cl_event.c index 1229820..d40881a 100644 --- a/src/cl_event.c +++ b/src/cl_event.c @@ -46,6 +46,17 @@ cl_event_is_gpu_command_type(cl_command_type type) } } +void cl_event_flush(cl_event event) +{ + assert(event->gpgpu_event != NULL); + if (event->gpgpu) { + cl_command_queue_flush_gpgpu(event->queue, event->gpgpu); + cl_gpgpu_delete(event->gpgpu); + event->gpgpu = NULL; + } + cl_gpgpu_event_flush(event->gpgpu_event); +} + cl_event cl_event_new(cl_context ctx, cl_command_queue queue, cl_command_type type, cl_bool emplict) { cl_event event = NULL; @@ -138,6 +149,11 @@ void cl_event_delete(cl_event event) pthread_mutex_unlock(&event->ctx->event_lock); cl_context_delete(event->ctx); + if (event->gpgpu) { + fprintf(stderr, "Warning: a event is deleted with a pending enqueued task.\n"); + cl_gpgpu_delete(event->gpgpu); + event->gpgpu = NULL; + } cl_free(event); } @@ -257,7 +273,6 @@ void cl_event_new_enqueue_callback(cl_event event, cl_command_queue queue = event->queue; cl_int i; cl_int err = CL_SUCCESS; - GET_QUEUE_THREAD_GPGPU(data->queue); /* Allocate and initialize the structure itself */ TRY_ALLOC_NO_ERR (cb, CALLOC(enqueue_callback)); @@ -347,7 +362,7 @@ void cl_event_new_enqueue_callback(cl_event event, } } if(data->queue != NULL && event->gpgpu_event != NULL) { - cl_gpgpu_event_pending(gpgpu, event->gpgpu_event); + event->gpgpu = cl_thread_gpgpu_take(event->queue); data->ptr = (void *)event->gpgpu_event; } cb->data = *data; @@ -397,8 +412,11 @@ void cl_event_set_status(cl_event event, cl_int status) if(event->gpgpu_event) cl_gpgpu_event_update_status(event->gpgpu_event, 1); //now set complet, need refine } else { - if(event->gpgpu_event) - cl_gpgpu_event_cancel(event->gpgpu_event); //Error cancel the enqueue + if(event->gpgpu_event) { + // Error then cancel the enqueued event. + cl_gpgpu_delete(event->gpgpu); + event->gpgpu = NULL; + } } event->status = status; //Change the event status after enqueue and befor unlock diff --git a/src/cl_event.h b/src/cl_event.h index bd8a5c7..3c23d74 100644 --- a/src/cl_event.h +++ b/src/cl_event.h @@ -63,6 +63,7 @@ struct _cl_event { cl_command_queue queue; /* The command queue associated with event */ cl_command_type type; /* The command type associated with event */ cl_int status; /* The execution status */ + cl_gpgpu gpgpu; /* Current gpgpu, owned by this structure. */ cl_gpgpu_event gpgpu_event; /* The event object communicate with hardware */ user_callback* user_cb; /* The event callback functions */ enqueue_callback* enqueue_cb; /* This event's enqueue */ @@ -95,9 +96,11 @@ cl_int cl_event_marker_with_wait_list(cl_command_queue, cl_uint, const cl_event cl_int cl_event_barrier_with_wait_list(cl_command_queue, cl_uint, const cl_event *, cl_event*); /* Do the event profiling */ cl_int cl_event_get_timestamp(cl_event event, cl_profiling_info param_name); -/*insert the user event*/ +/* insert the user event */ cl_int cl_event_insert_user_event(user_event** p_u_ev, cl_event event); -/*remove the user event*/ +/* remove the user event */ cl_int cl_event_remove_user_event(user_event** p_u_ev, cl_event event); +/* flush the event's pending gpgpu batch buffer and notify driver this gpgpu event has been flushed. */ +void cl_event_flush(cl_event event); #endif /* __CL_EVENT_H__ */ diff --git a/src/cl_thread.c b/src/cl_thread.c index 4692ed7..5713d70 100644 --- a/src/cl_thread.c +++ b/src/cl_thread.c @@ -209,6 +209,26 @@ void cl_invalid_thread_gpgpu(cl_command_queue queue) spec->valid = 0; } +cl_gpgpu cl_thread_gpgpu_take(cl_command_queue queue) +{ + queue_thread_private *thread_private = ((queue_thread_private *)(queue->thread_data)); + thread_spec_data* spec = NULL; + + pthread_mutex_lock(&thread_private->thread_data_lock); + spec = thread_private->threads_data[thread_id]; + assert(spec); + pthread_mutex_unlock(&thread_private->thread_data_lock); + + if (!spec->valid) + return NULL; + + assert(spec->gpgpu); + cl_gpgpu gpgpu = spec->gpgpu; + spec->gpgpu = NULL; + spec->valid = 0; + return gpgpu; +} + /* The destructor for clean the thread specific data. */ void cl_thread_data_destroy(cl_command_queue queue) { diff --git a/src/cl_thread.h b/src/cl_thread.h index bf855a2..ecc99ad 100644 --- a/src/cl_thread.h +++ b/src/cl_thread.h @@ -41,4 +41,7 @@ void cl_set_thread_batch_buf(cl_command_queue queue, void* buf); /* Used to get the batch buffer of each thread. */ void* cl_get_thread_batch_buf(cl_command_queue queue); +/* take current gpgpu from the thread gpgpu pool. */ +cl_gpgpu cl_thread_gpgpu_take(cl_command_queue queue); + #endif /* __CL_THREAD_H__ */ diff --git a/src/intel/intel_batchbuffer.c b/src/intel/intel_batchbuffer.c index d0da77a..7767db3 100644 --- a/src/intel/intel_batchbuffer.c +++ b/src/intel/intel_batchbuffer.c @@ -185,16 +185,3 @@ intel_batchbuffer_delete(intel_batchbuffer_t *batch) cl_free(batch); } - -LOCAL void -intel_batchbuffer_take(intel_batchbuffer_t *from, intel_batchbuffer_t *to) -{ - *to = *from; - //Need not unreference the buffer, to will unreference it. - from->buffer = NULL; - from->map = NULL; - from->ptr = NULL; - from->size = 0; - from->atomic = 0; - from->enable_slm = 0; -} diff --git a/src/intel/intel_batchbuffer.h b/src/intel/intel_batchbuffer.h index c62043e..0c3bc13 100644 --- a/src/intel/intel_batchbuffer.h +++ b/src/intel/intel_batchbuffer.h @@ -101,7 +101,6 @@ extern void intel_batchbuffer_init(intel_batchbuffer_t*, struct intel_driver*); extern void intel_batchbuffer_terminate(intel_batchbuffer_t*); extern void intel_batchbuffer_flush(intel_batchbuffer_t*); extern void intel_batchbuffer_reset(intel_batchbuffer_t*, size_t sz); -extern void intel_batchbuffer_take(intel_batchbuffer_t*, intel_batchbuffer_t *); static INLINE uint32_t intel_batchbuffer_space(const intel_batchbuffer_t *batch) diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c index a7c449d..d00bc83 100644 --- a/src/intel/intel_gpgpu.c +++ b/src/intel/intel_gpgpu.c @@ -60,9 +60,8 @@ typedef struct surface_heap { } surface_heap_t; typedef struct intel_event { - intel_batchbuffer_t *batch; - drm_intel_bo* buffer; - drm_intel_bo* ts_buf; + drm_intel_bo *buffer; + drm_intel_bo *ts_buf; int status; } intel_event_t; @@ -569,12 +568,19 @@ intel_gpgpu_check_binded_buf_address(intel_gpgpu_t *gpgpu) } static void +intel_gpgpu_flush_batch_buffer(intel_batchbuffer_t *batch) +{ + assert(batch); + intel_batchbuffer_emit_mi_flush(batch); + intel_batchbuffer_flush(batch); +} + +static void intel_gpgpu_flush(intel_gpgpu_t *gpgpu) { if (!gpgpu->batch || !gpgpu->batch->buffer) return; - intel_batchbuffer_emit_mi_flush(gpgpu->batch); - intel_batchbuffer_flush(gpgpu->batch); + intel_gpgpu_flush_batch_buffer(gpgpu->batch); intel_gpgpu_check_binded_buf_address(gpgpu); } @@ -1156,11 +1162,10 @@ intel_gpgpu_event_new(intel_gpgpu_t *gpgpu) intel_event_t *event = NULL; TRY_ALLOC_NO_ERR (event, CALLOC(intel_event_t)); - event->status = command_queued; - event->batch = NULL; event->buffer = gpgpu->batch->buffer; - if(event->buffer != NULL) + if (event->buffer) drm_intel_bo_reference(event->buffer); + event->status = command_queued; if(gpgpu->time_stamp_b.bo) { event->ts_buf = gpgpu->time_stamp_b.bo; @@ -1175,6 +1180,17 @@ error: goto exit; } +/* + The upper layer already flushed the batch buffer, just update + internal status to command_submitted. +*/ +static void +intel_gpgpu_event_flush(intel_event_t *event) +{ + assert(event->status == command_queued); + event->status = command_running; +} + static int intel_gpgpu_event_update_status(intel_event_t *event, int wait) { @@ -1182,7 +1198,7 @@ intel_gpgpu_event_update_status(intel_event_t *event, int wait) return event->status; if (event->buffer && - event->batch == NULL && //have flushed + event->status == command_running && !drm_intel_bo_busy(event->buffer)) { event->status = command_complete; drm_intel_bo_unreference(event->buffer); @@ -1203,36 +1219,8 @@ intel_gpgpu_event_update_status(intel_event_t *event, int wait) } static void -intel_gpgpu_event_pending(intel_gpgpu_t *gpgpu, intel_event_t *event) -{ - assert(event->buffer); //This is gpu enqueue command - assert(event->batch == NULL); //This command haven't pengding. - event->batch = intel_batchbuffer_new(gpgpu->drv); - assert(event->batch); - intel_batchbuffer_take(gpgpu->batch, event->batch); -} - -static void -intel_gpgpu_event_resume(intel_event_t *event) -{ - assert(event->batch); //This command have pending. - intel_batchbuffer_flush(event->batch); - intel_batchbuffer_delete(event->batch); - event->batch = NULL; -} - -static void -intel_gpgpu_event_cancel(intel_event_t *event) -{ - assert(event->batch); //This command have pending. - intel_batchbuffer_delete(event->batch); - event->batch = NULL; -} - -static void intel_gpgpu_event_delete(intel_event_t *event) { - assert(event->batch == NULL); //This command must have been flushed. if(event->buffer) drm_intel_bo_unreference(event->buffer); if(event->ts_buf) @@ -1412,10 +1400,8 @@ intel_set_gpgpu_callbacks(int device_id) cl_gpgpu_bind_sampler = (cl_gpgpu_bind_sampler_cb *) intel_gpgpu_bind_sampler; cl_gpgpu_set_scratch = (cl_gpgpu_set_scratch_cb *) intel_gpgpu_set_scratch; cl_gpgpu_event_new = (cl_gpgpu_event_new_cb *)intel_gpgpu_event_new; + cl_gpgpu_event_flush = (cl_gpgpu_event_flush_cb *)intel_gpgpu_event_flush; cl_gpgpu_event_update_status = (cl_gpgpu_event_update_status_cb *)intel_gpgpu_event_update_status; - cl_gpgpu_event_pending = (cl_gpgpu_event_pending_cb *)intel_gpgpu_event_pending; - cl_gpgpu_event_resume = (cl_gpgpu_event_resume_cb *)intel_gpgpu_event_resume; - cl_gpgpu_event_cancel = (cl_gpgpu_event_cancel_cb *)intel_gpgpu_event_cancel; cl_gpgpu_event_delete = (cl_gpgpu_event_delete_cb *)intel_gpgpu_event_delete; cl_gpgpu_event_get_exec_timestamp = (cl_gpgpu_event_get_exec_timestamp_cb *)intel_gpgpu_event_get_exec_timestamp; cl_gpgpu_event_get_gpu_cur_timestamp = (cl_gpgpu_event_get_gpu_cur_timestamp_cb *)intel_gpgpu_event_get_gpu_cur_timestamp; -- 2.7.4