From e9a9ac6f77fe876b84edfb638c3075d6a149c812 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Mon, 19 Apr 2021 15:20:15 -0700 Subject: [PATCH] freedreno/drm: Async submit support Move the submit ioctl to it's own thread to unblock the driver thread and let it move on to the next frame. Note that I did experiment with doing the append_bo() parts synchronously on the theory that we should be more likely to hit the fast path if we did that part of submit merging before the bo was potentially re-used in the next batch/submit. It helped some things by a couple percent, but hurt more things. Signed-off-by: Rob Clark Part-of: --- src/freedreno/drm/freedreno_device.c | 5 +- src/freedreno/drm/freedreno_ringbuffer.h | 9 ++- src/freedreno/drm/msm_device.c | 14 +++- src/freedreno/drm/msm_priv.h | 11 ++- src/freedreno/drm/msm_ringbuffer_sp.c | 102 ++++++++++++++++++++++-- src/gallium/drivers/freedreno/freedreno_fence.c | 37 ++++++++- src/gallium/drivers/freedreno/freedreno_fence.h | 6 ++ 7 files changed, 169 insertions(+), 15 deletions(-) diff --git a/src/freedreno/drm/freedreno_device.c b/src/freedreno/drm/freedreno_device.c index f9d4293..dc01f65 100644 --- a/src/freedreno/drm/freedreno_device.c +++ b/src/freedreno/drm/freedreno_device.c @@ -33,8 +33,7 @@ #include "freedreno_drmif.h" #include "freedreno_priv.h" -struct fd_device *kgsl_device_new(int fd); -struct fd_device *msm_device_new(int fd); +struct fd_device *msm_device_new(int fd, drmVersionPtr version); struct fd_device * fd_device_new(int fd) @@ -58,7 +57,7 @@ fd_device_new(int fd) goto out; } - dev = msm_device_new(fd); + dev = msm_device_new(fd, version); dev->version = version->version_minor; #if HAVE_FREEDRENO_KGSL } else if (!strcmp(version->name, "kgsl")) { diff --git a/src/freedreno/drm/freedreno_ringbuffer.h b/src/freedreno/drm/freedreno_ringbuffer.h index 3cf1343..8e6b3e9 100644 --- a/src/freedreno/drm/freedreno_ringbuffer.h +++ b/src/freedreno/drm/freedreno_ringbuffer.h @@ -30,7 +30,7 @@ #include #include "util/u_atomic.h" #include "util/u_debug.h" -#include "util/u_dynarray.h" +#include "util/u_queue.h" #include "adreno_common.xml.h" #include "adreno_pm4.xml.h" @@ -98,6 +98,13 @@ struct fd_ringbuffer *fd_submit_new_ringbuffer(struct fd_submit *submit, * out-fence-fd */ struct fd_submit_fence { + /** + * The ready fence is signaled once the submit is actually flushed down + * to the kernel, and fence/fence_fd are populated. You must wait for + * this fence to be signaled before reading fence/fence_fd. + */ + struct util_queue_fence ready; + struct fd_fence fence; /** diff --git a/src/freedreno/drm/msm_device.c b/src/freedreno/drm/msm_device.c index 7922ea0..6af709f 100644 --- a/src/freedreno/drm/msm_device.c +++ b/src/freedreno/drm/msm_device.c @@ -34,6 +34,9 @@ static void msm_device_destroy(struct fd_device *dev) { struct msm_device *msm_dev = to_msm_device(dev); + if (util_queue_is_initialized(&msm_dev->submit_queue)) { + util_queue_destroy(&msm_dev->submit_queue); + } free(msm_dev); } @@ -45,7 +48,7 @@ static const struct fd_device_funcs funcs = { }; struct fd_device * -msm_device_new(int fd) +msm_device_new(int fd, drmVersionPtr version) { struct msm_device *msm_dev; struct fd_device *dev; @@ -61,6 +64,15 @@ msm_device_new(int fd) dev = &msm_dev->base; dev->funcs = &funcs; + /* async submit_queue currently only used for msm_submit_sp: */ + if (version->version_minor >= FD_VERSION_SOFTPIN) { + /* Note the name is intentionally short to avoid the queue + * thread's comm truncating the interesting part of the + * process name. + */ + util_queue_init(&msm_dev->submit_queue, "sq", 8, 1, 0); + } + dev->bo_size = sizeof(struct msm_bo); return dev; diff --git a/src/freedreno/drm/msm_priv.h b/src/freedreno/drm/msm_priv.h index 76c7f2f..e4fe084 100644 --- a/src/freedreno/drm/msm_priv.h +++ b/src/freedreno/drm/msm_priv.h @@ -40,10 +40,11 @@ struct msm_device { struct fd_device base; struct fd_bo_cache ring_cache; + struct util_queue submit_queue; }; FD_DEFINE_CAST(fd_device, msm_device); -struct fd_device *msm_device_new(int fd); +struct fd_device *msm_device_new(int fd, drmVersionPtr version); struct msm_pipe { struct fd_pipe base; @@ -54,6 +55,14 @@ struct msm_pipe { uint32_t chip_id; uint32_t queue_id; struct slab_parent_pool ring_pool; + + /** + * The last fence seqno that was flushed to kernel (doesn't mean that it + * is complete, just that the kernel knows about it) + */ + uint32_t last_submit_fence; + + uint32_t last_enqueue_fence; /* just for debugging */ }; FD_DEFINE_CAST(fd_pipe, msm_pipe); diff --git a/src/freedreno/drm/msm_ringbuffer_sp.c b/src/freedreno/drm/msm_ringbuffer_sp.c index 1ebbc86..809c480 100644 --- a/src/freedreno/drm/msm_ringbuffer_sp.c +++ b/src/freedreno/drm/msm_ringbuffer_sp.c @@ -26,6 +26,7 @@ #include #include +#include #include "util/hash_table.h" #include "util/os_file.h" @@ -41,6 +42,14 @@ #define INIT_SIZE 0x1000 +/* In the pipe->flush() path, we don't have a util_queue_fence we can wait on, + * instead use a condition-variable. Note that pipe->flush() is not expected + * to be a common/hot path. + */ +static pthread_cond_t flush_cnd = PTHREAD_COND_INITIALIZER; +static pthread_mutex_t flush_mtx = PTHREAD_MUTEX_INITIALIZER; + + struct msm_submit_sp { struct fd_submit base; @@ -64,6 +73,13 @@ struct msm_submit_sp { */ int in_fence_fd; struct fd_submit_fence *out_fence; + + /* State for enqueued submits: + */ + struct list_head submit_list; /* includes this submit as last element */ + + /* Used in case out_fence==NULL: */ + struct util_queue_fence fence; }; FD_DEFINE_CAST(fd_submit, msm_submit_sp); @@ -369,14 +385,65 @@ flush_submit_list(struct list_head *submit_list) if (!bos_on_stack) free(submit_bos); + pthread_mutex_lock(&flush_mtx); + assert(fd_fence_before(msm_pipe->last_submit_fence, msm_submit->base.fence)); + msm_pipe->last_submit_fence = msm_submit->base.fence; + pthread_cond_broadcast(&flush_cnd); + pthread_mutex_unlock(&flush_mtx); + if (msm_submit->in_fence_fd != -1) close(msm_submit->in_fence_fd); - fd_submit_del(&msm_submit->base); - return ret; } +static void +msm_submit_sp_flush_execute(void *job, int thread_index) +{ + struct fd_submit *submit = job; + struct msm_submit_sp *msm_submit = to_msm_submit_sp(submit); + + flush_submit_list(&msm_submit->submit_list); + + DEBUG_MSG("finish: %u", submit->fence); +} + +static void +msm_submit_sp_flush_cleanup(void *job, int thread_index) +{ + struct fd_submit *submit = job; + fd_submit_del(submit); +} + +static int +enqueue_submit_list(struct list_head *submit_list) +{ + struct fd_submit *submit = last_submit(submit_list); + struct msm_submit_sp *msm_submit = to_msm_submit_sp(submit); + struct msm_device *msm_dev = to_msm_device(submit->pipe->dev); + + list_replace(submit_list, &msm_submit->submit_list); + list_inithead(submit_list); + + struct util_queue_fence *fence; + if (msm_submit->out_fence) { + fence = &msm_submit->out_fence->ready; + } else { + util_queue_fence_init(&msm_submit->fence); + fence = &msm_submit->fence; + } + + DEBUG_MSG("enqueue: %u", submit->fence); + + util_queue_add_job(&msm_dev->submit_queue, + submit, fence, + msm_submit_sp_flush_execute, + msm_submit_sp_flush_cleanup, + 0); + + return 0; +} + static bool should_defer(struct fd_submit *submit) { @@ -402,6 +469,7 @@ msm_submit_sp_flush(struct fd_submit *submit, int in_fence_fd, struct fd_submit_fence *out_fence) { struct fd_device *dev = submit->pipe->dev; + struct msm_pipe *msm_pipe = to_msm_pipe(submit->pipe); /* Acquire lock before flush_prep() because it is possible to race between * this and pipe->flush(): @@ -420,15 +488,16 @@ msm_submit_sp_flush(struct fd_submit *submit, int in_fence_fd, list_inithead(&dev->deferred_submits); dev->deferred_cmds = 0; - simple_mtx_unlock(&dev->submit_lock); - flush_submit_list(&submit_list); - simple_mtx_lock(&dev->submit_lock); + enqueue_submit_list(&submit_list); } list_addtail(&fd_submit_ref(submit)->node, &dev->deferred_submits); bool has_shared = msm_submit_sp_flush_prep(submit, in_fence_fd, out_fence); + assert(fd_fence_before(msm_pipe->last_enqueue_fence, submit->fence)); + msm_pipe->last_enqueue_fence = submit->fence; + /* If we don't need an out-fence, we can defer the submit. * * TODO we could defer submits with in-fence as well.. if we took our own @@ -436,6 +505,7 @@ msm_submit_sp_flush(struct fd_submit *submit, int in_fence_fd, * deferred submits */ if ((in_fence_fd == -1) && !out_fence && !has_shared && should_defer(submit)) { + DEBUG_MSG("defer: %u", submit->fence); dev->deferred_cmds += fd_ringbuffer_cmd_count(submit->primary); assert(dev->deferred_cmds == fd_dev_count_deferred_cmds(dev)); simple_mtx_unlock(&dev->submit_lock); @@ -451,19 +521,24 @@ msm_submit_sp_flush(struct fd_submit *submit, int in_fence_fd, simple_mtx_unlock(&dev->submit_lock); - return flush_submit_list(&submit_list); + return enqueue_submit_list(&submit_list); } void msm_pipe_sp_flush(struct fd_pipe *pipe, uint32_t fence) { + struct msm_pipe *msm_pipe = to_msm_pipe(pipe); struct fd_device *dev = pipe->dev; struct list_head submit_list; + DEBUG_MSG("flush: %u", fence); + list_inithead(&submit_list); simple_mtx_lock(&dev->submit_lock); + assert(!fd_fence_after(fence, msm_pipe->last_enqueue_fence)); + foreach_submit_safe (deferred_submit, &dev->deferred_submits) { /* We should never have submits from multiple pipes in the deferred * list. If we did, we couldn't compare their fence to our fence, @@ -485,9 +560,20 @@ msm_pipe_sp_flush(struct fd_pipe *pipe, uint32_t fence) simple_mtx_unlock(&dev->submit_lock); if (list_is_empty(&submit_list)) - return; + goto flush_sync; + + enqueue_submit_list(&submit_list); - flush_submit_list(&submit_list); +flush_sync: + /* Once we are sure that we've enqueued at least up to the requested + * submit, we need to be sure that submitq has caught up and flushed + * them to the kernel + */ + pthread_mutex_lock(&flush_mtx); + while (fd_fence_before(msm_pipe->last_submit_fence, fence)) { + pthread_cond_wait(&flush_cnd, &flush_mtx); + } + pthread_mutex_unlock(&flush_mtx); } static void diff --git a/src/gallium/drivers/freedreno/freedreno_fence.c b/src/gallium/drivers/freedreno/freedreno_fence.c index a4d4be9..5279efe 100644 --- a/src/gallium/drivers/freedreno/freedreno_fence.c +++ b/src/gallium/drivers/freedreno/freedreno_fence.c @@ -59,6 +59,16 @@ fence_flush(struct pipe_context *pctx, struct pipe_fence_handle *fence, } } + /* If after the pre-created unflushed fence is flushed, we end up + * re-populated to a previous last_fence, then *that* is the one + * whose submit_fence.ready we want to wait on: + */ + if (fence->last_fence) { + return fence_flush(pctx, fence->last_fence, timeout); + } + + util_queue_fence_wait(&fence->submit_fence.ready); + /* We've already waited for batch to be flushed and fence->batch * to be cleared: */ @@ -69,6 +79,8 @@ fence_flush(struct pipe_context *pctx, struct pipe_fence_handle *fence, if (fence->batch) fd_batch_flush(fence->batch); + util_queue_fence_wait(&fence->submit_fence.ready); + debug_assert(!fence->batch); return true; @@ -81,19 +93,37 @@ fd_fence_repopulate(struct pipe_fence_handle *fence, struct pipe_fence_handle *l * might have been) */ assert(!fence->submit_fence.use_fence_fd); + assert(!last_fence->batch); fence->submit_fence.fence = last_fence->submit_fence.fence; + + fd_fence_ref(&fence->last_fence, last_fence); + + /* We have nothing to flush, so nothing will clear the batch reference + * (which is normally done when the batch is flushed), so do it now: + */ + fd_fence_set_batch(fence, NULL); } static void fd_fence_destroy(struct pipe_fence_handle *fence) { + fd_fence_ref(&fence->last_fence, NULL); + tc_unflushed_batch_token_reference(&fence->tc_token, NULL); if (fence->submit_fence.use_fence_fd) close(fence->submit_fence.fence_fd); if (fence->syncobj) drmSyncobjDestroy(fd_device_fd(fence->screen->dev), fence->syncobj); fd_pipe_del(fence->pipe); + + /* TODO might be worth trying harder to avoid a potential stall here, + * but that would require the submit somehow holding a reference to + * the pipe_fence_handle.. and I'm not sure if it is a thing that is + * likely to matter much. + */ + util_queue_fence_wait(&fence->submit_fence.ready); + FREE(fence); } @@ -113,6 +143,9 @@ fd_fence_finish(struct pipe_screen *pscreen, struct pipe_context *pctx, if (!fence_flush(pctx, fence, timeout)) return false; + if (fence->last_fence) + fence = fence->last_fence; + if (fence->submit_fence.use_fence_fd) { int ret = sync_wait(fence->submit_fence.fence_fd, timeout / 1000000); return ret == 0; @@ -136,9 +169,10 @@ fence_create(struct fd_context *ctx, struct fd_batch *batch, int fence_fd, pipe_reference_init(&fence->reference, 1); util_queue_fence_init(&fence->ready); + util_queue_fence_init(&fence->submit_fence.ready); fence->ctx = ctx; - fence->batch = batch; + fd_fence_set_batch(fence, batch); fence->pipe = fd_pipe_ref(ctx->pipe); fence->screen = ctx->screen; fence->submit_fence.fence_fd = fence_fd; @@ -237,6 +271,7 @@ fd_fence_set_batch(struct pipe_fence_handle *fence, struct fd_batch *batch) if (batch) { assert(!fence->batch); fence->batch = batch; + batch->needs_flush = true; } else { fence->batch = NULL; diff --git a/src/gallium/drivers/freedreno/freedreno_fence.h b/src/gallium/drivers/freedreno/freedreno_fence.h index 947ac97..4d9efb8 100644 --- a/src/gallium/drivers/freedreno/freedreno_fence.h +++ b/src/gallium/drivers/freedreno/freedreno_fence.h @@ -35,6 +35,12 @@ struct pipe_fence_handle { struct pipe_reference reference; + /* When a pre-created unflushed fence has no actual rendering to flush, and + * the last_fence optimization is used, this will be a reference to the + * *actualy* fence which needs to be flushed before waiting. + */ + struct pipe_fence_handle *last_fence; + /* fence holds a weak reference to the batch until the batch is flushed, to * accommodate PIPE_FLUSH_DEFERRED. When the batch is actually flushed, it * is cleared (before the batch reference is dropped). If we need to wait -- 2.7.4