From cccdc513e3ee58b4119e9686c3086b606500d6a9 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Sat, 17 Apr 2021 12:51:02 -0700 Subject: [PATCH] freedreno/drm/sp: Implement deferred submit merging For submits flushed with (a) no required fence, and (b) no externally visible effects (ie. imported/exported bo), we can defer flushing the submit and merge it into a later submit. This is a bit more work in userspace, but it cuts down the number of submit ioctls. And a common case is that later submits overlap in the bo's used (for example, blit upload to a buffer, which is then used in the following draw pass), so it reduces the net amount of work needed to be done in the kernel to handle the submit ioctl. Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/19 Signed-off-by: Rob Clark Part-of: --- src/freedreno/drm/msm_pipe.c | 1 + src/freedreno/drm/msm_priv.h | 1 + src/freedreno/drm/msm_ringbuffer_sp.c | 234 +++++++++++++++++++++++++++++----- 3 files changed, 204 insertions(+), 32 deletions(-) diff --git a/src/freedreno/drm/msm_pipe.c b/src/freedreno/drm/msm_pipe.c index 8de3c5c..8793494 100644 --- a/src/freedreno/drm/msm_pipe.c +++ b/src/freedreno/drm/msm_pipe.c @@ -177,6 +177,7 @@ msm_pipe_destroy(struct fd_pipe *pipe) static const struct fd_pipe_funcs sp_funcs = { .ringbuffer_new_object = msm_ringbuffer_sp_new_object, .submit_new = msm_submit_sp_new, + .flush = msm_pipe_sp_flush, .get_param = msm_pipe_get_param, .wait = msm_pipe_wait, .destroy = msm_pipe_destroy, diff --git a/src/freedreno/drm/msm_priv.h b/src/freedreno/drm/msm_priv.h index c04fa79..76c7f2f 100644 --- a/src/freedreno/drm/msm_priv.h +++ b/src/freedreno/drm/msm_priv.h @@ -67,6 +67,7 @@ struct fd_ringbuffer *msm_ringbuffer_sp_new_object(struct fd_pipe *pipe, struct fd_submit *msm_submit_new(struct fd_pipe *pipe); struct fd_submit *msm_submit_sp_new(struct fd_pipe *pipe); +void msm_pipe_sp_flush(struct fd_pipe *pipe, uint32_t fence); void msm_pipe_sp_ringpool_init(struct msm_pipe *msm_pipe); void msm_pipe_sp_ringpool_fini(struct msm_pipe *msm_pipe); diff --git a/src/freedreno/drm/msm_ringbuffer_sp.c b/src/freedreno/drm/msm_ringbuffer_sp.c index e6c8418..f8becaf 100644 --- a/src/freedreno/drm/msm_ringbuffer_sp.c +++ b/src/freedreno/drm/msm_ringbuffer_sp.c @@ -28,6 +28,7 @@ #include #include "util/hash_table.h" +#include "util/os_file.h" #include "util/slab.h" #include "drm/freedreno_ringbuffer.h" @@ -57,6 +58,12 @@ struct msm_submit_sp { * so we can reclaim extra space at it's end. */ struct fd_ringbuffer *suballoc_ring; + + /* Flush args, potentially attached to the last submit in the list + * of submits to merge: + */ + int in_fence_fd; + struct fd_submit_fence *out_fence; }; FD_DEFINE_CAST(fd_submit, msm_submit_sp); @@ -108,7 +115,7 @@ msm_submit_append_bo(struct msm_submit_sp *submit, struct fd_bo *bo) /* NOTE: it is legal to use the same bo on different threads for * different submits. But it is not legal to use the same submit - * from given threads. + * from different threads. */ idx = READ_ONCE(msm_bo->idx); @@ -213,10 +220,12 @@ msm_submit_sp_new_ringbuffer(struct fd_submit *submit, uint32_t size, * 2) Add cmdstream bos to bos table * 3) Update bo fences */ -static void -msm_submit_sp_flush_prep(struct fd_submit *submit) +static bool +msm_submit_sp_flush_prep(struct fd_submit *submit, int in_fence_fd, + struct fd_submit_fence *out_fence) { struct msm_submit_sp *msm_submit = to_msm_submit_sp(submit); + bool has_shared = false; finalize_current_cmd(submit->primary); @@ -227,43 +236,93 @@ msm_submit_sp_flush_prep(struct fd_submit *submit) msm_submit_append_bo(msm_submit, primary->u.cmds[i].ring_bo); simple_mtx_lock(&table_lock); - for (unsigned i = 0; i < msm_submit->nr_bos; i++) + for (unsigned i = 0; i < msm_submit->nr_bos; i++) { fd_bo_add_fence(msm_submit->bos[i], submit->pipe, submit->fence); + has_shared |= msm_submit->bos[i]->shared; + } simple_mtx_unlock(&table_lock); + + msm_submit->out_fence = out_fence; + msm_submit->in_fence_fd = (in_fence_fd == -1) ? + -1 : os_dupfd_cloexec(in_fence_fd); + + return has_shared; } static int -msm_submit_sp_flush_finish(struct fd_submit *submit, int in_fence_fd, - struct fd_submit_fence *out_fence) +flush_submit_list(struct list_head *submit_list) { - struct msm_submit_sp *msm_submit = to_msm_submit_sp(submit); - struct msm_pipe *msm_pipe = to_msm_pipe(submit->pipe); + struct msm_submit_sp *msm_submit = to_msm_submit_sp(last_submit(submit_list)); + struct msm_pipe *msm_pipe = to_msm_pipe(msm_submit->base.pipe); struct drm_msm_gem_submit req = { .flags = msm_pipe->pipe, .queueid = msm_pipe->queue_id, }; int ret; - struct msm_ringbuffer_sp *primary = - to_msm_ringbuffer_sp(submit->primary); - struct drm_msm_gem_submit_cmd cmds[primary->u.nr_cmds]; - - for (unsigned i = 0; i < primary->u.nr_cmds; i++) { - cmds[i].type = MSM_SUBMIT_CMD_BUF; - cmds[i].submit_idx = - msm_submit_append_bo(msm_submit, primary->u.cmds[i].ring_bo); - cmds[i].submit_offset = primary->offset; - cmds[i].size = primary->u.cmds[i].size; - cmds[i].pad = 0; - cmds[i].nr_relocs = 0; + unsigned nr_cmds = 0; + + /* Determine the number of extra cmds's from deferred submits that + * we will be merging in: + */ + foreach_submit (submit, submit_list) { + assert(submit->pipe == &msm_pipe->base); + nr_cmds += to_msm_ringbuffer_sp(submit->primary)->u.nr_cmds; + } + + struct drm_msm_gem_submit_cmd cmds[nr_cmds]; + + unsigned cmd_idx = 0; + + /* Build up the table of cmds, and for all but the last submit in the + * list, merge their bo tables into the last submit. + */ + foreach_submit_safe (submit, submit_list) { + struct msm_ringbuffer_sp *deferred_primary = + to_msm_ringbuffer_sp(submit->primary); + + for (unsigned i = 0; i < deferred_primary->u.nr_cmds; i++) { + cmds[cmd_idx].type = MSM_SUBMIT_CMD_BUF; + cmds[cmd_idx].submit_idx = + msm_submit_append_bo(msm_submit, deferred_primary->u.cmds[i].ring_bo); + cmds[cmd_idx].submit_offset = deferred_primary->offset; + cmds[cmd_idx].size = deferred_primary->u.cmds[i].size; + cmds[cmd_idx].pad = 0; + cmds[cmd_idx].nr_relocs = 0; + + cmd_idx++; + } + + /* We are merging all the submits in the list into the last submit, + * so the remainder of the loop body doesn't apply to the last submit + */ + if (submit == last_submit(submit_list)) { + DEBUG_MSG("merged %u submits", cmd_idx); + break; + } + + struct msm_submit_sp *msm_deferred_submit = to_msm_submit_sp(submit); + for (unsigned i = 0; i < msm_deferred_submit->nr_bos; i++) { + /* Note: if bo is used in both the current submit and the deferred + * submit being merged, we expect to hit the fast-path as we add it + * to the current submit: + */ + msm_submit_append_bo(msm_submit, msm_deferred_submit->bos[i]); + } + + /* Now that the cmds/bos have been transfered over to the current submit, + * we can remove the deferred submit from the list and drop it's reference + */ + list_del(&submit->node); + fd_submit_del(submit); } - if (in_fence_fd != -1) { + if (msm_submit->in_fence_fd != -1) { req.flags |= MSM_SUBMIT_FENCE_FD_IN | MSM_SUBMIT_NO_IMPLICIT; - req.fence_fd = in_fence_fd; + req.fence_fd = msm_submit->in_fence_fd; } - if (out_fence && out_fence->use_fence_fd) { + if (msm_submit->out_fence && msm_submit->out_fence->use_fence_fd) { req.flags |= MSM_SUBMIT_FENCE_FD_OUT; } @@ -289,35 +348,146 @@ msm_submit_sp_flush_finish(struct fd_submit *submit, int in_fence_fd, submit_bos[i].presumed = 0; } - req.bos = VOID2U64(submit_bos), req.nr_bos = msm_submit->nr_bos; - req.cmds = VOID2U64(cmds), req.nr_cmds = primary->u.nr_cmds; + req.bos = VOID2U64(submit_bos); + req.nr_bos = msm_submit->nr_bos; + req.cmds = VOID2U64(cmds); + req.nr_cmds = nr_cmds; DEBUG_MSG("nr_cmds=%u, nr_bos=%u", req.nr_cmds, req.nr_bos); - ret = drmCommandWriteRead(submit->pipe->dev->fd, DRM_MSM_GEM_SUBMIT, &req, + ret = drmCommandWriteRead(msm_pipe->base.dev->fd, DRM_MSM_GEM_SUBMIT, &req, sizeof(req)); if (ret) { ERROR_MSG("submit failed: %d (%s)", ret, strerror(errno)); msm_dump_submit(&req); - } else if (!ret && out_fence) { - out_fence->fence.kfence = req.fence; - out_fence->fence.ufence = submit->fence; - out_fence->fence_fd = req.fence_fd; + } else if (!ret && msm_submit->out_fence) { + msm_submit->out_fence->fence.kfence = req.fence; + msm_submit->out_fence->fence.ufence = msm_submit->base.fence; + msm_submit->out_fence->fence_fd = req.fence_fd; } if (!bos_on_stack) free(submit_bos); + if (msm_submit->in_fence_fd != -1) + close(msm_submit->in_fence_fd); + + fd_submit_del(&msm_submit->base); + return ret; } +static bool +should_defer(struct fd_submit *submit) +{ + struct msm_submit_sp *msm_submit = to_msm_submit_sp(submit); + + /* if too many bo's, it may not be worth the CPU cost of submit merging: */ + if (msm_submit->nr_bos > 30) + return false; + + /* On the kernel side, with 32K ringbuffer, we have an upper limit of 2k + * cmds before we exceed the size of the ringbuffer, which results in + * deadlock writing into the RB (ie. kernel doesn't finish writing into + * the RB so it doesn't kick the GPU to start consuming from the RB) + */ + if (submit->pipe->dev->deferred_cmds > 128) + return false; + + return true; +} + static int msm_submit_sp_flush(struct fd_submit *submit, int in_fence_fd, struct fd_submit_fence *out_fence) { - msm_submit_sp_flush_prep(submit); + struct fd_device *dev = submit->pipe->dev; + + /* Acquire lock before flush_prep() because it is possible to race between + * this and pipe->flush(): + */ + simple_mtx_lock(&dev->submit_lock); + + /* If there are deferred submits from another fd_pipe, flush them now, + * since we can't merge submits from different submitqueue's (ie. they + * could have different priority, etc) + */ + if (!list_is_empty(&dev->deferred_submits) && + (last_submit(&dev->deferred_submits)->pipe != submit->pipe)) { + struct list_head submit_list; + + list_replace(&dev->deferred_submits, &submit_list); + list_inithead(&dev->deferred_submits); + dev->deferred_cmds = 0; + + simple_mtx_unlock(&dev->submit_lock); + flush_submit_list(&submit_list); + simple_mtx_lock(&dev->submit_lock); + } + + list_addtail(&fd_submit_ref(submit)->node, &dev->deferred_submits); + + bool has_shared = msm_submit_sp_flush_prep(submit, in_fence_fd, out_fence); + + /* If we don't need an out-fence, we can defer the submit. + * + * TODO we could defer submits with in-fence as well.. if we took our own + * reference to the fd, and merged all the in-fence-fd's when we flush the + * deferred submits + */ + if ((in_fence_fd == -1) && !out_fence && !has_shared && should_defer(submit)) { + dev->deferred_cmds += fd_ringbuffer_cmd_count(submit->primary); + assert(dev->deferred_cmds == fd_dev_count_deferred_cmds(dev)); + simple_mtx_unlock(&dev->submit_lock); + + return 0; + } + + struct list_head submit_list; + + list_replace(&dev->deferred_submits, &submit_list); + list_inithead(&dev->deferred_submits); + dev->deferred_cmds = 0; + + simple_mtx_unlock(&dev->submit_lock); + + return flush_submit_list(&submit_list); +} + +void +msm_pipe_sp_flush(struct fd_pipe *pipe, uint32_t fence) +{ + struct fd_device *dev = pipe->dev; + struct list_head submit_list; + + list_inithead(&submit_list); + + simple_mtx_lock(&dev->submit_lock); + + foreach_submit_safe (deferred_submit, &dev->deferred_submits) { + /* We should never have submits from multiple pipes in the deferred + * list. If we did, we couldn't compare their fence to our fence, + * since each fd_pipe is an independent timeline. + */ + if (deferred_submit->pipe != pipe) + break; + + if (fd_fence_after(deferred_submit->fence, fence)) + break; + + list_del(&deferred_submit->node); + list_addtail(&deferred_submit->node, &submit_list); + dev->deferred_cmds -= fd_ringbuffer_cmd_count(deferred_submit->primary); + } + + assert(dev->deferred_cmds == fd_dev_count_deferred_cmds(dev)); + + simple_mtx_unlock(&dev->submit_lock); + + if (list_is_empty(&submit_list)) + return; - return msm_submit_sp_flush_finish(submit, in_fence_fd, out_fence); + flush_submit_list(&submit_list); } static void -- 2.7.4