From cccdc513e3ee58b4119e9686c3086b606500d6a9 Mon Sep 17 00:00:00 2001
From: Rob Clark <robdclark@chromium.org>
Date: Sat, 17 Apr 2021 12:51:02 -0700
Subject: [PATCH] freedreno/drm/sp: Implement deferred submit merging

For submits flushed with (a) no required fence, and (b) no externally
visible effects (ie. imported/exported bo), we can defer flushing the
submit and merge it into a later submit.

This is a bit more work in userspace, but it cuts down the number of
submit ioctls.  And a common case is that later submits overlap in the
bo's used (for example, blit upload to a buffer, which is then used in
the following draw pass), so it reduces the net amount of work needed
to be done in the kernel to handle the submit ioctl.

Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/19
Signed-off-by: Rob Clark <robdclark@chromium.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10444>
---
 src/freedreno/drm/msm_pipe.c          |   1 +
 src/freedreno/drm/msm_priv.h          |   1 +
 src/freedreno/drm/msm_ringbuffer_sp.c | 234 +++++++++++++++++++++++++++++-----
 3 files changed, 204 insertions(+), 32 deletions(-)

diff --git a/src/freedreno/drm/msm_pipe.c b/src/freedreno/drm/msm_pipe.c
index 8de3c5c..8793494 100644
--- a/src/freedreno/drm/msm_pipe.c
+++ b/src/freedreno/drm/msm_pipe.c
@@ -177,6 +177,7 @@ msm_pipe_destroy(struct fd_pipe *pipe)
 static const struct fd_pipe_funcs sp_funcs = {
    .ringbuffer_new_object = msm_ringbuffer_sp_new_object,
    .submit_new = msm_submit_sp_new,
+   .flush = msm_pipe_sp_flush,
    .get_param = msm_pipe_get_param,
    .wait = msm_pipe_wait,
    .destroy = msm_pipe_destroy,
diff --git a/src/freedreno/drm/msm_priv.h b/src/freedreno/drm/msm_priv.h
index c04fa79..76c7f2f 100644
--- a/src/freedreno/drm/msm_priv.h
+++ b/src/freedreno/drm/msm_priv.h
@@ -67,6 +67,7 @@ struct fd_ringbuffer *msm_ringbuffer_sp_new_object(struct fd_pipe *pipe,
 
 struct fd_submit *msm_submit_new(struct fd_pipe *pipe);
 struct fd_submit *msm_submit_sp_new(struct fd_pipe *pipe);
+void msm_pipe_sp_flush(struct fd_pipe *pipe, uint32_t fence);
 
 void msm_pipe_sp_ringpool_init(struct msm_pipe *msm_pipe);
 void msm_pipe_sp_ringpool_fini(struct msm_pipe *msm_pipe);
diff --git a/src/freedreno/drm/msm_ringbuffer_sp.c b/src/freedreno/drm/msm_ringbuffer_sp.c
index e6c8418..f8becaf 100644
--- a/src/freedreno/drm/msm_ringbuffer_sp.c
+++ b/src/freedreno/drm/msm_ringbuffer_sp.c
@@ -28,6 +28,7 @@
 #include <inttypes.h>
 
 #include "util/hash_table.h"
+#include "util/os_file.h"
 #include "util/slab.h"
 
 #include "drm/freedreno_ringbuffer.h"
@@ -57,6 +58,12 @@ struct msm_submit_sp {
     * so we can reclaim extra space at it's end.
     */
    struct fd_ringbuffer *suballoc_ring;
+
+   /* Flush args, potentially attached to the last submit in the list
+    * of submits to merge:
+    */
+   int in_fence_fd;
+   struct fd_submit_fence *out_fence;
 };
 FD_DEFINE_CAST(fd_submit, msm_submit_sp);
 
@@ -108,7 +115,7 @@ msm_submit_append_bo(struct msm_submit_sp *submit, struct fd_bo *bo)
 
    /* NOTE: it is legal to use the same bo on different threads for
     * different submits.  But it is not legal to use the same submit
-    * from given threads.
+    * from different threads.
     */
    idx = READ_ONCE(msm_bo->idx);
 
@@ -213,10 +220,12 @@ msm_submit_sp_new_ringbuffer(struct fd_submit *submit, uint32_t size,
  * 2) Add cmdstream bos to bos table
  * 3) Update bo fences
  */
-static void
-msm_submit_sp_flush_prep(struct fd_submit *submit)
+static bool
+msm_submit_sp_flush_prep(struct fd_submit *submit, int in_fence_fd,
+                         struct fd_submit_fence *out_fence)
 {
    struct msm_submit_sp *msm_submit = to_msm_submit_sp(submit);
+   bool has_shared = false;
 
    finalize_current_cmd(submit->primary);
 
@@ -227,43 +236,93 @@ msm_submit_sp_flush_prep(struct fd_submit *submit)
       msm_submit_append_bo(msm_submit, primary->u.cmds[i].ring_bo);
 
    simple_mtx_lock(&table_lock);
-   for (unsigned i = 0; i < msm_submit->nr_bos; i++)
+   for (unsigned i = 0; i < msm_submit->nr_bos; i++) {
       fd_bo_add_fence(msm_submit->bos[i], submit->pipe, submit->fence);
+      has_shared |= msm_submit->bos[i]->shared;
+   }
    simple_mtx_unlock(&table_lock);
+
+   msm_submit->out_fence   = out_fence;
+   msm_submit->in_fence_fd = (in_fence_fd == -1) ?
+         -1 : os_dupfd_cloexec(in_fence_fd);
+
+   return has_shared;
 }
 
 static int
-msm_submit_sp_flush_finish(struct fd_submit *submit, int in_fence_fd,
-                           struct fd_submit_fence *out_fence)
+flush_submit_list(struct list_head *submit_list)
 {
-   struct msm_submit_sp *msm_submit = to_msm_submit_sp(submit);
-   struct msm_pipe *msm_pipe = to_msm_pipe(submit->pipe);
+   struct msm_submit_sp *msm_submit = to_msm_submit_sp(last_submit(submit_list));
+   struct msm_pipe *msm_pipe = to_msm_pipe(msm_submit->base.pipe);
    struct drm_msm_gem_submit req = {
       .flags = msm_pipe->pipe,
       .queueid = msm_pipe->queue_id,
    };
    int ret;
 
-   struct msm_ringbuffer_sp *primary =
-      to_msm_ringbuffer_sp(submit->primary);
-   struct drm_msm_gem_submit_cmd cmds[primary->u.nr_cmds];
-
-   for (unsigned i = 0; i < primary->u.nr_cmds; i++) {
-      cmds[i].type = MSM_SUBMIT_CMD_BUF;
-      cmds[i].submit_idx =
-         msm_submit_append_bo(msm_submit, primary->u.cmds[i].ring_bo);
-      cmds[i].submit_offset = primary->offset;
-      cmds[i].size = primary->u.cmds[i].size;
-      cmds[i].pad = 0;
-      cmds[i].nr_relocs = 0;
+   unsigned nr_cmds = 0;
+
+   /* Determine the number of extra cmds's from deferred submits that
+    * we will be merging in:
+    */
+   foreach_submit (submit, submit_list) {
+      assert(submit->pipe == &msm_pipe->base);
+      nr_cmds += to_msm_ringbuffer_sp(submit->primary)->u.nr_cmds;
+   }
+
+   struct drm_msm_gem_submit_cmd cmds[nr_cmds];
+
+   unsigned cmd_idx = 0;
+
+   /* Build up the table of cmds, and for all but the last submit in the
+    * list, merge their bo tables into the last submit.
+    */
+   foreach_submit_safe (submit, submit_list) {
+      struct msm_ringbuffer_sp *deferred_primary =
+         to_msm_ringbuffer_sp(submit->primary);
+
+      for (unsigned i = 0; i < deferred_primary->u.nr_cmds; i++) {
+         cmds[cmd_idx].type = MSM_SUBMIT_CMD_BUF;
+         cmds[cmd_idx].submit_idx =
+               msm_submit_append_bo(msm_submit, deferred_primary->u.cmds[i].ring_bo);
+         cmds[cmd_idx].submit_offset = deferred_primary->offset;
+         cmds[cmd_idx].size = deferred_primary->u.cmds[i].size;
+         cmds[cmd_idx].pad = 0;
+         cmds[cmd_idx].nr_relocs = 0;
+
+         cmd_idx++;
+      }
+
+      /* We are merging all the submits in the list into the last submit,
+       * so the remainder of the loop body doesn't apply to the last submit
+       */
+      if (submit == last_submit(submit_list)) {
+         DEBUG_MSG("merged %u submits", cmd_idx);
+         break;
+      }
+
+      struct msm_submit_sp *msm_deferred_submit = to_msm_submit_sp(submit);
+      for (unsigned i = 0; i < msm_deferred_submit->nr_bos; i++) {
+         /* Note: if bo is used in both the current submit and the deferred
+          * submit being merged, we expect to hit the fast-path as we add it
+          * to the current submit:
+          */
+         msm_submit_append_bo(msm_submit, msm_deferred_submit->bos[i]);
+      }
+
+      /* Now that the cmds/bos have been transfered over to the current submit,
+       * we can remove the deferred submit from the list and drop it's reference
+       */
+      list_del(&submit->node);
+      fd_submit_del(submit);
    }
 
-   if (in_fence_fd != -1) {
+   if (msm_submit->in_fence_fd != -1) {
       req.flags |= MSM_SUBMIT_FENCE_FD_IN | MSM_SUBMIT_NO_IMPLICIT;
-      req.fence_fd = in_fence_fd;
+      req.fence_fd = msm_submit->in_fence_fd;
    }
 
-   if (out_fence && out_fence->use_fence_fd) {
+   if (msm_submit->out_fence && msm_submit->out_fence->use_fence_fd) {
       req.flags |= MSM_SUBMIT_FENCE_FD_OUT;
    }
 
@@ -289,35 +348,146 @@ msm_submit_sp_flush_finish(struct fd_submit *submit, int in_fence_fd,
       submit_bos[i].presumed = 0;
    }
 
-   req.bos = VOID2U64(submit_bos), req.nr_bos = msm_submit->nr_bos;
-   req.cmds = VOID2U64(cmds), req.nr_cmds = primary->u.nr_cmds;
+   req.bos = VOID2U64(submit_bos);
+   req.nr_bos = msm_submit->nr_bos;
+   req.cmds = VOID2U64(cmds);
+   req.nr_cmds = nr_cmds;
 
    DEBUG_MSG("nr_cmds=%u, nr_bos=%u", req.nr_cmds, req.nr_bos);
 
-   ret = drmCommandWriteRead(submit->pipe->dev->fd, DRM_MSM_GEM_SUBMIT, &req,
+   ret = drmCommandWriteRead(msm_pipe->base.dev->fd, DRM_MSM_GEM_SUBMIT, &req,
                              sizeof(req));
    if (ret) {
       ERROR_MSG("submit failed: %d (%s)", ret, strerror(errno));
       msm_dump_submit(&req);
-   } else if (!ret && out_fence) {
-      out_fence->fence.kfence = req.fence;
-      out_fence->fence.ufence = submit->fence;
-      out_fence->fence_fd = req.fence_fd;
+   } else if (!ret && msm_submit->out_fence) {
+      msm_submit->out_fence->fence.kfence = req.fence;
+      msm_submit->out_fence->fence.ufence = msm_submit->base.fence;
+      msm_submit->out_fence->fence_fd = req.fence_fd;
    }
 
    if (!bos_on_stack)
       free(submit_bos);
 
+   if (msm_submit->in_fence_fd != -1)
+      close(msm_submit->in_fence_fd);
+
+   fd_submit_del(&msm_submit->base);
+
    return ret;
 }
 
+static bool
+should_defer(struct fd_submit *submit)
+{
+   struct msm_submit_sp *msm_submit = to_msm_submit_sp(submit);
+
+   /* if too many bo's, it may not be worth the CPU cost of submit merging: */
+   if (msm_submit->nr_bos > 30)
+      return false;
+
+   /* On the kernel side, with 32K ringbuffer, we have an upper limit of 2k
+    * cmds before we exceed the size of the ringbuffer, which results in
+    * deadlock writing into the RB (ie. kernel doesn't finish writing into
+    * the RB so it doesn't kick the GPU to start consuming from the RB)
+    */
+   if (submit->pipe->dev->deferred_cmds > 128)
+      return false;
+
+   return true;
+}
+
 static int
 msm_submit_sp_flush(struct fd_submit *submit, int in_fence_fd,
                     struct fd_submit_fence *out_fence)
 {
-   msm_submit_sp_flush_prep(submit);
+   struct fd_device *dev = submit->pipe->dev;
+
+   /* Acquire lock before flush_prep() because it is possible to race between
+    * this and pipe->flush():
+    */
+   simple_mtx_lock(&dev->submit_lock);
+
+   /* If there are deferred submits from another fd_pipe, flush them now,
+    * since we can't merge submits from different submitqueue's (ie. they
+    * could have different priority, etc)
+    */
+   if (!list_is_empty(&dev->deferred_submits) &&
+       (last_submit(&dev->deferred_submits)->pipe != submit->pipe)) {
+      struct list_head submit_list;
+
+      list_replace(&dev->deferred_submits, &submit_list);
+      list_inithead(&dev->deferred_submits);
+      dev->deferred_cmds = 0;
+
+      simple_mtx_unlock(&dev->submit_lock);
+      flush_submit_list(&submit_list);
+      simple_mtx_lock(&dev->submit_lock);
+   }
+
+   list_addtail(&fd_submit_ref(submit)->node, &dev->deferred_submits);
+
+   bool has_shared = msm_submit_sp_flush_prep(submit, in_fence_fd, out_fence);
+
+   /* If we don't need an out-fence, we can defer the submit.
+    *
+    * TODO we could defer submits with in-fence as well.. if we took our own
+    * reference to the fd, and merged all the in-fence-fd's when we flush the
+    * deferred submits
+    */
+   if ((in_fence_fd == -1) && !out_fence && !has_shared && should_defer(submit)) {
+      dev->deferred_cmds += fd_ringbuffer_cmd_count(submit->primary);
+      assert(dev->deferred_cmds == fd_dev_count_deferred_cmds(dev));
+      simple_mtx_unlock(&dev->submit_lock);
+
+      return 0;
+   }
+
+   struct list_head submit_list;
+
+   list_replace(&dev->deferred_submits, &submit_list);
+   list_inithead(&dev->deferred_submits);
+   dev->deferred_cmds = 0;
+
+   simple_mtx_unlock(&dev->submit_lock);
+
+   return flush_submit_list(&submit_list);
+}
+
+void
+msm_pipe_sp_flush(struct fd_pipe *pipe, uint32_t fence)
+{
+   struct fd_device *dev = pipe->dev;
+   struct list_head submit_list;
+
+   list_inithead(&submit_list);
+
+   simple_mtx_lock(&dev->submit_lock);
+
+   foreach_submit_safe (deferred_submit, &dev->deferred_submits) {
+      /* We should never have submits from multiple pipes in the deferred
+       * list.  If we did, we couldn't compare their fence to our fence,
+       * since each fd_pipe is an independent timeline.
+       */
+      if (deferred_submit->pipe != pipe)
+         break;
+
+      if (fd_fence_after(deferred_submit->fence, fence))
+         break;
+
+      list_del(&deferred_submit->node);
+      list_addtail(&deferred_submit->node, &submit_list);
+      dev->deferred_cmds -= fd_ringbuffer_cmd_count(deferred_submit->primary);
+   }
+
+   assert(dev->deferred_cmds == fd_dev_count_deferred_cmds(dev));
+
+   simple_mtx_unlock(&dev->submit_lock);
+
+   if (list_is_empty(&submit_list))
+      return;
 
-   return msm_submit_sp_flush_finish(submit, in_fence_fd, out_fence);
+   flush_submit_list(&submit_list);
 }
 
 static void
-- 
2.7.4