From e9a9ac6f77fe876b84edfb638c3075d6a149c812 Mon Sep 17 00:00:00 2001
From: Rob Clark <robdclark@chromium.org>
Date: Mon, 19 Apr 2021 15:20:15 -0700
Subject: [PATCH] freedreno/drm: Async submit support

Move the submit ioctl to it's own thread to unblock the driver thread
and let it move on to the next frame.

Note that I did experiment with doing the append_bo() parts
synchronously on the theory that we should be more likely to hit the
fast path if we did that part of submit merging before the bo was
potentially re-used in the next batch/submit.  It helped some things
by a couple percent, but hurt more things.

Signed-off-by: Rob Clark <robdclark@chromium.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10444>
---
 src/freedreno/drm/freedreno_device.c            |   5 +-
 src/freedreno/drm/freedreno_ringbuffer.h        |   9 ++-
 src/freedreno/drm/msm_device.c                  |  14 +++-
 src/freedreno/drm/msm_priv.h                    |  11 ++-
 src/freedreno/drm/msm_ringbuffer_sp.c           | 102 ++++++++++++++++++++++--
 src/gallium/drivers/freedreno/freedreno_fence.c |  37 ++++++++-
 src/gallium/drivers/freedreno/freedreno_fence.h |   6 ++
 7 files changed, 169 insertions(+), 15 deletions(-)

diff --git a/src/freedreno/drm/freedreno_device.c b/src/freedreno/drm/freedreno_device.c
index f9d4293..dc01f65 100644
--- a/src/freedreno/drm/freedreno_device.c
+++ b/src/freedreno/drm/freedreno_device.c
@@ -33,8 +33,7 @@
 #include "freedreno_drmif.h"
 #include "freedreno_priv.h"
 
-struct fd_device *kgsl_device_new(int fd);
-struct fd_device *msm_device_new(int fd);
+struct fd_device *msm_device_new(int fd, drmVersionPtr version);
 
 struct fd_device *
 fd_device_new(int fd)
@@ -58,7 +57,7 @@ fd_device_new(int fd)
          goto out;
       }
 
-      dev = msm_device_new(fd);
+      dev = msm_device_new(fd, version);
       dev->version = version->version_minor;
 #if HAVE_FREEDRENO_KGSL
    } else if (!strcmp(version->name, "kgsl")) {
diff --git a/src/freedreno/drm/freedreno_ringbuffer.h b/src/freedreno/drm/freedreno_ringbuffer.h
index 3cf1343..8e6b3e9 100644
--- a/src/freedreno/drm/freedreno_ringbuffer.h
+++ b/src/freedreno/drm/freedreno_ringbuffer.h
@@ -30,7 +30,7 @@
 #include <stdio.h>
 #include "util/u_atomic.h"
 #include "util/u_debug.h"
-#include "util/u_dynarray.h"
+#include "util/u_queue.h"
 
 #include "adreno_common.xml.h"
 #include "adreno_pm4.xml.h"
@@ -98,6 +98,13 @@ struct fd_ringbuffer *fd_submit_new_ringbuffer(struct fd_submit *submit,
  * out-fence-fd
  */
 struct fd_submit_fence {
+   /**
+    * The ready fence is signaled once the submit is actually flushed down
+    * to the kernel, and fence/fence_fd are populated.  You must wait for
+    * this fence to be signaled before reading fence/fence_fd.
+    */
+   struct util_queue_fence ready;
+
    struct fd_fence fence;
 
    /**
diff --git a/src/freedreno/drm/msm_device.c b/src/freedreno/drm/msm_device.c
index 7922ea0..6af709f 100644
--- a/src/freedreno/drm/msm_device.c
+++ b/src/freedreno/drm/msm_device.c
@@ -34,6 +34,9 @@ static void
 msm_device_destroy(struct fd_device *dev)
 {
    struct msm_device *msm_dev = to_msm_device(dev);
+   if (util_queue_is_initialized(&msm_dev->submit_queue)) {
+      util_queue_destroy(&msm_dev->submit_queue);
+   }
    free(msm_dev);
 }
 
@@ -45,7 +48,7 @@ static const struct fd_device_funcs funcs = {
 };
 
 struct fd_device *
-msm_device_new(int fd)
+msm_device_new(int fd, drmVersionPtr version)
 {
    struct msm_device *msm_dev;
    struct fd_device *dev;
@@ -61,6 +64,15 @@ msm_device_new(int fd)
    dev = &msm_dev->base;
    dev->funcs = &funcs;
 
+   /* async submit_queue currently only used for msm_submit_sp: */
+   if (version->version_minor >= FD_VERSION_SOFTPIN) {
+      /* Note the name is intentionally short to avoid the queue
+       * thread's comm truncating the interesting part of the
+       * process name.
+       */
+      util_queue_init(&msm_dev->submit_queue, "sq", 8, 1, 0);
+   }
+
    dev->bo_size = sizeof(struct msm_bo);
 
    return dev;
diff --git a/src/freedreno/drm/msm_priv.h b/src/freedreno/drm/msm_priv.h
index 76c7f2f..e4fe084 100644
--- a/src/freedreno/drm/msm_priv.h
+++ b/src/freedreno/drm/msm_priv.h
@@ -40,10 +40,11 @@
 struct msm_device {
    struct fd_device base;
    struct fd_bo_cache ring_cache;
+   struct util_queue submit_queue;
 };
 FD_DEFINE_CAST(fd_device, msm_device);
 
-struct fd_device *msm_device_new(int fd);
+struct fd_device *msm_device_new(int fd, drmVersionPtr version);
 
 struct msm_pipe {
    struct fd_pipe base;
@@ -54,6 +55,14 @@ struct msm_pipe {
    uint32_t chip_id;
    uint32_t queue_id;
    struct slab_parent_pool ring_pool;
+
+   /**
+    * The last fence seqno that was flushed to kernel (doesn't mean that it
+    * is complete, just that the kernel knows about it)
+    */
+   uint32_t last_submit_fence;
+
+   uint32_t last_enqueue_fence;   /* just for debugging */
 };
 FD_DEFINE_CAST(fd_pipe, msm_pipe);
 
diff --git a/src/freedreno/drm/msm_ringbuffer_sp.c b/src/freedreno/drm/msm_ringbuffer_sp.c
index 1ebbc86..809c480 100644
--- a/src/freedreno/drm/msm_ringbuffer_sp.c
+++ b/src/freedreno/drm/msm_ringbuffer_sp.c
@@ -26,6 +26,7 @@
 
 #include <assert.h>
 #include <inttypes.h>
+#include <pthread.h>
 
 #include "util/hash_table.h"
 #include "util/os_file.h"
@@ -41,6 +42,14 @@
 
 #define INIT_SIZE 0x1000
 
+/* In the pipe->flush() path, we don't have a util_queue_fence we can wait on,
+ * instead use a condition-variable.  Note that pipe->flush() is not expected
+ * to be a common/hot path.
+ */
+static pthread_cond_t  flush_cnd = PTHREAD_COND_INITIALIZER;
+static pthread_mutex_t flush_mtx = PTHREAD_MUTEX_INITIALIZER;
+
+
 struct msm_submit_sp {
    struct fd_submit base;
 
@@ -64,6 +73,13 @@ struct msm_submit_sp {
     */
    int in_fence_fd;
    struct fd_submit_fence *out_fence;
+
+   /* State for enqueued submits:
+    */
+   struct list_head submit_list;   /* includes this submit as last element */
+
+   /* Used in case out_fence==NULL: */
+   struct util_queue_fence fence;
 };
 FD_DEFINE_CAST(fd_submit, msm_submit_sp);
 
@@ -369,14 +385,65 @@ flush_submit_list(struct list_head *submit_list)
    if (!bos_on_stack)
       free(submit_bos);
 
+   pthread_mutex_lock(&flush_mtx);
+   assert(fd_fence_before(msm_pipe->last_submit_fence, msm_submit->base.fence));
+   msm_pipe->last_submit_fence = msm_submit->base.fence;
+   pthread_cond_broadcast(&flush_cnd);
+   pthread_mutex_unlock(&flush_mtx);
+
    if (msm_submit->in_fence_fd != -1)
       close(msm_submit->in_fence_fd);
 
-   fd_submit_del(&msm_submit->base);
-
    return ret;
 }
 
+static void
+msm_submit_sp_flush_execute(void *job, int thread_index)
+{
+   struct fd_submit *submit = job;
+   struct msm_submit_sp *msm_submit = to_msm_submit_sp(submit);
+
+   flush_submit_list(&msm_submit->submit_list);
+
+   DEBUG_MSG("finish: %u", submit->fence);
+}
+
+static void
+msm_submit_sp_flush_cleanup(void *job, int thread_index)
+{
+   struct fd_submit *submit = job;
+   fd_submit_del(submit);
+}
+
+static int
+enqueue_submit_list(struct list_head *submit_list)
+{
+   struct fd_submit *submit = last_submit(submit_list);
+   struct msm_submit_sp *msm_submit = to_msm_submit_sp(submit);
+   struct msm_device *msm_dev = to_msm_device(submit->pipe->dev);
+
+   list_replace(submit_list, &msm_submit->submit_list);
+   list_inithead(submit_list);
+
+   struct util_queue_fence *fence;
+   if (msm_submit->out_fence) {
+      fence = &msm_submit->out_fence->ready;
+   } else {
+      util_queue_fence_init(&msm_submit->fence);
+      fence = &msm_submit->fence;
+   }
+
+   DEBUG_MSG("enqueue: %u", submit->fence);
+
+   util_queue_add_job(&msm_dev->submit_queue,
+                      submit, fence,
+                      msm_submit_sp_flush_execute,
+                      msm_submit_sp_flush_cleanup,
+                      0);
+
+   return 0;
+}
+
 static bool
 should_defer(struct fd_submit *submit)
 {
@@ -402,6 +469,7 @@ msm_submit_sp_flush(struct fd_submit *submit, int in_fence_fd,
                     struct fd_submit_fence *out_fence)
 {
    struct fd_device *dev = submit->pipe->dev;
+   struct msm_pipe *msm_pipe = to_msm_pipe(submit->pipe);
 
    /* Acquire lock before flush_prep() because it is possible to race between
     * this and pipe->flush():
@@ -420,15 +488,16 @@ msm_submit_sp_flush(struct fd_submit *submit, int in_fence_fd,
       list_inithead(&dev->deferred_submits);
       dev->deferred_cmds = 0;
 
-      simple_mtx_unlock(&dev->submit_lock);
-      flush_submit_list(&submit_list);
-      simple_mtx_lock(&dev->submit_lock);
+      enqueue_submit_list(&submit_list);
    }
 
    list_addtail(&fd_submit_ref(submit)->node, &dev->deferred_submits);
 
    bool has_shared = msm_submit_sp_flush_prep(submit, in_fence_fd, out_fence);
 
+   assert(fd_fence_before(msm_pipe->last_enqueue_fence, submit->fence));
+   msm_pipe->last_enqueue_fence = submit->fence;
+
    /* If we don't need an out-fence, we can defer the submit.
     *
     * TODO we could defer submits with in-fence as well.. if we took our own
@@ -436,6 +505,7 @@ msm_submit_sp_flush(struct fd_submit *submit, int in_fence_fd,
     * deferred submits
     */
    if ((in_fence_fd == -1) && !out_fence && !has_shared && should_defer(submit)) {
+      DEBUG_MSG("defer: %u", submit->fence);
       dev->deferred_cmds += fd_ringbuffer_cmd_count(submit->primary);
       assert(dev->deferred_cmds == fd_dev_count_deferred_cmds(dev));
       simple_mtx_unlock(&dev->submit_lock);
@@ -451,19 +521,24 @@ msm_submit_sp_flush(struct fd_submit *submit, int in_fence_fd,
 
    simple_mtx_unlock(&dev->submit_lock);
 
-   return flush_submit_list(&submit_list);
+   return enqueue_submit_list(&submit_list);
 }
 
 void
 msm_pipe_sp_flush(struct fd_pipe *pipe, uint32_t fence)
 {
+   struct msm_pipe *msm_pipe = to_msm_pipe(pipe);
    struct fd_device *dev = pipe->dev;
    struct list_head submit_list;
 
+   DEBUG_MSG("flush: %u", fence);
+
    list_inithead(&submit_list);
 
    simple_mtx_lock(&dev->submit_lock);
 
+   assert(!fd_fence_after(fence, msm_pipe->last_enqueue_fence));
+
    foreach_submit_safe (deferred_submit, &dev->deferred_submits) {
       /* We should never have submits from multiple pipes in the deferred
        * list.  If we did, we couldn't compare their fence to our fence,
@@ -485,9 +560,20 @@ msm_pipe_sp_flush(struct fd_pipe *pipe, uint32_t fence)
    simple_mtx_unlock(&dev->submit_lock);
 
    if (list_is_empty(&submit_list))
-      return;
+      goto flush_sync;
+
+   enqueue_submit_list(&submit_list);
 
-   flush_submit_list(&submit_list);
+flush_sync:
+   /* Once we are sure that we've enqueued at least up to the requested
+    * submit, we need to be sure that submitq has caught up and flushed
+    * them to the kernel
+    */
+   pthread_mutex_lock(&flush_mtx);
+   while (fd_fence_before(msm_pipe->last_submit_fence, fence)) {
+      pthread_cond_wait(&flush_cnd, &flush_mtx);
+   }
+   pthread_mutex_unlock(&flush_mtx);
 }
 
 static void
diff --git a/src/gallium/drivers/freedreno/freedreno_fence.c b/src/gallium/drivers/freedreno/freedreno_fence.c
index a4d4be9..5279efe 100644
--- a/src/gallium/drivers/freedreno/freedreno_fence.c
+++ b/src/gallium/drivers/freedreno/freedreno_fence.c
@@ -59,6 +59,16 @@ fence_flush(struct pipe_context *pctx, struct pipe_fence_handle *fence,
          }
       }
 
+      /* If after the pre-created unflushed fence is flushed, we end up
+       * re-populated to a previous last_fence, then *that* is the one
+       * whose submit_fence.ready we want to wait on:
+       */
+      if (fence->last_fence) {
+         return fence_flush(pctx, fence->last_fence, timeout);
+      }
+
+      util_queue_fence_wait(&fence->submit_fence.ready);
+
       /* We've already waited for batch to be flushed and fence->batch
        * to be cleared:
        */
@@ -69,6 +79,8 @@ fence_flush(struct pipe_context *pctx, struct pipe_fence_handle *fence,
    if (fence->batch)
       fd_batch_flush(fence->batch);
 
+   util_queue_fence_wait(&fence->submit_fence.ready);
+
    debug_assert(!fence->batch);
 
    return true;
@@ -81,19 +93,37 @@ fd_fence_repopulate(struct pipe_fence_handle *fence, struct pipe_fence_handle *l
     * might have been)
     */
    assert(!fence->submit_fence.use_fence_fd);
+   assert(!last_fence->batch);
 
    fence->submit_fence.fence = last_fence->submit_fence.fence;
+
+   fd_fence_ref(&fence->last_fence, last_fence);
+
+   /* We have nothing to flush, so nothing will clear the batch reference
+    * (which is normally done when the batch is flushed), so do it now:
+    */
+   fd_fence_set_batch(fence, NULL);
 }
 
 static void
 fd_fence_destroy(struct pipe_fence_handle *fence)
 {
+   fd_fence_ref(&fence->last_fence, NULL);
+
    tc_unflushed_batch_token_reference(&fence->tc_token, NULL);
    if (fence->submit_fence.use_fence_fd)
       close(fence->submit_fence.fence_fd);
    if (fence->syncobj)
       drmSyncobjDestroy(fd_device_fd(fence->screen->dev), fence->syncobj);
    fd_pipe_del(fence->pipe);
+
+   /* TODO might be worth trying harder to avoid a potential stall here,
+    * but that would require the submit somehow holding a reference to
+    * the pipe_fence_handle.. and I'm not sure if it is a thing that is
+    * likely to matter much.
+    */
+   util_queue_fence_wait(&fence->submit_fence.ready);
+
    FREE(fence);
 }
 
@@ -113,6 +143,9 @@ fd_fence_finish(struct pipe_screen *pscreen, struct pipe_context *pctx,
    if (!fence_flush(pctx, fence, timeout))
       return false;
 
+   if (fence->last_fence)
+      fence = fence->last_fence;
+
    if (fence->submit_fence.use_fence_fd) {
       int ret = sync_wait(fence->submit_fence.fence_fd, timeout / 1000000);
       return ret == 0;
@@ -136,9 +169,10 @@ fence_create(struct fd_context *ctx, struct fd_batch *batch, int fence_fd,
 
    pipe_reference_init(&fence->reference, 1);
    util_queue_fence_init(&fence->ready);
+   util_queue_fence_init(&fence->submit_fence.ready);
 
    fence->ctx = ctx;
-   fence->batch = batch;
+   fd_fence_set_batch(fence, batch);
    fence->pipe = fd_pipe_ref(ctx->pipe);
    fence->screen = ctx->screen;
    fence->submit_fence.fence_fd = fence_fd;
@@ -237,6 +271,7 @@ fd_fence_set_batch(struct pipe_fence_handle *fence, struct fd_batch *batch)
    if (batch) {
       assert(!fence->batch);
       fence->batch = batch;
+      batch->needs_flush = true;
    } else {
       fence->batch = NULL;
 
diff --git a/src/gallium/drivers/freedreno/freedreno_fence.h b/src/gallium/drivers/freedreno/freedreno_fence.h
index 947ac97..4d9efb8 100644
--- a/src/gallium/drivers/freedreno/freedreno_fence.h
+++ b/src/gallium/drivers/freedreno/freedreno_fence.h
@@ -35,6 +35,12 @@
 struct pipe_fence_handle {
    struct pipe_reference reference;
 
+   /* When a pre-created unflushed fence has no actual rendering to flush, and
+    * the last_fence optimization is used, this will be a reference to the
+    * *actualy* fence which needs to be flushed before waiting.
+    */
+   struct pipe_fence_handle *last_fence;
+
    /* fence holds a weak reference to the batch until the batch is flushed, to
     * accommodate PIPE_FLUSH_DEFERRED.  When the batch is actually flushed, it
     * is cleared (before the batch reference is dropped).  If we need to wait
-- 
2.7.4