From 57a254c48de4533e1ea4169b1d2532458814a2c7 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Mon, 18 May 2020 10:41:11 +0200
Subject: [PATCH] v3dv: implement events

This reverts a previous half-attempt at an implementation of events
using a BO to hold the event state, and provides a full
implementation.  V3D doesn't have any built-in GPU functionality to
wait on any kind of events, so we need to implement this in the driver
an therefore we no longer need to use a BO for the event state.

Instead, we implement GPU waits by using a CPU job for the wait
operation that spawns a wait thread if the wait operation doesn't have
all its events signaled by the time it is processed. To implement the
semantics of the wait correctly, any jobs in the same command buffer
that come after the wait will not be emitted until the wait thread
completes.

If a submit spawns any wait threads for a command buffer we can't
signal any semaphores for it until all the wait threads complete and
we know that all the jobs for those command buffers have been
submitted. The same applies to the submit fence, if present.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6766>
---
 src/broadcom/vulkan/v3dv_cmd_buffer.c | 139 ++++++++--
 src/broadcom/vulkan/v3dv_device.c     |  46 +---
 src/broadcom/vulkan/v3dv_private.h    |  70 ++++-
 src/broadcom/vulkan/v3dv_queue.c      | 496 +++++++++++++++++++++++++++++++---
 4 files changed, 640 insertions(+), 111 deletions(-)

diff --git a/src/broadcom/vulkan/v3dv_cmd_buffer.c b/src/broadcom/vulkan/v3dv_cmd_buffer.c
index 73c19e2..4afd8f2 100644
--- a/src/broadcom/vulkan/v3dv_cmd_buffer.c
+++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c
@@ -130,6 +130,7 @@ cmd_buffer_init(struct v3dv_cmd_buffer *cmd_buffer,
 
    list_inithead(&cmd_buffer->private_objs);
    list_inithead(&cmd_buffer->submit_jobs);
+   list_inithead(&cmd_buffer->list_link);
 
    assert(pool);
    list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
@@ -161,6 +162,42 @@ cmd_buffer_create(struct v3dv_device *device,
    return VK_SUCCESS;
 }
 
+static void
+job_destroy_gpu_cl_resources(struct v3dv_job *job)
+{
+   assert(job->type == V3DV_JOB_TYPE_GPU_CL);
+
+   v3dv_cl_destroy(&job->bcl);
+   v3dv_cl_destroy(&job->rcl);
+   v3dv_cl_destroy(&job->indirect);
+
+   /* Since we don't ref BOs when we add them to the command buffer, don't
+    * unref them here either. Bo's will be freed when their corresponding API
+    * objects are destroyed.
+    */
+   _mesa_set_destroy(job->bos, NULL);
+
+   /* Extra BOs need to be destroyed with the job, since they were created
+    * internally by the driver for it.
+    */
+   set_foreach(job->extra_bos, entry) {
+      struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
+      v3dv_bo_free(job->device, bo);
+   }
+   _mesa_set_destroy(job->extra_bos, NULL);
+
+   v3dv_bo_free(job->device, job->tile_alloc);
+   v3dv_bo_free(job->device, job->tile_state);
+}
+
+static void
+job_destroy_cpu_wait_events_resources(struct v3dv_job *job)
+{
+   assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS);
+   assert(job->cmd_buffer);
+   vk_free(&job->cmd_buffer->device->alloc, job->cpu.event_wait.events);
+}
+
 void
 v3dv_job_destroy(struct v3dv_job *job)
 {
@@ -168,30 +205,15 @@ v3dv_job_destroy(struct v3dv_job *job)
 
    list_del(&job->list_link);
 
-   if (job->type == V3DV_JOB_TYPE_GPU_CL) {
-      v3dv_cl_destroy(&job->bcl);
-      v3dv_cl_destroy(&job->rcl);
-      v3dv_cl_destroy(&job->indirect);
-
-      /* Since we don't ref BOs, when we add them to the command buffer, don't
-       * unref them here either.
-       */
-#if 0
-      set_foreach(job->bos, entry) {
-         struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
-         v3dv_bo_free(cmd_buffer->device, bo);
-      }
-#endif
-      _mesa_set_destroy(job->bos, NULL);
-
-      set_foreach(job->extra_bos, entry) {
-         struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
-         v3dv_bo_free(job->device, bo);
-      }
-      _mesa_set_destroy(job->extra_bos, NULL);
-
-      v3dv_bo_free(job->device, job->tile_alloc);
-      v3dv_bo_free(job->device, job->tile_state);
+   switch (job->type) {
+   case V3DV_JOB_TYPE_GPU_CL:
+      job_destroy_gpu_cl_resources(job);
+      break;
+   case V3DV_JOB_TYPE_CPU_WAIT_EVENTS:
+      job_destroy_cpu_wait_events_resources(job);
+      break;
+   default:
+      break;
    }
 
    vk_free(&job->device->alloc, job);
@@ -3747,18 +3769,50 @@ v3dv_cmd_buffer_add_tfu_job(struct v3dv_cmd_buffer *cmd_buffer,
 
 void
 v3dv_CmdSetEvent(VkCommandBuffer commandBuffer,
-                 VkEvent event,
+                 VkEvent _event,
                  VkPipelineStageFlags stageMask)
 {
-   assert(!"vkCmdSetEvent not implemented yet");
+   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+   V3DV_FROM_HANDLE(v3dv_event, event, _event);
+
+   /* Event (re)sets can only happen outside a render pass instance so we
+    * should not be in the middle of job recording.
+    */
+   assert(cmd_buffer->state.pass == NULL);
+   assert(cmd_buffer->state.job == NULL);
+
+   struct v3dv_job *job =
+      cmd_buffer_create_cpu_job(cmd_buffer->device,
+                                V3DV_JOB_TYPE_CPU_SET_EVENT,
+                                cmd_buffer, -1);
+   job->cpu.event_set.event = event;
+   job->cpu.event_set.state = 1;
+
+   list_addtail(&job->list_link, &cmd_buffer->submit_jobs);
 }
 
 void
 v3dv_CmdResetEvent(VkCommandBuffer commandBuffer,
-                   VkEvent event,
+                   VkEvent _event,
                    VkPipelineStageFlags stageMask)
 {
-   assert(!"vkCmdResetEvent not implemented yet");
+   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+   V3DV_FROM_HANDLE(v3dv_event, event, _event);
+
+   /* Event (re)sets can only happen outside a render pass instance so we
+    * should not be in the middle of job recording.
+    */
+   assert(cmd_buffer->state.pass == NULL);
+   assert(cmd_buffer->state.job == NULL);
+
+   struct v3dv_job *job =
+      cmd_buffer_create_cpu_job(cmd_buffer->device,
+                                V3DV_JOB_TYPE_CPU_SET_EVENT,
+                                cmd_buffer, -1);
+   job->cpu.event_set.event = event;
+   job->cpu.event_set.state = 0;
+
+   list_addtail(&job->list_link, &cmd_buffer->submit_jobs);
 }
 
 void
@@ -3774,5 +3828,32 @@ v3dv_CmdWaitEvents(VkCommandBuffer commandBuffer,
                    uint32_t imageMemoryBarrierCount,
                    const VkImageMemoryBarrier *pImageMemoryBarriers)
 {
-   assert(!"vkCmdWaitEvents not implemented yet");
+   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   /* vkCmdWaitEvents can be recorded inside a render pass, so we might have
+    * an active job.
+    *
+    * FIXME: Since we can't signal/reset events inside a render pass, we could,
+    *        in theory, move this wait to an earlier point, such as before the
+    *        current job if it is inside a render pass, to avoid the split.
+    */
+   v3dv_cmd_buffer_finish_job(cmd_buffer);
+
+   assert(eventCount > 0);
+
+   struct v3dv_job *job =
+      cmd_buffer_create_cpu_job(cmd_buffer->device,
+                                V3DV_JOB_TYPE_CPU_WAIT_EVENTS,
+                                cmd_buffer, -1);
+
+   const uint32_t event_list_size = sizeof(struct v3dv_event *) * eventCount;
+
+   job->cpu.event_wait.event_count = eventCount;
+   job->cpu.event_wait.events =
+      vk_alloc(&cmd_buffer->device->alloc, event_list_size, 8,
+               VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   for (uint32_t i = 0; i < eventCount; i++)
+      job->cpu.event_wait.events[i] = v3dv_event_from_handle(pEvents[i]);
+
+   list_addtail(&job->list_link, &cmd_buffer->submit_jobs);
 }
diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c
index 10940a7..52775ba 100644
--- a/src/broadcom/vulkan/v3dv_device.c
+++ b/src/broadcom/vulkan/v3dv_device.c
@@ -41,6 +41,7 @@
 
 #include "drm-uapi/v3d_drm.h"
 #include "format/u_format.h"
+#include "u_atomic.h"
 #include "vk_util.h"
 
 #ifdef VK_USE_PLATFORM_XCB_KHR
@@ -1028,12 +1029,16 @@ queue_init(struct v3dv_device *device, struct v3dv_queue *queue)
    queue->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
    queue->device = device;
    queue->flags = 0;
+   list_inithead(&queue->submit_wait_list);
+   pthread_mutex_init(&queue->mutex, NULL);
    return VK_SUCCESS;
 }
 
 static void
 queue_finish(struct v3dv_queue *queue)
 {
+   assert(list_is_empty(&queue->submit_wait_list));
+   pthread_mutex_destroy(&queue->mutex);
 }
 
 static void
@@ -1229,6 +1234,8 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice,
       device->display_fd = -1;
    }
 
+   pthread_mutex_init(&device->mutex, NULL);
+
    result = queue_init(device, &device->queue);
    if (result != VK_SUCCESS)
       goto fail;
@@ -1265,6 +1272,7 @@ v3dv_DestroyDevice(VkDevice _device,
 
    v3dv_DeviceWaitIdle(_device);
    queue_finish(&device->queue);
+   pthread_mutex_destroy(&device->mutex);
    drmSyncobjDestroy(device->render_fd, device->last_job_sync);
    destroy_device_meta(device);
 
@@ -1289,20 +1297,7 @@ VkResult
 v3dv_DeviceWaitIdle(VkDevice _device)
 {
    V3DV_FROM_HANDLE(v3dv_device, device, _device);
-
-   int ret = drmSyncobjWait(device->render_fd,
-                            &device->last_job_sync, 1, INT64_MAX, 0, NULL);
-   if (ret)
-      return VK_ERROR_DEVICE_LOST;
-
-   return VK_SUCCESS;
-}
-
-VkResult
-v3dv_QueueWaitIdle(VkQueue _queue)
-{
-   V3DV_FROM_HANDLE(v3dv_queue, queue, _queue);
-   return v3dv_DeviceWaitIdle(v3dv_device_to_handle(queue->device));
+   return v3dv_QueueWaitIdle(v3dv_queue_to_handle(&device->queue));
 }
 
 VkResult
@@ -1909,25 +1904,11 @@ v3dv_CreateEvent(VkDevice _device,
    if (!event)
       return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
-   event->bo = v3dv_bo_alloc(device, 4096, "Event BO");
-   if (!event->bo)
-      goto fail_alloc;
-
-   bool ok = v3dv_bo_map(device, event->bo, 4096);
-   if (!ok)
-      goto fail_map;
-
    /* Events are created in the unsignaled state */
-   *((uint32_t *) event->bo->map) = 0;
+   event->state = false;
    *pEvent = v3dv_event_to_handle(event);
 
    return VK_SUCCESS;
-
-fail_map:
-   v3dv_bo_free(device, event->bo);
-fail_alloc:
-   vk_free2(&device->alloc, pAllocator, event);
-   return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
 }
 
 void
@@ -1941,7 +1922,6 @@ v3dv_DestroyEvent(VkDevice _device,
    if (!event)
       return;
 
-   v3dv_bo_free(device, event->bo);
    vk_free2(&device->alloc, pAllocator, event);
 }
 
@@ -1949,14 +1929,14 @@ VkResult
 v3dv_GetEventStatus(VkDevice _device, VkEvent _event)
 {
    V3DV_FROM_HANDLE(v3dv_event, event, _event);
-   return *((uint32_t *) event->bo->map) == 1 ? VK_EVENT_SET : VK_EVENT_RESET;
+   return p_atomic_read(&event->state) ? VK_EVENT_SET : VK_EVENT_RESET;
 }
 
 VkResult
 v3dv_SetEvent(VkDevice _device, VkEvent _event)
 {
    V3DV_FROM_HANDLE(v3dv_event, event, _event);
-   *((uint32_t *) event->bo->map) = 1;
+   p_atomic_set(&event->state, 1);
    return VK_SUCCESS;
 }
 
@@ -1964,7 +1944,7 @@ VkResult
 v3dv_ResetEvent(VkDevice _device, VkEvent _event)
 {
    V3DV_FROM_HANDLE(v3dv_event, event, _event);
-   *((uint32_t *) event->bo->map) = 0;
+   p_atomic_set(&event->state, 0);
    return VK_SUCCESS;
 }
 
diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
index db9fa76..f42ce0d 100644
--- a/src/broadcom/vulkan/v3dv_private.h
+++ b/src/broadcom/vulkan/v3dv_private.h
@@ -202,12 +202,46 @@ struct v3dv_instance {
    struct vk_debug_report_instance debug_report_callbacks;
 };
 
+/* Tracks wait threads spawned from a single vkQueueSubmit call */
+struct v3dv_queue_submit_wait_info {
+   struct list_head list_link;
+
+   struct v3dv_device *device;
+
+   /* List of wait threads spawned for any command buffers in a particular
+    * call to vkQueueSubmit.
+    */
+   uint32_t wait_thread_count;
+   struct {
+      pthread_t thread;
+      bool finished;
+   } wait_threads[16];
+
+   /* The master wait thread for the entire submit. This will wait for all
+    * other threads in this submit to complete  before processing signal
+    * semaphores and fences.
+    */
+   pthread_t master_wait_thread;
+
+   /* List of semaphores (and fence) to signal after all wait threads completed
+    * and all command buffer jobs in the submission have been sent to the GPU.
+    */
+   uint32_t signal_semaphore_count;
+   VkSemaphore *signal_semaphores;
+   VkFence fence;
+};
+
 struct v3dv_queue {
    VK_LOADER_DATA _loader_data;
 
    struct v3dv_device *device;
-
    VkDeviceQueueCreateFlags flags;
+
+   /* A list of active v3dv_queue_submit_wait_info */
+   struct list_head submit_wait_list;
+
+   /* A mutex to prevent concurrent access to the list of wait threads */
+   mtx_t mutex;
 };
 
 struct v3dv_meta_color_clear_pipeline {
@@ -237,11 +271,12 @@ struct v3dv_device {
    struct v3d_device_info devinfo;
    struct v3dv_queue queue;
 
-   /* Last command buffer submitted on this device. We use this to check if
-    * the GPU is idle.
-    */
+   /* A sync object to track the last job submitted to the GPU. */
    uint32_t last_job_sync;
 
+   /* A mutex to prevent concurrent access to last_job_sync from the queue */
+   mtx_t mutex;
+
    /* Resources used for meta operations */
    struct {
       mtx_t mtx;
@@ -618,9 +653,12 @@ enum v3dv_ez_state {
 enum v3dv_job_type {
    V3DV_JOB_TYPE_GPU_CL = 0,
    V3DV_JOB_TYPE_GPU_TFU,
+   V3DV_JOB_TYPE_GPU_CSD,
    V3DV_JOB_TYPE_CPU_RESET_QUERIES,
    V3DV_JOB_TYPE_CPU_END_QUERY,
    V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS,
+   V3DV_JOB_TYPE_CPU_SET_EVENT,
+   V3DV_JOB_TYPE_CPU_WAIT_EVENTS,
 };
 
 struct v3dv_reset_query_cpu_job_info {
@@ -644,6 +682,20 @@ struct v3dv_copy_query_results_cpu_job_info {
    VkQueryResultFlags flags;
 };
 
+struct v3dv_event_set_cpu_job_info {
+   struct v3dv_event *event;
+   int state;
+};
+
+struct v3dv_event_wait_cpu_job_info {
+   /* List of events to wait on */
+   uint32_t event_count;
+   struct v3dv_event **events;
+
+   /* Whether any postponed jobs after the wait should wait on semaphores */
+   bool sem_wait;
+};
+
 struct v3dv_job {
    struct list_head list_link;
 
@@ -703,6 +755,8 @@ struct v3dv_job {
       struct v3dv_reset_query_cpu_job_info        query_reset;
       struct v3dv_end_query_cpu_job_info          query_end;
       struct v3dv_copy_query_results_cpu_job_info query_copy_results;
+      struct v3dv_event_set_cpu_job_info          event_set;
+      struct v3dv_event_wait_cpu_job_info         event_wait;
    } cpu;
 
    /* Job spects for TFU jobs */
@@ -879,6 +933,12 @@ struct v3dv_cmd_buffer {
    struct v3dv_cmd_pool *pool;
    struct list_head pool_link;
 
+   /* Used at submit time to link command buffers in the submission that have
+    * spawned wait threads, so we can then wait on all of them to complete
+    * before we process any signal sempahores or fences.
+    */
+   struct list_head list_link;
+
    VkCommandBufferUsageFlags usage_flags;
    VkCommandBufferLevel level;
 
@@ -975,7 +1035,7 @@ struct v3dv_fence {
 };
 
 struct v3dv_event {
-   struct v3dv_bo *bo;
+   int state;
 };
 
 struct v3dv_shader_module {
diff --git a/src/broadcom/vulkan/v3dv_queue.c b/src/broadcom/vulkan/v3dv_queue.c
index 63ab297..93d5451 100644
--- a/src/broadcom/vulkan/v3dv_queue.c
+++ b/src/broadcom/vulkan/v3dv_queue.c
@@ -26,6 +26,8 @@
 
 #include "broadcom/clif/clif_dump.h"
 
+#include "u_atomic.h"
+
 #include <errno.h>
 #include <time.h>
 
@@ -77,6 +79,80 @@ get_absolute_timeout(uint64_t timeout)
 }
 
 static VkResult
+queue_submit_job(struct v3dv_queue *queue,
+                 struct v3dv_job *job,
+                 bool do_wait,
+                 pthread_t *wait_thread);
+
+/* Waits for active CPU wait threads spawned before the current thread to
+ * complete and submit all their GPU jobs.
+ */
+static void
+cpu_queue_wait_idle(struct v3dv_queue *queue)
+{
+   const pthread_t this_thread = pthread_self();
+
+retry:
+   mtx_lock(&queue->mutex);
+   list_for_each_entry(struct v3dv_queue_submit_wait_info, info,
+                       &queue->submit_wait_list, list_link) {
+      for (uint32_t  i = 0; i < info->wait_thread_count; i++) {
+         if (info->wait_threads[i].finished)
+            continue;
+
+         /* Because we are testing this against the list of spawned threads
+          * it will never match for the main thread, so when we call this from
+          * the main thread we are effectively waiting for all active threads
+          * to complete, and otherwise we are only waiting for work submitted
+          * before the wait thread that called this (a wait thread should never
+          * be waiting for work submitted after it).
+          */
+         if (info->wait_threads[i].thread == this_thread)
+            goto done;
+
+         /* Wait and try again */
+         mtx_unlock(&queue->mutex);
+         usleep(500); /* 0.5 ms */
+         goto retry;
+      }
+   }
+
+done:
+   mtx_unlock(&queue->mutex);
+}
+
+static VkResult
+gpu_queue_wait_idle(struct v3dv_queue *queue)
+{
+   struct v3dv_device *device = queue->device;
+
+   mtx_lock(&device->mutex);
+   uint32_t last_job_sync = device->last_job_sync;
+   mtx_unlock(&device->mutex);
+
+   int ret = drmSyncobjWait(device->render_fd,
+                            &last_job_sync, 1, INT64_MAX, 0, NULL);
+   if (ret)
+      return VK_ERROR_DEVICE_LOST;
+
+   return VK_SUCCESS;
+}
+
+VkResult
+v3dv_QueueWaitIdle(VkQueue _queue)
+{
+   V3DV_FROM_HANDLE(v3dv_queue, queue, _queue);
+
+   /* Check that we don't have any wait threads running in the CPU first,
+    * as these can spawn new GPU jobs.
+    */
+   cpu_queue_wait_idle(queue);
+
+   /* Check we don't have any GPU jobs running */
+   return gpu_queue_wait_idle(queue);
+}
+
+static VkResult
 handle_reset_query_cpu_job(struct v3dv_job *job)
 {
    /* We are about to reset query counters so we need to make sure that
@@ -85,7 +161,9 @@ handle_reset_query_cpu_job(struct v3dv_job *job)
     * FIXME: we could avoid blocking the main thread for this if we use
     *        submission thread.
     */
-   v3dv_DeviceWaitIdle(v3dv_device_to_handle(job->device));
+   VkResult result = gpu_queue_wait_idle(&job->device->queue);
+   if (result != VK_SUCCESS)
+      return result;
 
    struct v3dv_reset_query_cpu_job_info *info = &job->cpu.query_reset;
    for (uint32_t i = info->first; i < info->first + info->count; i++) {
@@ -157,6 +235,161 @@ handle_copy_query_results_cpu_job(struct v3dv_job *job)
 }
 
 static VkResult
+handle_set_event_cpu_job(struct v3dv_job *job, bool is_wait_thread)
+{
+   /* From the Vulkan 1.0 spec:
+    *
+    *    "When vkCmdSetEvent is submitted to a queue, it defines an execution
+    *     dependency on commands that were submitted before it, and defines an
+    *     event signal operation which sets the event to the signaled state.
+    *     The first synchronization scope includes every command previously
+    *     submitted to the same queue, including those in the same command
+    *     buffer and batch".
+    *
+    * So we should wait for all prior work to be completed before signaling
+    * the event, this includes all active CPU wait threads spawned for any
+    * command buffer submitted *before* this.
+    *
+    * FIXME: we could avoid blocking the main thread for this if we use a
+    *        submission thread.
+    */
+
+   /* If we are calling this from a wait thread it will only wait
+    * wait threads sspawned before it, otherwise it will wait for
+    * all active threads to complete.
+    */
+   cpu_queue_wait_idle(&job->device->queue);
+
+   VkResult result = gpu_queue_wait_idle(&job->device->queue);
+   if (result != VK_SUCCESS)
+      return result;
+
+   struct v3dv_event_set_cpu_job_info *info = &job->cpu.event_set;
+   p_atomic_set(&info->event->state, info->state);
+
+   return VK_SUCCESS;
+}
+
+static bool
+check_wait_events_complete(struct v3dv_job *job)
+{
+   assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS);
+
+   struct v3dv_event_wait_cpu_job_info *info = &job->cpu.event_wait;
+   for (uint32_t i = 0; i < info->event_count; i++) {
+      if (!p_atomic_read(&info->events[i]->state))
+         return false;
+   }
+   return true;
+}
+
+static void
+wait_thread_finish(struct v3dv_queue *queue, pthread_t thread)
+{
+   mtx_lock(&queue->mutex);
+   list_for_each_entry(struct v3dv_queue_submit_wait_info, info,
+                       &queue->submit_wait_list, list_link) {
+      for (uint32_t  i = 0; i < info->wait_thread_count; i++) {
+         if (info->wait_threads[i].thread == thread) {
+            info->wait_threads[i].finished = true;
+            goto done;
+         }
+      }
+   }
+
+   unreachable(!"Failed to finish wait thread: not found");
+
+done:
+   mtx_unlock(&queue->mutex);
+}
+
+static void *
+event_wait_thread_func(void *_job)
+{
+   struct v3dv_job *job = (struct v3dv_job *) _job;
+   assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS);
+   struct v3dv_event_wait_cpu_job_info *info = &job->cpu.event_wait;
+
+   /* Wait for events to be signaled */
+   const useconds_t wait_interval_ms = 1;
+   while (!check_wait_events_complete(job))
+      usleep(wait_interval_ms * 1000);
+
+   /* Now continue submitting pending jobs for the same command buffer after
+    * the wait job.
+    */
+   struct v3dv_queue *queue = &job->device->queue;
+   list_for_each_entry_from(struct v3dv_job, pjob, job->list_link.next,
+                            &job->cmd_buffer->submit_jobs, list_link) {
+      /* We don't want to spawn more than one wait thread per command buffer.
+       * If this job also requires a wait for events, we will do the wait here.
+       */
+      VkResult result = queue_submit_job(queue, pjob, info->sem_wait, NULL);
+      if (result == VK_NOT_READY) {
+         while (!check_wait_events_complete(pjob)) {
+            usleep(wait_interval_ms * 1000);
+         }
+         result = VK_SUCCESS;
+      }
+
+      if (result != VK_SUCCESS) {
+         fprintf(stderr, "Wait thread job execution failed.\n");
+         goto done;
+      }
+   }
+
+done:
+   wait_thread_finish(queue, pthread_self());
+   return NULL;
+}
+
+static VkResult
+spawn_event_wait_thread(struct v3dv_job *job, pthread_t *wait_thread)
+
+{
+   assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS);
+   assert(job->cmd_buffer);
+   assert(wait_thread != NULL);
+
+   if (pthread_create(wait_thread, NULL, event_wait_thread_func, job))
+      return vk_error(job->device->instance, VK_ERROR_DEVICE_LOST);
+
+   return VK_NOT_READY;
+}
+
+static VkResult
+handle_wait_events_cpu_job(struct v3dv_job *job,
+                           bool sem_wait,
+                           pthread_t *wait_thread)
+{
+   assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS);
+   struct v3dv_event_wait_cpu_job_info *info = &job->cpu.event_wait;
+
+   /* If all events are signaled then we are done and can continue submitting
+    * the rest of the command buffer normally.
+    */
+   if (check_wait_events_complete(job))
+      return VK_SUCCESS;
+
+   /* Otherwise, we put the rest of the command buffer on a wait thread until
+    * all events are signaled. We only spawn a new thread on the first
+    * wait job we see for a command buffer, any additional wait jobs in the
+    * same command buffer will run in that same wait thread and will get here
+    * with a NULL wait_thread pointer.
+    *
+    * Also, whether we spawn a wait thread or not, we always return
+    * VK_NOT_READY (unless an error happened), so we stop trying to submit
+    * any jobs in the same command buffer after the wait job. The wait thread
+    * will attempt to submit them after the wait completes.
+    */
+   info->sem_wait = sem_wait;
+   if (wait_thread)
+      return spawn_event_wait_thread(job, wait_thread);
+   else
+      return VK_NOT_READY;
+}
+
+static VkResult
 process_semaphores_to_signal(struct v3dv_device *device,
                              uint32_t count, const VkSemaphore *sems)
 {
@@ -164,7 +397,9 @@ process_semaphores_to_signal(struct v3dv_device *device,
       return VK_SUCCESS;
 
    int fd;
+   mtx_lock(&device->mutex);
    drmSyncobjExportSyncFile(device->render_fd, device->last_job_sync, &fd);
+   mtx_unlock(&device->mutex);
    if (fd == -1)
       return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
@@ -198,7 +433,9 @@ process_fence_to_signal(struct v3dv_device *device, VkFence _fence)
    fence->fd = -1;
 
    int fd;
+   mtx_lock(&device->mutex);
    drmSyncobjExportSyncFile(device->render_fd, device->last_job_sync, &fd);
+   mtx_unlock(&device->mutex);
    if (fd == -1)
       return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
@@ -220,39 +457,22 @@ handle_cl_job(struct v3dv_queue *queue,
 
    struct drm_v3d_submit_cl submit;
 
-   /* RCL jobs don't start until the previous RCL job has finished so we don't
-    * really need to add a fence for those, however, we might need to wait on a
-    * CSD or TFU job, which are not serialized.
-    *
-    * FIXME: for now, if we are asked to wait on any semaphores, we just wait
-    * on the last job we submitted. In the future we might want to pass the
-    * actual syncobj of the wait semaphores so we don't block on the last RCL
-    * if we only need to wait for a previous CSD or TFU, for example, but
-    * we would have to extend our kernel interface to support the case where
-    * we have more than one semaphore to wait on.
-    */
-   submit.in_sync_bcl = 0;
-   submit.in_sync_rcl = do_wait ? device->last_job_sync : 0;
-
-   /* Update the sync object for the last rendering by this device. */
-   submit.out_sync = device->last_job_sync;
-
    submit.bcl_start = job->bcl.bo->offset;
    submit.bcl_end = job->bcl.bo->offset + v3dv_cl_offset(&job->bcl);
    submit.rcl_start = job->rcl.bo->offset;
    submit.rcl_end = job->rcl.bo->offset + v3dv_cl_offset(&job->rcl);
 
-   submit.flags = 0;
+   submit.qma = job->tile_alloc->offset;
+   submit.qms = job->tile_alloc->size;
+   submit.qts = job->tile_state->offset;
+
    /* FIXME: we already know that we support cache flush, as we only support
     * hw that supports that, but would be better to just DRM-ask it
     */
+   submit.flags = 0;
    if (job->tmu_dirty_rcl)
       submit.flags |= DRM_V3D_SUBMIT_CL_FLUSH_CACHE;
 
-   submit.qma = job->tile_alloc->offset;
-   submit.qms = job->tile_alloc->size;
-   submit.qts = job->tile_state->offset;
-
    submit.bo_handle_count = job->bo_count;
    uint32_t *bo_handles =
       (uint32_t *) malloc(sizeof(uint32_t) * MAX2(4, submit.bo_handle_count * 2));
@@ -264,9 +484,25 @@ handle_cl_job(struct v3dv_queue *queue,
    assert(bo_idx == submit.bo_handle_count);
    submit.bo_handles = (uintptr_t)(void *)bo_handles;
 
+   /* RCL jobs don't start until the previous RCL job has finished so we don't
+    * really need to add a fence for those, however, we might need to wait on a
+    * CSD or TFU job, which are not serialized.
+    *
+    * FIXME: for now, if we are asked to wait on any semaphores, we just wait
+    * on the last job we submitted. In the future we might want to pass the
+    * actual syncobj of the wait semaphores so we don't block on the last RCL
+    * if we only need to wait for a previous CSD or TFU, for example, but
+    * we would have to extend our kernel interface to support the case where
+    * we have more than one semaphore to wait on.
+    */
+   mtx_lock(&queue->device->mutex);
+   submit.in_sync_bcl = 0;
+   submit.in_sync_rcl = do_wait ? device->last_job_sync : 0;
+   submit.out_sync = device->last_job_sync;
    v3dv_clif_dump(device, job, &submit);
-
    int ret = v3dv_ioctl(device->render_fd, DRM_IOCTL_V3D_SUBMIT_CL, &submit);
+   mtx_unlock(&queue->device->mutex);
+
    static bool warned = false;
    if (ret && !warned) {
       fprintf(stderr, "Draw call returned %s. Expect corruption.\n",
@@ -287,12 +523,14 @@ handle_tfu_job(struct v3dv_queue *queue,
                struct v3dv_job *job,
                bool do_wait)
 {
-   const struct v3dv_device *device = queue->device;
+   struct v3dv_device *device = queue->device;
 
+   mtx_lock(&device->mutex);
    job->tfu.in_sync = do_wait ? device->last_job_sync : 0;
    job->tfu.out_sync = device->last_job_sync;
-
    int ret = v3dv_ioctl(device->render_fd, DRM_IOCTL_V3D_SUBMIT_TFU, &job->tfu);
+   mtx_unlock(&device->mutex);
+
    if (ret != 0) {
       fprintf(stderr, "Failed to submit TFU job: %d\n", ret);
       return vk_error(device->instance, VK_ERROR_DEVICE_LOST);
@@ -304,7 +542,8 @@ handle_tfu_job(struct v3dv_queue *queue,
 static VkResult
 queue_submit_job(struct v3dv_queue *queue,
                  struct v3dv_job *job,
-                 bool do_wait)
+                 bool do_wait,
+                 pthread_t *wait_thread)
 {
    assert(job);
 
@@ -319,6 +558,10 @@ queue_submit_job(struct v3dv_queue *queue,
       return handle_end_query_cpu_job(job);
    case V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS:
       return handle_copy_query_results_cpu_job(job);
+   case V3DV_JOB_TYPE_CPU_SET_EVENT:
+      return handle_set_event_cpu_job(job, wait_thread != NULL);
+   case V3DV_JOB_TYPE_CPU_WAIT_EVENTS:
+      return handle_wait_events_cpu_job(job, do_wait, wait_thread);
    default:
       unreachable("Unhandled job type");
    }
@@ -439,13 +682,15 @@ queue_submit_noop_job(struct v3dv_queue *queue, const VkSubmitInfo *pSubmit)
          return result;
    }
 
-   return queue_submit_job(queue, noop_job, pSubmit->waitSemaphoreCount > 0);
+   return queue_submit_job(queue, noop_job, pSubmit->waitSemaphoreCount > 0,
+                           NULL);
 }
 
 static VkResult
 queue_submit_cmd_buffer(struct v3dv_queue *queue,
                         struct v3dv_cmd_buffer *cmd_buffer,
-                        const VkSubmitInfo *pSubmit)
+                        const VkSubmitInfo *pSubmit,
+                        pthread_t *wait_thread)
 {
    assert(cmd_buffer);
 
@@ -455,7 +700,8 @@ queue_submit_cmd_buffer(struct v3dv_queue *queue,
    list_for_each_entry_safe(struct v3dv_job, job,
                             &cmd_buffer->submit_jobs, list_link) {
       VkResult result = queue_submit_job(queue, job,
-                                         pSubmit->waitSemaphoreCount > 0);
+                                         pSubmit->waitSemaphoreCount > 0,
+                                         wait_thread);
       if (result != VK_SUCCESS)
          return result;
    }
@@ -463,12 +709,81 @@ queue_submit_cmd_buffer(struct v3dv_queue *queue,
    return VK_SUCCESS;
 }
 
+static void
+add_wait_thread_to_list(struct v3dv_device *device,
+                        pthread_t thread,
+                        struct v3dv_queue_submit_wait_info **wait_info)
+{
+   /* If this is the first time we spawn a wait thread for this queue
+    * submission create a v3dv_queue_submit_wait_info to track this and
+    * any other threads in the same submission and add it to the global list
+    * in the queue.
+    */
+   if (*wait_info == NULL) {
+      *wait_info =
+         vk_zalloc(&device->alloc, sizeof(struct v3dv_queue_submit_wait_info), 8,
+                   VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
+      (*wait_info)->device = device;
+   }
+
+   /* And add the thread to the list of wait threads for this submission */
+   const uint32_t thread_idx = (*wait_info)->wait_thread_count;
+   assert(thread_idx < 16);
+   (*wait_info)->wait_threads[thread_idx].thread = thread;
+   (*wait_info)->wait_threads[thread_idx].finished = false;
+   (*wait_info)->wait_thread_count++;
+}
+
+static void
+add_signal_semaphores_to_wait_list(struct v3dv_device *device,
+                                   const VkSubmitInfo *pSubmit,
+                                   struct v3dv_queue_submit_wait_info *wait_info)
+{
+   assert(wait_info);
+
+   if (pSubmit->signalSemaphoreCount == 0)
+      return;
+
+   /* FIXME: We put all the semaphores in a list and we signal all of them
+    * together from the submit master thread when the last wait thread in the
+    * submit completes. We could do better though: group the semaphores per
+    * submit and signal them as soon as all wait threads for a particular
+    * submit completes. Not sure if the extra work would be worth it though,
+    * since we only spawn waith threads for event waits and only when the
+    * event if set from the host after the queue submission.
+    */
+
+   /* Check the size of the current semaphore list */
+   const uint32_t prev_count = wait_info->signal_semaphore_count;
+   const uint32_t prev_alloc_size = prev_count * sizeof(VkSemaphore);
+   VkSemaphore *prev_list = wait_info->signal_semaphores;
+
+   /* Resize the list to hold the additional semaphores */
+   const uint32_t extra_alloc_size =
+      pSubmit->signalSemaphoreCount * sizeof(VkSemaphore);
+   wait_info->signal_semaphore_count += pSubmit->signalSemaphoreCount;
+   wait_info->signal_semaphores =
+      vk_alloc(&device->alloc, prev_alloc_size + extra_alloc_size, 8,
+               VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
+
+   /* Copy the old list to the new allocation and free the old list */
+   if (prev_count > 0) {
+      memcpy(wait_info->signal_semaphores, prev_list, prev_alloc_size);
+      vk_free(&device->alloc, prev_list);
+   }
+
+   /* Add the new semaphores to the list */
+   memcpy(wait_info->signal_semaphores + prev_count,
+          pSubmit->pSignalSemaphores, extra_alloc_size);
+}
+
 static VkResult
 queue_submit_cmd_buffer_batch(struct v3dv_queue *queue,
                               const VkSubmitInfo *pSubmit,
-                              VkFence fence)
+                              struct v3dv_queue_submit_wait_info **wait_info)
 {
    VkResult result = VK_SUCCESS;
+   bool has_wait_threads = false;
 
    /* Even if we don't have any actual work to submit we still need to wait
     * on the wait semaphores and signal the signal semaphores and fence, so
@@ -479,9 +794,24 @@ queue_submit_cmd_buffer_batch(struct v3dv_queue *queue,
       result = queue_submit_noop_job(queue, pSubmit);
    } else {
       for (uint32_t i = 0; i < pSubmit->commandBufferCount; i++) {
+         pthread_t wait_thread;
          struct v3dv_cmd_buffer *cmd_buffer =
             v3dv_cmd_buffer_from_handle(pSubmit->pCommandBuffers[i]);
-         result = queue_submit_cmd_buffer(queue, cmd_buffer, pSubmit);
+         result = queue_submit_cmd_buffer(queue, cmd_buffer, pSubmit,
+                                          &wait_thread);
+
+         /* We get VK_NOT_READY if we had to spawn a wait thread for the
+          * command buffer. In that scenario, we want to continue submitting
+          * any pending command buffers in the batch, but we don't want to
+          * process any signal semaphores for the batch until we know we have
+          * submitted every job for every command buffer in the batch.
+          */
+         if (result == VK_NOT_READY) {
+            result = VK_SUCCESS;
+            add_wait_thread_to_list(queue->device, wait_thread, wait_info);
+            has_wait_threads = true;
+         }
+
          if (result != VK_SUCCESS)
             break;
       }
@@ -490,13 +820,78 @@ queue_submit_cmd_buffer_batch(struct v3dv_queue *queue,
    if (result != VK_SUCCESS)
       return result;
 
-   result = process_semaphores_to_signal(queue->device,
-                                         pSubmit->signalSemaphoreCount,
-                                         pSubmit->pSignalSemaphores);
+   /* If had to emit any wait threads in this submit we need to wait for all
+    * of them to complete before we can signal any semaphores.
+    */
+   if (!has_wait_threads) {
+      return process_semaphores_to_signal(queue->device,
+                                          pSubmit->signalSemaphoreCount,
+                                          pSubmit->pSignalSemaphores);
+   } else {
+      assert(*wait_info);
+      add_signal_semaphores_to_wait_list(queue->device, pSubmit, *wait_info);
+      return VK_NOT_READY;
+   }
+}
+
+static void *
+master_wait_thread_func(void *_wait_info)
+{
+   struct v3dv_queue_submit_wait_info *wait_info =
+      (struct v3dv_queue_submit_wait_info *) _wait_info;
+
+   struct v3dv_queue *queue = &wait_info->device->queue;
+
+   /* Wait for all command buffer wait threads to complete */
+   for (uint32_t i = 0; i < wait_info->wait_thread_count; i++) {
+      int res = pthread_join(wait_info->wait_threads[i].thread, NULL);
+      if (res != 0)
+         fprintf(stderr, "Wait thread failed to join.\n");
+   }
+
+   /* Signal semaphores and fences */
+   VkResult result;
+   result = process_semaphores_to_signal(wait_info->device,
+                                         wait_info->signal_semaphore_count,
+                                         wait_info->signal_semaphores);
    if (result != VK_SUCCESS)
-      return result;
+      fprintf(stderr, "Wait thread semaphore signaling failed.");
 
-   return VK_SUCCESS;
+   result = process_fence_to_signal(wait_info->device, wait_info->fence);
+   if (result != VK_SUCCESS)
+      fprintf(stderr, "Wait thread fence signaling failed.");
+
+   /* Release wait_info */
+   mtx_lock(&queue->mutex);
+   list_del(&wait_info->list_link);
+   mtx_unlock(&queue->mutex);
+
+   vk_free(&wait_info->device->alloc, wait_info->signal_semaphores);
+   vk_free(&wait_info->device->alloc, wait_info);
+
+   return NULL;
+}
+
+
+static VkResult
+spawn_master_wait_thread(struct v3dv_queue *queue,
+                         struct v3dv_queue_submit_wait_info *wait_info)
+
+{
+   VkResult result = VK_SUCCESS;
+
+   mtx_lock(&queue->mutex);
+   if (pthread_create(&wait_info->master_wait_thread, NULL,
+                      master_wait_thread_func, wait_info)) {
+      result = vk_error(queue->device->instance, VK_ERROR_DEVICE_LOST);
+      goto done;
+   }
+
+   list_addtail(&wait_info->list_link, &queue->submit_wait_list);
+
+done:
+   mtx_unlock(&queue->mutex);
+   return result;
 }
 
 VkResult
@@ -507,18 +902,31 @@ v3dv_QueueSubmit(VkQueue _queue,
 {
    V3DV_FROM_HANDLE(v3dv_queue, queue, _queue);
 
+   struct v3dv_queue_submit_wait_info *wait_info = NULL;
+
    VkResult result = VK_SUCCESS;
    for (uint32_t i = 0; i < submitCount; i++) {
-      result = queue_submit_cmd_buffer_batch(queue, &pSubmits[i], fence);
-      if (result != VK_SUCCESS)
-         return result;
+      result = queue_submit_cmd_buffer_batch(queue, &pSubmits[i], &wait_info);
+      if (result != VK_SUCCESS && result != VK_NOT_READY)
+         goto done;
    }
 
-   result = process_fence_to_signal(queue->device, fence);
-   if (result != VK_SUCCESS)
-      return result;
+   if (!wait_info) {
+      assert(result != VK_NOT_READY);
+      result = process_fence_to_signal(queue->device, fence);
+      goto done;
+   }
 
-   return VK_SUCCESS;
+   /* We emitted wait threads, so we have to spwan a master thread for this
+    * queue submission that waits for all other threads to complete and then
+    * will signal any semaphores and fences.
+    */
+   assert(wait_info);
+   wait_info->fence = fence;
+   result = spawn_master_wait_thread(queue, wait_info);
+
+done:
+   return result;
 }
 
 VkResult
-- 
2.7.4