v3dv: implement events

author Iago Toral Quiroga <itoral@igalia.com>

Mon, 18 May 2020 08:41:11 +0000 (10:41 +0200)

committer Marge Bot <eric+marge@anholt.net>

Tue, 13 Oct 2020 21:21:30 +0000 (21:21 +0000)
author Iago Toral Quiroga <itoral@igalia.com>
Mon, 18 May 2020 08:41:11 +0000 (10:41 +0200)
committer Marge Bot <eric+marge@anholt.net>
Tue, 13 Oct 2020 21:21:30 +0000 (21:21 +0000)
diff --git a/src/broadcom/vulkan/v3dv_cmd_buffer.c b/src/broadcom/vulkan/v3dv_cmd_buffer.c

index 73c19e2..4afd8f2 100644 (file)
--- a/src/broadcom/vulkan/v3dv_cmd_buffer.c
+++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c
@@ -130,6 +130,7 @@ cmd_buffer_init(struct v3dv_cmd_buffer *cmd_buffer,
  
     list_inithead(&cmd_buffer->private_objs);
     list_inithead(&cmd_buffer->submit_jobs);
+   list_inithead(&cmd_buffer->list_link);
  
     assert(pool);
     list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
@@ -161,6 +162,42 @@ cmd_buffer_create(struct v3dv_device *device,
     return VK_SUCCESS;
  }
  
+static void
+job_destroy_gpu_cl_resources(struct v3dv_job *job)
+{
+   assert(job->type == V3DV_JOB_TYPE_GPU_CL);
+
+   v3dv_cl_destroy(&job->bcl);
+   v3dv_cl_destroy(&job->rcl);
+   v3dv_cl_destroy(&job->indirect);
+
+   /* Since we don't ref BOs when we add them to the command buffer, don't
+    * unref them here either. Bo's will be freed when their corresponding API
+    * objects are destroyed.
+    */
+   _mesa_set_destroy(job->bos, NULL);
+
+   /* Extra BOs need to be destroyed with the job, since they were created
+    * internally by the driver for it.
+    */
+   set_foreach(job->extra_bos, entry) {
+      struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
+      v3dv_bo_free(job->device, bo);
+   }
+   _mesa_set_destroy(job->extra_bos, NULL);
+
+   v3dv_bo_free(job->device, job->tile_alloc);
+   v3dv_bo_free(job->device, job->tile_state);
+}
+
+static void
+job_destroy_cpu_wait_events_resources(struct v3dv_job *job)
+{
+   assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS);
+   assert(job->cmd_buffer);
+   vk_free(&job->cmd_buffer->device->alloc, job->cpu.event_wait.events);
+}
+
  void
  v3dv_job_destroy(struct v3dv_job *job)
  {
@@ -168,30 +205,15 @@ v3dv_job_destroy(struct v3dv_job *job)
  
     list_del(&job->list_link);
  
-   if (job->type == V3DV_JOB_TYPE_GPU_CL) {
-      v3dv_cl_destroy(&job->bcl);
-      v3dv_cl_destroy(&job->rcl);
-      v3dv_cl_destroy(&job->indirect);
-
-      /* Since we don't ref BOs, when we add them to the command buffer, don't
-       * unref them here either.
-       */
-#if 0
-      set_foreach(job->bos, entry) {
-         struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
-         v3dv_bo_free(cmd_buffer->device, bo);
-      }
-#endif
-      _mesa_set_destroy(job->bos, NULL);
-
-      set_foreach(job->extra_bos, entry) {
-         struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
-         v3dv_bo_free(job->device, bo);
-      }
-      _mesa_set_destroy(job->extra_bos, NULL);
-
-      v3dv_bo_free(job->device, job->tile_alloc);
-      v3dv_bo_free(job->device, job->tile_state);
+   switch (job->type) {
+   case V3DV_JOB_TYPE_GPU_CL:
+      job_destroy_gpu_cl_resources(job);
+      break;
+   case V3DV_JOB_TYPE_CPU_WAIT_EVENTS:
+      job_destroy_cpu_wait_events_resources(job);
+      break;
+   default:
+      break;
     }
  
     vk_free(&job->device->alloc, job);
@@ -3747,18 +3769,50 @@ v3dv_cmd_buffer_add_tfu_job(struct v3dv_cmd_buffer *cmd_buffer,
  
  void
  v3dv_CmdSetEvent(VkCommandBuffer commandBuffer,
-                 VkEvent event,
+                 VkEvent _event,
                   VkPipelineStageFlags stageMask)
  {
-   assert(!"vkCmdSetEvent not implemented yet");
+   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+   V3DV_FROM_HANDLE(v3dv_event, event, _event);
+
+   /* Event (re)sets can only happen outside a render pass instance so we
+    * should not be in the middle of job recording.
+    */
+   assert(cmd_buffer->state.pass == NULL);
+   assert(cmd_buffer->state.job == NULL);
+
+   struct v3dv_job *job =
+      cmd_buffer_create_cpu_job(cmd_buffer->device,
+                                V3DV_JOB_TYPE_CPU_SET_EVENT,
+                                cmd_buffer, -1);
+   job->cpu.event_set.event = event;
+   job->cpu.event_set.state = 1;
+
+   list_addtail(&job->list_link, &cmd_buffer->submit_jobs);
  }
  
  void
  v3dv_CmdResetEvent(VkCommandBuffer commandBuffer,
-                   VkEvent event,
+                   VkEvent _event,
                     VkPipelineStageFlags stageMask)
  {
-   assert(!"vkCmdResetEvent not implemented yet");
+   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+   V3DV_FROM_HANDLE(v3dv_event, event, _event);
+
+   /* Event (re)sets can only happen outside a render pass instance so we
+    * should not be in the middle of job recording.
+    */
+   assert(cmd_buffer->state.pass == NULL);
+   assert(cmd_buffer->state.job == NULL);
+
+   struct v3dv_job *job =
+      cmd_buffer_create_cpu_job(cmd_buffer->device,
+                                V3DV_JOB_TYPE_CPU_SET_EVENT,
+                                cmd_buffer, -1);
+   job->cpu.event_set.event = event;
+   job->cpu.event_set.state = 0;
+
+   list_addtail(&job->list_link, &cmd_buffer->submit_jobs);
  }
  
  void
@@ -3774,5 +3828,32 @@ v3dv_CmdWaitEvents(VkCommandBuffer commandBuffer,
                     uint32_t imageMemoryBarrierCount,
                     const VkImageMemoryBarrier *pImageMemoryBarriers)
  {
-   assert(!"vkCmdWaitEvents not implemented yet");
+   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   /* vkCmdWaitEvents can be recorded inside a render pass, so we might have
+    * an active job.
+    *
+    * FIXME: Since we can't signal/reset events inside a render pass, we could,
+    *        in theory, move this wait to an earlier point, such as before the
+    *        current job if it is inside a render pass, to avoid the split.
+    */
+   v3dv_cmd_buffer_finish_job(cmd_buffer);
+
+   assert(eventCount > 0);
+
+   struct v3dv_job *job =
+      cmd_buffer_create_cpu_job(cmd_buffer->device,
+                                V3DV_JOB_TYPE_CPU_WAIT_EVENTS,
+                                cmd_buffer, -1);
+
+   const uint32_t event_list_size = sizeof(struct v3dv_event *) * eventCount;
+
+   job->cpu.event_wait.event_count = eventCount;
+   job->cpu.event_wait.events =
+      vk_alloc(&cmd_buffer->device->alloc, event_list_size, 8,
+               VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   for (uint32_t i = 0; i < eventCount; i++)
+      job->cpu.event_wait.events[i] = v3dv_event_from_handle(pEvents[i]);
+
+   list_addtail(&job->list_link, &cmd_buffer->submit_jobs);
  }
diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c

index 10940a7..52775ba 100644 (file)
--- a/src/broadcom/vulkan/v3dv_device.c
+++ b/src/broadcom/vulkan/v3dv_device.c
@@ -41,6 +41,7 @@
  
  #include "drm-uapi/v3d_drm.h"
  #include "format/u_format.h"
+#include "u_atomic.h"
  #include "vk_util.h"
  
  #ifdef VK_USE_PLATFORM_XCB_KHR
@@ -1028,12 +1029,16 @@ queue_init(struct v3dv_device *device, struct v3dv_queue *queue)
     queue->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
     queue->device = device;
     queue->flags = 0;
+   list_inithead(&queue->submit_wait_list);
+   pthread_mutex_init(&queue->mutex, NULL);
     return VK_SUCCESS;
  }
  
  static void
  queue_finish(struct v3dv_queue *queue)
  {
+   assert(list_is_empty(&queue->submit_wait_list));
+   pthread_mutex_destroy(&queue->mutex);
  }
  
  static void
@@ -1229,6 +1234,8 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice,
        device->display_fd = -1;
     }
  
+   pthread_mutex_init(&device->mutex, NULL);
+
     result = queue_init(device, &device->queue);
     if (result != VK_SUCCESS)
        goto fail;
@@ -1265,6 +1272,7 @@ v3dv_DestroyDevice(VkDevice _device,
  
     v3dv_DeviceWaitIdle(_device);
     queue_finish(&device->queue);
+   pthread_mutex_destroy(&device->mutex);
     drmSyncobjDestroy(device->render_fd, device->last_job_sync);
     destroy_device_meta(device);
  
@@ -1289,20 +1297,7 @@ VkResult
  v3dv_DeviceWaitIdle(VkDevice _device)
  {
     V3DV_FROM_HANDLE(v3dv_device, device, _device);
-
-   int ret = drmSyncobjWait(device->render_fd,
-                            &device->last_job_sync, 1, INT64_MAX, 0, NULL);
-   if (ret)
-      return VK_ERROR_DEVICE_LOST;
-
-   return VK_SUCCESS;
-}
-
-VkResult
-v3dv_QueueWaitIdle(VkQueue _queue)
-{
-   V3DV_FROM_HANDLE(v3dv_queue, queue, _queue);
-   return v3dv_DeviceWaitIdle(v3dv_device_to_handle(queue->device));
+   return v3dv_QueueWaitIdle(v3dv_queue_to_handle(&device->queue));
  }
  
  VkResult
@@ -1909,25 +1904,11 @@ v3dv_CreateEvent(VkDevice _device,
     if (!event)
        return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
  
-   event->bo = v3dv_bo_alloc(device, 4096, "Event BO");
-   if (!event->bo)
-      goto fail_alloc;
-
-   bool ok = v3dv_bo_map(device, event->bo, 4096);
-   if (!ok)
-      goto fail_map;
-
     /* Events are created in the unsignaled state */
-   *((uint32_t *) event->bo->map) = 0;
+   event->state = false;
     *pEvent = v3dv_event_to_handle(event);
  
     return VK_SUCCESS;
-
-fail_map:
-   v3dv_bo_free(device, event->bo);
-fail_alloc:
-   vk_free2(&device->alloc, pAllocator, event);
-   return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
  }
  
  void
@@ -1941,7 +1922,6 @@ v3dv_DestroyEvent(VkDevice _device,
     if (!event)
        return;
  
-   v3dv_bo_free(device, event->bo);
     vk_free2(&device->alloc, pAllocator, event);
  }
  
@@ -1949,14 +1929,14 @@ VkResult
  v3dv_GetEventStatus(VkDevice _device, VkEvent _event)
  {
     V3DV_FROM_HANDLE(v3dv_event, event, _event);
-   return *((uint32_t *) event->bo->map) == 1 ? VK_EVENT_SET : VK_EVENT_RESET;
+   return p_atomic_read(&event->state) ? VK_EVENT_SET : VK_EVENT_RESET;
  }
  
  VkResult
  v3dv_SetEvent(VkDevice _device, VkEvent _event)
  {
     V3DV_FROM_HANDLE(v3dv_event, event, _event);
-   *((uint32_t *) event->bo->map) = 1;
+   p_atomic_set(&event->state, 1);
     return VK_SUCCESS;
  }
  
@@ -1964,7 +1944,7 @@ VkResult
  v3dv_ResetEvent(VkDevice _device, VkEvent _event)
  {
     V3DV_FROM_HANDLE(v3dv_event, event, _event);
-   *((uint32_t *) event->bo->map) = 0;
+   p_atomic_set(&event->state, 0);
     return VK_SUCCESS;
  }
  
diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h

index db9fa76..f42ce0d 100644 (file)
--- a/src/broadcom/vulkan/v3dv_private.h
+++ b/src/broadcom/vulkan/v3dv_private.h
@@ -202,12 +202,46 @@ struct v3dv_instance {
     struct vk_debug_report_instance debug_report_callbacks;
  };
  
+/* Tracks wait threads spawned from a single vkQueueSubmit call */
+struct v3dv_queue_submit_wait_info {
+   struct list_head list_link;
+
+   struct v3dv_device *device;
+
+   /* List of wait threads spawned for any command buffers in a particular
+    * call to vkQueueSubmit.
+    */
+   uint32_t wait_thread_count;
+   struct {
+      pthread_t thread;
+      bool finished;
+   } wait_threads[16];
+
+   /* The master wait thread for the entire submit. This will wait for all
+    * other threads in this submit to complete  before processing signal
+    * semaphores and fences.
+    */
+   pthread_t master_wait_thread;
+
+   /* List of semaphores (and fence) to signal after all wait threads completed
+    * and all command buffer jobs in the submission have been sent to the GPU.
+    */
+   uint32_t signal_semaphore_count;
+   VkSemaphore *signal_semaphores;
+   VkFence fence;
+};
+
  struct v3dv_queue {
     VK_LOADER_DATA _loader_data;
  
     struct v3dv_device *device;
-
     VkDeviceQueueCreateFlags flags;
+
+   /* A list of active v3dv_queue_submit_wait_info */
+   struct list_head submit_wait_list;
+
+   /* A mutex to prevent concurrent access to the list of wait threads */
+   mtx_t mutex;
  };
  
  struct v3dv_meta_color_clear_pipeline {
@@ -237,11 +271,12 @@ struct v3dv_device {
     struct v3d_device_info devinfo;
     struct v3dv_queue queue;
  
-   /* Last command buffer submitted on this device. We use this to check if
-    * the GPU is idle.
-    */
+   /* A sync object to track the last job submitted to the GPU. */
     uint32_t last_job_sync;
  
+   /* A mutex to prevent concurrent access to last_job_sync from the queue */
+   mtx_t mutex;
+
     /* Resources used for meta operations */
     struct {
        mtx_t mtx;
@@ -618,9 +653,12 @@ enum v3dv_ez_state {
  enum v3dv_job_type {
     V3DV_JOB_TYPE_GPU_CL = 0,
     V3DV_JOB_TYPE_GPU_TFU,
+   V3DV_JOB_TYPE_GPU_CSD,
     V3DV_JOB_TYPE_CPU_RESET_QUERIES,
     V3DV_JOB_TYPE_CPU_END_QUERY,
     V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS,
+   V3DV_JOB_TYPE_CPU_SET_EVENT,
+   V3DV_JOB_TYPE_CPU_WAIT_EVENTS,
  };
  
  struct v3dv_reset_query_cpu_job_info {
@@ -644,6 +682,20 @@ struct v3dv_copy_query_results_cpu_job_info {
     VkQueryResultFlags flags;
  };
  
+struct v3dv_event_set_cpu_job_info {
+   struct v3dv_event *event;
+   int state;
+};
+
+struct v3dv_event_wait_cpu_job_info {
+   /* List of events to wait on */
+   uint32_t event_count;
+   struct v3dv_event **events;
+
+   /* Whether any postponed jobs after the wait should wait on semaphores */
+   bool sem_wait;
+};
+
  struct v3dv_job {
     struct list_head list_link;
  
@@ -703,6 +755,8 @@ struct v3dv_job {
        struct v3dv_reset_query_cpu_job_info        query_reset;
        struct v3dv_end_query_cpu_job_info          query_end;
        struct v3dv_copy_query_results_cpu_job_info query_copy_results;
+      struct v3dv_event_set_cpu_job_info          event_set;
+      struct v3dv_event_wait_cpu_job_info         event_wait;
     } cpu;
  
     /* Job spects for TFU jobs */
@@ -879,6 +933,12 @@ struct v3dv_cmd_buffer {
     struct v3dv_cmd_pool *pool;
     struct list_head pool_link;
  
+   /* Used at submit time to link command buffers in the submission that have
+    * spawned wait threads, so we can then wait on all of them to complete
+    * before we process any signal sempahores or fences.
+    */
+   struct list_head list_link;
+
     VkCommandBufferUsageFlags usage_flags;
     VkCommandBufferLevel level;
  
@@ -975,7 +1035,7 @@ struct v3dv_fence {
  };
  
  struct v3dv_event {
-   struct v3dv_bo *bo;
+   int state;
  };
  
  struct v3dv_shader_module {
diff --git a/src/broadcom/vulkan/v3dv_queue.c b/src/broadcom/vulkan/v3dv_queue.c

index 63ab297..93d5451 100644 (file)
--- a/src/broadcom/vulkan/v3dv_queue.c
+++ b/src/broadcom/vulkan/v3dv_queue.c
@@ -26,6 +26,8 @@
  
  #include "broadcom/clif/clif_dump.h"
  
+#include "u_atomic.h"
+
  #include <errno.h>
  #include <time.h>
  
@@ -77,6 +79,80 @@ get_absolute_timeout(uint64_t timeout)
  }
  
  static VkResult
+queue_submit_job(struct v3dv_queue *queue,
+                 struct v3dv_job *job,
+                 bool do_wait,
+                 pthread_t *wait_thread);
+
+/* Waits for active CPU wait threads spawned before the current thread to
+ * complete and submit all their GPU jobs.
+ */
+static void
+cpu_queue_wait_idle(struct v3dv_queue *queue)
+{
+   const pthread_t this_thread = pthread_self();
+
+retry:
+   mtx_lock(&queue->mutex);
+   list_for_each_entry(struct v3dv_queue_submit_wait_info, info,
+                       &queue->submit_wait_list, list_link) {
+      for (uint32_t  i = 0; i < info->wait_thread_count; i++) {
+         if (info->wait_threads[i].finished)
+            continue;
+
+         /* Because we are testing this against the list of spawned threads
+          * it will never match for the main thread, so when we call this from
+          * the main thread we are effectively waiting for all active threads
+          * to complete, and otherwise we are only waiting for work submitted
+          * before the wait thread that called this (a wait thread should never
+          * be waiting for work submitted after it).
+          */
+         if (info->wait_threads[i].thread == this_thread)
+            goto done;
+
+         /* Wait and try again */
+         mtx_unlock(&queue->mutex);
+         usleep(500); /* 0.5 ms */
+         goto retry;
+      }
+   }
+
+done:
+   mtx_unlock(&queue->mutex);
+}
+
+static VkResult
+gpu_queue_wait_idle(struct v3dv_queue *queue)
+{
+   struct v3dv_device *device = queue->device;
+
+   mtx_lock(&device->mutex);
+   uint32_t last_job_sync = device->last_job_sync;
+   mtx_unlock(&device->mutex);
+
+   int ret = drmSyncobjWait(device->render_fd,
+                            &last_job_sync, 1, INT64_MAX, 0, NULL);
+   if (ret)
+      return VK_ERROR_DEVICE_LOST;
+
+   return VK_SUCCESS;
+}
+
+VkResult
+v3dv_QueueWaitIdle(VkQueue _queue)
+{
+   V3DV_FROM_HANDLE(v3dv_queue, queue, _queue);
+
+   /* Check that we don't have any wait threads running in the CPU first,
+    * as these can spawn new GPU jobs.
+    */
+   cpu_queue_wait_idle(queue);
+
+   /* Check we don't have any GPU jobs running */
+   return gpu_queue_wait_idle(queue);
+}
+
+static VkResult
  handle_reset_query_cpu_job(struct v3dv_job *job)
  {
     /* We are about to reset query counters so we need to make sure that
@@ -85,7 +161,9 @@ handle_reset_query_cpu_job(struct v3dv_job *job)
      * FIXME: we could avoid blocking the main thread for this if we use
      *        submission thread.
      */
-   v3dv_DeviceWaitIdle(v3dv_device_to_handle(job->device));
+   VkResult result = gpu_queue_wait_idle(&job->device->queue);
+   if (result != VK_SUCCESS)
+      return result;
  
     struct v3dv_reset_query_cpu_job_info *info = &job->cpu.query_reset;
     for (uint32_t i = info->first; i < info->first + info->count; i++) {
@@ -157,6 +235,161 @@ handle_copy_query_results_cpu_job(struct v3dv_job *job)
  }
  
  static VkResult
+handle_set_event_cpu_job(struct v3dv_job *job, bool is_wait_thread)
+{
+   /* From the Vulkan 1.0 spec:
+    *
+    *    "When vkCmdSetEvent is submitted to a queue, it defines an execution
+    *     dependency on commands that were submitted before it, and defines an
+    *     event signal operation which sets the event to the signaled state.
+    *     The first synchronization scope includes every command previously
+    *     submitted to the same queue, including those in the same command
+    *     buffer and batch".
+    *
+    * So we should wait for all prior work to be completed before signaling
+    * the event, this includes all active CPU wait threads spawned for any
+    * command buffer submitted *before* this.
+    *
+    * FIXME: we could avoid blocking the main thread for this if we use a
+    *        submission thread.
+    */
+
+   /* If we are calling this from a wait thread it will only wait
+    * wait threads sspawned before it, otherwise it will wait for
+    * all active threads to complete.
+    */
+   cpu_queue_wait_idle(&job->device->queue);
+
+   VkResult result = gpu_queue_wait_idle(&job->device->queue);
+   if (result != VK_SUCCESS)
+      return result;
+
+   struct v3dv_event_set_cpu_job_info *info = &job->cpu.event_set;
+   p_atomic_set(&info->event->state, info->state);
+
+   return VK_SUCCESS;
+}
+
+static bool
+check_wait_events_complete(struct v3dv_job *job)
+{
+   assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS);
+
+   struct v3dv_event_wait_cpu_job_info *info = &job->cpu.event_wait;
+   for (uint32_t i = 0; i < info->event_count; i++) {
+      if (!p_atomic_read(&info->events[i]->state))
+         return false;
+   }
+   return true;
+}
+
+static void
+wait_thread_finish(struct v3dv_queue *queue, pthread_t thread)
+{
+   mtx_lock(&queue->mutex);
+   list_for_each_entry(struct v3dv_queue_submit_wait_info, info,
+                       &queue->submit_wait_list, list_link) {
+      for (uint32_t  i = 0; i < info->wait_thread_count; i++) {
+         if (info->wait_threads[i].thread == thread) {
+            info->wait_threads[i].finished = true;
+            goto done;
+         }
+      }
+   }
+
+   unreachable(!"Failed to finish wait thread: not found");
+
+done:
+   mtx_unlock(&queue->mutex);
+}
+
+static void *
+event_wait_thread_func(void *_job)
+{
+   struct v3dv_job *job = (struct v3dv_job *) _job;
+   assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS);
+   struct v3dv_event_wait_cpu_job_info *info = &job->cpu.event_wait;
+
+   /* Wait for events to be signaled */
+   const useconds_t wait_interval_ms = 1;
+   while (!check_wait_events_complete(job))
+      usleep(wait_interval_ms * 1000);
+
+   /* Now continue submitting pending jobs for the same command buffer after
+    * the wait job.
+    */
+   struct v3dv_queue *queue = &job->device->queue;
+   list_for_each_entry_from(struct v3dv_job, pjob, job->list_link.next,
+                            &job->cmd_buffer->submit_jobs, list_link) {
+      /* We don't want to spawn more than one wait thread per command buffer.
+       * If this job also requires a wait for events, we will do the wait here.
+       */
+      VkResult result = queue_submit_job(queue, pjob, info->sem_wait, NULL);
+      if (result == VK_NOT_READY) {
+         while (!check_wait_events_complete(pjob)) {
+            usleep(wait_interval_ms * 1000);
+         }
+         result = VK_SUCCESS;
+      }
+
+      if (result != VK_SUCCESS) {
+         fprintf(stderr, "Wait thread job execution failed.\n");
+         goto done;
+      }
+   }
+
+done:
+   wait_thread_finish(queue, pthread_self());
+   return NULL;
+}
+
+static VkResult
+spawn_event_wait_thread(struct v3dv_job *job, pthread_t *wait_thread)
+
+{
+   assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS);
+   assert(job->cmd_buffer);
+   assert(wait_thread != NULL);
+
+   if (pthread_create(wait_thread, NULL, event_wait_thread_func, job))
+      return vk_error(job->device->instance, VK_ERROR_DEVICE_LOST);
+
+   return VK_NOT_READY;
+}
+
+static VkResult
+handle_wait_events_cpu_job(struct v3dv_job *job,
+                           bool sem_wait,
+                           pthread_t *wait_thread)
+{
+   assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS);
+   struct v3dv_event_wait_cpu_job_info *info = &job->cpu.event_wait;
+
+   /* If all events are signaled then we are done and can continue submitting
+    * the rest of the command buffer normally.
+    */
+   if (check_wait_events_complete(job))
+      return VK_SUCCESS;
+
+   /* Otherwise, we put the rest of the command buffer on a wait thread until
+    * all events are signaled. We only spawn a new thread on the first
+    * wait job we see for a command buffer, any additional wait jobs in the
+    * same command buffer will run in that same wait thread and will get here
+    * with a NULL wait_thread pointer.
+    *
+    * Also, whether we spawn a wait thread or not, we always return
+    * VK_NOT_READY (unless an error happened), so we stop trying to submit
+    * any jobs in the same command buffer after the wait job. The wait thread
+    * will attempt to submit them after the wait completes.
+    */
+   info->sem_wait = sem_wait;
+   if (wait_thread)
+      return spawn_event_wait_thread(job, wait_thread);
+   else
+      return VK_NOT_READY;
+}
+
+static VkResult
  process_semaphores_to_signal(struct v3dv_device *device,
                               uint32_t count, const VkSemaphore *sems)
  {
@@ -164,7 +397,9 @@ process_semaphores_to_signal(struct v3dv_device *device,
        return VK_SUCCESS;
  
     int fd;
+   mtx_lock(&device->mutex);
     drmSyncobjExportSyncFile(device->render_fd, device->last_job_sync, &fd);
+   mtx_unlock(&device->mutex);
     if (fd == -1)
        return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
  
@@ -198,7 +433,9 @@ process_fence_to_signal(struct v3dv_device *device, VkFence _fence)
     fence->fd = -1;
  
     int fd;
+   mtx_lock(&device->mutex);
     drmSyncobjExportSyncFile(device->render_fd, device->last_job_sync, &fd);
+   mtx_unlock(&device->mutex);
     if (fd == -1)
        return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
  
@@ -220,39 +457,22 @@ handle_cl_job(struct v3dv_queue *queue,
  
     struct drm_v3d_submit_cl submit;
  
-   /* RCL jobs don't start until the previous RCL job has finished so we don't
-    * really need to add a fence for those, however, we might need to wait on a
-    * CSD or TFU job, which are not serialized.
-    *
-    * FIXME: for now, if we are asked to wait on any semaphores, we just wait
-    * on the last job we submitted. In the future we might want to pass the
-    * actual syncobj of the wait semaphores so we don't block on the last RCL
-    * if we only need to wait for a previous CSD or TFU, for example, but
-    * we would have to extend our kernel interface to support the case where
-    * we have more than one semaphore to wait on.
-    */
-   submit.in_sync_bcl = 0;
-   submit.in_sync_rcl = do_wait ? device->last_job_sync : 0;
-
-   /* Update the sync object for the last rendering by this device. */
-   submit.out_sync = device->last_job_sync;
-
     submit.bcl_start = job->bcl.bo->offset;
     submit.bcl_end = job->bcl.bo->offset + v3dv_cl_offset(&job->bcl);
     submit.rcl_start = job->rcl.bo->offset;
     submit.rcl_end = job->rcl.bo->offset + v3dv_cl_offset(&job->rcl);
  
-   submit.flags = 0;
+   submit.qma = job->tile_alloc->offset;
+   submit.qms = job->tile_alloc->size;
+   submit.qts = job->tile_state->offset;
+
     /* FIXME: we already know that we support cache flush, as we only support
      * hw that supports that, but would be better to just DRM-ask it
      */
+   submit.flags = 0;
     if (job->tmu_dirty_rcl)
        submit.flags |= DRM_V3D_SUBMIT_CL_FLUSH_CACHE;
  
-   submit.qma = job->tile_alloc->offset;
-   submit.qms = job->tile_alloc->size;
-   submit.qts = job->tile_state->offset;
-
     submit.bo_handle_count = job->bo_count;
     uint32_t *bo_handles =
        (uint32_t *) malloc(sizeof(uint32_t) * MAX2(4, submit.bo_handle_count * 2));
@@ -264,9 +484,25 @@ handle_cl_job(struct v3dv_queue *queue,
     assert(bo_idx == submit.bo_handle_count);
     submit.bo_handles = (uintptr_t)(void *)bo_handles;
  
+   /* RCL jobs don't start until the previous RCL job has finished so we don't
+    * really need to add a fence for those, however, we might need to wait on a
+    * CSD or TFU job, which are not serialized.
+    *
+    * FIXME: for now, if we are asked to wait on any semaphores, we just wait
+    * on the last job we submitted. In the future we might want to pass the
+    * actual syncobj of the wait semaphores so we don't block on the last RCL
+    * if we only need to wait for a previous CSD or TFU, for example, but
+    * we would have to extend our kernel interface to support the case where
+    * we have more than one semaphore to wait on.
+    */
+   mtx_lock(&queue->device->mutex);
+   submit.in_sync_bcl = 0;
+   submit.in_sync_rcl = do_wait ? device->last_job_sync : 0;
+   submit.out_sync = device->last_job_sync;
     v3dv_clif_dump(device, job, &submit);
-
     int ret = v3dv_ioctl(device->render_fd, DRM_IOCTL_V3D_SUBMIT_CL, &submit);
+   mtx_unlock(&queue->device->mutex);
+
     static bool warned = false;
     if (ret && !warned) {
        fprintf(stderr, "Draw call returned %s. Expect corruption.\n",
@@ -287,12 +523,14 @@ handle_tfu_job(struct v3dv_queue *queue,
                 struct v3dv_job *job,
                 bool do_wait)
  {
-   const struct v3dv_device *device = queue->device;
+   struct v3dv_device *device = queue->device;
  
+   mtx_lock(&device->mutex);
     job->tfu.in_sync = do_wait ? device->last_job_sync : 0;
     job->tfu.out_sync = device->last_job_sync;
-
     int ret = v3dv_ioctl(device->render_fd, DRM_IOCTL_V3D_SUBMIT_TFU, &job->tfu);
+   mtx_unlock(&device->mutex);
+
     if (ret != 0) {
        fprintf(stderr, "Failed to submit TFU job: %d\n", ret);
        return vk_error(device->instance, VK_ERROR_DEVICE_LOST);
@@ -304,7 +542,8 @@ handle_tfu_job(struct v3dv_queue *queue,
  static VkResult
  queue_submit_job(struct v3dv_queue *queue,
                   struct v3dv_job *job,
-                 bool do_wait)
+                 bool do_wait,
+                 pthread_t *wait_thread)
  {
     assert(job);
  
@@ -319,6 +558,10 @@ queue_submit_job(struct v3dv_queue *queue,
        return handle_end_query_cpu_job(job);
     case V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS:
        return handle_copy_query_results_cpu_job(job);
+   case V3DV_JOB_TYPE_CPU_SET_EVENT:
+      return handle_set_event_cpu_job(job, wait_thread != NULL);
+   case V3DV_JOB_TYPE_CPU_WAIT_EVENTS:
+      return handle_wait_events_cpu_job(job, do_wait, wait_thread);
     default:
        unreachable("Unhandled job type");
     }
@@ -439,13 +682,15 @@ queue_submit_noop_job(struct v3dv_queue *queue, const VkSubmitInfo *pSubmit)
           return result;
     }
  
-   return queue_submit_job(queue, noop_job, pSubmit->waitSemaphoreCount > 0);
+   return queue_submit_job(queue, noop_job, pSubmit->waitSemaphoreCount > 0,
+                           NULL);
  }
  
  static VkResult
  queue_submit_cmd_buffer(struct v3dv_queue *queue,
                          struct v3dv_cmd_buffer *cmd_buffer,
-                        const VkSubmitInfo *pSubmit)
+                        const VkSubmitInfo *pSubmit,
+                        pthread_t *wait_thread)
  {
     assert(cmd_buffer);
  
@@ -455,7 +700,8 @@ queue_submit_cmd_buffer(struct v3dv_queue *queue,
     list_for_each_entry_safe(struct v3dv_job, job,
                              &cmd_buffer->submit_jobs, list_link) {
        VkResult result = queue_submit_job(queue, job,
-                                         pSubmit->waitSemaphoreCount > 0);
+                                         pSubmit->waitSemaphoreCount > 0,
+                                         wait_thread);
        if (result != VK_SUCCESS)
           return result;
     }
@@ -463,12 +709,81 @@ queue_submit_cmd_buffer(struct v3dv_queue *queue,
     return VK_SUCCESS;
  }
  
+static void
+add_wait_thread_to_list(struct v3dv_device *device,
+                        pthread_t thread,
+                        struct v3dv_queue_submit_wait_info **wait_info)
+{
+   /* If this is the first time we spawn a wait thread for this queue
+    * submission create a v3dv_queue_submit_wait_info to track this and
+    * any other threads in the same submission and add it to the global list
+    * in the queue.
+    */
+   if (*wait_info == NULL) {
+      *wait_info =
+         vk_zalloc(&device->alloc, sizeof(struct v3dv_queue_submit_wait_info), 8,
+                   VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
+      (*wait_info)->device = device;
+   }
+
+   /* And add the thread to the list of wait threads for this submission */
+   const uint32_t thread_idx = (*wait_info)->wait_thread_count;
+   assert(thread_idx < 16);
+   (*wait_info)->wait_threads[thread_idx].thread = thread;
+   (*wait_info)->wait_threads[thread_idx].finished = false;
+   (*wait_info)->wait_thread_count++;
+}
+
+static void
+add_signal_semaphores_to_wait_list(struct v3dv_device *device,
+                                   const VkSubmitInfo *pSubmit,
+                                   struct v3dv_queue_submit_wait_info *wait_info)
+{
+   assert(wait_info);
+
+   if (pSubmit->signalSemaphoreCount == 0)
+      return;
+
+   /* FIXME: We put all the semaphores in a list and we signal all of them
+    * together from the submit master thread when the last wait thread in the
+    * submit completes. We could do better though: group the semaphores per
+    * submit and signal them as soon as all wait threads for a particular
+    * submit completes. Not sure if the extra work would be worth it though,
+    * since we only spawn waith threads for event waits and only when the
+    * event if set from the host after the queue submission.
+    */
+
+   /* Check the size of the current semaphore list */
+   const uint32_t prev_count = wait_info->signal_semaphore_count;
+   const uint32_t prev_alloc_size = prev_count * sizeof(VkSemaphore);
+   VkSemaphore *prev_list = wait_info->signal_semaphores;
+
+   /* Resize the list to hold the additional semaphores */
+   const uint32_t extra_alloc_size =
+      pSubmit->signalSemaphoreCount * sizeof(VkSemaphore);
+   wait_info->signal_semaphore_count += pSubmit->signalSemaphoreCount;
+   wait_info->signal_semaphores =
+      vk_alloc(&device->alloc, prev_alloc_size + extra_alloc_size, 8,
+               VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
+
+   /* Copy the old list to the new allocation and free the old list */
+   if (prev_count > 0) {
+      memcpy(wait_info->signal_semaphores, prev_list, prev_alloc_size);
+      vk_free(&device->alloc, prev_list);
+   }
+
+   /* Add the new semaphores to the list */
+   memcpy(wait_info->signal_semaphores + prev_count,
+          pSubmit->pSignalSemaphores, extra_alloc_size);
+}
+
  static VkResult
  queue_submit_cmd_buffer_batch(struct v3dv_queue *queue,
                                const VkSubmitInfo *pSubmit,
-                              VkFence fence)
+                              struct v3dv_queue_submit_wait_info **wait_info)
  {
     VkResult result = VK_SUCCESS;
+   bool has_wait_threads = false;
  
     /* Even if we don't have any actual work to submit we still need to wait
      * on the wait semaphores and signal the signal semaphores and fence, so
@@ -479,9 +794,24 @@ queue_submit_cmd_buffer_batch(struct v3dv_queue *queue,
        result = queue_submit_noop_job(queue, pSubmit);
     } else {
        for (uint32_t i = 0; i < pSubmit->commandBufferCount; i++) {
+         pthread_t wait_thread;
           struct v3dv_cmd_buffer *cmd_buffer =
              v3dv_cmd_buffer_from_handle(pSubmit->pCommandBuffers[i]);
-         result = queue_submit_cmd_buffer(queue, cmd_buffer, pSubmit);
+         result = queue_submit_cmd_buffer(queue, cmd_buffer, pSubmit,
+                                          &wait_thread);
+
+         /* We get VK_NOT_READY if we had to spawn a wait thread for the
+          * command buffer. In that scenario, we want to continue submitting
+          * any pending command buffers in the batch, but we don't want to
+          * process any signal semaphores for the batch until we know we have
+          * submitted every job for every command buffer in the batch.
+          */
+         if (result == VK_NOT_READY) {
+            result = VK_SUCCESS;
+            add_wait_thread_to_list(queue->device, wait_thread, wait_info);
+            has_wait_threads = true;
+         }
+
           if (result != VK_SUCCESS)
              break;
        }
@@ -490,13 +820,78 @@ queue_submit_cmd_buffer_batch(struct v3dv_queue *queue,
     if (result != VK_SUCCESS)
        return result;
  
-   result = process_semaphores_to_signal(queue->device,
-                                         pSubmit->signalSemaphoreCount,
-                                         pSubmit->pSignalSemaphores);
+   /* If had to emit any wait threads in this submit we need to wait for all
+    * of them to complete before we can signal any semaphores.
+    */
+   if (!has_wait_threads) {
+      return process_semaphores_to_signal(queue->device,
+                                          pSubmit->signalSemaphoreCount,
+                                          pSubmit->pSignalSemaphores);
+   } else {
+      assert(*wait_info);
+      add_signal_semaphores_to_wait_list(queue->device, pSubmit, *wait_info);
+      return VK_NOT_READY;
+   }
+}
+
+static void *
+master_wait_thread_func(void *_wait_info)
+{
+   struct v3dv_queue_submit_wait_info *wait_info =
+      (struct v3dv_queue_submit_wait_info *) _wait_info;
+
+   struct v3dv_queue *queue = &wait_info->device->queue;
+
+   /* Wait for all command buffer wait threads to complete */
+   for (uint32_t i = 0; i < wait_info->wait_thread_count; i++) {
+      int res = pthread_join(wait_info->wait_threads[i].thread, NULL);
+      if (res != 0)
+         fprintf(stderr, "Wait thread failed to join.\n");
+   }
+
+   /* Signal semaphores and fences */
+   VkResult result;
+   result = process_semaphores_to_signal(wait_info->device,
+                                         wait_info->signal_semaphore_count,
+                                         wait_info->signal_semaphores);
     if (result != VK_SUCCESS)
-      return result;
+      fprintf(stderr, "Wait thread semaphore signaling failed.");
  
-   return VK_SUCCESS;
+   result = process_fence_to_signal(wait_info->device, wait_info->fence);
+   if (result != VK_SUCCESS)
+      fprintf(stderr, "Wait thread fence signaling failed.");
+
+   /* Release wait_info */
+   mtx_lock(&queue->mutex);
+   list_del(&wait_info->list_link);
+   mtx_unlock(&queue->mutex);
+
+   vk_free(&wait_info->device->alloc, wait_info->signal_semaphores);
+   vk_free(&wait_info->device->alloc, wait_info);
+
+   return NULL;
+}
+
+
+static VkResult
+spawn_master_wait_thread(struct v3dv_queue *queue,
+                         struct v3dv_queue_submit_wait_info *wait_info)
+
+{
+   VkResult result = VK_SUCCESS;
+
+   mtx_lock(&queue->mutex);
+   if (pthread_create(&wait_info->master_wait_thread, NULL,
+                      master_wait_thread_func, wait_info)) {
+      result = vk_error(queue->device->instance, VK_ERROR_DEVICE_LOST);
+      goto done;
+   }
+
+   list_addtail(&wait_info->list_link, &queue->submit_wait_list);
+
+done:
+   mtx_unlock(&queue->mutex);
+   return result;
  }
  
  VkResult
@@ -507,18 +902,31 @@ v3dv_QueueSubmit(VkQueue _queue,
  {
     V3DV_FROM_HANDLE(v3dv_queue, queue, _queue);
  
+   struct v3dv_queue_submit_wait_info *wait_info = NULL;
+
     VkResult result = VK_SUCCESS;
     for (uint32_t i = 0; i < submitCount; i++) {
-      result = queue_submit_cmd_buffer_batch(queue, &pSubmits[i], fence);
-      if (result != VK_SUCCESS)
-         return result;
+      result = queue_submit_cmd_buffer_batch(queue, &pSubmits[i], &wait_info);
+      if (result != VK_SUCCESS && result != VK_NOT_READY)
+         goto done;
     }
  
-   result = process_fence_to_signal(queue->device, fence);
-   if (result != VK_SUCCESS)
-      return result;
+   if (!wait_info) {
+      assert(result != VK_NOT_READY);
+      result = process_fence_to_signal(queue->device, fence);
+      goto done;
+   }
  
-   return VK_SUCCESS;
+   /* We emitted wait threads, so we have to spwan a master thread for this
+    * queue submission that waits for all other threads to complete and then
+    * will signal any semaphores and fences.
+    */
+   assert(wait_info);
+   wait_info->fence = fence;
+   result = spawn_master_wait_thread(queue, wait_info);
+
+done:
+   return result;
  }
  
  VkResult
author	Iago Toral Quiroga <itoral@igalia.com>
	Mon, 18 May 2020 08:41:11 +0000 (10:41 +0200)
committer	Marge Bot <eric+marge@anholt.net>
	Tue, 13 Oct 2020 21:21:30 +0000 (21:21 +0000)
src/broadcom/vulkan/v3dv_cmd_buffer.c		patch \| blob \| history
src/broadcom/vulkan/v3dv_device.c		patch \| blob \| history
src/broadcom/vulkan/v3dv_private.h		patch \| blob \| history
src/broadcom/vulkan/v3dv_queue.c		patch \| blob \| history