From 5c7e60362cbf26e331b5621fac643813558b6493 Mon Sep 17 00:00:00 2001
From: Juston Li <justonli@google.com>
Date: Mon, 6 Feb 2023 13:44:39 -0800
Subject: [PATCH] venus: enable timeline semaphore feedback

At vkQueueSubmit time, for each batch with timeline semaphores to
signal, append cmd_buffers with feedback cmds to update the counter
value in its respective feedback slot.

Since multiple signals on the same semaphore could be pending at the
same time across batches/vkQueueSubmits, src slots and commands are
allocated on demand. These src slots can be reused after they've been
signaled (if the current semaphore counter is greater/equal than the
src value) and are cleaned up on vkDestroySemaphore.

Signed-off-by: Juston Li <justonli@google.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20500>
---
 src/virtio/vulkan/vn_feedback.c |   6 +-
 src/virtio/vulkan/vn_queue.c    | 612 +++++++++++++++++++++++++++++++++++++---
 src/virtio/vulkan/vn_queue.h    |  32 +++
 3 files changed, 607 insertions(+), 43 deletions(-)

diff --git a/src/virtio/vulkan/vn_feedback.c b/src/virtio/vulkan/vn_feedback.c
index e7275ec..0fc5f84 100644
--- a/src/virtio/vulkan/vn_feedback.c
+++ b/src/virtio/vulkan/vn_feedback.c
@@ -418,7 +418,7 @@ vn_feedback_cmd_record(VkCommandBuffer cmd_handle,
    /* slot size is 8 bytes for timeline semaphore and 4 bytes fence.
     * src slot is non-null for timeline semaphore.
     */
-   VkDeviceSize buf_size = src_slot ? 8 : 4;
+   const VkDeviceSize buf_size = src_slot ? 8 : 4;
 
    static const VkCommandBufferBeginInfo begin_info = {
       .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
@@ -503,7 +503,7 @@ vn_feedback_cmd_record(VkCommandBuffer cmd_handle,
 VkResult
 vn_feedback_cmd_alloc(VkDevice dev_handle,
                       struct vn_feedback_cmd_pool *pool,
-                      struct vn_feedback_slot *slot,
+                      struct vn_feedback_slot *dst_slot,
                       struct vn_feedback_slot *src_slot,
                       VkCommandBuffer *out_cmd_handle)
 {
@@ -522,7 +522,7 @@ vn_feedback_cmd_alloc(VkDevice dev_handle,
    if (result != VK_SUCCESS)
       goto out_unlock;
 
-   result = vn_feedback_cmd_record(cmd_handle, slot, src_slot);
+   result = vn_feedback_cmd_record(cmd_handle, dst_slot, src_slot);
    if (result != VK_SUCCESS) {
       vn_FreeCommandBuffers(dev_handle, pool->pool, 1, &cmd_handle);
       goto out_unlock;
diff --git a/src/virtio/vulkan/vn_queue.c b/src/virtio/vulkan/vn_queue.c
index 55da4fc..3d8f58e 100644
--- a/src/virtio/vulkan/vn_queue.c
+++ b/src/virtio/vulkan/vn_queue.c
@@ -59,21 +59,33 @@ struct vn_queue_submission {
 
    bool synchronous;
    bool has_feedback_fence;
+   bool has_feedback_semaphore;
    const struct vn_device_memory *wsi_mem;
-
+   uint32_t sem_cmd_buffer_count;
+
+   /* Temporary storage allocation for submission
+    * A single alloc for storage is performed and the offsets inside
+    * storage are set as below:
+    * batches
+    *  - copy of SubmitInfos
+    *  - an extra SubmitInfo for appending fence feedback
+    * cmds
+    *  - copy of cmd buffers for any batch with sem feedback with
+    *    additional cmd buffers for each signal semaphore that uses
+    *    feedback
+    *  - an extra cmd buffer info for appending fence feedback
+    *    when using SubmitInfo2
+    */
    struct {
       void *storage;
 
-      /* ptr offset to cmd buffer info needed by SubmitInfo2
-       * for fence feedback */
-      VkCommandBufferSubmitInfo *fence_feedback_cmd_info;
-
-      /* ptr offsets to batches in storage */
       union {
          void *batches;
          VkSubmitInfo *submit_batches;
          VkSubmitInfo2 *submit_batches2;
       };
+
+      void *cmds;
    } temp;
 };
 
@@ -133,6 +145,53 @@ vn_get_signal_semaphore(struct vn_queue_submission *submit,
                   .semaphore;
 }
 
+static inline uint32_t
+vn_get_cmd_buffer_count(struct vn_queue_submission *submit,
+                        uint32_t batch_index)
+{
+   assert((submit->batch_type == VK_STRUCTURE_TYPE_SUBMIT_INFO) ||
+          (submit->batch_type == VK_STRUCTURE_TYPE_SUBMIT_INFO_2));
+
+   return submit->batch_type == VK_STRUCTURE_TYPE_SUBMIT_INFO
+             ? submit->submit_batches[batch_index].commandBufferCount
+             : submit->submit_batches2[batch_index].commandBufferInfoCount;
+}
+
+static inline const void *
+vn_get_cmd_buffer_ptr(struct vn_queue_submission *submit,
+                      uint32_t batch_index)
+{
+   assert((submit->batch_type == VK_STRUCTURE_TYPE_SUBMIT_INFO) ||
+          (submit->batch_type == VK_STRUCTURE_TYPE_SUBMIT_INFO_2));
+
+   return submit->batch_type == VK_STRUCTURE_TYPE_SUBMIT_INFO
+             ? (const void *)submit->submit_batches[batch_index]
+                  .pCommandBuffers
+             : (const void *)submit->submit_batches2[batch_index]
+                  .pCommandBufferInfos;
+}
+
+static uint64_t
+vn_get_signal_semaphore_counter(struct vn_queue_submission *submit,
+                                uint32_t batch_index,
+                                uint32_t semaphore_index)
+{
+   switch (submit->batch_type) {
+   case VK_STRUCTURE_TYPE_SUBMIT_INFO: {
+      const struct VkTimelineSemaphoreSubmitInfo *timeline_semaphore_info =
+         vk_find_struct_const(submit->submit_batches[batch_index].pNext,
+                              TIMELINE_SEMAPHORE_SUBMIT_INFO);
+      return timeline_semaphore_info->pSignalSemaphoreValues[semaphore_index];
+   }
+   case VK_STRUCTURE_TYPE_SUBMIT_INFO_2:
+      return submit->submit_batches2[batch_index]
+         .pSignalSemaphoreInfos[semaphore_index]
+         .value;
+   default:
+      unreachable("unexpected batch type");
+   }
+}
+
 static VkResult
 vn_queue_submission_fix_batch_semaphores(struct vn_queue_submission *submit,
                                          uint32_t batch_index)
@@ -166,14 +225,27 @@ vn_queue_submission_fix_batch_semaphores(struct vn_queue_submission *submit,
          dev->instance, vn_device_to_handle(dev), &res_info);
    }
 
+   bool batch_has_sem_feedback = false;
    for (uint32_t i = 0; i < signal_count; i++) {
       struct vn_semaphore *sem = vn_semaphore_from_handle(
          vn_get_signal_semaphore(submit, batch_index, i));
 
       /* see vn_queue_submission_prepare */
       submit->synchronous |= sem->is_external;
+
+      if (sem->feedback.slot) {
+         batch_has_sem_feedback = true;
+         submit->sem_cmd_buffer_count++;
+      }
+   }
+
+   if (batch_has_sem_feedback) {
+      submit->sem_cmd_buffer_count +=
+         vn_get_cmd_buffer_count(submit, batch_index);
    }
 
+   submit->has_feedback_semaphore |= batch_has_sem_feedback;
+
    return VK_SUCCESS;
 }
 
@@ -226,9 +298,9 @@ vn_queue_submission_alloc_storage(struct vn_queue_submission *submit)
    size_t batch_size = 0;
    size_t cmd_size = 0;
    size_t alloc_size = 0;
-   size_t batch_offset = 0;
+   size_t cmd_offset = 0;
 
-   if (!submit->has_feedback_fence)
+   if (!submit->has_feedback_fence && !submit->has_feedback_semaphore)
       return VK_SUCCESS;
 
    switch (submit->batch_type) {
@@ -239,23 +311,32 @@ vn_queue_submission_alloc_storage(struct vn_queue_submission *submit)
    case VK_STRUCTURE_TYPE_SUBMIT_INFO_2:
       batch_size = sizeof(VkSubmitInfo2);
       cmd_size = sizeof(VkCommandBufferSubmitInfo);
+      break;
+   default:
+      unreachable("unexpected batch type");
+   }
+
+   /* space for copied batches */
+   alloc_size = batch_size * submit->batch_count;
+   cmd_offset = alloc_size;
+
+   if (submit->has_feedback_fence) {
+      /* add space for an additional batch for fence feedback
+       * and move cmd offset
+       */
+      alloc_size += batch_size;
+      cmd_offset = alloc_size;
 
       /* SubmitInfo2 needs a cmd buffer info struct for the fence
        * feedback cmd
        */
-      if (submit->has_feedback_fence)
+      if (submit->batch_type == VK_STRUCTURE_TYPE_SUBMIT_INFO_2)
          alloc_size += cmd_size;
-
-      break;
-   default:
-      unreachable("unexpected batch type");
    }
-   /* offset/size for batches */
-   batch_offset = alloc_size;
-   alloc_size += batch_size * submit->batch_count;
-   /* add space for an additional batch for fence feedback */
-   if (submit->has_feedback_fence)
-      alloc_size += batch_size;
+
+   /* space for copied cmds and sem feedback cmds */
+   if (submit->has_feedback_semaphore)
+      alloc_size += submit->sem_cmd_buffer_count * cmd_size;
 
    submit->temp.storage = vk_alloc(alloc, alloc_size, VN_DEFAULT_ALIGN,
                                    VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
@@ -263,8 +344,207 @@ vn_queue_submission_alloc_storage(struct vn_queue_submission *submit)
    if (!submit->temp.storage)
       return VK_ERROR_OUT_OF_HOST_MEMORY;
 
-   submit->temp.fence_feedback_cmd_info = submit->temp.storage;
-   submit->temp.batches = submit->temp.storage + batch_offset;
+   submit->temp.batches = submit->temp.storage;
+   submit->temp.cmds = submit->temp.storage + cmd_offset;
+
+   return VK_SUCCESS;
+}
+
+struct vn_feedback_src {
+   struct vn_feedback_slot *src_slot;
+   VkCommandBuffer *commands;
+
+   struct list_head head;
+};
+
+static VkResult
+vn_timeline_semaphore_feedback_src_init(struct vn_device *dev,
+                                        struct vn_feedback_slot *slot,
+                                        struct vn_feedback_src *feedback_src,
+                                        const VkAllocationCallbacks *alloc)
+{
+   VkResult result;
+   VkDevice dev_handle = vn_device_to_handle(dev);
+
+   feedback_src->src_slot = vn_feedback_pool_alloc(
+      &dev->feedback_pool, VN_FEEDBACK_TYPE_TIMELINE_SEMAPHORE);
+
+   if (!feedback_src->src_slot)
+      return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+   feedback_src->commands = vk_zalloc(
+      alloc, sizeof(feedback_src->commands) * dev->queue_family_count,
+      VN_DEFAULT_ALIGN, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+
+   if (!feedback_src->commands) {
+      vn_feedback_pool_free(&dev->feedback_pool, feedback_src->src_slot);
+      return VK_ERROR_OUT_OF_HOST_MEMORY;
+   }
+
+   for (uint32_t i = 0; i < dev->queue_family_count; i++) {
+      result = vn_feedback_cmd_alloc(dev_handle, &dev->cmd_pools[i], slot,
+                                     feedback_src->src_slot,
+                                     &feedback_src->commands[i]);
+      if (result != VK_SUCCESS) {
+         for (uint32_t j = 0; j < i; j++) {
+            vn_feedback_cmd_free(dev_handle, &dev->cmd_pools[j],
+                                 feedback_src->commands[j]);
+         }
+         vk_free(alloc, feedback_src->commands);
+         vn_feedback_pool_free(&dev->feedback_pool, feedback_src->src_slot);
+         return result;
+      }
+   }
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+vn_set_sem_feedback_cmd(struct vn_queue *queue,
+                        struct vn_semaphore *sem,
+                        uint64_t counter,
+                        VkCommandBuffer *cmd_handle)
+{
+   VkResult result;
+   struct vn_device *dev = queue->device;
+   const VkAllocationCallbacks *alloc = &dev->base.base.alloc;
+   struct vn_feedback_src *free_feedback_src = NULL;
+
+   assert(sem->feedback.slot);
+
+   simple_mtx_lock(&sem->feedback.src_lists_mtx);
+   if (!list_is_empty(&sem->feedback.free_src_list)) {
+      free_feedback_src = list_first_entry(&sem->feedback.free_src_list,
+                                           struct vn_feedback_src, head);
+      list_move_to(&free_feedback_src->head, &sem->feedback.pending_src_list);
+   }
+   simple_mtx_unlock(&sem->feedback.src_lists_mtx);
+
+   if (!free_feedback_src) {
+      /* allocate a new src slot if none are free */
+      free_feedback_src =
+         vk_zalloc(alloc, sizeof(*free_feedback_src), VN_DEFAULT_ALIGN,
+                   VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+
+      if (!free_feedback_src)
+         return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+      result = vn_timeline_semaphore_feedback_src_init(
+         dev, sem->feedback.slot, free_feedback_src, alloc);
+      if (result != VK_SUCCESS) {
+         vk_free(alloc, free_feedback_src);
+         return result;
+      }
+
+      simple_mtx_lock(&sem->feedback.src_lists_mtx);
+      list_add(&free_feedback_src->head, &sem->feedback.pending_src_list);
+      simple_mtx_unlock(&sem->feedback.src_lists_mtx);
+   }
+
+   vn_feedback_set_counter(free_feedback_src->src_slot, counter);
+
+   for (uint32_t i = 0; i < queue->device->queue_family_count; i++) {
+      if (queue->device->queue_families[i] == queue->family) {
+         *cmd_handle = free_feedback_src->commands[i];
+         return VK_SUCCESS;
+      }
+   }
+
+   unreachable("bad feedback sem");
+}
+
+struct vn_feedback_cmds {
+   union {
+      void *cmds;
+      VkCommandBuffer *cmd_buffers;
+      VkCommandBufferSubmitInfo *cmd_buffer_infos;
+   };
+};
+
+static inline VkCommandBuffer *
+vn_get_cmd_handle(struct vn_queue_submission *submit,
+                  struct vn_feedback_cmds *feedback_cmds,
+                  uint32_t cmd_index)
+{
+   assert((submit->batch_type == VK_STRUCTURE_TYPE_SUBMIT_INFO) ||
+          (submit->batch_type == VK_STRUCTURE_TYPE_SUBMIT_INFO_2));
+
+   return submit->batch_type == VK_STRUCTURE_TYPE_SUBMIT_INFO
+             ? &feedback_cmds->cmd_buffers[cmd_index]
+             : &feedback_cmds->cmd_buffer_infos[cmd_index].commandBuffer;
+}
+
+static VkResult
+vn_queue_submission_add_semaphore_feedback(
+   struct vn_queue_submission *submit,
+   uint32_t batch_index,
+   uint32_t cmd_buffer_count,
+   uint32_t sem_feedback_count,
+   struct vn_feedback_cmds *feedback_cmds)
+{
+   struct vn_queue *queue = vn_queue_from_handle(submit->queue_handle);
+   uint32_t signal_semaphore_count =
+      vn_get_signal_semaphore_count(submit, batch_index);
+   VkResult result;
+
+   /* Update SubmitInfo to use our copy of cmd buffers with sem feedback cmds
+    * appended and update the cmd buffer count.
+    * SubmitInfo2 also needs to initialize the cmd buffer info struct.
+    */
+   switch (submit->batch_type) {
+   case VK_STRUCTURE_TYPE_SUBMIT_INFO: {
+      VkSubmitInfo *submit_info = &submit->temp.submit_batches[batch_index];
+
+      submit_info->pCommandBuffers = feedback_cmds->cmd_buffers;
+      submit_info->commandBufferCount = cmd_buffer_count + sem_feedback_count;
+      break;
+   }
+   case VK_STRUCTURE_TYPE_SUBMIT_INFO_2: {
+      VkSubmitInfo2 *submit_info2 =
+         &submit->temp.submit_batches2[batch_index];
+
+      for (uint32_t i = cmd_buffer_count;
+           i < cmd_buffer_count + sem_feedback_count; i++) {
+         VkCommandBufferSubmitInfo *cmd_buffer_info =
+            &feedback_cmds->cmd_buffer_infos[i];
+
+         cmd_buffer_info->sType =
+            VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO;
+         cmd_buffer_info->pNext = NULL;
+         cmd_buffer_info->deviceMask = 0;
+      }
+
+      submit_info2->pCommandBufferInfos = feedback_cmds->cmd_buffer_infos;
+      submit_info2->commandBufferInfoCount =
+         cmd_buffer_count + sem_feedback_count;
+      break;
+   }
+   default:
+      unreachable("unexpected batch type");
+   }
+
+   /* Set the sem feedback cmds we appended in our copy of cmd buffers
+    * with cmds to write the signal value.
+    */
+   uint32_t cmd_index = cmd_buffer_count;
+   for (uint32_t i = 0; i < signal_semaphore_count; i++) {
+      struct vn_semaphore *sem = vn_semaphore_from_handle(
+         vn_get_signal_semaphore(submit, batch_index, i));
+
+      if (sem->feedback.slot) {
+         VkCommandBuffer *cmd_handle =
+            vn_get_cmd_handle(submit, feedback_cmds, cmd_index);
+
+         uint64_t counter =
+            vn_get_signal_semaphore_counter(submit, batch_index, i);
+
+         result = vn_set_sem_feedback_cmd(queue, sem, counter, cmd_handle);
+         if (result != VK_SUCCESS)
+            return result;
+
+         cmd_index++;
+      }
+   }
 
    return VK_SUCCESS;
 }
@@ -283,7 +563,9 @@ vn_get_fence_feedback_cmd(struct vn_queue *queue, struct vn_fence *fence)
 }
 
 static void
-vn_queue_submission_add_fence_feedback(struct vn_queue_submission *submit)
+vn_queue_submission_add_fence_feedback(
+   struct vn_queue_submission *submit,
+   VkCommandBufferSubmitInfo *fence_feedback_cmd)
 {
    struct vn_queue *queue = vn_queue_from_handle(submit->queue_handle);
    struct vn_fence *fence = vn_fence_from_handle(submit->fence_handle);
@@ -307,10 +589,7 @@ vn_queue_submission_add_fence_feedback(struct vn_queue_submission *submit)
       break;
    }
    case VK_STRUCTURE_TYPE_SUBMIT_INFO_2: {
-      VkCommandBufferSubmitInfo *cmd_buffer_info =
-         submit->temp.fence_feedback_cmd_info;
-
-      *cmd_buffer_info = (VkCommandBufferSubmitInfo){
+      *fence_feedback_cmd = (VkCommandBufferSubmitInfo){
          .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO,
          .commandBuffer = *cmd_handle,
       };
@@ -321,7 +600,7 @@ vn_queue_submission_add_fence_feedback(struct vn_queue_submission *submit)
       *submit_info2 = (VkSubmitInfo2){
          .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO_2,
          .commandBufferInfoCount = 1,
-         .pCommandBufferInfos = cmd_buffer_info,
+         .pCommandBufferInfos = fence_feedback_cmd,
       };
       break;
    }
@@ -335,17 +614,21 @@ vn_queue_submission_add_fence_feedback(struct vn_queue_submission *submit)
 static VkResult
 vn_queue_submission_setup_batches(struct vn_queue_submission *submit)
 {
+   VkResult result;
    size_t batch_size = 0;
+   size_t cmd_size = 0;
 
-   if (!submit->has_feedback_fence)
+   if (!submit->has_feedback_fence && !submit->has_feedback_semaphore)
       return VK_SUCCESS;
 
    switch (submit->batch_type) {
    case VK_STRUCTURE_TYPE_SUBMIT_INFO:
       batch_size = sizeof(VkSubmitInfo);
+      cmd_size = sizeof(VkCommandBuffer);
       break;
    case VK_STRUCTURE_TYPE_SUBMIT_INFO_2:
       batch_size = sizeof(VkSubmitInfo2);
+      cmd_size = sizeof(VkCommandBufferSubmitInfo);
       break;
    default:
       unreachable("unexpected batch type");
@@ -361,8 +644,55 @@ vn_queue_submission_setup_batches(struct vn_queue_submission *submit)
              batch_size * submit->batch_count);
    }
 
-   if (submit->has_feedback_fence)
-      vn_queue_submission_add_fence_feedback(submit);
+   /* For any batches with semaphore feedback, copy the original
+    * cmd_buffer handles and append feedback cmds.
+    */
+   uint32_t cmd_offset = 0;
+   for (uint32_t batch_index = 0; batch_index < submit->batch_count;
+        batch_index++) {
+      uint32_t cmd_buffer_count =
+         vn_get_cmd_buffer_count(submit, batch_index);
+      uint32_t signal_count =
+         vn_get_signal_semaphore_count(submit, batch_index);
+
+      uint32_t sem_feedback_count = 0;
+      for (uint32_t i = 0; i < signal_count; i++) {
+         struct vn_semaphore *sem = vn_semaphore_from_handle(
+            vn_get_signal_semaphore(submit, batch_index, i));
+
+         if (sem->feedback.slot)
+            sem_feedback_count++;
+      }
+
+      if (sem_feedback_count) {
+         struct vn_feedback_cmds feedback_cmds = {
+            .cmds = submit->temp.cmds + cmd_offset,
+         };
+
+         size_t cmd_buffer_size = cmd_buffer_count * cmd_size;
+         /* copy only needed for non-empty batches */
+         if (cmd_buffer_size) {
+            memcpy(feedback_cmds.cmds,
+                   vn_get_cmd_buffer_ptr(submit, batch_index),
+                   cmd_buffer_size);
+         }
+
+         result = vn_queue_submission_add_semaphore_feedback(
+            submit, batch_index, cmd_buffer_count, sem_feedback_count,
+            &feedback_cmds);
+         if (result != VK_SUCCESS)
+            return result;
+
+         /* Set offset to next batches cmd_buffers */
+         cmd_offset += cmd_buffer_size + (sem_feedback_count * cmd_size);
+      }
+   }
+
+   if (submit->has_feedback_fence) {
+      VkCommandBufferSubmitInfo *fence_feedback_cmd =
+         submit->temp.cmds + cmd_offset;
+      vn_queue_submission_add_fence_feedback(submit, fence_feedback_cmd);
+   }
 
    submit->submit_batches = submit->temp.submit_batches;
 
@@ -370,12 +700,67 @@ vn_queue_submission_setup_batches(struct vn_queue_submission *submit)
 }
 
 static void
+vn_queue_sem_recycle_src_feedback(VkDevice dev_handle, VkSemaphore sem_handle)
+{
+
+   struct vn_semaphore *sem = vn_semaphore_from_handle(sem_handle);
+
+   if (!sem->feedback.slot)
+      return;
+
+   uint64_t curr_counter = 0;
+   vn_GetSemaphoreCounterValue(dev_handle, sem_handle, &curr_counter);
+
+   /* search pending src list for already signaled values*/
+   simple_mtx_lock(&sem->feedback.src_lists_mtx);
+   list_for_each_entry_safe(struct vn_feedback_src, feedback_src,
+                            &sem->feedback.pending_src_list, head) {
+      if (curr_counter >= vn_feedback_get_counter(feedback_src->src_slot)) {
+         list_move_to(&feedback_src->head, &sem->feedback.free_src_list);
+      }
+   }
+   simple_mtx_unlock(&sem->feedback.src_lists_mtx);
+}
+
+static void
+vn_queue_recycle_src_feedback(struct vn_queue_submission *submit)
+{
+   struct vn_queue *queue = vn_queue_from_handle(submit->queue_handle);
+   struct vn_device *dev = queue->device;
+   VkDevice dev_handle = vn_device_to_handle(dev);
+
+   for (uint32_t batch_index = 0; batch_index < submit->batch_count;
+        batch_index++) {
+
+      uint32_t wait_count = vn_get_wait_semaphore_count(submit, batch_index);
+      uint32_t signal_count =
+         vn_get_signal_semaphore_count(submit, batch_index);
+
+      for (uint32_t i = 0; i < wait_count; i++) {
+         VkSemaphore sem_handle =
+            vn_get_wait_semaphore(submit, batch_index, i);
+         vn_queue_sem_recycle_src_feedback(dev_handle, sem_handle);
+      }
+
+      for (uint32_t i = 0; i < signal_count; i++) {
+         VkSemaphore sem_handle =
+            vn_get_signal_semaphore(submit, batch_index, i);
+         vn_queue_sem_recycle_src_feedback(dev_handle, sem_handle);
+      }
+   }
+}
+
+static void
 vn_queue_submission_cleanup(struct vn_queue_submission *submit)
 {
    struct vn_queue *queue = vn_queue_from_handle(submit->queue_handle);
    const VkAllocationCallbacks *alloc = &queue->device->base.base.alloc;
 
-   if (submit->has_feedback_fence)
+   /* TODO clean up pending src feedbacks on failure? */
+   if (submit->has_feedback_semaphore)
+      vn_queue_recycle_src_feedback(submit);
+
+   if (submit->has_feedback_fence || submit->has_feedback_semaphore)
       vk_free(alloc, submit->temp.storage);
 }
 
@@ -816,8 +1201,8 @@ vn_GetFenceStatus(VkDevice device, VkFence _fence)
          if (result == VK_SUCCESS) {
             /* When fence feedback slot gets signaled, the real fence
              * signal operation follows after but the signaling isr can be
-             * deferred or preempted. To avoid theoretical racing, we let
-             * the renderer wait for the fence. This also helps resolve
+             * deferred or preempted. To avoid racing, we let the
+             * renderer wait for the fence. This also helps resolve
              * synchronization validation errors, because the layer no
              * longer sees any fence status checks and falsely believes the
              * caller does not sync.
@@ -1094,6 +1479,84 @@ vn_semaphore_signal_wsi(struct vn_device *dev, struct vn_semaphore *sem)
    sem->payload = temp;
 }
 
+static VkResult
+vn_timeline_semaphore_feedback_init(struct vn_device *dev,
+                                    struct vn_semaphore *sem,
+                                    uint64_t initial_value,
+                                    const VkAllocationCallbacks *alloc)
+{
+   struct vn_feedback_slot *slot;
+
+   assert(sem->type == VK_SEMAPHORE_TYPE_TIMELINE);
+
+   if (sem->is_external)
+      return VK_SUCCESS;
+
+   if (VN_PERF(NO_TIMELINE_SEM_FEEDBACK))
+      return VK_SUCCESS;
+
+   slot = vn_feedback_pool_alloc(&dev->feedback_pool,
+                                 VN_FEEDBACK_TYPE_TIMELINE_SEMAPHORE);
+   if (!slot)
+      return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+   list_inithead(&sem->feedback.pending_src_list);
+   list_inithead(&sem->feedback.free_src_list);
+
+   vn_feedback_set_counter(slot, initial_value);
+
+   simple_mtx_init(&sem->feedback.src_lists_mtx, mtx_plain);
+   simple_mtx_init(&sem->feedback.async_wait_mtx, mtx_plain);
+
+   sem->feedback.signaled_counter = initial_value;
+   sem->feedback.slot = slot;
+
+   return VK_SUCCESS;
+}
+
+static void
+vn_timeline_semaphore_feedback_free(struct vn_device *dev,
+                                    struct vn_feedback_src *feedback_src)
+{
+   VkDevice dev_handle = vn_device_to_handle(dev);
+   const VkAllocationCallbacks *alloc = &dev->base.base.alloc;
+
+   for (uint32_t i = 0; i < dev->queue_family_count; i++) {
+      vn_feedback_cmd_free(dev_handle, &dev->cmd_pools[i],
+                           feedback_src->commands[i]);
+   }
+   vk_free(alloc, feedback_src->commands);
+
+   vn_feedback_pool_free(&dev->feedback_pool, feedback_src->src_slot);
+   /* feedback_src was allocated laziy at submission time using the
+    * device level alloc, not the vkCreateSemaphore passed alloc
+    */
+   vk_free(alloc, feedback_src);
+}
+
+static void
+vn_timeline_semaphore_feedback_fini(struct vn_device *dev,
+                                    struct vn_semaphore *sem)
+{
+   if (!sem->feedback.slot)
+      return;
+
+   list_for_each_entry_safe(struct vn_feedback_src, feedback_src,
+                            &sem->feedback.free_src_list, head) {
+      vn_timeline_semaphore_feedback_free(dev, feedback_src);
+   }
+
+   list_for_each_entry_safe(struct vn_feedback_src, feedback_src,
+                            &sem->feedback.pending_src_list, head) {
+      vn_timeline_semaphore_feedback_free(dev, feedback_src);
+   }
+
+   simple_mtx_destroy(&sem->feedback.src_lists_mtx);
+   simple_mtx_destroy(&sem->feedback.async_wait_mtx);
+
+   vn_feedback_pool_free(&dev->feedback_pool, sem->feedback.slot);
+}
+
 VkResult
 vn_CreateSemaphore(VkDevice device,
                    const VkSemaphoreCreateInfo *pCreateInfo,
@@ -1127,10 +1590,14 @@ vn_CreateSemaphore(VkDevice device,
    sem->is_external = export_info && export_info->handleTypes;
 
    VkResult result = vn_semaphore_init_payloads(dev, sem, initial_val, alloc);
-   if (result != VK_SUCCESS) {
-      vn_object_base_fini(&sem->base);
-      vk_free(alloc, sem);
-      return vn_error(dev->instance, result);
+   if (result != VK_SUCCESS)
+      goto out_object_base_fini;
+
+   if (sem->type == VK_SEMAPHORE_TYPE_TIMELINE) {
+      result =
+         vn_timeline_semaphore_feedback_init(dev, sem, initial_val, alloc);
+      if (result != VK_SUCCESS)
+         goto out_payloads_fini;
    }
 
    VkSemaphore sem_handle = vn_semaphore_to_handle(sem);
@@ -1140,6 +1607,15 @@ vn_CreateSemaphore(VkDevice device,
    *pSemaphore = sem_handle;
 
    return VK_SUCCESS;
+
+out_payloads_fini:
+   vn_sync_payload_release(dev, &sem->permanent);
+   vn_sync_payload_release(dev, &sem->temporary);
+
+out_object_base_fini:
+   vn_object_base_fini(&sem->base);
+   vk_free(alloc, sem);
+   return vn_error(dev->instance, result);
 }
 
 void
@@ -1158,6 +1634,9 @@ vn_DestroySemaphore(VkDevice device,
 
    vn_async_vkDestroySemaphore(dev->instance, device, semaphore, NULL);
 
+   if (sem->type == VK_SEMAPHORE_TYPE_TIMELINE)
+      vn_timeline_semaphore_feedback_fini(dev, sem);
+
    vn_sync_payload_release(dev, &sem->permanent);
    vn_sync_payload_release(dev, &sem->temporary);
 
@@ -1170,14 +1649,53 @@ vn_GetSemaphoreCounterValue(VkDevice device,
                             VkSemaphore semaphore,
                             uint64_t *pValue)
 {
-   VN_TRACE_FUNC();
    struct vn_device *dev = vn_device_from_handle(device);
    struct vn_semaphore *sem = vn_semaphore_from_handle(semaphore);
    ASSERTED struct vn_sync_payload *payload = sem->payload;
 
    assert(payload->type == VN_SYNC_TYPE_DEVICE_ONLY);
-   return vn_call_vkGetSemaphoreCounterValue(dev->instance, device, semaphore,
-                                             pValue);
+
+   if (sem->feedback.slot) {
+      simple_mtx_lock(&sem->feedback.async_wait_mtx);
+
+      *pValue = vn_feedback_get_counter(sem->feedback.slot);
+
+      if (sem->feedback.signaled_counter < *pValue) {
+         /* When the timeline semaphore feedback slot gets signaled, the real
+          * semaphore signal operation follows after but the signaling isr can
+          * be deferred or preempted. To avoid racing, we let the renderer
+          * wait for the semaphore by sending an asynchronous wait call for
+          * the feedback value.
+          * We also cache the counter value to only send the async call once
+          * per counter value to prevent spamming redundant async wait calls.
+          * The cached counter value requires a lock to ensure multiple
+          * threads querying for the same value are guaranteed to encode after
+          * the async wait call.
+          *
+          * This also helps resolve synchronization validation errors, because
+          * the layer no longer sees any semaphore status checks and falsely
+          * believes the caller does not sync.
+          */
+         VkSemaphoreWaitInfo wait_info = {
+            .sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO,
+            .pNext = NULL,
+            .flags = 0,
+            .semaphoreCount = 1,
+            .pSemaphores = &semaphore,
+            .pValues = pValue,
+         };
+
+         vn_async_vkWaitSemaphores(dev->instance, device, &wait_info,
+                                   UINT64_MAX);
+         sem->feedback.signaled_counter = *pValue;
+      }
+      simple_mtx_unlock(&sem->feedback.async_wait_mtx);
+
+      return VK_SUCCESS;
+   } else {
+      return vn_call_vkGetSemaphoreCounterValue(dev->instance, device,
+                                                semaphore, pValue);
+   }
 }
 
 VkResult
@@ -1185,6 +1703,8 @@ vn_SignalSemaphore(VkDevice device, const VkSemaphoreSignalInfo *pSignalInfo)
 {
    VN_TRACE_FUNC();
    struct vn_device *dev = vn_device_from_handle(device);
+   struct vn_semaphore *sem =
+      vn_semaphore_from_handle(pSignalInfo->semaphore);
 
    /* TODO if the semaphore is shared-by-ref, this needs to be synchronous */
    if (false)
@@ -1192,6 +1712,18 @@ vn_SignalSemaphore(VkDevice device, const VkSemaphoreSignalInfo *pSignalInfo)
    else
       vn_async_vkSignalSemaphore(dev->instance, device, pSignalInfo);
 
+   if (sem->feedback.slot) {
+      simple_mtx_lock(&sem->feedback.async_wait_mtx);
+
+      vn_feedback_set_counter(sem->feedback.slot, pSignalInfo->value);
+      /* Update async counters. Since we're signaling, we're aligned with
+       * the renderer.
+       */
+      sem->feedback.signaled_counter = pSignalInfo->value;
+
+      simple_mtx_unlock(&sem->feedback.async_wait_mtx);
+   }
+
    return VK_SUCCESS;
 }
 
diff --git a/src/virtio/vulkan/vn_queue.h b/src/virtio/vulkan/vn_queue.h
index e34ab98..cdfc46b 100644
--- a/src/virtio/vulkan/vn_queue.h
+++ b/src/virtio/vulkan/vn_queue.h
@@ -88,6 +88,38 @@ struct vn_semaphore {
    struct vn_sync_payload permanent;
    struct vn_sync_payload temporary;
 
+   struct {
+      /* non-NULL if VN_PERF_NO_TIMELINE_SEM_FEEDBACK is disabled */
+      struct vn_feedback_slot *slot;
+
+      /* Lists of allocated vn_feedback_src
+       * The pending_src_list tracks vn_feedback_src slots that have
+       * not been signaled since the last submission cleanup.
+       * The free_src_list tracks vn_feedback_src slots that have
+       * signaled and can be reused.
+       * On submission prepare, used vn_feedback_src are moved from
+       * the free list to the pending list. On submission cleanup,
+       * vn_feedback_src of any associated semaphores are checked
+       * and moved to the free list if they were signaled.
+       * vn_feedback_src slots are allocated on demand if the
+       * free_src_list is empty.
+       */
+      struct list_head pending_src_list;
+      struct list_head free_src_list;
+
+      /* Lock for accessing free/pending src lists */
+      simple_mtx_t src_lists_mtx;
+
+      /* Cached counter value to track if an async sem wait call is needed */
+      uint64_t signaled_counter;
+
+      /* Lock for checking if an async sem wait call is needed based on
+       * the current counter value and signaled_counter to ensure async
+       * wait order across threads.
+       */
+      simple_mtx_t async_wait_mtx;
+   } feedback;
+
    bool is_external;
 
    /* ring_idx of the last queue submission (only used for permanent
-- 
2.7.4