radv: Upload shaders to invisible VRAM on small BAR systems.

author Tatsuyuki Ishi <ishitatsuyuki@gmail.com>

Fri, 24 Feb 2023 05:56:20 +0000 (14:56 +0900)

committer Marge Bot <emma+marge@anholt.net>

Thu, 16 Mar 2023 18:02:57 +0000 (18:02 +0000)
author Tatsuyuki Ishi <ishitatsuyuki@gmail.com>
Fri, 24 Feb 2023 05:56:20 +0000 (14:56 +0900)
committer Marge Bot <emma+marge@anholt.net>
Thu, 16 Mar 2023 18:02:57 +0000 (18:02 +0000)
diff --git a/docs/envvars.rst b/docs/envvars.rst

index 39a3c55..3f2862d 100644 (file)
--- a/docs/envvars.rst
+++ b/docs/envvars.rst
@@ -1161,6 +1161,8 @@ RADV driver environment variables
        enable wave32 for compute shaders (GFX10+)
     ``dccmsaa``
        enable DCC for MSAA images
+   ``dmashaders``
+      upload shaders to invisible VRAM (might be useful for non-resizable BAR systems)
     ``emulate_rt``
        forces ray-tracing to be emulated in software on GFX10_3+ and enables
        rt extensions with older hardware.
diff --git a/src/amd/vulkan/layers/radv_sqtt_layer.c b/src/amd/vulkan/layers/radv_sqtt_layer.c

index 3dc7f44..4b65f85 100644 (file)
--- a/src/amd/vulkan/layers/radv_sqtt_layer.c
+++ b/src/amd/vulkan/layers/radv_sqtt_layer.c
@@ -144,6 +144,7 @@ static VkResult
  radv_sqtt_reloc_graphics_shaders(struct radv_device *device,
                                   struct radv_graphics_pipeline *pipeline)
  {
+   struct radv_shader_dma_submission *submission = NULL;
     struct radv_sqtt_shaders_reloc *reloc;
     uint32_t code_size = 0;
  
@@ -170,21 +171,38 @@ radv_sqtt_reloc_graphics_shaders(struct radv_device *device,
     reloc->bo = reloc->alloc->arena->bo;
  
     /* Relocate shader binaries to be contiguous in memory as requested by RGP. */
-   uint64_t slab_va = radv_buffer_get_va(reloc->bo);
-   uint32_t slab_offset = reloc->alloc->offset;
-   char *slab_ptr = reloc->alloc->arena->ptr;
+   uint64_t slab_va = radv_buffer_get_va(reloc->bo) + reloc->alloc->offset;
+   char *slab_ptr = reloc->alloc->arena->ptr + reloc->alloc->offset;
+   uint64_t offset = 0;
+
+   if (device->shader_use_invisible_vram) {
+       submission =
+         radv_shader_dma_get_submission(device, reloc->bo, slab_va, code_size);
+      if (!submission)
+         return VK_ERROR_UNKNOWN;
+   }
  
     for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i) {
        const struct radv_shader *shader = pipeline->base.shaders[i];
+      void *dest_ptr;
        if (!shader)
           continue;
  
-      reloc->va[i] = slab_va + slab_offset;
+      reloc->va[i] = slab_va + offset;
+
+      if (device->shader_use_invisible_vram)
+         dest_ptr = submission->ptr + offset;
+      else
+         dest_ptr = slab_ptr + offset;
  
-      void *dest_ptr = slab_ptr + slab_offset;
        memcpy(dest_ptr, shader->code, shader->code_size);
  
-      slab_offset += align(shader->code_size, RADV_SHADER_ALLOC_ALIGNMENT);
+      offset += align(shader->code_size, RADV_SHADER_ALLOC_ALIGNMENT);
+   }
+
+   if (device->shader_use_invisible_vram) {
+      if (!radv_shader_dma_submit(device, submission, NULL))
+         return VK_ERROR_UNKNOWN;
     }
  
     pipeline->sqtt_shaders_reloc = reloc;
diff --git a/src/amd/vulkan/radv_constants.h b/src/amd/vulkan/radv_constants.h

index 2cb211a..946c66c 100644 (file)
--- a/src/amd/vulkan/radv_constants.h
+++ b/src/amd/vulkan/radv_constants.h
@@ -137,6 +137,11 @@
  #define PERF_CTR_BO_LOCK_OFFSET  0
  #define PERF_CTR_BO_FENCE_OFFSET 8
  
+/* The maximum number of in-flight uploads (radv_shader_dma_submission) when asynchronous shader
+ * upload is used.
+ */
+#define RADV_SHADER_UPLOAD_CS_COUNT 32
+
  /* NGG GDS counters:
   *   offset  0| 4| 8|12  - reserved for NGG streamout counters
   *   offset 16           - pipeline statistics counter for all streams
diff --git a/src/amd/vulkan/radv_debug.h b/src/amd/vulkan/radv_debug.h

index bd0e53c..07d3033 100644 (file)
--- a/src/amd/vulkan/radv_debug.h
+++ b/src/amd/vulkan/radv_debug.h
@@ -87,6 +87,7 @@ enum {
     RADV_PERFTEST_GPL = 1u << 13,
     RADV_PERFTEST_NGG_STREAMOUT = 1u << 14,
     RADV_PERFTEST_VIDEO_DECODE = 1u << 15,
+   RADV_PERFTEST_DMA_SHADERS = 1u << 16,
  };
  
  bool radv_init_trace(struct radv_device *device);
diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c

index b392909..8895b89 100644 (file)
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -50,6 +50,8 @@
  #include "radv_private.h"
  #include "radv_shader.h"
  #include "vk_util.h"
+#include "vk_common_entrypoints.h"
+#include "vk_semaphore.h"
  #ifdef _WIN32
  typedef void *drmDevicePtr;
  #include <io.h>
@@ -805,7 +807,7 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr
  
        result = device->ws->ctx_create(device->ws, priority, &device->hw_ctx[priority]);
        if (result != VK_SUCCESS)
-         goto fail;
+         goto fail_queue;
     }
  
     for (unsigned i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
@@ -819,7 +821,7 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr
                    VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
        if (!device->queues[qfi]) {
           result = VK_ERROR_OUT_OF_HOST_MEMORY;
-         goto fail;
+         goto fail_queue;
        }
  
        memset(device->queues[qfi], 0, queue_create->queueCount * sizeof(struct radv_queue));
@@ -829,11 +831,19 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr
        for (unsigned q = 0; q < queue_create->queueCount; q++) {
           result = radv_queue_init(device, &device->queues[qfi][q], q, queue_create, global_priority);
           if (result != VK_SUCCESS)
-            goto fail;
+            goto fail_queue;
        }
     }
     device->private_sdma_queue = VK_NULL_HANDLE;
  
+   device->shader_use_invisible_vram =
+      (device->instance->perftest_flags & RADV_PERFTEST_DMA_SHADERS) &&
+      /* SDMA buffer copy is only implemented for GFX7+. */
+      device->physical_device->rad_info.gfx_level >= GFX7;
+   result = radv_init_shader_upload_queue(device);
+   if (result != VK_SUCCESS)
+      goto fail;
+
     device->pbb_allowed = device->physical_device->rad_info.gfx_level >= GFX9 &&
                           !(device->instance->debug_flags & RADV_DEBUG_NOBINNING);
  
@@ -1081,6 +1091,9 @@ fail:
     radv_device_finish_ps_epilogs(device);
     radv_device_finish_border_color(device);
  
+   radv_destroy_shader_upload_queue(device);
+
+fail_queue:
     for (unsigned i = 0; i < RADV_MAX_QUEUE_FAMILIES; i++) {
        for (unsigned q = 0; q < device->queue_count[i]; q++)
           radv_queue_finish(&device->queues[i][q]);
@@ -1093,6 +1106,8 @@ fail:
           device->ws->ctx_destroy(device->hw_ctx[i]);
     }
  
+   radv_destroy_shader_arenas(device);
+
     _mesa_hash_table_destroy(device->rt_handles, NULL);
  
     simple_mtx_destroy(&device->pstate_mtx);
@@ -1154,6 +1169,8 @@ radv_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
     VkPipelineCache pc = radv_pipeline_cache_to_handle(device->mem_cache);
     radv_DestroyPipelineCache(radv_device_to_handle(device), pc, NULL);
  
+   radv_destroy_shader_upload_queue(device);
+
     radv_trap_handler_finish(device);
     radv_finish_trace(device);
  
diff --git a/src/amd/vulkan/radv_instance.c b/src/amd/vulkan/radv_instance.c

index c65ff5c..14aa0c9 100644 (file)
--- a/src/amd/vulkan/radv_instance.c
+++ b/src/amd/vulkan/radv_instance.c
@@ -100,6 +100,7 @@ static const struct debug_control radv_perftest_options[] = {
     {"gpl", RADV_PERFTEST_GPL},
     {"ngg_streamout", RADV_PERFTEST_NGG_STREAMOUT},
     {"video_decode", RADV_PERFTEST_VIDEO_DECODE},
+   {"dmashaders", RADV_PERFTEST_DMA_SHADERS},
     {NULL, 0}};
  
  const char *
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h

index bdb9ef7..adb3a1c 100644 (file)
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -850,6 +850,18 @@ void radv_queue_finish(struct radv_queue *queue);
  enum radeon_ctx_priority
  radv_get_queue_global_priority(const VkDeviceQueueGlobalPriorityCreateInfoKHR *pObj);
  
+struct radv_shader_dma_submission {
+   struct list_head list;
+
+   struct radeon_cmdbuf *cs;
+   struct radeon_winsys_bo *bo;
+   uint64_t bo_size;
+   char *ptr;
+
+   /* The semaphore value to wait for before reusing this submission. */
+   uint64_t seq;
+};
+
  #define RADV_BORDER_COLOR_COUNT       4096
  #define RADV_BORDER_COLOR_BUFFER_SIZE (sizeof(VkClearColorValue) * RADV_BORDER_COLOR_COUNT)
  
@@ -982,6 +994,17 @@ struct radv_device {
     struct list_head shader_block_obj_pool;
     mtx_t shader_arena_mutex;
  
+   mtx_t shader_upload_hw_ctx_mutex;
+   struct radeon_winsys_ctx *shader_upload_hw_ctx;
+   VkSemaphore shader_upload_sem;
+   uint64_t shader_upload_seq;
+   struct list_head shader_dma_submissions;
+   mtx_t shader_dma_submission_list_mutex;
+   cnd_t shader_dma_submission_list_cond;
+
+   /* Whether to DMA shaders to invisible VRAM or to upload directly through BAR. */
+   bool shader_use_invisible_vram;
+
     /* For detecting VM faults reported by dmesg. */
     uint64_t dmesg_timestamp;
  
diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c

index 631fc79..7c3c8b0 100644 (file)
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -34,6 +34,7 @@
  #include "util/mesa-sha1.h"
  #include "util/u_atomic.h"
  #include "util/streaming-load-memcpy.h"
+#include "radv_cs.h"
  #include "radv_debug.h"
  #include "radv_meta.h"
  #include "radv_private.h"
@@ -48,6 +49,8 @@
  #include "aco_interface.h"
  #include "sid.h"
  #include "vk_format.h"
+#include "vk_sync.h"
+#include "vk_semaphore.h"
  
  #include "aco_shader_info.h"
  #include "radv_aco_shader_info.h"
@@ -1470,6 +1473,22 @@ free_block_obj(struct radv_device *device, union radv_shader_arena_block *block)
     list_add(&block->pool, &device->shader_block_obj_pool);
  }
  
+VkResult
+radv_shader_wait_for_upload(struct radv_device *device, uint64_t seq)
+{
+   if (!seq)
+      return VK_SUCCESS;
+
+   const VkSemaphoreWaitInfo wait_info = {
+      .sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO,
+      .pSemaphores = &device->shader_upload_sem,
+      .semaphoreCount = 1,
+      .pValues = &seq,
+   };
+   return device->vk.dispatch_table.WaitSemaphores(radv_device_to_handle(device), &wait_info,
+                                                   UINT64_MAX);
+}
+
  /* Segregated fit allocator, implementing a good-fit allocation policy.
   *
   * This is an variation of sequential fit allocation with several lists of free blocks ("holes")
@@ -1545,21 +1564,29 @@ radv_alloc_shader_memory(struct radv_device *device, uint32_t size, void *ptr)
        MAX2(RADV_SHADER_ALLOC_MIN_ARENA_SIZE
                << MIN2(RADV_SHADER_ALLOC_MAX_ARENA_SIZE_SHIFT, device->shader_arena_shift),
             size);
-   VkResult result = device->ws->buffer_create(
-      device->ws, arena_size, RADV_SHADER_ALLOC_ALIGNMENT, RADEON_DOMAIN_VRAM,
-      RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_32BIT |
+   enum radeon_bo_flag flags = RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_32BIT;
+   if (device->shader_use_invisible_vram)
+      flags |= RADEON_FLAG_NO_CPU_ACCESS;
+   else
+      flags |=
           (device->physical_device->rad_info.cpdma_prefetch_writes_memory ? 0
-                                                                         : RADEON_FLAG_READ_ONLY),
-      RADV_BO_PRIORITY_SHADER, 0, &arena->bo);
+                                                                         : RADEON_FLAG_READ_ONLY);
+
+   VkResult result;
+   result =
+      device->ws->buffer_create(device->ws, arena_size, RADV_SHADER_ALLOC_ALIGNMENT,
+                                RADEON_DOMAIN_VRAM, flags, RADV_BO_PRIORITY_SHADER, 0, &arena->bo);
     if (result != VK_SUCCESS)
        goto fail;
     radv_rmv_log_bo_allocate(device, arena->bo, arena_size, true);
  
     list_inithead(&arena->entries);
  
-   arena->ptr = (char *)device->ws->buffer_map(arena->bo);
-   if (!arena->ptr)
-      goto fail;
+   if (!(flags & RADEON_FLAG_NO_CPU_ACCESS)) {
+      arena->ptr = (char *)device->ws->buffer_map(arena->bo);
+      if (!arena->ptr)
+         goto fail;
+   }
  
     alloc = alloc_block_obj(device);
     hole = arena_size - size > 0 ? alloc_block_obj(device) : alloc;
@@ -1685,6 +1712,84 @@ radv_destroy_shader_arenas(struct radv_device *device)
     mtx_destroy(&device->shader_arena_mutex);
  }
  
+VkResult
+radv_init_shader_upload_queue(struct radv_device *device)
+{
+   if (!device->shader_use_invisible_vram)
+      return VK_SUCCESS;
+
+   VkDevice vk_device = radv_device_to_handle(device);
+   struct radeon_winsys *ws = device->ws;
+
+   const struct vk_device_dispatch_table *disp = &device->vk.dispatch_table;
+   VkResult result = VK_SUCCESS;
+
+   result = ws->ctx_create(ws, RADEON_CTX_PRIORITY_MEDIUM, &device->shader_upload_hw_ctx);
+   if (result != VK_SUCCESS)
+      return result;
+   mtx_init(&device->shader_upload_hw_ctx_mutex, mtx_plain);
+
+   mtx_init(&device->shader_dma_submission_list_mutex, mtx_plain);
+   cnd_init(&device->shader_dma_submission_list_cond);
+   list_inithead(&device->shader_dma_submissions);
+
+   for (unsigned i = 0; i < RADV_SHADER_UPLOAD_CS_COUNT; i++) {
+      struct radv_shader_dma_submission *submission = calloc(1, sizeof(struct radv_shader_dma_submission));
+      submission->cs = ws->cs_create(ws, AMD_IP_SDMA);
+      if (!submission->cs)
+         return VK_ERROR_OUT_OF_HOST_MEMORY;
+      list_addtail(&submission->list, &device->shader_dma_submissions);
+   }
+
+   const VkSemaphoreTypeCreateInfo sem_type = {
+      .sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO,
+      .semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE,
+      .initialValue = 0,
+   };
+   const VkSemaphoreCreateInfo sem_create = {
+      .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO,
+      .pNext = &sem_type,
+   };
+   result = disp->CreateSemaphore(vk_device, &sem_create, NULL, &device->shader_upload_sem);
+   if (result != VK_SUCCESS)
+      return result;
+
+   return VK_SUCCESS;
+}
+
+void
+radv_destroy_shader_upload_queue(struct radv_device *device)
+{
+   if (!device->shader_use_invisible_vram)
+      return;
+
+   struct vk_device_dispatch_table *disp = &device->vk.dispatch_table;
+   struct radeon_winsys *ws = device->ws;
+
+   /* Upload queue should be idle assuming that pipelines are not leaked */
+   if (device->shader_upload_sem)
+      disp->DestroySemaphore(radv_device_to_handle(device), device->shader_upload_sem, NULL);
+
+   list_for_each_entry_safe(struct radv_shader_dma_submission, submission,
+                            &device->shader_dma_submissions, list)
+   {
+      if (submission->cs)
+         ws->cs_destroy(submission->cs);
+      if (submission->bo)
+         ws->buffer_destroy(ws, submission->bo);
+      list_del(&submission->list);
+      free(submission);
+   }
+
+   cnd_destroy(&device->shader_dma_submission_list_cond);
+   mtx_destroy(&device->shader_dma_submission_list_mutex);
+
+   if (device->shader_upload_hw_ctx) {
+      mtx_destroy(&device->shader_upload_hw_ctx_mutex);
+      ws->ctx_destroy(device->shader_upload_hw_ctx);
+   }
+}
+
  /* For the UMR disassembler. */
  #define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */
  #define DEBUGGER_NUM_MARKERS        5
@@ -2036,19 +2141,8 @@ radv_open_rtld_binary(struct radv_device *device, const struct radv_shader *shad
  
  static bool
  radv_shader_binary_upload(struct radv_device *device, const struct radv_shader_binary *binary,
-                          struct radv_shader *shader)
+                          struct radv_shader *shader, void *dest_ptr)
  {
-   void *dest_ptr;
-
-   shader->alloc = radv_alloc_shader_memory(device, shader->code_size, shader);
-   if (!shader->alloc)
-      return false;
-
-   shader->bo = shader->alloc->arena->bo;
-   shader->va = radv_buffer_get_va(shader->bo) + shader->alloc->offset;
-
-   dest_ptr = shader->alloc->arena->ptr + shader->alloc->offset;
-
     if (device->thread_trace.bo) {
        shader->code = calloc(shader->code_size, 1);
        if (!shader->code) {
@@ -2106,6 +2200,153 @@ radv_shader_binary_upload(struct radv_device *device, const struct radv_shader_b
     return true;
  }
  
+static VkResult
+radv_shader_dma_resize_upload_buf(struct radv_shader_dma_submission *submission,
+                                  struct radeon_winsys *ws, uint64_t size)
+{
+   if (submission->bo)
+      ws->buffer_destroy(ws, submission->bo);
+
+   VkResult result =
+      ws->buffer_create(ws, size, RADV_SHADER_ALLOC_ALIGNMENT, RADEON_DOMAIN_GTT,
+                        RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING |
+                           RADEON_FLAG_32BIT | RADEON_FLAG_GTT_WC,
+                        RADV_BO_PRIORITY_UPLOAD_BUFFER, 0, &submission->bo);
+   if (result != VK_SUCCESS)
+      return result;
+
+   submission->ptr = ws->buffer_map(submission->bo);
+   submission->bo_size = size;
+
+   return VK_SUCCESS;
+}
+
+struct radv_shader_dma_submission *
+radv_shader_dma_pop_submission(struct radv_device *device)
+{
+   struct radv_shader_dma_submission *submission;
+
+   mtx_lock(&device->shader_dma_submission_list_mutex);
+
+   while (list_is_empty(&device->shader_dma_submissions))
+      cnd_wait(&device->shader_dma_submission_list_cond, &device->shader_dma_submission_list_mutex);
+
+   submission =
+      list_first_entry(&device->shader_dma_submissions, struct radv_shader_dma_submission, list);
+   list_del(&submission->list);
+
+   mtx_unlock(&device->shader_dma_submission_list_mutex);
+
+   return submission;
+}
+
+void
+radv_shader_dma_push_submission(struct radv_device *device,
+                                struct radv_shader_dma_submission *submission, uint64_t seq)
+{
+   submission->seq = seq;
+
+   mtx_lock(&device->shader_dma_submission_list_mutex);
+
+   list_addtail(&submission->list, &device->shader_dma_submissions);
+   cnd_signal(&device->shader_dma_submission_list_cond);
+
+   mtx_unlock(&device->shader_dma_submission_list_mutex);
+}
+
+struct radv_shader_dma_submission *
+radv_shader_dma_get_submission(struct radv_device *device, struct radeon_winsys_bo *bo, uint64_t va,
+                               uint64_t size)
+{
+   struct radv_shader_dma_submission *submission = radv_shader_dma_pop_submission(device);
+   struct radeon_cmdbuf *cs = submission->cs;
+   struct radeon_winsys *ws = device->ws;
+   VkResult result;
+
+   /* Wait for potentially in-flight submission to settle */
+   result = radv_shader_wait_for_upload(device, submission->seq);
+   if (result != VK_SUCCESS)
+      goto fail;
+
+   ws->cs_reset(cs);
+
+   if (submission->bo_size < size) {
+      result = radv_shader_dma_resize_upload_buf(submission, ws, size);
+      if (result != VK_SUCCESS)
+         goto fail;
+   }
+
+   radv_sdma_copy_buffer(device, cs, radv_buffer_get_va(submission->bo), va, size);
+   radv_cs_add_buffer(ws, cs, submission->bo);
+   radv_cs_add_buffer(ws, cs, bo);
+
+   result = ws->cs_finalize(cs);
+   if (result != VK_SUCCESS)
+      goto fail;
+
+   return submission;
+
+fail:
+   radv_shader_dma_push_submission(device, submission, 0);
+
+   return NULL;
+}
+
+/*
+ * If upload_seq_out is NULL, this function blocks until the DMA is complete. Otherwise, the
+ * semaphore value to wait on device->shader_upload_sem is stored in *upload_seq_out.
+ */
+bool
+radv_shader_dma_submit(struct radv_device *device, struct radv_shader_dma_submission *submission,
+                       uint64_t *upload_seq_out)
+{
+   struct radeon_cmdbuf *cs = submission->cs;
+   struct radeon_winsys *ws = device->ws;
+   VkResult result;
+
+   mtx_lock(&device->shader_upload_hw_ctx_mutex);
+
+   uint64_t upload_seq = device->shader_upload_seq + 1;
+
+   struct vk_semaphore *semaphore = vk_semaphore_from_handle(device->shader_upload_sem);
+   struct vk_sync *sync = vk_semaphore_get_active_sync(semaphore);
+   const struct vk_sync_signal signal_info = {
+      .sync = sync,
+      .signal_value = upload_seq,
+      .stage_mask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
+   };
+
+   struct radv_winsys_submit_info submit = {
+      .ip_type = AMD_IP_SDMA,
+      .queue_index = 0,
+      .cs_array = &cs,
+      .cs_count = 1,
+   };
+
+   result = ws->cs_submit(device->shader_upload_hw_ctx, &submit, 0, NULL, 1, &signal_info, false);
+   if (result != VK_SUCCESS)
+   {
+      mtx_unlock(&device->shader_upload_hw_ctx_mutex);
+      radv_shader_dma_push_submission(device, submission, 0);
+      return false;
+   }
+   device->shader_upload_seq = upload_seq;
+   mtx_unlock(&device->shader_upload_hw_ctx_mutex);
+
+   radv_shader_dma_push_submission(device, submission, upload_seq);
+
+   if (upload_seq_out) {
+      *upload_seq_out = upload_seq;
+   } else {
+      result = radv_shader_wait_for_upload(device, upload_seq);
+      if (result != VK_SUCCESS)
+         return false;
+   }
+
+   return true;
+}
+
+
  struct radv_shader *
  radv_shader_create(struct radv_device *device, const struct radv_shader_binary *binary,
                     bool keep_shader_info, bool from_cache, const struct radv_shader_args *args)
@@ -2215,9 +2456,33 @@ radv_shader_create(struct radv_device *device, const struct radv_shader_binary *
        }
     }
  
-   if (!radv_shader_binary_upload(device, binary, shader))
+   shader->alloc = radv_alloc_shader_memory(device, shader->code_size, shader);
+   if (!shader->alloc)
        return NULL;
  
+   shader->bo = shader->alloc->arena->bo;
+   shader->va = radv_buffer_get_va(shader->bo) + shader->alloc->offset;
+
+   if (device->shader_use_invisible_vram) {
+      struct radv_shader_dma_submission *submission =
+         radv_shader_dma_get_submission(device, shader->bo, shader->va, shader->code_size);
+      if (!submission)
+         return NULL;
+
+      if (!radv_shader_binary_upload(device, binary, shader, submission->ptr)) {
+         radv_shader_dma_push_submission(device, submission, 0);
+         return NULL;
+      }
+
+      if (!radv_shader_dma_submit(device, submission, NULL))
+         return NULL;
+   } else {
+      void *dest_ptr = shader->alloc->arena->ptr + shader->alloc->offset;
+
+      if (!radv_shader_binary_upload(device, binary, shader, dest_ptr))
+         return NULL;
+   }
+
     return shader;
  }
  
@@ -2243,15 +2508,38 @@ radv_shader_part_create(struct radv_shader_part_binary *binary, unsigned wave_si
     return shader_part;
  }
  
-void
-radv_shader_part_binary_upload(const struct radv_shader_part_binary *binary, void *dest_ptr)
+bool
+radv_shader_part_binary_upload(struct radv_device *device, struct radv_shader_part *shader_part)
  {
-   memcpy(dest_ptr, binary->data, binary->code_size);
+   const struct radv_shader_part_binary *bin = shader_part->binary;
+   uint32_t code_size = radv_get_shader_binary_size(bin->code_size);
+   struct radv_shader_dma_submission *submission = NULL;
+   void *dest_ptr;
  
+   if (device->shader_use_invisible_vram) {
+      uint64_t va = radv_buffer_get_va(shader_part->alloc->arena->bo) + shader_part->alloc->offset;
+      submission =
+         radv_shader_dma_get_submission(device, shader_part->alloc->arena->bo, va, code_size);
+      if (!submission)
+         return false;
+
+      dest_ptr = submission->ptr;
+   } else {
+      dest_ptr = shader_part->alloc->arena->ptr + shader_part->alloc->offset;
+   }
+
+   memcpy(dest_ptr, bin->data, bin->code_size);
     /* Add end-of-code markers for the UMR disassembler. */
-   uint32_t *ptr32 = (uint32_t *)dest_ptr + binary->code_size / 4;
+   uint32_t *ptr32 = (uint32_t *)dest_ptr + code_size / 4;
     for (unsigned i = 0; i < DEBUGGER_NUM_MARKERS; i++)
        ptr32[i] = DEBUGGER_END_OF_CODE_MARKER;
+
+   if (device->shader_use_invisible_vram) {
+      if (!radv_shader_dma_submit(device, submission, NULL))
+         return false;
+   }
+
+   return true;
  }
  
  static char *
@@ -2633,8 +2921,8 @@ radv_create_vs_prolog(struct radv_device *device, const struct radv_vs_prolog_ke
     prolog->bo = prolog->alloc->arena->bo;
     prolog->va = radv_buffer_get_va(prolog->bo) + prolog->alloc->offset;
  
-   void *dest_ptr = prolog->alloc->arena->ptr + prolog->alloc->offset;
-   radv_shader_part_binary_upload(binary, dest_ptr);
+   if (!radv_shader_part_binary_upload(device, prolog))
+      goto fail_alloc;
  
     if (options.dump_shader) {
        fprintf(stderr, "Vertex prolog");
@@ -2698,8 +2986,8 @@ radv_create_ps_epilog(struct radv_device *device, const struct radv_ps_epilog_ke
     epilog->bo = epilog->alloc->arena->bo;
     epilog->va = radv_buffer_get_va(epilog->bo) + epilog->alloc->offset;
  
-   void *dest_ptr = epilog->alloc->arena->ptr + epilog->alloc->offset;
-   radv_shader_part_binary_upload(binary, dest_ptr);
+   if (!radv_shader_part_binary_upload(device, epilog))
+      goto fail_alloc;
  
     if (options.dump_shader) {
        fprintf(stderr, "Fragment epilog");
diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h

index fa2805a..9b0b91f 100644 (file)
--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@@ -563,6 +563,8 @@ bool radv_nir_lower_vs_inputs(nir_shader *shader, const struct radv_pipeline_sta
  
  void radv_init_shader_arenas(struct radv_device *device);
  void radv_destroy_shader_arenas(struct radv_device *device);
+VkResult radv_init_shader_upload_queue(struct radv_device *device);
+void radv_destroy_shader_upload_queue(struct radv_device *device);
  
  struct radv_shader_args;
  
@@ -575,7 +577,25 @@ struct radv_shader *radv_shader_nir_to_asm(
     int shader_count, const struct radv_pipeline_key *key, bool keep_shader_info, bool keep_statistic_info,
     struct radv_shader_binary **binary_out);
  
-void radv_shader_part_binary_upload(const struct radv_shader_part_binary *binary, void *dest_ptr);
+VkResult radv_shader_wait_for_upload(struct radv_device *device, uint64_t seq);
+
+bool radv_shader_part_binary_upload(struct radv_device *device,
+                                    struct radv_shader_part *shader_part);
+
+struct radv_shader_dma_submission *
+radv_shader_dma_pop_submission(struct radv_device *device);
+
+void radv_shader_dma_push_submission(struct radv_device *device,
+                                     struct radv_shader_dma_submission *submission,
+                                     uint64_t seq);
+
+struct radv_shader_dma_submission *radv_shader_dma_get_submission(struct radv_device *device,
+                                                                  struct radeon_winsys_bo *bo,
+                                                                  uint64_t va, uint64_t size);
+
+bool radv_shader_dma_submit(struct radv_device *device,
+                            struct radv_shader_dma_submission *submission,
+                            uint64_t *upload_seq_out);
  
  union radv_shader_arena_block *radv_alloc_shader_memory(struct radv_device *device, uint32_t size,
                                                          void *ptr);
author	Tatsuyuki Ishi <ishitatsuyuki@gmail.com>
	Fri, 24 Feb 2023 05:56:20 +0000 (14:56 +0900)
committer	Marge Bot <emma+marge@anholt.net>
	Thu, 16 Mar 2023 18:02:57 +0000 (18:02 +0000)
docs/envvars.rst		patch \| blob \| history
src/amd/vulkan/layers/radv_sqtt_layer.c		patch \| blob \| history
src/amd/vulkan/radv_constants.h		patch \| blob \| history
src/amd/vulkan/radv_debug.h		patch \| blob \| history
src/amd/vulkan/radv_device.c		patch \| blob \| history
src/amd/vulkan/radv_instance.c		patch \| blob \| history
src/amd/vulkan/radv_private.h		patch \| blob \| history
src/amd/vulkan/radv_shader.c		patch \| blob \| history
src/amd/vulkan/radv_shader.h		patch \| blob \| history