radv: Upload shaders to invisible VRAM on small BAR systems.
authorTatsuyuki Ishi <ishitatsuyuki@gmail.com>
Fri, 24 Feb 2023 05:56:20 +0000 (14:56 +0900)
committerMarge Bot <emma+marge@anholt.net>
Thu, 16 Mar 2023 18:02:57 +0000 (18:02 +0000)
Following PAL's implementation, this patch avoids allocating shader code
buffers in BAR and use SDMA to upload them to invisible VRAM
directly.

For some games like HZD, shaders can take as much as 400MB, which exceeds
the non-resizable BAR size (256MB) and cause inconsistent spilling
behavior. The kernel will normally move these to invisible VRAM on its own,
but there are a few cases that it does not reliably happen. This patch does
the moving explicitly in the driver to ensure predictable results.

In this patch, we upload the shaders synchronously; so the shader will be
ready as soon as vkCreate*Pipeline returns. A following patch will make
this asynchronous and don't block until we see a use of the pipeline.

As a side effect, when SQTT is used we now store the shaders on a cacheable
buffer which would speed up writing the trace to the disk.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16271>

docs/envvars.rst
src/amd/vulkan/layers/radv_sqtt_layer.c
src/amd/vulkan/radv_constants.h
src/amd/vulkan/radv_debug.h
src/amd/vulkan/radv_device.c
src/amd/vulkan/radv_instance.c
src/amd/vulkan/radv_private.h
src/amd/vulkan/radv_shader.c
src/amd/vulkan/radv_shader.h

index 39a3c55..3f2862d 100644 (file)
@@ -1161,6 +1161,8 @@ RADV driver environment variables
       enable wave32 for compute shaders (GFX10+)
    ``dccmsaa``
       enable DCC for MSAA images
+   ``dmashaders``
+      upload shaders to invisible VRAM (might be useful for non-resizable BAR systems)
    ``emulate_rt``
       forces ray-tracing to be emulated in software on GFX10_3+ and enables
       rt extensions with older hardware.
index 3dc7f44..4b65f85 100644 (file)
@@ -144,6 +144,7 @@ static VkResult
 radv_sqtt_reloc_graphics_shaders(struct radv_device *device,
                                  struct radv_graphics_pipeline *pipeline)
 {
+   struct radv_shader_dma_submission *submission = NULL;
    struct radv_sqtt_shaders_reloc *reloc;
    uint32_t code_size = 0;
 
@@ -170,21 +171,38 @@ radv_sqtt_reloc_graphics_shaders(struct radv_device *device,
    reloc->bo = reloc->alloc->arena->bo;
 
    /* Relocate shader binaries to be contiguous in memory as requested by RGP. */
-   uint64_t slab_va = radv_buffer_get_va(reloc->bo);
-   uint32_t slab_offset = reloc->alloc->offset;
-   char *slab_ptr = reloc->alloc->arena->ptr;
+   uint64_t slab_va = radv_buffer_get_va(reloc->bo) + reloc->alloc->offset;
+   char *slab_ptr = reloc->alloc->arena->ptr + reloc->alloc->offset;
+   uint64_t offset = 0;
+
+   if (device->shader_use_invisible_vram) {
+       submission =
+         radv_shader_dma_get_submission(device, reloc->bo, slab_va, code_size);
+      if (!submission)
+         return VK_ERROR_UNKNOWN;
+   }
 
    for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i) {
       const struct radv_shader *shader = pipeline->base.shaders[i];
+      void *dest_ptr;
       if (!shader)
          continue;
 
-      reloc->va[i] = slab_va + slab_offset;
+      reloc->va[i] = slab_va + offset;
+
+      if (device->shader_use_invisible_vram)
+         dest_ptr = submission->ptr + offset;
+      else
+         dest_ptr = slab_ptr + offset;
 
-      void *dest_ptr = slab_ptr + slab_offset;
       memcpy(dest_ptr, shader->code, shader->code_size);
 
-      slab_offset += align(shader->code_size, RADV_SHADER_ALLOC_ALIGNMENT);
+      offset += align(shader->code_size, RADV_SHADER_ALLOC_ALIGNMENT);
+   }
+
+   if (device->shader_use_invisible_vram) {
+      if (!radv_shader_dma_submit(device, submission, NULL))
+         return VK_ERROR_UNKNOWN;
    }
 
    pipeline->sqtt_shaders_reloc = reloc;
index 2cb211a..946c66c 100644 (file)
 #define PERF_CTR_BO_LOCK_OFFSET  0
 #define PERF_CTR_BO_FENCE_OFFSET 8
 
+/* The maximum number of in-flight uploads (radv_shader_dma_submission) when asynchronous shader
+ * upload is used.
+ */
+#define RADV_SHADER_UPLOAD_CS_COUNT 32
+
 /* NGG GDS counters:
  *   offset  0| 4| 8|12  - reserved for NGG streamout counters
  *   offset 16           - pipeline statistics counter for all streams
index bd0e53c..07d3033 100644 (file)
@@ -87,6 +87,7 @@ enum {
    RADV_PERFTEST_GPL = 1u << 13,
    RADV_PERFTEST_NGG_STREAMOUT = 1u << 14,
    RADV_PERFTEST_VIDEO_DECODE = 1u << 15,
+   RADV_PERFTEST_DMA_SHADERS = 1u << 16,
 };
 
 bool radv_init_trace(struct radv_device *device);
index b392909..8895b89 100644 (file)
@@ -50,6 +50,8 @@
 #include "radv_private.h"
 #include "radv_shader.h"
 #include "vk_util.h"
+#include "vk_common_entrypoints.h"
+#include "vk_semaphore.h"
 #ifdef _WIN32
 typedef void *drmDevicePtr;
 #include <io.h>
@@ -805,7 +807,7 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr
 
       result = device->ws->ctx_create(device->ws, priority, &device->hw_ctx[priority]);
       if (result != VK_SUCCESS)
-         goto fail;
+         goto fail_queue;
    }
 
    for (unsigned i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
@@ -819,7 +821,7 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr
                   VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
       if (!device->queues[qfi]) {
          result = VK_ERROR_OUT_OF_HOST_MEMORY;
-         goto fail;
+         goto fail_queue;
       }
 
       memset(device->queues[qfi], 0, queue_create->queueCount * sizeof(struct radv_queue));
@@ -829,11 +831,19 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr
       for (unsigned q = 0; q < queue_create->queueCount; q++) {
          result = radv_queue_init(device, &device->queues[qfi][q], q, queue_create, global_priority);
          if (result != VK_SUCCESS)
-            goto fail;
+            goto fail_queue;
       }
    }
    device->private_sdma_queue = VK_NULL_HANDLE;
 
+   device->shader_use_invisible_vram =
+      (device->instance->perftest_flags & RADV_PERFTEST_DMA_SHADERS) &&
+      /* SDMA buffer copy is only implemented for GFX7+. */
+      device->physical_device->rad_info.gfx_level >= GFX7;
+   result = radv_init_shader_upload_queue(device);
+   if (result != VK_SUCCESS)
+      goto fail;
+
    device->pbb_allowed = device->physical_device->rad_info.gfx_level >= GFX9 &&
                          !(device->instance->debug_flags & RADV_DEBUG_NOBINNING);
 
@@ -1081,6 +1091,9 @@ fail:
    radv_device_finish_ps_epilogs(device);
    radv_device_finish_border_color(device);
 
+   radv_destroy_shader_upload_queue(device);
+
+fail_queue:
    for (unsigned i = 0; i < RADV_MAX_QUEUE_FAMILIES; i++) {
       for (unsigned q = 0; q < device->queue_count[i]; q++)
          radv_queue_finish(&device->queues[i][q]);
@@ -1093,6 +1106,8 @@ fail:
          device->ws->ctx_destroy(device->hw_ctx[i]);
    }
 
+   radv_destroy_shader_arenas(device);
+
    _mesa_hash_table_destroy(device->rt_handles, NULL);
 
    simple_mtx_destroy(&device->pstate_mtx);
@@ -1154,6 +1169,8 @@ radv_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
    VkPipelineCache pc = radv_pipeline_cache_to_handle(device->mem_cache);
    radv_DestroyPipelineCache(radv_device_to_handle(device), pc, NULL);
 
+   radv_destroy_shader_upload_queue(device);
+
    radv_trap_handler_finish(device);
    radv_finish_trace(device);
 
index c65ff5c..14aa0c9 100644 (file)
@@ -100,6 +100,7 @@ static const struct debug_control radv_perftest_options[] = {
    {"gpl", RADV_PERFTEST_GPL},
    {"ngg_streamout", RADV_PERFTEST_NGG_STREAMOUT},
    {"video_decode", RADV_PERFTEST_VIDEO_DECODE},
+   {"dmashaders", RADV_PERFTEST_DMA_SHADERS},
    {NULL, 0}};
 
 const char *
index bdb9ef7..adb3a1c 100644 (file)
@@ -850,6 +850,18 @@ void radv_queue_finish(struct radv_queue *queue);
 enum radeon_ctx_priority
 radv_get_queue_global_priority(const VkDeviceQueueGlobalPriorityCreateInfoKHR *pObj);
 
+struct radv_shader_dma_submission {
+   struct list_head list;
+
+   struct radeon_cmdbuf *cs;
+   struct radeon_winsys_bo *bo;
+   uint64_t bo_size;
+   char *ptr;
+
+   /* The semaphore value to wait for before reusing this submission. */
+   uint64_t seq;
+};
+
 #define RADV_BORDER_COLOR_COUNT       4096
 #define RADV_BORDER_COLOR_BUFFER_SIZE (sizeof(VkClearColorValue) * RADV_BORDER_COLOR_COUNT)
 
@@ -982,6 +994,17 @@ struct radv_device {
    struct list_head shader_block_obj_pool;
    mtx_t shader_arena_mutex;
 
+   mtx_t shader_upload_hw_ctx_mutex;
+   struct radeon_winsys_ctx *shader_upload_hw_ctx;
+   VkSemaphore shader_upload_sem;
+   uint64_t shader_upload_seq;
+   struct list_head shader_dma_submissions;
+   mtx_t shader_dma_submission_list_mutex;
+   cnd_t shader_dma_submission_list_cond;
+
+   /* Whether to DMA shaders to invisible VRAM or to upload directly through BAR. */
+   bool shader_use_invisible_vram;
+
    /* For detecting VM faults reported by dmesg. */
    uint64_t dmesg_timestamp;
 
index 631fc79..7c3c8b0 100644 (file)
@@ -34,6 +34,7 @@
 #include "util/mesa-sha1.h"
 #include "util/u_atomic.h"
 #include "util/streaming-load-memcpy.h"
+#include "radv_cs.h"
 #include "radv_debug.h"
 #include "radv_meta.h"
 #include "radv_private.h"
@@ -48,6 +49,8 @@
 #include "aco_interface.h"
 #include "sid.h"
 #include "vk_format.h"
+#include "vk_sync.h"
+#include "vk_semaphore.h"
 
 #include "aco_shader_info.h"
 #include "radv_aco_shader_info.h"
@@ -1470,6 +1473,22 @@ free_block_obj(struct radv_device *device, union radv_shader_arena_block *block)
    list_add(&block->pool, &device->shader_block_obj_pool);
 }
 
+VkResult
+radv_shader_wait_for_upload(struct radv_device *device, uint64_t seq)
+{
+   if (!seq)
+      return VK_SUCCESS;
+
+   const VkSemaphoreWaitInfo wait_info = {
+      .sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO,
+      .pSemaphores = &device->shader_upload_sem,
+      .semaphoreCount = 1,
+      .pValues = &seq,
+   };
+   return device->vk.dispatch_table.WaitSemaphores(radv_device_to_handle(device), &wait_info,
+                                                   UINT64_MAX);
+}
+
 /* Segregated fit allocator, implementing a good-fit allocation policy.
  *
  * This is an variation of sequential fit allocation with several lists of free blocks ("holes")
@@ -1545,21 +1564,29 @@ radv_alloc_shader_memory(struct radv_device *device, uint32_t size, void *ptr)
       MAX2(RADV_SHADER_ALLOC_MIN_ARENA_SIZE
               << MIN2(RADV_SHADER_ALLOC_MAX_ARENA_SIZE_SHIFT, device->shader_arena_shift),
            size);
-   VkResult result = device->ws->buffer_create(
-      device->ws, arena_size, RADV_SHADER_ALLOC_ALIGNMENT, RADEON_DOMAIN_VRAM,
-      RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_32BIT |
+   enum radeon_bo_flag flags = RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_32BIT;
+   if (device->shader_use_invisible_vram)
+      flags |= RADEON_FLAG_NO_CPU_ACCESS;
+   else
+      flags |=
          (device->physical_device->rad_info.cpdma_prefetch_writes_memory ? 0
-                                                                         : RADEON_FLAG_READ_ONLY),
-      RADV_BO_PRIORITY_SHADER, 0, &arena->bo);
+                                                                         : RADEON_FLAG_READ_ONLY);
+
+   VkResult result;
+   result =
+      device->ws->buffer_create(device->ws, arena_size, RADV_SHADER_ALLOC_ALIGNMENT,
+                                RADEON_DOMAIN_VRAM, flags, RADV_BO_PRIORITY_SHADER, 0, &arena->bo);
    if (result != VK_SUCCESS)
       goto fail;
    radv_rmv_log_bo_allocate(device, arena->bo, arena_size, true);
 
    list_inithead(&arena->entries);
 
-   arena->ptr = (char *)device->ws->buffer_map(arena->bo);
-   if (!arena->ptr)
-      goto fail;
+   if (!(flags & RADEON_FLAG_NO_CPU_ACCESS)) {
+      arena->ptr = (char *)device->ws->buffer_map(arena->bo);
+      if (!arena->ptr)
+         goto fail;
+   }
 
    alloc = alloc_block_obj(device);
    hole = arena_size - size > 0 ? alloc_block_obj(device) : alloc;
@@ -1685,6 +1712,84 @@ radv_destroy_shader_arenas(struct radv_device *device)
    mtx_destroy(&device->shader_arena_mutex);
 }
 
+VkResult
+radv_init_shader_upload_queue(struct radv_device *device)
+{
+   if (!device->shader_use_invisible_vram)
+      return VK_SUCCESS;
+
+   VkDevice vk_device = radv_device_to_handle(device);
+   struct radeon_winsys *ws = device->ws;
+
+   const struct vk_device_dispatch_table *disp = &device->vk.dispatch_table;
+   VkResult result = VK_SUCCESS;
+
+   result = ws->ctx_create(ws, RADEON_CTX_PRIORITY_MEDIUM, &device->shader_upload_hw_ctx);
+   if (result != VK_SUCCESS)
+      return result;
+   mtx_init(&device->shader_upload_hw_ctx_mutex, mtx_plain);
+
+   mtx_init(&device->shader_dma_submission_list_mutex, mtx_plain);
+   cnd_init(&device->shader_dma_submission_list_cond);
+   list_inithead(&device->shader_dma_submissions);
+
+   for (unsigned i = 0; i < RADV_SHADER_UPLOAD_CS_COUNT; i++) {
+      struct radv_shader_dma_submission *submission = calloc(1, sizeof(struct radv_shader_dma_submission));
+      submission->cs = ws->cs_create(ws, AMD_IP_SDMA);
+      if (!submission->cs)
+         return VK_ERROR_OUT_OF_HOST_MEMORY;
+      list_addtail(&submission->list, &device->shader_dma_submissions);
+   }
+
+   const VkSemaphoreTypeCreateInfo sem_type = {
+      .sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO,
+      .semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE,
+      .initialValue = 0,
+   };
+   const VkSemaphoreCreateInfo sem_create = {
+      .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO,
+      .pNext = &sem_type,
+   };
+   result = disp->CreateSemaphore(vk_device, &sem_create, NULL, &device->shader_upload_sem);
+   if (result != VK_SUCCESS)
+      return result;
+
+   return VK_SUCCESS;
+}
+
+void
+radv_destroy_shader_upload_queue(struct radv_device *device)
+{
+   if (!device->shader_use_invisible_vram)
+      return;
+
+   struct vk_device_dispatch_table *disp = &device->vk.dispatch_table;
+   struct radeon_winsys *ws = device->ws;
+
+   /* Upload queue should be idle assuming that pipelines are not leaked */
+   if (device->shader_upload_sem)
+      disp->DestroySemaphore(radv_device_to_handle(device), device->shader_upload_sem, NULL);
+
+   list_for_each_entry_safe(struct radv_shader_dma_submission, submission,
+                            &device->shader_dma_submissions, list)
+   {
+      if (submission->cs)
+         ws->cs_destroy(submission->cs);
+      if (submission->bo)
+         ws->buffer_destroy(ws, submission->bo);
+      list_del(&submission->list);
+      free(submission);
+   }
+
+   cnd_destroy(&device->shader_dma_submission_list_cond);
+   mtx_destroy(&device->shader_dma_submission_list_mutex);
+
+   if (device->shader_upload_hw_ctx) {
+      mtx_destroy(&device->shader_upload_hw_ctx_mutex);
+      ws->ctx_destroy(device->shader_upload_hw_ctx);
+   }
+}
+
 /* For the UMR disassembler. */
 #define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */
 #define DEBUGGER_NUM_MARKERS        5
@@ -2036,19 +2141,8 @@ radv_open_rtld_binary(struct radv_device *device, const struct radv_shader *shad
 
 static bool
 radv_shader_binary_upload(struct radv_device *device, const struct radv_shader_binary *binary,
-                          struct radv_shader *shader)
+                          struct radv_shader *shader, void *dest_ptr)
 {
-   void *dest_ptr;
-
-   shader->alloc = radv_alloc_shader_memory(device, shader->code_size, shader);
-   if (!shader->alloc)
-      return false;
-
-   shader->bo = shader->alloc->arena->bo;
-   shader->va = radv_buffer_get_va(shader->bo) + shader->alloc->offset;
-
-   dest_ptr = shader->alloc->arena->ptr + shader->alloc->offset;
-
    if (device->thread_trace.bo) {
       shader->code = calloc(shader->code_size, 1);
       if (!shader->code) {
@@ -2106,6 +2200,153 @@ radv_shader_binary_upload(struct radv_device *device, const struct radv_shader_b
    return true;
 }
 
+static VkResult
+radv_shader_dma_resize_upload_buf(struct radv_shader_dma_submission *submission,
+                                  struct radeon_winsys *ws, uint64_t size)
+{
+   if (submission->bo)
+      ws->buffer_destroy(ws, submission->bo);
+
+   VkResult result =
+      ws->buffer_create(ws, size, RADV_SHADER_ALLOC_ALIGNMENT, RADEON_DOMAIN_GTT,
+                        RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING |
+                           RADEON_FLAG_32BIT | RADEON_FLAG_GTT_WC,
+                        RADV_BO_PRIORITY_UPLOAD_BUFFER, 0, &submission->bo);
+   if (result != VK_SUCCESS)
+      return result;
+
+   submission->ptr = ws->buffer_map(submission->bo);
+   submission->bo_size = size;
+
+   return VK_SUCCESS;
+}
+
+struct radv_shader_dma_submission *
+radv_shader_dma_pop_submission(struct radv_device *device)
+{
+   struct radv_shader_dma_submission *submission;
+
+   mtx_lock(&device->shader_dma_submission_list_mutex);
+
+   while (list_is_empty(&device->shader_dma_submissions))
+      cnd_wait(&device->shader_dma_submission_list_cond, &device->shader_dma_submission_list_mutex);
+
+   submission =
+      list_first_entry(&device->shader_dma_submissions, struct radv_shader_dma_submission, list);
+   list_del(&submission->list);
+
+   mtx_unlock(&device->shader_dma_submission_list_mutex);
+
+   return submission;
+}
+
+void
+radv_shader_dma_push_submission(struct radv_device *device,
+                                struct radv_shader_dma_submission *submission, uint64_t seq)
+{
+   submission->seq = seq;
+
+   mtx_lock(&device->shader_dma_submission_list_mutex);
+
+   list_addtail(&submission->list, &device->shader_dma_submissions);
+   cnd_signal(&device->shader_dma_submission_list_cond);
+
+   mtx_unlock(&device->shader_dma_submission_list_mutex);
+}
+
+struct radv_shader_dma_submission *
+radv_shader_dma_get_submission(struct radv_device *device, struct radeon_winsys_bo *bo, uint64_t va,
+                               uint64_t size)
+{
+   struct radv_shader_dma_submission *submission = radv_shader_dma_pop_submission(device);
+   struct radeon_cmdbuf *cs = submission->cs;
+   struct radeon_winsys *ws = device->ws;
+   VkResult result;
+
+   /* Wait for potentially in-flight submission to settle */
+   result = radv_shader_wait_for_upload(device, submission->seq);
+   if (result != VK_SUCCESS)
+      goto fail;
+
+   ws->cs_reset(cs);
+
+   if (submission->bo_size < size) {
+      result = radv_shader_dma_resize_upload_buf(submission, ws, size);
+      if (result != VK_SUCCESS)
+         goto fail;
+   }
+
+   radv_sdma_copy_buffer(device, cs, radv_buffer_get_va(submission->bo), va, size);
+   radv_cs_add_buffer(ws, cs, submission->bo);
+   radv_cs_add_buffer(ws, cs, bo);
+
+   result = ws->cs_finalize(cs);
+   if (result != VK_SUCCESS)
+      goto fail;
+
+   return submission;
+
+fail:
+   radv_shader_dma_push_submission(device, submission, 0);
+
+   return NULL;
+}
+
+/*
+ * If upload_seq_out is NULL, this function blocks until the DMA is complete. Otherwise, the
+ * semaphore value to wait on device->shader_upload_sem is stored in *upload_seq_out.
+ */
+bool
+radv_shader_dma_submit(struct radv_device *device, struct radv_shader_dma_submission *submission,
+                       uint64_t *upload_seq_out)
+{
+   struct radeon_cmdbuf *cs = submission->cs;
+   struct radeon_winsys *ws = device->ws;
+   VkResult result;
+
+   mtx_lock(&device->shader_upload_hw_ctx_mutex);
+
+   uint64_t upload_seq = device->shader_upload_seq + 1;
+
+   struct vk_semaphore *semaphore = vk_semaphore_from_handle(device->shader_upload_sem);
+   struct vk_sync *sync = vk_semaphore_get_active_sync(semaphore);
+   const struct vk_sync_signal signal_info = {
+      .sync = sync,
+      .signal_value = upload_seq,
+      .stage_mask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
+   };
+
+   struct radv_winsys_submit_info submit = {
+      .ip_type = AMD_IP_SDMA,
+      .queue_index = 0,
+      .cs_array = &cs,
+      .cs_count = 1,
+   };
+
+   result = ws->cs_submit(device->shader_upload_hw_ctx, &submit, 0, NULL, 1, &signal_info, false);
+   if (result != VK_SUCCESS)
+   {
+      mtx_unlock(&device->shader_upload_hw_ctx_mutex);
+      radv_shader_dma_push_submission(device, submission, 0);
+      return false;
+   }
+   device->shader_upload_seq = upload_seq;
+   mtx_unlock(&device->shader_upload_hw_ctx_mutex);
+
+   radv_shader_dma_push_submission(device, submission, upload_seq);
+
+   if (upload_seq_out) {
+      *upload_seq_out = upload_seq;
+   } else {
+      result = radv_shader_wait_for_upload(device, upload_seq);
+      if (result != VK_SUCCESS)
+         return false;
+   }
+
+   return true;
+}
+
+
 struct radv_shader *
 radv_shader_create(struct radv_device *device, const struct radv_shader_binary *binary,
                    bool keep_shader_info, bool from_cache, const struct radv_shader_args *args)
@@ -2215,9 +2456,33 @@ radv_shader_create(struct radv_device *device, const struct radv_shader_binary *
       }
    }
 
-   if (!radv_shader_binary_upload(device, binary, shader))
+   shader->alloc = radv_alloc_shader_memory(device, shader->code_size, shader);
+   if (!shader->alloc)
       return NULL;
 
+   shader->bo = shader->alloc->arena->bo;
+   shader->va = radv_buffer_get_va(shader->bo) + shader->alloc->offset;
+
+   if (device->shader_use_invisible_vram) {
+      struct radv_shader_dma_submission *submission =
+         radv_shader_dma_get_submission(device, shader->bo, shader->va, shader->code_size);
+      if (!submission)
+         return NULL;
+
+      if (!radv_shader_binary_upload(device, binary, shader, submission->ptr)) {
+         radv_shader_dma_push_submission(device, submission, 0);
+         return NULL;
+      }
+
+      if (!radv_shader_dma_submit(device, submission, NULL))
+         return NULL;
+   } else {
+      void *dest_ptr = shader->alloc->arena->ptr + shader->alloc->offset;
+
+      if (!radv_shader_binary_upload(device, binary, shader, dest_ptr))
+         return NULL;
+   }
+
    return shader;
 }
 
@@ -2243,15 +2508,38 @@ radv_shader_part_create(struct radv_shader_part_binary *binary, unsigned wave_si
    return shader_part;
 }
 
-void
-radv_shader_part_binary_upload(const struct radv_shader_part_binary *binary, void *dest_ptr)
+bool
+radv_shader_part_binary_upload(struct radv_device *device, struct radv_shader_part *shader_part)
 {
-   memcpy(dest_ptr, binary->data, binary->code_size);
+   const struct radv_shader_part_binary *bin = shader_part->binary;
+   uint32_t code_size = radv_get_shader_binary_size(bin->code_size);
+   struct radv_shader_dma_submission *submission = NULL;
+   void *dest_ptr;
 
+   if (device->shader_use_invisible_vram) {
+      uint64_t va = radv_buffer_get_va(shader_part->alloc->arena->bo) + shader_part->alloc->offset;
+      submission =
+         radv_shader_dma_get_submission(device, shader_part->alloc->arena->bo, va, code_size);
+      if (!submission)
+         return false;
+
+      dest_ptr = submission->ptr;
+   } else {
+      dest_ptr = shader_part->alloc->arena->ptr + shader_part->alloc->offset;
+   }
+
+   memcpy(dest_ptr, bin->data, bin->code_size);
    /* Add end-of-code markers for the UMR disassembler. */
-   uint32_t *ptr32 = (uint32_t *)dest_ptr + binary->code_size / 4;
+   uint32_t *ptr32 = (uint32_t *)dest_ptr + code_size / 4;
    for (unsigned i = 0; i < DEBUGGER_NUM_MARKERS; i++)
       ptr32[i] = DEBUGGER_END_OF_CODE_MARKER;
+
+   if (device->shader_use_invisible_vram) {
+      if (!radv_shader_dma_submit(device, submission, NULL))
+         return false;
+   }
+
+   return true;
 }
 
 static char *
@@ -2633,8 +2921,8 @@ radv_create_vs_prolog(struct radv_device *device, const struct radv_vs_prolog_ke
    prolog->bo = prolog->alloc->arena->bo;
    prolog->va = radv_buffer_get_va(prolog->bo) + prolog->alloc->offset;
 
-   void *dest_ptr = prolog->alloc->arena->ptr + prolog->alloc->offset;
-   radv_shader_part_binary_upload(binary, dest_ptr);
+   if (!radv_shader_part_binary_upload(device, prolog))
+      goto fail_alloc;
 
    if (options.dump_shader) {
       fprintf(stderr, "Vertex prolog");
@@ -2698,8 +2986,8 @@ radv_create_ps_epilog(struct radv_device *device, const struct radv_ps_epilog_ke
    epilog->bo = epilog->alloc->arena->bo;
    epilog->va = radv_buffer_get_va(epilog->bo) + epilog->alloc->offset;
 
-   void *dest_ptr = epilog->alloc->arena->ptr + epilog->alloc->offset;
-   radv_shader_part_binary_upload(binary, dest_ptr);
+   if (!radv_shader_part_binary_upload(device, epilog))
+      goto fail_alloc;
 
    if (options.dump_shader) {
       fprintf(stderr, "Fragment epilog");
index fa2805a..9b0b91f 100644 (file)
@@ -563,6 +563,8 @@ bool radv_nir_lower_vs_inputs(nir_shader *shader, const struct radv_pipeline_sta
 
 void radv_init_shader_arenas(struct radv_device *device);
 void radv_destroy_shader_arenas(struct radv_device *device);
+VkResult radv_init_shader_upload_queue(struct radv_device *device);
+void radv_destroy_shader_upload_queue(struct radv_device *device);
 
 struct radv_shader_args;
 
@@ -575,7 +577,25 @@ struct radv_shader *radv_shader_nir_to_asm(
    int shader_count, const struct radv_pipeline_key *key, bool keep_shader_info, bool keep_statistic_info,
    struct radv_shader_binary **binary_out);
 
-void radv_shader_part_binary_upload(const struct radv_shader_part_binary *binary, void *dest_ptr);
+VkResult radv_shader_wait_for_upload(struct radv_device *device, uint64_t seq);
+
+bool radv_shader_part_binary_upload(struct radv_device *device,
+                                    struct radv_shader_part *shader_part);
+
+struct radv_shader_dma_submission *
+radv_shader_dma_pop_submission(struct radv_device *device);
+
+void radv_shader_dma_push_submission(struct radv_device *device,
+                                     struct radv_shader_dma_submission *submission,
+                                     uint64_t seq);
+
+struct radv_shader_dma_submission *radv_shader_dma_get_submission(struct radv_device *device,
+                                                                  struct radeon_winsys_bo *bo,
+                                                                  uint64_t va, uint64_t size);
+
+bool radv_shader_dma_submit(struct radv_device *device,
+                            struct radv_shader_dma_submission *submission,
+                            uint64_t *upload_seq_out);
 
 union radv_shader_arena_block *radv_alloc_shader_memory(struct radv_device *device, uint32_t size,
                                                         void *ptr);