nvk: add support for preamble and tls allocation.
authorDave Airlie <airlied@redhat.com>
Mon, 13 Jun 2022 23:14:05 +0000 (09:14 +1000)
committerMarge Bot <emma+marge@anholt.net>
Fri, 4 Aug 2023 21:31:54 +0000 (21:31 +0000)
This adds support for per-queue allocations that are reallocated
upwards if the submitted cmd needs more space.

This is used to implement the tls allocations for compute.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24326>

src/nouveau/vulkan/nvk_cmd_buffer.c
src/nouveau/vulkan/nvk_cmd_buffer.h
src/nouveau/vulkan/nvk_device.c
src/nouveau/vulkan/nvk_device.h

index c3f26df..91024b7 100644 (file)
@@ -6,6 +6,8 @@
 
 #include "nouveau_push.h"
 
+#include "nvk_cla0c0.h"
+
 static void
 nvk_destroy_cmd_buffer(struct nvk_cmd_buffer *cmd_buffer)
 {
@@ -217,6 +219,30 @@ nvk_ResetCommandBuffer(VkCommandBuffer commandBuffer,
    return nvk_reset_cmd_buffer(cmd_buffer);
 }
 
+static uint64_t
+calc_tls_size(struct nvk_device *device,
+              uint32_t lpos, uint32_t lneg, uint32_t cstack)
+{
+   uint64_t size = (lpos + lneg) * 32 + cstack;
+
+   assert (size < (1 << 20));
+
+   size *= 64; /* max warps */
+   size  = align(size, 0x8000);
+   size *= device->pdev->dev->mp_count;
+
+   size = align(size, 1 << 17);
+   return size;
+}
+
+static void
+nve4_begin_compute(struct nvk_cmd_buffer *cmd)
+{
+   struct nvk_device *dev = (struct nvk_device *)cmd->vk.base.device;
+
+   cmd->tls_space_needed = calc_tls_size(dev, 128 * 16, 0, 0x200);
+}
+
 VKAPI_ATTR VkResult VKAPI_CALL
 nvk_BeginCommandBuffer(VkCommandBuffer commandBuffer,
                        const VkCommandBufferBeginInfo *pBeginInfo)
@@ -230,6 +256,13 @@ nvk_BeginCommandBuffer(VkCommandBuffer commandBuffer,
    else
       cmd->reset_on_submit = false;
 
+   struct nvk_device *dev = (struct nvk_device *)cmd->vk.base.device;
+   struct nvk_physical_device *pdev = dev->pdev;
+
+   /* this could be made optional for non-compute cmdbuffers */
+   if (pdev->dev->cls >= 0xa0)
+      nve4_begin_compute(cmd);
+
    return VK_SUCCESS;
 }
 
index 3145ee2..96b9ac4 100644 (file)
@@ -52,6 +52,8 @@ struct nvk_cmd_buffer {
 
    struct nouveau_ws_push *push;
    bool reset_on_submit;
+
+   uint64_t tls_space_needed;
 };
 
 VkResult nvk_reset_cmd_buffer(struct nvk_cmd_buffer *cmd_buffer);
index 884a946..b3aa6a5 100644 (file)
 
 #include "vulkan/wsi/wsi_common.h"
 
+#include "nvk_cla0c0.h"
+#include "cla1c0.h"
+#include "nvk_clc3c0.h"
+
+static VkResult
+nvk_update_preamble_push(struct nvk_queue_state *qs, struct nvk_device *dev,
+                         const struct nvk_queue_alloc_info *needs)
+{
+   struct nouveau_ws_bo *tls_bo = qs->tls_bo;
+   VkResult result;
+   if (needs->tls_size > qs->alloc_info.tls_size) {
+      tls_bo = nouveau_ws_bo_new(dev->pdev->dev,
+                                 needs->tls_size, (1 << 17), NOUVEAU_WS_BO_LOCAL);
+      if (!tls_bo) {
+         result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
+         goto fail;
+      }
+   }
+
+   if (tls_bo != qs->tls_bo) {
+      if (qs->tls_bo)
+         nouveau_ws_bo_destroy(qs->tls_bo);
+      qs->tls_bo = tls_bo;
+   }
+
+   struct nouveau_ws_push *push = nouveau_ws_push_new(dev->pdev->dev, 256);
+
+   nouveau_ws_push_ref(push, qs->tls_bo, NOUVEAU_WS_BO_RDWR);
+   P_MTHD(push, NVA0C0, SET_SHADER_LOCAL_MEMORY_A);
+   P_NVA0C0_SET_SHADER_LOCAL_MEMORY_A(push, qs->tls_bo->offset >> 32);
+   P_NVA0C0_SET_SHADER_LOCAL_MEMORY_B(push, qs->tls_bo->offset & 0xffffffff);
+
+   uint64_t temp_size = qs->tls_bo->size / dev->pdev->dev->mp_count;
+   P_MTHD(push, NVA0C0, SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A);
+   P_NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A(push, temp_size >> 32);
+   P_NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_B(push, temp_size & ~0x7fff);
+   P_NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_C(push, 0xff);
+
+   if (dev->pdev->dev->cls < 0xc3) {
+      P_MTHD(push, NVA0C0, SET_SHADER_LOCAL_MEMORY_THROTTLED_A);
+      P_NVA0C0_SET_SHADER_LOCAL_MEMORY_THROTTLED_A(push, temp_size >> 32);
+      P_NVA0C0_SET_SHADER_LOCAL_MEMORY_THROTTLED_B(push, temp_size & ~0x7fff);
+      P_NVA0C0_SET_SHADER_LOCAL_MEMORY_THROTTLED_C(push, 0xff);
+
+      P_MTHD(push, NVA0C0, SET_SHADER_LOCAL_MEMORY_WINDOW);
+      P_NVA0C0_SET_SHADER_LOCAL_MEMORY_WINDOW(push, 0xff << 24);
+
+      P_MTHD(push, NVA0C0, SET_SHADER_SHARED_MEMORY_WINDOW);
+      P_NVA0C0_SET_SHADER_SHARED_MEMORY_WINDOW(push, 0xfe << 24);
+
+      // TODO CODE_ADDRESS_HIGH
+   } else {
+      uint64_t temp = 0xfeULL << 24;
+
+      P_MTHD(push, NVC3C0, SET_SHADER_SHARED_MEMORY_WINDOW_A);
+      P_NVC3C0_SET_SHADER_SHARED_MEMORY_WINDOW_A(push, temp >> 32);
+      P_NVC3C0_SET_SHADER_SHARED_MEMORY_WINDOW_B(push, temp & 0xffffffff);
+
+      temp = 0xffULL << 24;
+      P_MTHD(push, NVC3C0, SET_SHADER_LOCAL_MEMORY_WINDOW_A);
+      P_NVC3C0_SET_SHADER_LOCAL_MEMORY_WINDOW_A(push, temp >> 32);
+      P_NVC3C0_SET_SHADER_LOCAL_MEMORY_WINDOW_B(push, temp & 0xffffffff);
+   }
+
+   P_MTHD(push, NVA0C0, SET_SPA_VERSION);
+   P_NVA0C0_SET_SPA_VERSION(push, { .major = dev->pdev->dev->cls >= 0xa1 ? 0x4 : 0x3 });
+
+   if (qs->push)
+      nouveau_ws_push_destroy(qs->push);
+   qs->push = push;
+   return 0;
+ fail:
+   return vk_error(qs, result);
+}
+
 static VkResult
-nvk_queue_submit(struct vk_queue *queue, struct vk_queue_submit *submission)
+nvk_update_preambles(struct nvk_queue_state *qs, struct nvk_device *device,
+                     struct vk_command_buffer *const *cmd_buffers, uint32_t cmd_buffer_count)
 {
-   struct nvk_device *device = container_of(queue->base.device, struct nvk_device, vk);
+   struct nvk_queue_alloc_info needs = { 0 };
+   for (uint32_t i = 0; i < cmd_buffer_count; i++) {
+      struct nvk_cmd_buffer *cmd = (struct nvk_cmd_buffer *)cmd_buffers[i];
+      needs.tls_size = MAX2(needs.tls_size, cmd->tls_space_needed);
+   }
+
+   if (needs.tls_size == qs->alloc_info.tls_size)
+      return VK_SUCCESS;
+   return nvk_update_preamble_push(qs, device, &needs);
+}
+
+static VkResult
+nvk_queue_submit(struct vk_queue *vkqueue, struct vk_queue_submit *submission)
+{
+   struct nvk_device *device = container_of(vkqueue->base.device, struct nvk_device, vk);
+   struct nvk_queue *queue = container_of(vkqueue, struct nvk_queue, vk);
+   VkResult result;
+
+   result = nvk_update_preambles(&queue->state, device, submission->command_buffers,
+                                 submission->command_buffer_count);
+   if (result != VK_SUCCESS)
+      return result;
 
    pthread_mutex_lock(&device->mutex);
+
+   if (queue->state.push)
+      nouveau_ws_push_submit(queue->state.push, device->pdev->dev, device->ctx);
    for (unsigned i = 0; i < submission->command_buffer_count; i++) {
       struct nvk_cmd_buffer *cmd = (struct nvk_cmd_buffer *)submission->command_buffers[i];
 
@@ -77,7 +177,7 @@ nvk_CreateDevice(VkPhysicalDevice physicalDevice,
       goto fail_init;
    }
 
-   result = vk_queue_init(&device->queue, &device->vk, &pCreateInfo->pQueueCreateInfos[0], 0);
+   result = vk_queue_init(&device->queue.vk, &device->vk, &pCreateInfo->pQueueCreateInfos[0], 0);
    if (result != VK_SUCCESS)
       goto fail_ctx;
 
@@ -103,7 +203,7 @@ nvk_CreateDevice(VkPhysicalDevice physicalDevice,
    }
    pthread_condattr_destroy(&condattr);
 
-   device->queue.driver_submit = nvk_queue_submit;
+   device->queue.vk.driver_submit = nvk_queue_submit;
 
    device->pdev = physical_device;
 
@@ -114,7 +214,7 @@ nvk_CreateDevice(VkPhysicalDevice physicalDevice,
 fail_mutex:
    pthread_mutex_destroy(&device->mutex);
 fail_queue:
-   vk_queue_finish(&device->queue);
+   vk_queue_finish(&device->queue.vk);
 fail_ctx:
    nouveau_ws_context_destroy(device->ctx);
 fail_init:
@@ -132,9 +232,13 @@ nvk_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
    if (!device)
       return;
 
+   if (device->queue.state.push)
+      nouveau_ws_push_destroy(device->queue.state.push);
+   if (device->queue.state.tls_bo)
+      nouveau_ws_bo_destroy(device->queue.state.tls_bo);
    pthread_cond_destroy(&device->queue_submit);
    pthread_mutex_destroy(&device->mutex);
-   vk_queue_finish(&device->queue);
+   vk_queue_finish(&device->queue.vk);
    vk_device_finish(&device->vk);
    nouveau_ws_context_destroy(device->ctx);
    vk_free(&device->vk.alloc, device);
index d32444c..e561ebb 100644 (file)
@@ -9,13 +9,30 @@
 struct novueau_ws_context;
 struct nvk_physical_device;
 
+struct nvk_queue_alloc_info {
+   uint64_t tls_size;
+};
+
+struct nvk_queue_state {
+   struct nvk_queue_alloc_info alloc_info;
+   struct nouveau_ws_bo *tls_bo;
+
+   struct nouveau_ws_push *push;
+};
+
+struct nvk_queue {
+   struct vk_queue vk;
+
+   struct nvk_queue_state state;
+};
+
 struct nvk_device {
    struct vk_device vk;
    struct nvk_physical_device *pdev;
 
    struct nouveau_ws_context *ctx;
 
-   struct vk_queue queue;
+   struct nvk_queue queue;
 
    pthread_mutex_t mutex;
    pthread_cond_t queue_submit;