nvk: Rework TLS/SLM and image/sampler table handling
authorFaith Ekstrand <faith.ekstrand@collabora.com>
Tue, 31 Jan 2023 02:11:56 +0000 (20:11 -0600)
committerMarge Bot <emma+marge@anholt.net>
Fri, 4 Aug 2023 21:31:58 +0000 (21:31 +0000)
When Dave originally added SLM, he did so with a preamble pushbuf which
sets up the SLM area prior to executing any actual command buffers.
This has the advantage that the SLM BO is never directly referenced in
any command buffer and we can swap it out even after command buffers
have been built.  When I added image/sampler tables, I sort-of went
along with it but then did something different for 3D.

Thanks to buffer reference counting, we can employ a fairly simple
locking scheme.  Each of the globals: images, samplers, and TSL has a
lock which gets taken whenever the table is modified.  When we go to do
a submit, we take each lock and take a queue-local reference to the BO
and grab a copy of its size parameter under the lock.  If anything has
changed, we update the queue-local preamble pushbuf.  Because the queue
also holds a reference, we can safely proceed to submit command buffers
which reference those global BOs without the individual global locks
taken.  If SLM or one of the descriptor tables resizes while we're
mid-submit, our reference to the BO will remain valid until either the
destruction of the queue or the next submit.

The one small bit of threaded cleverness (one always has to be careful
with that) is that the SLM size can be checked without taking the lock
because the SLM size only ever increases.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24326>

src/nouveau/vulkan/nvk_cmd_buffer.c
src/nouveau/vulkan/nvk_cmd_dispatch.c
src/nouveau/vulkan/nvk_cmd_draw.c
src/nouveau/vulkan/nvk_compute_pipeline.c
src/nouveau/vulkan/nvk_descriptor_table.h
src/nouveau/vulkan/nvk_device.c
src/nouveau/vulkan/nvk_device.h
src/nouveau/vulkan/nvk_shader.c
src/nouveau/vulkan/nvk_shader.h

index 43300af..29e0d50 100644 (file)
@@ -381,6 +381,7 @@ nvk_CmdBindPipeline(VkCommandBuffer commandBuffer,
 {
    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
    VK_FROM_HANDLE(nvk_pipeline, pipeline, _pipeline);
+   struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
 
    for (unsigned s = 0; s < ARRAY_SIZE(pipeline->shaders); s++) {
       if (!pipeline->shaders[s].bo)
@@ -388,6 +389,9 @@ nvk_CmdBindPipeline(VkCommandBuffer commandBuffer,
 
       nouveau_ws_push_ref(cmd->push, pipeline->shaders[s].bo,
                           NOUVEAU_WS_BO_RD);
+
+      if (pipeline->shaders[s].slm_size)
+         nvk_device_ensure_slm(dev, pipeline->shaders[s].slm_size);
    }
 
    switch (pipelineBindPoint) {
index 89bf942..698dc5c 100644 (file)
 #define NVC3C0_QMDV02_02_VAL_SET(p,a...) NVVAL_MW_SET((p), NVC3C0, QMDV02_02, ##a)
 #define NVC3C0_QMDV02_02_DEF_SET(p,a...) NVDEF_MW_SET((p), NVC3C0, QMDV02_02, ##a)
 
-static uint64_t
-calc_tls_size(struct nvk_device *device,
-              uint32_t lpos, uint32_t lneg, uint32_t cstack)
-{
-   uint64_t size = (lpos + lneg) * 32 + cstack;
-
-   assert (size < (1 << 20));
-
-   size *= 64; /* max warps */
-   size  = align(size, 0x8000);
-   size *= device->pdev->dev->mp_count;
-
-   size = align(size, 1 << 17);
-   return size;
-}
-
 void
 nvk_cmd_buffer_begin_compute(struct nvk_cmd_buffer *cmd,
                              const VkCommandBufferBeginInfo *pBeginInfo)
-{
-   struct nvk_device *dev = (struct nvk_device *)cmd->vk.base.device;
-
-   if (dev->ctx->compute.cls < 0xa0c0)
-      return;
-
-   cmd->tls_space_needed = calc_tls_size(dev, 128 * 16, 0, 0x200);
-}
+{ }
 
 static void
 gv100_compute_setup_launch_desc(uint32_t *qmd,
index 2301c0a..ca942d2 100644 (file)
@@ -171,7 +171,6 @@ void
 nvk_cmd_buffer_begin_graphics(struct nvk_cmd_buffer *cmd,
                               const VkCommandBufferBeginInfo *pBeginInfo)
 {
-   struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
    struct nouveau_ws_push *p = cmd->push;
 
    P_MTHD(p, NV9097, SET_OBJECT);
@@ -215,25 +214,6 @@ nvk_cmd_buffer_begin_graphics(struct nvk_cmd_buffer *cmd,
    /* TODO: Vertex runout */
    /* TODO: temp */
 
-   P_IMMD(p, NV9097, SET_SHADER_LOCAL_MEMORY_WINDOW, 0xff000000); /* TODO */
-
-   nvk_push_descriptor_table_ref(p, &dev->samplers);
-   uint64_t tsp_addr = nvk_descriptor_table_base_address(&dev->samplers);
-   P_MTHD(p, NV9097, SET_TEX_SAMPLER_POOL_A);
-   P_NV9097_SET_TEX_SAMPLER_POOL_A(p, tsp_addr >> 32);
-   P_NV9097_SET_TEX_SAMPLER_POOL_B(p, tsp_addr);
-   P_NV9097_SET_TEX_SAMPLER_POOL_C(p, dev->samplers.alloc - 1);
-
-   nvk_push_descriptor_table_ref(p, &dev->images);
-   uint64_t thp_addr = nvk_descriptor_table_base_address(&dev->images);
-   P_MTHD(p, NV9097, SET_TEX_HEADER_POOL_A);
-   P_NV9097_SET_TEX_HEADER_POOL_A(p, thp_addr >> 32);
-   P_NV9097_SET_TEX_HEADER_POOL_B(p, thp_addr & 0xffffffff);
-   P_NV9097_SET_TEX_HEADER_POOL_C(p, dev->images.alloc - 1);
-
-   /* TODO: TIC */
-   /* TODO: TSC */
-
    P_IMMD(p, NV9097, SET_WINDOW_ORIGIN, {
       .mode    = MODE_UPPER_LEFT,
       .flip_y  = FLIP_Y_FALSE,
index 69afd42..8bb2def 100644 (file)
@@ -40,8 +40,7 @@ gv100_compute_setup_launch_desc_template(uint32_t *qmd,
    NVC3C0_QMDV02_02_VAL_SET(qmd, SHARED_MEMORY_SIZE,
                             align(shader->cp.smem_size, 0x100));
    NVC3C0_QMDV02_02_VAL_SET(qmd, SHADER_LOCAL_MEMORY_LOW_SIZE,
-                            (shader->hdr[1] & 0xfffff0) +
-                            align(shader->cp.lmem_size, 0x10));
+                            align(shader->slm_size, 0x10));
    NVC3C0_QMDV02_02_VAL_SET(qmd, SHADER_LOCAL_MEMORY_HIGH_SIZE, 0);
    NVC3C0_QMDV02_02_VAL_SET(qmd, MIN_SM_CONFIG_SHARED_MEM_SIZE,
                             gv100_sm_config_smem_size(8 * 1024));
index 970caef..f3197f1 100644 (file)
@@ -41,18 +41,18 @@ void nvk_descriptor_table_free(struct nvk_device *device,
                                struct nvk_descriptor_table *table,
                                uint32_t index);
 
-static inline void
-nvk_push_descriptor_table_ref(struct nouveau_ws_push *push,
-                              const struct nvk_descriptor_table *table)
+static inline struct nouveau_ws_bo *
+nvk_descriptor_table_get_bo_ref(struct nvk_descriptor_table *table,
+                                uint32_t *alloc_count_out)
 {
-   if (table->bo)
-      nouveau_ws_push_ref(push, table->bo, NOUVEAU_WS_BO_RD);
-}
-
-static inline uint64_t
-nvk_descriptor_table_base_address(const struct nvk_descriptor_table *table)
-{
-   return table->bo->offset;
+   simple_mtx_lock(&table->mutex);
+   struct nouveau_ws_bo *bo = table->bo;
+   if (bo)
+      nouveau_ws_bo_ref(bo);
+   *alloc_count_out = table->alloc;
+   simple_mtx_unlock(&table->mutex);
+
+   return bo;
 }
 
 #endif
index 8a03f31..b78f9cd 100644 (file)
 
 #include "vulkan/wsi/wsi_common.h"
 
+#include "nvk_cl9097.h"
 #include "nvk_cl90b5.h"
 #include "nvk_cla0c0.h"
 #include "cla1c0.h"
 #include "nvk_clc3c0.h"
 
+static void
+nvk_slm_area_init(struct nvk_slm_area *area)
+{
+   memset(area, 0, sizeof(*area));
+   simple_mtx_init(&area->mutex, mtx_plain);
+}
+
+static void
+nvk_slm_area_finish(struct nvk_slm_area *area)
+{
+   simple_mtx_destroy(&area->mutex);
+   if (area->bo)
+      nouveau_ws_bo_destroy(area->bo);
+}
+
+static struct nouveau_ws_bo *
+nvk_slm_area_get_bo_ref(struct nvk_slm_area *area,
+                        uint32_t *bytes_per_warp_out,
+                        uint32_t *bytes_per_mp_out)
+{
+   simple_mtx_lock(&area->mutex);
+   struct nouveau_ws_bo *bo = area->bo;
+   if (bo)
+      nouveau_ws_bo_ref(bo);
+   *bytes_per_warp_out = area->bytes_per_warp;
+   *bytes_per_mp_out = area->bytes_per_mp;
+   simple_mtx_unlock(&area->mutex);
+
+   return bo;
+}
+
 static VkResult
-nvk_update_preamble_push(struct nvk_queue_state *qs, struct nvk_device *dev,
-                         const struct nvk_queue_alloc_info *needs)
+nvk_slm_area_ensure(struct nvk_device *dev,
+                    struct nvk_slm_area *area,
+                    uint32_t bytes_per_thread)
 {
-   struct nouveau_ws_bo *tls_bo = qs->tls_bo;
-   VkResult result;
-   if (needs->tls_size > qs->alloc_info.tls_size) {
-      tls_bo = nouveau_ws_bo_new(dev->pdev->dev,
-                                 needs->tls_size, (1 << 17), NOUVEAU_WS_BO_LOCAL);
-      if (!tls_bo) {
-         result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
-         goto fail;
-      }
-   }
+   assert(bytes_per_thread < (1 << 24));
+
+   /* TODO: Volta+doesn't use CRC */
+   const uint32_t crs_size = 0;
+
+   uint64_t bytes_per_warp = bytes_per_thread * 32 + crs_size;
+
+   /* The hardware seems to require this alignment for
+    * NV9097_SET_SHADER_LOCAL_MEMORY_E_DEFAULT_SIZE_PER_WARP
+    */
+   bytes_per_warp = ALIGN(bytes_per_warp, 0x200);
+
+   uint64_t bytes_per_mp = bytes_per_warp * 64; /* max warps */
+
+   /* The hardware seems to require this alignment for
+    * NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A_SIZE_LOWER.
+    *
+    * Fortunately, this is just the alignment for bytes_per_warp multiplied
+    * by the number of warps, 64.  It might matter for real on a GPU with 48
+    * warps but we don't support any of those yet.
+    */
+   assert(bytes_per_mp == ALIGN(bytes_per_mp, 0x8000));
+
+   /* nvk_slm_area::bytes_per_mp only ever increases so we can check this
+    * outside the lock and exit early in the common case.  We only need to
+    * take the lock if we're actually going to resize.
+    *
+    * Also, we only care about bytes_per_mp and not bytes_per_warp because
+    * they are integer multiples of each other.
+    */
+   if (likely(bytes_per_mp <= area->bytes_per_mp))
+      return VK_SUCCESS;
 
-   if (tls_bo != qs->tls_bo) {
-      if (qs->tls_bo)
-         nouveau_ws_bo_destroy(qs->tls_bo);
-      qs->tls_bo = tls_bo;
-   }
+   uint64_t size = bytes_per_mp * dev->pdev->dev->mp_count;
 
-   struct nouveau_ws_push *push = nouveau_ws_push_new(dev->pdev->dev, 256);
-
-   nouveau_ws_push_ref(push, qs->tls_bo, NOUVEAU_WS_BO_RDWR);
-   P_MTHD(push, NVA0C0, SET_SHADER_LOCAL_MEMORY_A);
-   P_NVA0C0_SET_SHADER_LOCAL_MEMORY_A(push, qs->tls_bo->offset >> 32);
-   P_NVA0C0_SET_SHADER_LOCAL_MEMORY_B(push, qs->tls_bo->offset & 0xffffffff);
-
-   nvk_push_descriptor_table_ref(push, &dev->samplers);
-   uint64_t tsp_addr = nvk_descriptor_table_base_address(&dev->samplers);
-   P_MTHD(push, NVA0C0, SET_TEX_SAMPLER_POOL_A);
-   P_NVA0C0_SET_TEX_SAMPLER_POOL_A(push, tsp_addr >> 32);
-   P_NVA0C0_SET_TEX_SAMPLER_POOL_B(push, tsp_addr & 0xffffffff);
-   P_NVA0C0_SET_TEX_SAMPLER_POOL_C(push, dev->samplers.alloc - 1);
-
-   nvk_push_descriptor_table_ref(push, &dev->images);
-   uint64_t thp_addr = nvk_descriptor_table_base_address(&dev->images);
-   P_MTHD(push, NVA0C0, SET_TEX_HEADER_POOL_A);
-   P_NVA0C0_SET_TEX_HEADER_POOL_A(push, thp_addr >> 32);
-   P_NVA0C0_SET_TEX_HEADER_POOL_B(push, thp_addr & 0xffffffff);
-   P_NVA0C0_SET_TEX_HEADER_POOL_C(push, dev->images.alloc - 1);
-
-   uint64_t temp_size = qs->tls_bo->size / dev->pdev->dev->mp_count;
-   P_MTHD(push, NVA0C0, SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A);
-   P_NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A(push, temp_size >> 32);
-   P_NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_B(push, temp_size & ~0x7fff);
-   P_NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_C(push, 0xff);
-
-   if (dev->ctx->compute.cls < 0xc3c0) {
-      P_MTHD(push, NVA0C0, SET_SHADER_LOCAL_MEMORY_THROTTLED_A);
-      P_NVA0C0_SET_SHADER_LOCAL_MEMORY_THROTTLED_A(push, temp_size >> 32);
-      P_NVA0C0_SET_SHADER_LOCAL_MEMORY_THROTTLED_B(push, temp_size & ~0x7fff);
-      P_NVA0C0_SET_SHADER_LOCAL_MEMORY_THROTTLED_C(push, 0xff);
-
-      P_MTHD(push, NVA0C0, SET_SHADER_LOCAL_MEMORY_WINDOW);
-      P_NVA0C0_SET_SHADER_LOCAL_MEMORY_WINDOW(push, 0xff << 24);
-
-      P_MTHD(push, NVA0C0, SET_SHADER_SHARED_MEMORY_WINDOW);
-      P_NVA0C0_SET_SHADER_SHARED_MEMORY_WINDOW(push, 0xfe << 24);
+   struct nouveau_ws_bo *bo =
+      nouveau_ws_bo_new(dev->pdev->dev, size, 0, NOUVEAU_WS_BO_LOCAL);
+   if (bo == NULL)
+      return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
 
-      // TODO CODE_ADDRESS_HIGH
+   struct nouveau_ws_bo *unref_bo;
+   simple_mtx_lock(&area->mutex);
+   if (bytes_per_mp <= area->bytes_per_mp) {
+      /* We lost the race, throw away our BO */
+      assert(area->bytes_per_warp == bytes_per_warp);
+      unref_bo = bo;
    } else {
-      uint64_t temp = 0xfeULL << 24;
+      unref_bo = area->bo;
+      area->bo = bo;
+      area->bytes_per_warp = bytes_per_warp;
+      area->bytes_per_mp = bytes_per_mp;
+   }
+   simple_mtx_unlock(&area->mutex);
 
-      P_MTHD(push, NVC3C0, SET_SHADER_SHARED_MEMORY_WINDOW_A);
-      P_NVC3C0_SET_SHADER_SHARED_MEMORY_WINDOW_A(push, temp >> 32);
-      P_NVC3C0_SET_SHADER_SHARED_MEMORY_WINDOW_B(push, temp & 0xffffffff);
+   if (unref_bo)
+      nouveau_ws_bo_destroy(unref_bo);
 
-      temp = 0xffULL << 24;
-      P_MTHD(push, NVC3C0, SET_SHADER_LOCAL_MEMORY_WINDOW_A);
-      P_NVC3C0_SET_SHADER_LOCAL_MEMORY_WINDOW_A(push, temp >> 32);
-      P_NVC3C0_SET_SHADER_LOCAL_MEMORY_WINDOW_B(push, temp & 0xffffffff);
-   }
+   return VK_SUCCESS;
+}
 
-   P_MTHD(push, NVA0C0, SET_SPA_VERSION);
-   P_NVA0C0_SET_SPA_VERSION(push, { .major = dev->ctx->compute.cls >= 0xa1c0 ? 0x4 : 0x3 });
+static void
+nvk_queue_state_init(struct nvk_queue_state *qs)
+{
+   memset(qs, 0, sizeof(*qs));
+}
 
+static void
+nvk_queue_state_finish(struct nvk_device *dev,
+                       struct nvk_queue_state *qs)
+{
+   if (qs->images.bo)
+      nouveau_ws_bo_destroy(qs->images.bo);
+   if (qs->samplers.bo)
+      nouveau_ws_bo_destroy(qs->samplers.bo);
+   if (qs->slm.bo)
+      nouveau_ws_bo_destroy(qs->slm.bo);
    if (qs->push)
       nouveau_ws_push_destroy(qs->push);
-   qs->push = push;
-   return 0;
- fail:
-   return vk_error(qs, result);
+}
+
+static void
+nvk_queue_state_ref(struct nouveau_ws_push *push,
+                    struct nvk_queue_state *qs)
+{
+   if (qs->images.bo)
+      nouveau_ws_push_ref(push, qs->images.bo, NOUVEAU_WS_BO_RD);
+   if (qs->samplers.bo)
+      nouveau_ws_push_ref(push, qs->samplers.bo, NOUVEAU_WS_BO_RD);
+   if (qs->slm.bo)
+      nouveau_ws_push_ref(push, qs->slm.bo, NOUVEAU_WS_BO_RDWR);
 }
 
 static VkResult
-nvk_update_preambles(struct nvk_queue_state *qs, struct nvk_device *device,
-                     struct vk_command_buffer *const *cmd_buffers, uint32_t cmd_buffer_count)
+nvk_queue_state_update(struct nvk_device *dev,
+                       struct nvk_queue_state *qs)
 {
-   struct nvk_queue_alloc_info needs = { 0 };
-   for (uint32_t i = 0; i < cmd_buffer_count; i++) {
-      struct nvk_cmd_buffer *cmd = (struct nvk_cmd_buffer *)cmd_buffers[i];
-      needs.tls_size = MAX2(needs.tls_size, cmd->tls_space_needed);
+   struct nouveau_ws_bo *bo;
+   uint32_t alloc_count, bytes_per_warp, bytes_per_mp;
+   bool dirty = false;
+
+   bo = nvk_descriptor_table_get_bo_ref(&dev->images, &alloc_count);
+   if (qs->images.bo != bo || qs->images.alloc_count != alloc_count) {
+      if (qs->images.bo)
+         nouveau_ws_bo_destroy(qs->images.bo);
+      qs->images.bo = bo;
+      qs->images.alloc_count = alloc_count;
+      dirty = true;
+   } else {
+      /* No change */
+      if (bo)
+         nouveau_ws_bo_destroy(bo);
    }
 
-   if (needs.tls_size == qs->alloc_info.tls_size)
+   bo = nvk_descriptor_table_get_bo_ref(&dev->samplers, &alloc_count);
+   if (qs->samplers.bo != bo || qs->samplers.alloc_count != alloc_count) {
+      if (qs->samplers.bo)
+         nouveau_ws_bo_destroy(qs->samplers.bo);
+      qs->samplers.bo = bo;
+      qs->samplers.alloc_count = alloc_count;
+      dirty = true;
+   } else {
+      /* No change */
+      if (bo)
+         nouveau_ws_bo_destroy(bo);
+   }
+
+   bo = nvk_slm_area_get_bo_ref(&dev->slm, &bytes_per_warp, &bytes_per_mp);
+   if (qs->slm.bo != bo || qs->slm.bytes_per_warp != bytes_per_warp ||
+       qs->slm.bytes_per_mp != bytes_per_mp) {
+      if (qs->slm.bo)
+         nouveau_ws_bo_destroy(qs->slm.bo);
+      qs->slm.bo = bo;
+      qs->slm.bytes_per_warp = bytes_per_warp;
+      qs->slm.bytes_per_mp = bytes_per_mp;
+      dirty = true;
+   } else {
+      /* No change */
+      if (bo)
+         nouveau_ws_bo_destroy(bo);
+   }
+
+   /* TODO: We're currently depending on kernel reference counting to protect
+    * us here.  If we ever stop reference counting in the kernel, we will
+    * either need to delay destruction or hold on to our extra BO references
+    * and insert a GPU stall here if anything has changed before dropping our
+    * old references.
+    */
+
+   if (!dirty)
       return VK_SUCCESS;
-   return nvk_update_preamble_push(qs, device, &needs);
+
+   struct nouveau_ws_push *p = nouveau_ws_push_new(dev->pdev->dev, 256);
+   if (p == NULL)
+      return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+
+   if (qs->images.bo) {
+      nouveau_ws_push_ref(p, qs->images.bo, NOUVEAU_WS_BO_RD);
+
+      /* Compute */
+      P_MTHD(p, NVA0C0, SET_TEX_HEADER_POOL_A);
+      P_NVA0C0_SET_TEX_HEADER_POOL_A(p, qs->images.bo->offset >> 32);
+      P_NVA0C0_SET_TEX_HEADER_POOL_B(p, qs->images.bo->offset);
+      P_NVA0C0_SET_TEX_HEADER_POOL_C(p, qs->images.alloc_count - 1);
+
+      /* 3D */
+      P_MTHD(p, NV9097, SET_TEX_HEADER_POOL_A);
+      P_NV9097_SET_TEX_HEADER_POOL_A(p, qs->images.bo->offset >> 32);
+      P_NV9097_SET_TEX_HEADER_POOL_B(p, qs->images.bo->offset);
+      P_NV9097_SET_TEX_HEADER_POOL_C(p, qs->images.alloc_count - 1);
+   }
+
+   if (qs->samplers.bo) {
+      nouveau_ws_push_ref(p, qs->samplers.bo, NOUVEAU_WS_BO_RD);
+
+      /* Compute */
+      P_MTHD(p, NVA0C0, SET_TEX_SAMPLER_POOL_A);
+      P_NVA0C0_SET_TEX_SAMPLER_POOL_A(p, qs->samplers.bo->offset >> 32);
+      P_NVA0C0_SET_TEX_SAMPLER_POOL_B(p, qs->samplers.bo->offset);
+      P_NVA0C0_SET_TEX_SAMPLER_POOL_C(p, qs->samplers.alloc_count - 1);
+
+      /* 3D */
+      P_MTHD(p, NV9097, SET_TEX_SAMPLER_POOL_A);
+      P_NV9097_SET_TEX_SAMPLER_POOL_A(p, qs->samplers.bo->offset >> 32);
+      P_NV9097_SET_TEX_SAMPLER_POOL_B(p, qs->samplers.bo->offset);
+      P_NV9097_SET_TEX_SAMPLER_POOL_C(p, qs->samplers.alloc_count - 1);
+   }
+
+   if (qs->slm.bo) {
+      nouveau_ws_push_ref(p, qs->slm.bo, NOUVEAU_WS_BO_RDWR);
+      const uint64_t slm_addr = qs->slm.bo->offset;
+      const uint64_t slm_size = qs->slm.bo->size;
+      const uint64_t slm_per_warp = qs->slm.bytes_per_warp;
+      const uint64_t slm_per_mp = qs->slm.bytes_per_mp;
+      assert(!(slm_per_mp & 0x7fff));
+
+      /* Compute */
+      P_MTHD(p, NVA0C0, SET_SHADER_LOCAL_MEMORY_A);
+      P_NVA0C0_SET_SHADER_LOCAL_MEMORY_A(p, slm_addr >> 32);
+      P_NVA0C0_SET_SHADER_LOCAL_MEMORY_B(p, slm_addr);
+
+      P_MTHD(p, NVA0C0, SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A);
+      P_NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A(p, slm_per_mp >> 32);
+      P_NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_B(p, slm_per_mp);
+      P_NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_C(p, 0xff);
+
+      if (dev->ctx->compute.cls < VOLTA_COMPUTE_A) {
+         P_MTHD(p, NVA0C0, SET_SHADER_LOCAL_MEMORY_THROTTLED_A);
+         P_NVA0C0_SET_SHADER_LOCAL_MEMORY_THROTTLED_A(p, slm_per_mp >> 32);
+         P_NVA0C0_SET_SHADER_LOCAL_MEMORY_THROTTLED_B(p, slm_per_mp);
+         P_NVA0C0_SET_SHADER_LOCAL_MEMORY_THROTTLED_C(p, 0xff);
+      }
+
+      /* 3D */
+      P_MTHD(p, NV9097, SET_SHADER_LOCAL_MEMORY_A);
+      P_NV9097_SET_SHADER_LOCAL_MEMORY_A(p, slm_addr >> 32);
+      P_NV9097_SET_SHADER_LOCAL_MEMORY_B(p, slm_addr);
+      P_NV9097_SET_SHADER_LOCAL_MEMORY_C(p, slm_size >> 32);
+      P_NV9097_SET_SHADER_LOCAL_MEMORY_D(p, slm_size);
+      P_NV9097_SET_SHADER_LOCAL_MEMORY_E(p, slm_per_warp);
+   }
+
+   /* We set memory windows unconditionally.  Otherwise, the memory window
+    * might be in a random place and cause us to fault off into nowhere.
+    */
+   if (dev->ctx->compute.cls >= VOLTA_COMPUTE_A) {
+      uint64_t temp = 0xfeULL << 24;
+      P_MTHD(p, NVC3C0, SET_SHADER_SHARED_MEMORY_WINDOW_A);
+      P_NVC3C0_SET_SHADER_SHARED_MEMORY_WINDOW_A(p, temp >> 32);
+      P_NVC3C0_SET_SHADER_SHARED_MEMORY_WINDOW_B(p, temp & 0xffffffff);
+
+      temp = 0xffULL << 24;
+      P_MTHD(p, NVC3C0, SET_SHADER_LOCAL_MEMORY_WINDOW_A);
+      P_NVC3C0_SET_SHADER_LOCAL_MEMORY_WINDOW_A(p, temp >> 32);
+      P_NVC3C0_SET_SHADER_LOCAL_MEMORY_WINDOW_B(p, temp & 0xffffffff);
+   } else {
+      P_MTHD(p, NVA0C0, SET_SHADER_LOCAL_MEMORY_WINDOW);
+      P_NVA0C0_SET_SHADER_LOCAL_MEMORY_WINDOW(p, 0xff << 24);
+
+      P_MTHD(p, NVA0C0, SET_SHADER_SHARED_MEMORY_WINDOW);
+      P_NVA0C0_SET_SHADER_SHARED_MEMORY_WINDOW(p, 0xfe << 24);
+
+      // TODO CODE_ADDRESS_HIGH
+   }
+
+   /* From nvc0_screen.c:
+    *
+    *    "Reduce likelihood of collision with real buffers by placing the
+    *    hole at the top of the 4G area. This will have to be dealt with
+    *    for real eventually by blocking off that area from the VM."
+    *
+    * Really?!?  TODO: Fix this for realz.  Annoyingly, we only have a
+    * 32-bit pointer for this in 3D rather than a full 48 like we have for
+    * compute.
+    */
+   P_IMMD(p, NV9097, SET_SHADER_LOCAL_MEMORY_WINDOW, 0xff << 24);
+
+   if (qs->push)
+      nouveau_ws_push_destroy(qs->push);
+   qs->push = p;
+
+   return VK_SUCCESS;
 }
 
 static VkResult
@@ -127,18 +327,17 @@ nvk_queue_init(struct nvk_device *dev, struct nvk_queue *queue,
    if (result != VK_SUCCESS)
       return result;
 
+   nvk_queue_state_init(&queue->state);
+
    return VK_SUCCESS;
 }
 
 static void
 nvk_queue_finish(struct nvk_device *dev, struct nvk_queue *queue)
 {
-   if (queue->state.push)
-      nouveau_ws_push_destroy(queue->state.push);
+   nvk_queue_state_finish(dev, &queue->state);
    if (queue->empty_push)
       nouveau_ws_push_destroy(queue->empty_push);
-   if (queue->state.tls_bo)
-      nouveau_ws_bo_destroy(queue->state.tls_bo);
    vk_queue_finish(&queue->vk);
 }
 
@@ -155,8 +354,8 @@ nvk_queue_submit(struct vk_queue *vkqueue, struct vk_queue_submit *submission)
       P_MTHD(queue->empty_push, NV90B5, NOP);
       P_NV90B5_NOP(queue->empty_push, 0);
    }
-   result = nvk_update_preambles(&queue->state, device, submission->command_buffers,
-                                 submission->command_buffer_count);
+
+   result = nvk_queue_state_update(device, &queue->state);
    if (result != VK_SUCCESS)
       return result;
 
@@ -184,6 +383,8 @@ nvk_queue_submit(struct vk_queue *vkqueue, struct vk_queue_submit *submission)
          nouveau_ws_push_ref(cmd->push, bo_sync->bo, NOUVEAU_WS_BO_RDWR);
       }
 
+      nvk_queue_state_ref(cmd->push, &queue->state);
+
       simple_mtx_lock(&device->memory_objects_lock);
       list_for_each_entry(struct nvk_device_memory, mem,
                           &device->memory_objects, link) {
@@ -267,10 +468,12 @@ nvk_CreateDevice(VkPhysicalDevice physicalDevice,
    if (result != VK_SUCCESS)
       goto fail_images;
 
+   nvk_slm_area_init(&device->slm);
+
    result = nvk_queue_init(device, &device->queue,
                            &pCreateInfo->pQueueCreateInfos[0], 0);
    if (result != VK_SUCCESS)
-      goto fail_samplers;
+      goto fail_slm;
 
    if (pthread_mutex_init(&device->mutex, NULL) != 0) {
       result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
@@ -312,7 +515,8 @@ fail_mutex:
    pthread_mutex_destroy(&device->mutex);
 fail_queue:
    nvk_queue_finish(device, &device->queue);
-fail_samplers:
+fail_slm:
+   nvk_slm_area_finish(&device->slm);
    nvk_descriptor_table_finish(device, &device->samplers);
 fail_images:
    nvk_descriptor_table_finish(device, &device->images);
@@ -340,6 +544,7 @@ nvk_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
    pthread_mutex_destroy(&device->mutex);
    nvk_queue_finish(device, &device->queue);
    vk_device_finish(&device->vk);
+   nvk_slm_area_finish(&device->slm);
    nvk_descriptor_table_finish(device, &device->samplers);
    nvk_descriptor_table_finish(device, &device->images);
    assert(list_is_empty(&device->memory_objects));
@@ -347,3 +552,10 @@ nvk_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
    nouveau_ws_context_destroy(device->ctx);
    vk_free(&device->vk.alloc, device);
 }
+
+VkResult
+nvk_device_ensure_slm(struct nvk_device *dev,
+                      uint32_t bytes_per_thread)
+{
+   return nvk_slm_area_ensure(dev, &dev->slm, bytes_per_thread);
+}
index 01b085e..144969a 100644 (file)
 struct novueau_ws_context;
 struct nvk_physical_device;
 
-struct nvk_queue_alloc_info {
-   uint64_t tls_size;
+struct nvk_slm_area {
+   simple_mtx_t mutex;
+   struct nouveau_ws_bo *bo;
+   uint32_t bytes_per_warp;
+   uint32_t bytes_per_mp;
 };
 
 struct nvk_queue_state {
-   struct nvk_queue_alloc_info alloc_info;
-   struct nouveau_ws_bo *tls_bo;
+   struct {
+      struct nouveau_ws_bo *bo;
+      uint32_t alloc_count;
+   } images;
+
+   struct {
+      struct nouveau_ws_bo *bo;
+      uint32_t alloc_count;
+   } samplers;
+
+   struct {
+      struct nouveau_ws_bo *bo;
+      uint32_t bytes_per_warp;
+      uint32_t bytes_per_mp;
+   } slm;
 
    struct nouveau_ws_push *push;
 };
@@ -41,6 +57,7 @@ struct nvk_device {
 
    struct nvk_descriptor_table images;
    struct nvk_descriptor_table samplers;
+   struct nvk_slm_area slm;
 
    struct nvk_queue queue;
 
@@ -52,6 +69,9 @@ struct nvk_device {
 
 VK_DEFINE_HANDLE_CASTS(nvk_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)
 
+VkResult nvk_device_ensure_slm(struct nvk_device *dev,
+                               uint32_t bytes_per_thread);
+
 static struct nvk_physical_device *
 nvk_device_physical(struct nvk_device *device)
 {
index f6ce2ba..3ea63ff 100644 (file)
@@ -652,7 +652,7 @@ nvk_compile_nir(struct nvk_physical_device *device, nir_shader *nir,
       assert(info_out.bin.tlsSpace < (1 << 24));
       shader->hdr[0] |= 1 << 26;
       shader->hdr[1] |= align(info_out.bin.tlsSpace, 0x10); /* l[] size */
-      shader->need_tls = true;
+      shader->slm_size = info_out.bin.tlsSpace;
    }
 
    if (info_out.io.globalAccess)
index 023e592..e6fbfde 100644 (file)
@@ -22,9 +22,9 @@ struct nvk_shader {
    uint8_t *code_ptr;
    uint32_t code_size;
 
-   bool need_tls;
    uint8_t num_gprs;
    uint8_t num_barriers;
+   uint32_t slm_size;
 
    uint32_t hdr[NVC0_MAX_SHADER_HEADER_SIZE/4];
    uint32_t flags[2];
@@ -53,7 +53,6 @@ struct nvk_shader {
    } fs;
 
    struct {
-      uint32_t lmem_size; /* local memory (TGSI PRIVATE resource) size */
       uint32_t smem_size; /* shared memory (TGSI LOCAL resource) size */
       uint32_t block_size[3];
    } cp;