#include "vulkan/wsi/wsi_common.h"
+#include "nvk_cl9097.h"
#include "nvk_cl90b5.h"
#include "nvk_cla0c0.h"
#include "cla1c0.h"
#include "nvk_clc3c0.h"
+static void
+nvk_slm_area_init(struct nvk_slm_area *area)
+{
+ memset(area, 0, sizeof(*area));
+ simple_mtx_init(&area->mutex, mtx_plain);
+}
+
+static void
+nvk_slm_area_finish(struct nvk_slm_area *area)
+{
+ simple_mtx_destroy(&area->mutex);
+ if (area->bo)
+ nouveau_ws_bo_destroy(area->bo);
+}
+
+static struct nouveau_ws_bo *
+nvk_slm_area_get_bo_ref(struct nvk_slm_area *area,
+ uint32_t *bytes_per_warp_out,
+ uint32_t *bytes_per_mp_out)
+{
+ simple_mtx_lock(&area->mutex);
+ struct nouveau_ws_bo *bo = area->bo;
+ if (bo)
+ nouveau_ws_bo_ref(bo);
+ *bytes_per_warp_out = area->bytes_per_warp;
+ *bytes_per_mp_out = area->bytes_per_mp;
+ simple_mtx_unlock(&area->mutex);
+
+ return bo;
+}
+
static VkResult
-nvk_update_preamble_push(struct nvk_queue_state *qs, struct nvk_device *dev,
- const struct nvk_queue_alloc_info *needs)
+nvk_slm_area_ensure(struct nvk_device *dev,
+ struct nvk_slm_area *area,
+ uint32_t bytes_per_thread)
{
- struct nouveau_ws_bo *tls_bo = qs->tls_bo;
- VkResult result;
- if (needs->tls_size > qs->alloc_info.tls_size) {
- tls_bo = nouveau_ws_bo_new(dev->pdev->dev,
- needs->tls_size, (1 << 17), NOUVEAU_WS_BO_LOCAL);
- if (!tls_bo) {
- result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
- goto fail;
- }
- }
+ assert(bytes_per_thread < (1 << 24));
+
+ /* TODO: Volta+doesn't use CRC */
+ const uint32_t crs_size = 0;
+
+ uint64_t bytes_per_warp = bytes_per_thread * 32 + crs_size;
+
+ /* The hardware seems to require this alignment for
+ * NV9097_SET_SHADER_LOCAL_MEMORY_E_DEFAULT_SIZE_PER_WARP
+ */
+ bytes_per_warp = ALIGN(bytes_per_warp, 0x200);
+
+ uint64_t bytes_per_mp = bytes_per_warp * 64; /* max warps */
+
+ /* The hardware seems to require this alignment for
+ * NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A_SIZE_LOWER.
+ *
+ * Fortunately, this is just the alignment for bytes_per_warp multiplied
+ * by the number of warps, 64. It might matter for real on a GPU with 48
+ * warps but we don't support any of those yet.
+ */
+ assert(bytes_per_mp == ALIGN(bytes_per_mp, 0x8000));
+
+ /* nvk_slm_area::bytes_per_mp only ever increases so we can check this
+ * outside the lock and exit early in the common case. We only need to
+ * take the lock if we're actually going to resize.
+ *
+ * Also, we only care about bytes_per_mp and not bytes_per_warp because
+ * they are integer multiples of each other.
+ */
+ if (likely(bytes_per_mp <= area->bytes_per_mp))
+ return VK_SUCCESS;
- if (tls_bo != qs->tls_bo) {
- if (qs->tls_bo)
- nouveau_ws_bo_destroy(qs->tls_bo);
- qs->tls_bo = tls_bo;
- }
+ uint64_t size = bytes_per_mp * dev->pdev->dev->mp_count;
- struct nouveau_ws_push *push = nouveau_ws_push_new(dev->pdev->dev, 256);
-
- nouveau_ws_push_ref(push, qs->tls_bo, NOUVEAU_WS_BO_RDWR);
- P_MTHD(push, NVA0C0, SET_SHADER_LOCAL_MEMORY_A);
- P_NVA0C0_SET_SHADER_LOCAL_MEMORY_A(push, qs->tls_bo->offset >> 32);
- P_NVA0C0_SET_SHADER_LOCAL_MEMORY_B(push, qs->tls_bo->offset & 0xffffffff);
-
- nvk_push_descriptor_table_ref(push, &dev->samplers);
- uint64_t tsp_addr = nvk_descriptor_table_base_address(&dev->samplers);
- P_MTHD(push, NVA0C0, SET_TEX_SAMPLER_POOL_A);
- P_NVA0C0_SET_TEX_SAMPLER_POOL_A(push, tsp_addr >> 32);
- P_NVA0C0_SET_TEX_SAMPLER_POOL_B(push, tsp_addr & 0xffffffff);
- P_NVA0C0_SET_TEX_SAMPLER_POOL_C(push, dev->samplers.alloc - 1);
-
- nvk_push_descriptor_table_ref(push, &dev->images);
- uint64_t thp_addr = nvk_descriptor_table_base_address(&dev->images);
- P_MTHD(push, NVA0C0, SET_TEX_HEADER_POOL_A);
- P_NVA0C0_SET_TEX_HEADER_POOL_A(push, thp_addr >> 32);
- P_NVA0C0_SET_TEX_HEADER_POOL_B(push, thp_addr & 0xffffffff);
- P_NVA0C0_SET_TEX_HEADER_POOL_C(push, dev->images.alloc - 1);
-
- uint64_t temp_size = qs->tls_bo->size / dev->pdev->dev->mp_count;
- P_MTHD(push, NVA0C0, SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A);
- P_NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A(push, temp_size >> 32);
- P_NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_B(push, temp_size & ~0x7fff);
- P_NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_C(push, 0xff);
-
- if (dev->ctx->compute.cls < 0xc3c0) {
- P_MTHD(push, NVA0C0, SET_SHADER_LOCAL_MEMORY_THROTTLED_A);
- P_NVA0C0_SET_SHADER_LOCAL_MEMORY_THROTTLED_A(push, temp_size >> 32);
- P_NVA0C0_SET_SHADER_LOCAL_MEMORY_THROTTLED_B(push, temp_size & ~0x7fff);
- P_NVA0C0_SET_SHADER_LOCAL_MEMORY_THROTTLED_C(push, 0xff);
-
- P_MTHD(push, NVA0C0, SET_SHADER_LOCAL_MEMORY_WINDOW);
- P_NVA0C0_SET_SHADER_LOCAL_MEMORY_WINDOW(push, 0xff << 24);
-
- P_MTHD(push, NVA0C0, SET_SHADER_SHARED_MEMORY_WINDOW);
- P_NVA0C0_SET_SHADER_SHARED_MEMORY_WINDOW(push, 0xfe << 24);
+ struct nouveau_ws_bo *bo =
+ nouveau_ws_bo_new(dev->pdev->dev, size, 0, NOUVEAU_WS_BO_LOCAL);
+ if (bo == NULL)
+ return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
- // TODO CODE_ADDRESS_HIGH
+ struct nouveau_ws_bo *unref_bo;
+ simple_mtx_lock(&area->mutex);
+ if (bytes_per_mp <= area->bytes_per_mp) {
+ /* We lost the race, throw away our BO */
+ assert(area->bytes_per_warp == bytes_per_warp);
+ unref_bo = bo;
} else {
- uint64_t temp = 0xfeULL << 24;
+ unref_bo = area->bo;
+ area->bo = bo;
+ area->bytes_per_warp = bytes_per_warp;
+ area->bytes_per_mp = bytes_per_mp;
+ }
+ simple_mtx_unlock(&area->mutex);
- P_MTHD(push, NVC3C0, SET_SHADER_SHARED_MEMORY_WINDOW_A);
- P_NVC3C0_SET_SHADER_SHARED_MEMORY_WINDOW_A(push, temp >> 32);
- P_NVC3C0_SET_SHADER_SHARED_MEMORY_WINDOW_B(push, temp & 0xffffffff);
+ if (unref_bo)
+ nouveau_ws_bo_destroy(unref_bo);
- temp = 0xffULL << 24;
- P_MTHD(push, NVC3C0, SET_SHADER_LOCAL_MEMORY_WINDOW_A);
- P_NVC3C0_SET_SHADER_LOCAL_MEMORY_WINDOW_A(push, temp >> 32);
- P_NVC3C0_SET_SHADER_LOCAL_MEMORY_WINDOW_B(push, temp & 0xffffffff);
- }
+ return VK_SUCCESS;
+}
- P_MTHD(push, NVA0C0, SET_SPA_VERSION);
- P_NVA0C0_SET_SPA_VERSION(push, { .major = dev->ctx->compute.cls >= 0xa1c0 ? 0x4 : 0x3 });
+static void
+nvk_queue_state_init(struct nvk_queue_state *qs)
+{
+ memset(qs, 0, sizeof(*qs));
+}
+static void
+nvk_queue_state_finish(struct nvk_device *dev,
+ struct nvk_queue_state *qs)
+{
+ if (qs->images.bo)
+ nouveau_ws_bo_destroy(qs->images.bo);
+ if (qs->samplers.bo)
+ nouveau_ws_bo_destroy(qs->samplers.bo);
+ if (qs->slm.bo)
+ nouveau_ws_bo_destroy(qs->slm.bo);
if (qs->push)
nouveau_ws_push_destroy(qs->push);
- qs->push = push;
- return 0;
- fail:
- return vk_error(qs, result);
+}
+
+static void
+nvk_queue_state_ref(struct nouveau_ws_push *push,
+ struct nvk_queue_state *qs)
+{
+ if (qs->images.bo)
+ nouveau_ws_push_ref(push, qs->images.bo, NOUVEAU_WS_BO_RD);
+ if (qs->samplers.bo)
+ nouveau_ws_push_ref(push, qs->samplers.bo, NOUVEAU_WS_BO_RD);
+ if (qs->slm.bo)
+ nouveau_ws_push_ref(push, qs->slm.bo, NOUVEAU_WS_BO_RDWR);
}
static VkResult
-nvk_update_preambles(struct nvk_queue_state *qs, struct nvk_device *device,
- struct vk_command_buffer *const *cmd_buffers, uint32_t cmd_buffer_count)
+nvk_queue_state_update(struct nvk_device *dev,
+ struct nvk_queue_state *qs)
{
- struct nvk_queue_alloc_info needs = { 0 };
- for (uint32_t i = 0; i < cmd_buffer_count; i++) {
- struct nvk_cmd_buffer *cmd = (struct nvk_cmd_buffer *)cmd_buffers[i];
- needs.tls_size = MAX2(needs.tls_size, cmd->tls_space_needed);
+ struct nouveau_ws_bo *bo;
+ uint32_t alloc_count, bytes_per_warp, bytes_per_mp;
+ bool dirty = false;
+
+ bo = nvk_descriptor_table_get_bo_ref(&dev->images, &alloc_count);
+ if (qs->images.bo != bo || qs->images.alloc_count != alloc_count) {
+ if (qs->images.bo)
+ nouveau_ws_bo_destroy(qs->images.bo);
+ qs->images.bo = bo;
+ qs->images.alloc_count = alloc_count;
+ dirty = true;
+ } else {
+ /* No change */
+ if (bo)
+ nouveau_ws_bo_destroy(bo);
}
- if (needs.tls_size == qs->alloc_info.tls_size)
+ bo = nvk_descriptor_table_get_bo_ref(&dev->samplers, &alloc_count);
+ if (qs->samplers.bo != bo || qs->samplers.alloc_count != alloc_count) {
+ if (qs->samplers.bo)
+ nouveau_ws_bo_destroy(qs->samplers.bo);
+ qs->samplers.bo = bo;
+ qs->samplers.alloc_count = alloc_count;
+ dirty = true;
+ } else {
+ /* No change */
+ if (bo)
+ nouveau_ws_bo_destroy(bo);
+ }
+
+ bo = nvk_slm_area_get_bo_ref(&dev->slm, &bytes_per_warp, &bytes_per_mp);
+ if (qs->slm.bo != bo || qs->slm.bytes_per_warp != bytes_per_warp ||
+ qs->slm.bytes_per_mp != bytes_per_mp) {
+ if (qs->slm.bo)
+ nouveau_ws_bo_destroy(qs->slm.bo);
+ qs->slm.bo = bo;
+ qs->slm.bytes_per_warp = bytes_per_warp;
+ qs->slm.bytes_per_mp = bytes_per_mp;
+ dirty = true;
+ } else {
+ /* No change */
+ if (bo)
+ nouveau_ws_bo_destroy(bo);
+ }
+
+ /* TODO: We're currently depending on kernel reference counting to protect
+ * us here. If we ever stop reference counting in the kernel, we will
+ * either need to delay destruction or hold on to our extra BO references
+ * and insert a GPU stall here if anything has changed before dropping our
+ * old references.
+ */
+
+ if (!dirty)
return VK_SUCCESS;
- return nvk_update_preamble_push(qs, device, &needs);
+
+ struct nouveau_ws_push *p = nouveau_ws_push_new(dev->pdev->dev, 256);
+ if (p == NULL)
+ return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+
+ if (qs->images.bo) {
+ nouveau_ws_push_ref(p, qs->images.bo, NOUVEAU_WS_BO_RD);
+
+ /* Compute */
+ P_MTHD(p, NVA0C0, SET_TEX_HEADER_POOL_A);
+ P_NVA0C0_SET_TEX_HEADER_POOL_A(p, qs->images.bo->offset >> 32);
+ P_NVA0C0_SET_TEX_HEADER_POOL_B(p, qs->images.bo->offset);
+ P_NVA0C0_SET_TEX_HEADER_POOL_C(p, qs->images.alloc_count - 1);
+
+ /* 3D */
+ P_MTHD(p, NV9097, SET_TEX_HEADER_POOL_A);
+ P_NV9097_SET_TEX_HEADER_POOL_A(p, qs->images.bo->offset >> 32);
+ P_NV9097_SET_TEX_HEADER_POOL_B(p, qs->images.bo->offset);
+ P_NV9097_SET_TEX_HEADER_POOL_C(p, qs->images.alloc_count - 1);
+ }
+
+ if (qs->samplers.bo) {
+ nouveau_ws_push_ref(p, qs->samplers.bo, NOUVEAU_WS_BO_RD);
+
+ /* Compute */
+ P_MTHD(p, NVA0C0, SET_TEX_SAMPLER_POOL_A);
+ P_NVA0C0_SET_TEX_SAMPLER_POOL_A(p, qs->samplers.bo->offset >> 32);
+ P_NVA0C0_SET_TEX_SAMPLER_POOL_B(p, qs->samplers.bo->offset);
+ P_NVA0C0_SET_TEX_SAMPLER_POOL_C(p, qs->samplers.alloc_count - 1);
+
+ /* 3D */
+ P_MTHD(p, NV9097, SET_TEX_SAMPLER_POOL_A);
+ P_NV9097_SET_TEX_SAMPLER_POOL_A(p, qs->samplers.bo->offset >> 32);
+ P_NV9097_SET_TEX_SAMPLER_POOL_B(p, qs->samplers.bo->offset);
+ P_NV9097_SET_TEX_SAMPLER_POOL_C(p, qs->samplers.alloc_count - 1);
+ }
+
+ if (qs->slm.bo) {
+ nouveau_ws_push_ref(p, qs->slm.bo, NOUVEAU_WS_BO_RDWR);
+ const uint64_t slm_addr = qs->slm.bo->offset;
+ const uint64_t slm_size = qs->slm.bo->size;
+ const uint64_t slm_per_warp = qs->slm.bytes_per_warp;
+ const uint64_t slm_per_mp = qs->slm.bytes_per_mp;
+ assert(!(slm_per_mp & 0x7fff));
+
+ /* Compute */
+ P_MTHD(p, NVA0C0, SET_SHADER_LOCAL_MEMORY_A);
+ P_NVA0C0_SET_SHADER_LOCAL_MEMORY_A(p, slm_addr >> 32);
+ P_NVA0C0_SET_SHADER_LOCAL_MEMORY_B(p, slm_addr);
+
+ P_MTHD(p, NVA0C0, SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A);
+ P_NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A(p, slm_per_mp >> 32);
+ P_NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_B(p, slm_per_mp);
+ P_NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_C(p, 0xff);
+
+ if (dev->ctx->compute.cls < VOLTA_COMPUTE_A) {
+ P_MTHD(p, NVA0C0, SET_SHADER_LOCAL_MEMORY_THROTTLED_A);
+ P_NVA0C0_SET_SHADER_LOCAL_MEMORY_THROTTLED_A(p, slm_per_mp >> 32);
+ P_NVA0C0_SET_SHADER_LOCAL_MEMORY_THROTTLED_B(p, slm_per_mp);
+ P_NVA0C0_SET_SHADER_LOCAL_MEMORY_THROTTLED_C(p, 0xff);
+ }
+
+ /* 3D */
+ P_MTHD(p, NV9097, SET_SHADER_LOCAL_MEMORY_A);
+ P_NV9097_SET_SHADER_LOCAL_MEMORY_A(p, slm_addr >> 32);
+ P_NV9097_SET_SHADER_LOCAL_MEMORY_B(p, slm_addr);
+ P_NV9097_SET_SHADER_LOCAL_MEMORY_C(p, slm_size >> 32);
+ P_NV9097_SET_SHADER_LOCAL_MEMORY_D(p, slm_size);
+ P_NV9097_SET_SHADER_LOCAL_MEMORY_E(p, slm_per_warp);
+ }
+
+ /* We set memory windows unconditionally. Otherwise, the memory window
+ * might be in a random place and cause us to fault off into nowhere.
+ */
+ if (dev->ctx->compute.cls >= VOLTA_COMPUTE_A) {
+ uint64_t temp = 0xfeULL << 24;
+ P_MTHD(p, NVC3C0, SET_SHADER_SHARED_MEMORY_WINDOW_A);
+ P_NVC3C0_SET_SHADER_SHARED_MEMORY_WINDOW_A(p, temp >> 32);
+ P_NVC3C0_SET_SHADER_SHARED_MEMORY_WINDOW_B(p, temp & 0xffffffff);
+
+ temp = 0xffULL << 24;
+ P_MTHD(p, NVC3C0, SET_SHADER_LOCAL_MEMORY_WINDOW_A);
+ P_NVC3C0_SET_SHADER_LOCAL_MEMORY_WINDOW_A(p, temp >> 32);
+ P_NVC3C0_SET_SHADER_LOCAL_MEMORY_WINDOW_B(p, temp & 0xffffffff);
+ } else {
+ P_MTHD(p, NVA0C0, SET_SHADER_LOCAL_MEMORY_WINDOW);
+ P_NVA0C0_SET_SHADER_LOCAL_MEMORY_WINDOW(p, 0xff << 24);
+
+ P_MTHD(p, NVA0C0, SET_SHADER_SHARED_MEMORY_WINDOW);
+ P_NVA0C0_SET_SHADER_SHARED_MEMORY_WINDOW(p, 0xfe << 24);
+
+ // TODO CODE_ADDRESS_HIGH
+ }
+
+ /* From nvc0_screen.c:
+ *
+ * "Reduce likelihood of collision with real buffers by placing the
+ * hole at the top of the 4G area. This will have to be dealt with
+ * for real eventually by blocking off that area from the VM."
+ *
+ * Really?!? TODO: Fix this for realz. Annoyingly, we only have a
+ * 32-bit pointer for this in 3D rather than a full 48 like we have for
+ * compute.
+ */
+ P_IMMD(p, NV9097, SET_SHADER_LOCAL_MEMORY_WINDOW, 0xff << 24);
+
+ if (qs->push)
+ nouveau_ws_push_destroy(qs->push);
+ qs->push = p;
+
+ return VK_SUCCESS;
}
static VkResult
if (result != VK_SUCCESS)
return result;
+ nvk_queue_state_init(&queue->state);
+
return VK_SUCCESS;
}
static void
nvk_queue_finish(struct nvk_device *dev, struct nvk_queue *queue)
{
- if (queue->state.push)
- nouveau_ws_push_destroy(queue->state.push);
+ nvk_queue_state_finish(dev, &queue->state);
if (queue->empty_push)
nouveau_ws_push_destroy(queue->empty_push);
- if (queue->state.tls_bo)
- nouveau_ws_bo_destroy(queue->state.tls_bo);
vk_queue_finish(&queue->vk);
}
P_MTHD(queue->empty_push, NV90B5, NOP);
P_NV90B5_NOP(queue->empty_push, 0);
}
- result = nvk_update_preambles(&queue->state, device, submission->command_buffers,
- submission->command_buffer_count);
+
+ result = nvk_queue_state_update(device, &queue->state);
if (result != VK_SUCCESS)
return result;
nouveau_ws_push_ref(cmd->push, bo_sync->bo, NOUVEAU_WS_BO_RDWR);
}
+ nvk_queue_state_ref(cmd->push, &queue->state);
+
simple_mtx_lock(&device->memory_objects_lock);
list_for_each_entry(struct nvk_device_memory, mem,
&device->memory_objects, link) {
if (result != VK_SUCCESS)
goto fail_images;
+ nvk_slm_area_init(&device->slm);
+
result = nvk_queue_init(device, &device->queue,
&pCreateInfo->pQueueCreateInfos[0], 0);
if (result != VK_SUCCESS)
- goto fail_samplers;
+ goto fail_slm;
if (pthread_mutex_init(&device->mutex, NULL) != 0) {
result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
pthread_mutex_destroy(&device->mutex);
fail_queue:
nvk_queue_finish(device, &device->queue);
-fail_samplers:
+fail_slm:
+ nvk_slm_area_finish(&device->slm);
nvk_descriptor_table_finish(device, &device->samplers);
fail_images:
nvk_descriptor_table_finish(device, &device->images);
pthread_mutex_destroy(&device->mutex);
nvk_queue_finish(device, &device->queue);
vk_device_finish(&device->vk);
+ nvk_slm_area_finish(&device->slm);
nvk_descriptor_table_finish(device, &device->samplers);
nvk_descriptor_table_finish(device, &device->images);
assert(list_is_empty(&device->memory_objects));
nouveau_ws_context_destroy(device->ctx);
vk_free(&device->vk.alloc, device);
}
+
+VkResult
+nvk_device_ensure_slm(struct nvk_device *dev,
+ uint32_t bytes_per_thread)
+{
+ return nvk_slm_area_ensure(dev, &dev->slm, bytes_per_thread);
+}