#include "nouveau_push.h"
+#include "nvk_cla0c0.h"
+
static void
nvk_destroy_cmd_buffer(struct nvk_cmd_buffer *cmd_buffer)
{
return nvk_reset_cmd_buffer(cmd_buffer);
}
+static uint64_t
+calc_tls_size(struct nvk_device *device,
+ uint32_t lpos, uint32_t lneg, uint32_t cstack)
+{
+ uint64_t size = (lpos + lneg) * 32 + cstack;
+
+ assert (size < (1 << 20));
+
+ size *= 64; /* max warps */
+ size = align(size, 0x8000);
+ size *= device->pdev->dev->mp_count;
+
+ size = align(size, 1 << 17);
+ return size;
+}
+
+static void
+nve4_begin_compute(struct nvk_cmd_buffer *cmd)
+{
+ struct nvk_device *dev = (struct nvk_device *)cmd->vk.base.device;
+
+ cmd->tls_space_needed = calc_tls_size(dev, 128 * 16, 0, 0x200);
+}
+
VKAPI_ATTR VkResult VKAPI_CALL
nvk_BeginCommandBuffer(VkCommandBuffer commandBuffer,
const VkCommandBufferBeginInfo *pBeginInfo)
else
cmd->reset_on_submit = false;
+ struct nvk_device *dev = (struct nvk_device *)cmd->vk.base.device;
+ struct nvk_physical_device *pdev = dev->pdev;
+
+ /* this could be made optional for non-compute cmdbuffers */
+ if (pdev->dev->cls >= 0xa0)
+ nve4_begin_compute(cmd);
+
return VK_SUCCESS;
}
#include "vulkan/wsi/wsi_common.h"
+#include "nvk_cla0c0.h"
+#include "cla1c0.h"
+#include "nvk_clc3c0.h"
+
+static VkResult
+nvk_update_preamble_push(struct nvk_queue_state *qs, struct nvk_device *dev,
+ const struct nvk_queue_alloc_info *needs)
+{
+ struct nouveau_ws_bo *tls_bo = qs->tls_bo;
+ VkResult result;
+ if (needs->tls_size > qs->alloc_info.tls_size) {
+ tls_bo = nouveau_ws_bo_new(dev->pdev->dev,
+ needs->tls_size, (1 << 17), NOUVEAU_WS_BO_LOCAL);
+ if (!tls_bo) {
+ result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
+ goto fail;
+ }
+ }
+
+ if (tls_bo != qs->tls_bo) {
+ if (qs->tls_bo)
+ nouveau_ws_bo_destroy(qs->tls_bo);
+ qs->tls_bo = tls_bo;
+ }
+
+ struct nouveau_ws_push *push = nouveau_ws_push_new(dev->pdev->dev, 256);
+
+ nouveau_ws_push_ref(push, qs->tls_bo, NOUVEAU_WS_BO_RDWR);
+ P_MTHD(push, NVA0C0, SET_SHADER_LOCAL_MEMORY_A);
+ P_NVA0C0_SET_SHADER_LOCAL_MEMORY_A(push, qs->tls_bo->offset >> 32);
+ P_NVA0C0_SET_SHADER_LOCAL_MEMORY_B(push, qs->tls_bo->offset & 0xffffffff);
+
+ uint64_t temp_size = qs->tls_bo->size / dev->pdev->dev->mp_count;
+ P_MTHD(push, NVA0C0, SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A);
+ P_NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A(push, temp_size >> 32);
+ P_NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_B(push, temp_size & ~0x7fff);
+ P_NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_C(push, 0xff);
+
+ if (dev->pdev->dev->cls < 0xc3) {
+ P_MTHD(push, NVA0C0, SET_SHADER_LOCAL_MEMORY_THROTTLED_A);
+ P_NVA0C0_SET_SHADER_LOCAL_MEMORY_THROTTLED_A(push, temp_size >> 32);
+ P_NVA0C0_SET_SHADER_LOCAL_MEMORY_THROTTLED_B(push, temp_size & ~0x7fff);
+ P_NVA0C0_SET_SHADER_LOCAL_MEMORY_THROTTLED_C(push, 0xff);
+
+ P_MTHD(push, NVA0C0, SET_SHADER_LOCAL_MEMORY_WINDOW);
+ P_NVA0C0_SET_SHADER_LOCAL_MEMORY_WINDOW(push, 0xff << 24);
+
+ P_MTHD(push, NVA0C0, SET_SHADER_SHARED_MEMORY_WINDOW);
+ P_NVA0C0_SET_SHADER_SHARED_MEMORY_WINDOW(push, 0xfe << 24);
+
+ // TODO CODE_ADDRESS_HIGH
+ } else {
+ uint64_t temp = 0xfeULL << 24;
+
+ P_MTHD(push, NVC3C0, SET_SHADER_SHARED_MEMORY_WINDOW_A);
+ P_NVC3C0_SET_SHADER_SHARED_MEMORY_WINDOW_A(push, temp >> 32);
+ P_NVC3C0_SET_SHADER_SHARED_MEMORY_WINDOW_B(push, temp & 0xffffffff);
+
+ temp = 0xffULL << 24;
+ P_MTHD(push, NVC3C0, SET_SHADER_LOCAL_MEMORY_WINDOW_A);
+ P_NVC3C0_SET_SHADER_LOCAL_MEMORY_WINDOW_A(push, temp >> 32);
+ P_NVC3C0_SET_SHADER_LOCAL_MEMORY_WINDOW_B(push, temp & 0xffffffff);
+ }
+
+ P_MTHD(push, NVA0C0, SET_SPA_VERSION);
+ P_NVA0C0_SET_SPA_VERSION(push, { .major = dev->pdev->dev->cls >= 0xa1 ? 0x4 : 0x3 });
+
+ if (qs->push)
+ nouveau_ws_push_destroy(qs->push);
+ qs->push = push;
+ return 0;
+ fail:
+ return vk_error(qs, result);
+}
+
static VkResult
-nvk_queue_submit(struct vk_queue *queue, struct vk_queue_submit *submission)
+nvk_update_preambles(struct nvk_queue_state *qs, struct nvk_device *device,
+ struct vk_command_buffer *const *cmd_buffers, uint32_t cmd_buffer_count)
{
- struct nvk_device *device = container_of(queue->base.device, struct nvk_device, vk);
+ struct nvk_queue_alloc_info needs = { 0 };
+ for (uint32_t i = 0; i < cmd_buffer_count; i++) {
+ struct nvk_cmd_buffer *cmd = (struct nvk_cmd_buffer *)cmd_buffers[i];
+ needs.tls_size = MAX2(needs.tls_size, cmd->tls_space_needed);
+ }
+
+ if (needs.tls_size == qs->alloc_info.tls_size)
+ return VK_SUCCESS;
+ return nvk_update_preamble_push(qs, device, &needs);
+}
+
+static VkResult
+nvk_queue_submit(struct vk_queue *vkqueue, struct vk_queue_submit *submission)
+{
+ struct nvk_device *device = container_of(vkqueue->base.device, struct nvk_device, vk);
+ struct nvk_queue *queue = container_of(vkqueue, struct nvk_queue, vk);
+ VkResult result;
+
+ result = nvk_update_preambles(&queue->state, device, submission->command_buffers,
+ submission->command_buffer_count);
+ if (result != VK_SUCCESS)
+ return result;
pthread_mutex_lock(&device->mutex);
+
+ if (queue->state.push)
+ nouveau_ws_push_submit(queue->state.push, device->pdev->dev, device->ctx);
for (unsigned i = 0; i < submission->command_buffer_count; i++) {
struct nvk_cmd_buffer *cmd = (struct nvk_cmd_buffer *)submission->command_buffers[i];
goto fail_init;
}
- result = vk_queue_init(&device->queue, &device->vk, &pCreateInfo->pQueueCreateInfos[0], 0);
+ result = vk_queue_init(&device->queue.vk, &device->vk, &pCreateInfo->pQueueCreateInfos[0], 0);
if (result != VK_SUCCESS)
goto fail_ctx;
}
pthread_condattr_destroy(&condattr);
- device->queue.driver_submit = nvk_queue_submit;
+ device->queue.vk.driver_submit = nvk_queue_submit;
device->pdev = physical_device;
fail_mutex:
pthread_mutex_destroy(&device->mutex);
fail_queue:
- vk_queue_finish(&device->queue);
+ vk_queue_finish(&device->queue.vk);
fail_ctx:
nouveau_ws_context_destroy(device->ctx);
fail_init:
if (!device)
return;
+ if (device->queue.state.push)
+ nouveau_ws_push_destroy(device->queue.state.push);
+ if (device->queue.state.tls_bo)
+ nouveau_ws_bo_destroy(device->queue.state.tls_bo);
pthread_cond_destroy(&device->queue_submit);
pthread_mutex_destroy(&device->mutex);
- vk_queue_finish(&device->queue);
+ vk_queue_finish(&device->queue.vk);
vk_device_finish(&device->vk);
nouveau_ws_context_destroy(device->ctx);
vk_free(&device->vk.alloc, device);