From: Dave Airlie Date: Tue, 14 Jun 2022 00:27:57 +0000 (+1000) Subject: nvk: add basic nve4+ compute support. X-Git-Tag: upstream/23.3.3~4529 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=a0d64f2770b725dc0664bed60439609793d69317;p=platform%2Fupstream%2Fmesa.git nvk: add basic nve4+ compute support. This can do most basic compute execution, except indirect computeheadless demo runs, and some CTS tests pass. Part-of: --- diff --git a/src/nouveau/vulkan/meson.build b/src/nouveau/vulkan/meson.build index 4b4e085..971adc5 100644 --- a/src/nouveau/vulkan/meson.build +++ b/src/nouveau/vulkan/meson.build @@ -8,6 +8,8 @@ nvk_files = files( 'nvk_cmd_buffer.c', 'nvk_cmd_buffer.h', 'nvk_cmd_copy.c', + 'nvk_cmd_dispatch.c', + 'nvk_compute_pipeline.c', 'nvk_descriptor_set.h', 'nvk_descriptor_set.c', 'nvk_descriptor_set_layout.c', diff --git a/src/nouveau/vulkan/nvk_cmd_buffer.c b/src/nouveau/vulkan/nvk_cmd_buffer.c index 134f65c..c81f7ec 100644 --- a/src/nouveau/vulkan/nvk_cmd_buffer.c +++ b/src/nouveau/vulkan/nvk_cmd_buffer.c @@ -2,9 +2,13 @@ #include "nvk_descriptor_set.h" #include "nvk_device.h" +#include "nvk_pipeline.h" #include "nvk_physical_device.h" #include "nouveau_push.h" +#include "nouveau_context.h" + +#include "nouveau/nouveau.h" #include "nvk_cla0c0.h" @@ -290,30 +294,6 @@ nvk_ResetCommandBuffer(VkCommandBuffer commandBuffer, return nvk_reset_cmd_buffer(cmd_buffer); } -static uint64_t -calc_tls_size(struct nvk_device *device, - uint32_t lpos, uint32_t lneg, uint32_t cstack) -{ - uint64_t size = (lpos + lneg) * 32 + cstack; - - assert (size < (1 << 20)); - - size *= 64; /* max warps */ - size = align(size, 0x8000); - size *= device->pdev->dev->mp_count; - - size = align(size, 1 << 17); - return size; -} - -static void -nve4_begin_compute(struct nvk_cmd_buffer *cmd) -{ - struct nvk_device *dev = (struct nvk_device *)cmd->vk.base.device; - - cmd->tls_space_needed = calc_tls_size(dev, 128 * 16, 0, 0x200); -} - VKAPI_ATTR VkResult VKAPI_CALL nvk_BeginCommandBuffer(VkCommandBuffer commandBuffer, const VkCommandBufferBeginInfo *pBeginInfo) @@ -327,12 +307,7 @@ nvk_BeginCommandBuffer(VkCommandBuffer commandBuffer, else cmd->reset_on_submit = false; - struct nvk_device *dev = (struct nvk_device *)cmd->vk.base.device; - struct nvk_physical_device *pdev = dev->pdev; - - /* this could be made optional for non-compute cmdbuffers */ - if (pdev->dev->cls >= 0xa0) - nve4_begin_compute(cmd); + nvk_cmd_buffer_begin_compute(cmd, pBeginInfo); return VK_SUCCESS; } @@ -349,6 +324,26 @@ nvk_CmdPipelineBarrier2(VkCommandBuffer commandBuffer, const VkDependencyInfo *pDependencyInfo) { } +VKAPI_ATTR void VKAPI_CALL +nvk_CmdBindPipeline(VkCommandBuffer commandBuffer, + VkPipelineBindPoint pipelineBindPoint, + VkPipeline _pipeline) +{ + VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer); + VK_FROM_HANDLE(nvk_pipeline, pipeline, _pipeline); + + switch (pipelineBindPoint) { + case VK_PIPELINE_BIND_POINT_COMPUTE: + assert(pipeline->type == NVK_PIPELINE_COMPUTE); + nouveau_ws_push_ref(cmd->push, + pipeline->shaders[MESA_SHADER_COMPUTE].bo, + NOUVEAU_WS_BO_RD); + cmd->state.cs.pipeline = (struct nvk_compute_pipeline *)pipeline; + break; + default: + unreachable("Unhandled bind point"); + } +} VKAPI_ATTR void VKAPI_CALL nvk_CmdBindDescriptorSets(VkCommandBuffer commandBuffer, diff --git a/src/nouveau/vulkan/nvk_cmd_buffer.h b/src/nouveau/vulkan/nvk_cmd_buffer.h index 181146c..433ed8e 100644 --- a/src/nouveau/vulkan/nvk_cmd_buffer.h +++ b/src/nouveau/vulkan/nvk_cmd_buffer.h @@ -3,6 +3,8 @@ #include "nvk_private.h" +#include "nouveau_push.h" + #include "vulkan/runtime/vk_command_buffer.h" #include "vulkan/runtime/vk_command_pool.h" @@ -21,6 +23,14 @@ VK_DEFINE_NONDISP_HANDLE_CASTS(nvk_cmd_pool, vk.base, VkCommandPool, /** Root descriptor table. This gets pushed to the GPU directly */ struct nvk_root_descriptor_table { + union { + struct { + uint32_t block_size[3]; + uint32_t grid_size[3]; + uint32_t _pad[2]; + } cs; + }; + /* Client push constants */ uint8_t push[128]; @@ -37,6 +47,7 @@ struct nvk_descriptor_state { }; struct nvk_compute_state { + struct nvk_compute_pipeline *pipeline; struct nvk_descriptor_state descriptors; }; @@ -68,6 +79,9 @@ struct nvk_cmd_buffer { }; VkResult nvk_reset_cmd_buffer(struct nvk_cmd_buffer *cmd_buffer); +void nvk_cmd_buffer_begin_compute(struct nvk_cmd_buffer *cmd, + const VkCommandBufferBeginInfo *pBeginInfo); + VK_DEFINE_HANDLE_CASTS(nvk_cmd_buffer, vk.base, VkCommandBuffer, VK_OBJECT_TYPE_COMMAND_BUFFER) @@ -87,4 +101,5 @@ nvk_get_descriptors_state(struct nvk_cmd_buffer *cmd, bool nvk_cmd_buffer_upload_alloc(struct nvk_cmd_buffer *cmd_buffer, unsigned size, uint64_t *addr, void **ptr); + #endif diff --git a/src/nouveau/vulkan/nvk_cmd_dispatch.c b/src/nouveau/vulkan/nvk_cmd_dispatch.c new file mode 100644 index 0000000..dd26db3 --- /dev/null +++ b/src/nouveau/vulkan/nvk_cmd_dispatch.c @@ -0,0 +1,132 @@ +#include "nvk_cmd_buffer.h" +#include "nvk_descriptor_set.h" +#include "nvk_device.h" +#include "nvk_physical_device.h" +#include "nvk_pipeline.h" + +#include "classes/cla0b5.h" + +#include "nvk_cla0c0.h" +#include "cla1c0.h" +#include "nvk_clc3c0.h" + +#include "drf.h" +#include "cla0c0qmd.h" +#include "clc0c0qmd.h" +#include "clc3c0qmd.h" + +#define NVA0C0_QMDV00_06_VAL_SET(p,a...) NVVAL_MW_SET((p), NVA0C0, QMDV00_06, ##a) +#define NVA0C0_QMDV00_06_DEF_SET(p,a...) NVDEF_MW_SET((p), NVA0C0, QMDV00_06, ##a) +#define NVC0C0_QMDV02_01_VAL_SET(p,a...) NVVAL_MW_SET((p), NVC0C0, QMDV02_01, ##a) +#define NVC0C0_QMDV02_01_DEF_SET(p,a...) NVDEF_MW_SET((p), NVC0C0, QMDV02_01, ##a) +#define NVC3C0_QMDV02_02_VAL_SET(p,a...) NVVAL_MW_SET((p), NVC3C0, QMDV02_02, ##a) +#define NVC3C0_QMDV02_02_DEF_SET(p,a...) NVDEF_MW_SET((p), NVC3C0, QMDV02_02, ##a) + +static uint64_t +calc_tls_size(struct nvk_device *device, + uint32_t lpos, uint32_t lneg, uint32_t cstack) +{ + uint64_t size = (lpos + lneg) * 32 + cstack; + + assert (size < (1 << 20)); + + size *= 64; /* max warps */ + size = align(size, 0x8000); + size *= device->pdev->dev->mp_count; + + size = align(size, 1 << 17); + return size; +} + +void +nvk_cmd_buffer_begin_compute(struct nvk_cmd_buffer *cmd, + const VkCommandBufferBeginInfo *pBeginInfo) +{ + struct nvk_device *dev = (struct nvk_device *)cmd->vk.base.device; + struct nvk_physical_device *pdev = dev->pdev; + + if (pdev->dev->cls < 0xa0) + return; + + cmd->tls_space_needed = calc_tls_size(dev, 128 * 16, 0, 0x200); +} + +static void +gv100_compute_setup_launch_desc(uint32_t *qmd, + uint32_t x, uint32_t y, uint32_t z) +{ + NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_RASTER_WIDTH, x); + NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_RASTER_HEIGHT, y); + NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_RASTER_DEPTH, z); +} + +static inline void +gp100_cp_launch_desc_set_cb(uint32_t *qmd, unsigned index, + uint32_t size, uint64_t address) +{ + NVC0C0_QMDV02_01_VAL_SET(qmd, CONSTANT_BUFFER_ADDR_LOWER, index, address); + NVC0C0_QMDV02_01_VAL_SET(qmd, CONSTANT_BUFFER_ADDR_UPPER, index, address >> 32); + NVC0C0_QMDV02_01_VAL_SET(qmd, CONSTANT_BUFFER_SIZE_SHIFTED4, index, + DIV_ROUND_UP(size, 16)); + NVC0C0_QMDV02_01_DEF_SET(qmd, CONSTANT_BUFFER_VALID, index, TRUE); +} + +VKAPI_ATTR void VKAPI_CALL +nvk_CmdDispatch(VkCommandBuffer commandBuffer, + uint32_t groupCountX, + uint32_t groupCountY, + uint32_t groupCountZ) +{ + VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer); + const struct nvk_compute_pipeline *pipeline = cmd->state.cs.pipeline; + const struct nvk_shader *shader = + &pipeline->base.shaders[MESA_SHADER_COMPUTE]; + struct nvk_descriptor_state *desc = &cmd->state.cs.descriptors; + + desc->root.cs.block_size[0] = shader->cp.block_size[0]; + desc->root.cs.block_size[1] = shader->cp.block_size[1]; + desc->root.cs.block_size[2] = shader->cp.block_size[2]; + desc->root.cs.grid_size[0] = groupCountX; + desc->root.cs.grid_size[1] = groupCountY; + desc->root.cs.grid_size[2] = groupCountZ; + + uint32_t root_table_size = sizeof(desc->root); + void *root_table_map; + uint64_t root_table_addr; + if (!nvk_cmd_buffer_upload_alloc(cmd, root_table_size, &root_table_addr, + &root_table_map)) + return; /* TODO: Error */ + + P_MTHD(cmd->push, NVA0C0, OFFSET_OUT_UPPER); + P_NVA0C0_OFFSET_OUT_UPPER(cmd->push, root_table_addr >> 32); + P_NVA0C0_OFFSET_OUT(cmd->push, root_table_addr & 0xffffffff); + P_MTHD(cmd->push, NVA0C0, LINE_LENGTH_IN); + P_NVA0C0_LINE_LENGTH_IN(cmd->push, root_table_size); + P_NVA0C0_LINE_COUNT(cmd->push, 0x1); + + P_1INC(cmd->push, NVA0C0, LAUNCH_DMA); + P_NVA0C0_LAUNCH_DMA(cmd->push, + { .dst_memory_layout = DST_MEMORY_LAYOUT_PITCH, + .sysmembar_disable = SYSMEMBAR_DISABLE_TRUE }); + P_INLINE_ARRAY(cmd->push, (uint32_t *)&desc->root, root_table_size / 4); + + uint32_t *qmd; + uint64_t qmd_addr; + if (!nvk_cmd_buffer_upload_alloc(cmd, 512, &qmd_addr, (void **)&qmd)) + return; /* TODO: Error */ + + memcpy(qmd, pipeline->qmd_template, 256); + gv100_compute_setup_launch_desc(qmd, groupCountX, groupCountY, groupCountZ); + + gp100_cp_launch_desc_set_cb(qmd, 0, 256, root_table_addr); + gp100_cp_launch_desc_set_cb(qmd, 1, 256, root_table_addr); + + P_MTHD(cmd->push, NVA0C0, INVALIDATE_SHADER_CACHES_NO_WFI); + P_NVA0C0_INVALIDATE_SHADER_CACHES_NO_WFI(cmd->push, { .constant = CONSTANT_TRUE }); + + P_MTHD(cmd->push, NVA0C0, SEND_PCAS_A); + P_NVA0C0_SEND_PCAS_A(cmd->push, qmd_addr >> 8); + P_IMMD(cmd->push, NVA0C0, SEND_SIGNALING_PCAS_B, + { .invalidate = INVALIDATE_TRUE, + .schedule = SCHEDULE_TRUE }); +} diff --git a/src/nouveau/vulkan/nvk_compute_pipeline.c b/src/nouveau/vulkan/nvk_compute_pipeline.c new file mode 100644 index 0000000..ff5991d --- /dev/null +++ b/src/nouveau/vulkan/nvk_compute_pipeline.c @@ -0,0 +1,109 @@ +#include "nvk_private.h" +#include "nvk_device.h" +#include "nvk_pipeline.h" +#include "nvk_pipeline_layout.h" +#include "nvk_shader.h" + +#include "nouveau_bo.h" +#include "vk_shader_module.h" + +#include "drf.h" +#include "cla0c0qmd.h" +#include "clc0c0qmd.h" +#include "clc3c0qmd.h" +#define NVA0C0_QMDV00_06_VAL_SET(p,a...) NVVAL_MW_SET((p), NVA0C0, QMDV00_06, ##a) +#define NVA0C0_QMDV00_06_DEF_SET(p,a...) NVDEF_MW_SET((p), NVA0C0, QMDV00_06, ##a) +#define NVC0C0_QMDV02_01_VAL_SET(p,a...) NVVAL_MW_SET((p), NVC0C0, QMDV02_01, ##a) +#define NVC0C0_QMDV02_01_DEF_SET(p,a...) NVDEF_MW_SET((p), NVC0C0, QMDV02_01, ##a) +#define NVC3C0_QMDV02_02_VAL_SET(p,a...) NVVAL_MW_SET((p), NVC3C0, QMDV02_02, ##a) +#define NVC3C0_QMDV02_02_DEF_SET(p,a...) NVDEF_MW_SET((p), NVC3C0, QMDV02_02, ##a) + +static int +gv100_sm_config_smem_size(uint32_t size) +{ + if (size > 64 * 1024) size = 96 * 1024; + else if (size > 32 * 1024) size = 64 * 1024; + else if (size > 16 * 1024) size = 32 * 1024; + else if (size > 8 * 1024) size = 16 * 1024; + else size = 8 * 1024; + return (size / 4096) + 1; +} + +static void +gv100_compute_setup_launch_desc_template(uint32_t *qmd, + struct nvk_shader *shader) +{ + NVC3C0_QMDV02_02_VAL_SET(qmd, SM_GLOBAL_CACHING_ENABLE, 1); + NVC3C0_QMDV02_02_DEF_SET(qmd, API_VISIBLE_CALL_LIMIT, NO_CHECK); + NVC3C0_QMDV02_02_DEF_SET(qmd, SAMPLER_INDEX, INDEPENDENTLY); + NVC3C0_QMDV02_02_VAL_SET(qmd, SHARED_MEMORY_SIZE, + align(shader->cp.smem_size, 0x100)); + NVC3C0_QMDV02_02_VAL_SET(qmd, SHADER_LOCAL_MEMORY_LOW_SIZE, + (shader->hdr[1] & 0xfffff0) + + align(shader->cp.lmem_size, 0x10)); + NVC3C0_QMDV02_02_VAL_SET(qmd, SHADER_LOCAL_MEMORY_HIGH_SIZE, 0); + NVC3C0_QMDV02_02_VAL_SET(qmd, MIN_SM_CONFIG_SHARED_MEM_SIZE, + gv100_sm_config_smem_size(8 * 1024)); + NVC3C0_QMDV02_02_VAL_SET(qmd, MAX_SM_CONFIG_SHARED_MEM_SIZE, + gv100_sm_config_smem_size(96 * 1024)); + NVC3C0_QMDV02_02_VAL_SET(qmd, QMD_VERSION, 2); + NVC3C0_QMDV02_02_VAL_SET(qmd, QMD_MAJOR_VERSION, 2); + NVC3C0_QMDV02_02_VAL_SET(qmd, TARGET_SM_CONFIG_SHARED_MEM_SIZE, + gv100_sm_config_smem_size(shader->cp.smem_size)); + + NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_THREAD_DIMENSION0, shader->cp.block_size[0]); + NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_THREAD_DIMENSION1, shader->cp.block_size[1]); + NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_THREAD_DIMENSION2, shader->cp.block_size[2]); + NVC3C0_QMDV02_02_VAL_SET(qmd, REGISTER_COUNT_V, shader->num_gprs); + NVC3C0_QMDV02_02_VAL_SET(qmd, BARRIER_COUNT, shader->num_barriers); + + uint64_t entry = shader->bo->offset; + NVC3C0_QMDV02_02_VAL_SET(qmd, PROGRAM_ADDRESS_LOWER, entry & 0xffffffff); + NVC3C0_QMDV02_02_VAL_SET(qmd, PROGRAM_ADDRESS_UPPER, entry >> 32); + +} + +VkResult +nvk_compute_pipeline_create(struct nvk_device *device, + struct vk_pipeline_cache *cache, + const VkComputePipelineCreateInfo *pCreateInfo, + const VkAllocationCallbacks *pAllocator, + VkPipeline *pPipeline) +{ + VK_FROM_HANDLE(nvk_pipeline_layout, pipeline_layout, pCreateInfo->layout); + struct nvk_physical_device *pdevice = nvk_device_physical(device); + struct nvk_compute_pipeline *pipeline; + VkResult result; + + pipeline = vk_object_zalloc(&device->vk, pAllocator, sizeof(*pipeline), + VK_OBJECT_TYPE_PIPELINE); + if (pipeline == NULL) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + pipeline->base.type = NVK_PIPELINE_COMPUTE; + + assert(pCreateInfo->stage.stage == VK_SHADER_STAGE_COMPUTE_BIT); + VK_FROM_HANDLE(vk_shader_module, module, pCreateInfo->stage.module); + + nir_shader *nir; + result = nvk_shader_compile_to_nir(device, module, + pCreateInfo->stage.pName, + MESA_SHADER_COMPUTE, + pCreateInfo->stage.pSpecializationInfo, + pipeline_layout, &nir); + if (result != VK_SUCCESS) + goto fail; + + result = nvk_compile_nir(pdevice, nir, &pipeline->base.shaders[MESA_SHADER_COMPUTE]); + if (result != VK_SUCCESS) + goto fail; + + nvk_shader_upload(pdevice, &pipeline->base.shaders[MESA_SHADER_COMPUTE]); + gv100_compute_setup_launch_desc_template(pipeline->qmd_template, &pipeline->base.shaders[MESA_SHADER_COMPUTE]); + *pPipeline = nvk_pipeline_to_handle(&pipeline->base); + return VK_SUCCESS; + +fail: + vk_object_free(&device->vk, pAllocator, pipeline); + return result; +} diff --git a/src/nouveau/vulkan/nvk_physical_device.c b/src/nouveau/vulkan/nvk_physical_device.c index b285d4c..311b7e7 100644 --- a/src/nouveau/vulkan/nvk_physical_device.c +++ b/src/nouveau/vulkan/nvk_physical_device.c @@ -5,12 +5,27 @@ #include "nvk_format.h" #include "nvk_image.h" #include "nvk_instance.h" +#include "nvk_shader.h" #include "nvk_wsi.h" #include "vulkan/runtime/vk_device.h" #include "vulkan/util/vk_enum_defines.h" #include "vulkan/wsi/wsi_common.h" +#include "vulkan/util/vk_enum_defines.h" + +#include "cl90c0.h" +#include "cl91c0.h" +#include "cla0c0.h" +#include "cla1c0.h" +#include "clb0c0.h" +#include "clb1c0.h" +#include "clc0c0.h" +#include "clc1c0.h" +#include "clc3c0.h" +#include "clc5c0.h" + + VKAPI_ATTR void VKAPI_CALL nvk_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice, VkPhysicalDeviceFeatures2 *pFeatures) diff --git a/src/nouveau/vulkan/nvk_pipeline.c b/src/nouveau/vulkan/nvk_pipeline.c index 6e7db2c..c2ec25f 100644 --- a/src/nouveau/vulkan/nvk_pipeline.c +++ b/src/nouveau/vulkan/nvk_pipeline.c @@ -24,16 +24,6 @@ nvk_graphics_pipeline_create(struct nvk_device *device, unreachable("Graphics pipelines not yet implemented"); } -static VkResult -nvk_compute_pipeline_create(struct nvk_device *device, - struct vk_pipeline_cache *cache, - const VkComputePipelineCreateInfo *pCreateInfo, - const VkAllocationCallbacks *pAllocator, - VkPipeline *pPipeline) -{ - unreachable("Compute pipelines not yet implemented"); -} - VKAPI_ATTR VkResult VKAPI_CALL nvk_CreateGraphicsPipelines(VkDevice _device, VkPipelineCache pipelineCache, diff --git a/src/nouveau/vulkan/nvk_pipeline.h b/src/nouveau/vulkan/nvk_pipeline.h index 50da9be..f7e0aa2 100644 --- a/src/nouveau/vulkan/nvk_pipeline.h +++ b/src/nouveau/vulkan/nvk_pipeline.h @@ -4,6 +4,8 @@ #include "nvk_shader.h" #include "vk_object.h" +struct vk_pipeline_cache; + enum nvk_pipeline_type { NVK_PIPELINE_GRAPHICS, NVK_PIPELINE_COMPUTE, @@ -19,4 +21,18 @@ struct nvk_pipeline { VK_DEFINE_NONDISP_HANDLE_CASTS(nvk_pipeline, base, VkPipeline, VK_OBJECT_TYPE_PIPELINE) + +struct nvk_compute_pipeline { + struct nvk_pipeline base; + + uint32_t qmd_template[64]; +}; + +VkResult +nvk_compute_pipeline_create(struct nvk_device *device, + struct vk_pipeline_cache *cache, + const VkComputePipelineCreateInfo *pCreateInfo, + const VkAllocationCallbacks *pAllocator, + VkPipeline *pPipeline); + #endif