nvk: add basic nve4+ compute support.
authorDave Airlie <airlied@redhat.com>
Tue, 14 Jun 2022 00:27:57 +0000 (10:27 +1000)
committerMarge Bot <emma+marge@anholt.net>
Fri, 4 Aug 2023 21:31:54 +0000 (21:31 +0000)
This can do most basic compute execution, except indirect

computeheadless demo runs, and some CTS tests pass.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24326>

src/nouveau/vulkan/meson.build
src/nouveau/vulkan/nvk_cmd_buffer.c
src/nouveau/vulkan/nvk_cmd_buffer.h
src/nouveau/vulkan/nvk_cmd_dispatch.c [new file with mode: 0644]
src/nouveau/vulkan/nvk_compute_pipeline.c [new file with mode: 0644]
src/nouveau/vulkan/nvk_physical_device.c
src/nouveau/vulkan/nvk_pipeline.c
src/nouveau/vulkan/nvk_pipeline.h

index 4b4e085..971adc5 100644 (file)
@@ -8,6 +8,8 @@ nvk_files = files(
   'nvk_cmd_buffer.c',
   'nvk_cmd_buffer.h',
   'nvk_cmd_copy.c',
+  'nvk_cmd_dispatch.c',
+  'nvk_compute_pipeline.c',
   'nvk_descriptor_set.h',
   'nvk_descriptor_set.c',
   'nvk_descriptor_set_layout.c',
index 134f65c..c81f7ec 100644 (file)
@@ -2,9 +2,13 @@
 
 #include "nvk_descriptor_set.h"
 #include "nvk_device.h"
+#include "nvk_pipeline.h"
 #include "nvk_physical_device.h"
 
 #include "nouveau_push.h"
+#include "nouveau_context.h"
+
+#include "nouveau/nouveau.h"
 
 #include "nvk_cla0c0.h"
 
@@ -290,30 +294,6 @@ nvk_ResetCommandBuffer(VkCommandBuffer commandBuffer,
    return nvk_reset_cmd_buffer(cmd_buffer);
 }
 
-static uint64_t
-calc_tls_size(struct nvk_device *device,
-              uint32_t lpos, uint32_t lneg, uint32_t cstack)
-{
-   uint64_t size = (lpos + lneg) * 32 + cstack;
-
-   assert (size < (1 << 20));
-
-   size *= 64; /* max warps */
-   size  = align(size, 0x8000);
-   size *= device->pdev->dev->mp_count;
-
-   size = align(size, 1 << 17);
-   return size;
-}
-
-static void
-nve4_begin_compute(struct nvk_cmd_buffer *cmd)
-{
-   struct nvk_device *dev = (struct nvk_device *)cmd->vk.base.device;
-
-   cmd->tls_space_needed = calc_tls_size(dev, 128 * 16, 0, 0x200);
-}
-
 VKAPI_ATTR VkResult VKAPI_CALL
 nvk_BeginCommandBuffer(VkCommandBuffer commandBuffer,
                        const VkCommandBufferBeginInfo *pBeginInfo)
@@ -327,12 +307,7 @@ nvk_BeginCommandBuffer(VkCommandBuffer commandBuffer,
    else
       cmd->reset_on_submit = false;
 
-   struct nvk_device *dev = (struct nvk_device *)cmd->vk.base.device;
-   struct nvk_physical_device *pdev = dev->pdev;
-
-   /* this could be made optional for non-compute cmdbuffers */
-   if (pdev->dev->cls >= 0xa0)
-      nve4_begin_compute(cmd);
+   nvk_cmd_buffer_begin_compute(cmd, pBeginInfo);
 
    return VK_SUCCESS;
 }
@@ -349,6 +324,26 @@ nvk_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,
                         const VkDependencyInfo *pDependencyInfo)
 { }
 
+VKAPI_ATTR void VKAPI_CALL
+nvk_CmdBindPipeline(VkCommandBuffer commandBuffer,
+                    VkPipelineBindPoint pipelineBindPoint,
+                    VkPipeline _pipeline)
+{
+   VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
+   VK_FROM_HANDLE(nvk_pipeline, pipeline, _pipeline);
+
+   switch (pipelineBindPoint) {
+   case VK_PIPELINE_BIND_POINT_COMPUTE:
+      assert(pipeline->type == NVK_PIPELINE_COMPUTE);
+      nouveau_ws_push_ref(cmd->push,
+                          pipeline->shaders[MESA_SHADER_COMPUTE].bo,
+                          NOUVEAU_WS_BO_RD);
+      cmd->state.cs.pipeline = (struct nvk_compute_pipeline *)pipeline;
+      break;
+   default:
+      unreachable("Unhandled bind point");
+   }
+}
 
 VKAPI_ATTR void VKAPI_CALL
 nvk_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
index 181146c..433ed8e 100644 (file)
@@ -3,6 +3,8 @@
 
 #include "nvk_private.h"
 
+#include "nouveau_push.h"
+
 #include "vulkan/runtime/vk_command_buffer.h"
 #include "vulkan/runtime/vk_command_pool.h"
 
@@ -21,6 +23,14 @@ VK_DEFINE_NONDISP_HANDLE_CASTS(nvk_cmd_pool, vk.base, VkCommandPool,
 
 /** Root descriptor table.  This gets pushed to the GPU directly */
 struct nvk_root_descriptor_table {
+   union {
+      struct {
+         uint32_t block_size[3];
+         uint32_t grid_size[3];
+         uint32_t _pad[2];
+      } cs;
+   };
+
    /* Client push constants */
    uint8_t push[128];
 
@@ -37,6 +47,7 @@ struct nvk_descriptor_state {
 };
 
 struct nvk_compute_state {
+   struct nvk_compute_pipeline *pipeline;
    struct nvk_descriptor_state descriptors;
 };
 
@@ -68,6 +79,9 @@ struct nvk_cmd_buffer {
 };
 
 VkResult nvk_reset_cmd_buffer(struct nvk_cmd_buffer *cmd_buffer);
+void nvk_cmd_buffer_begin_compute(struct nvk_cmd_buffer *cmd,
+                                  const VkCommandBufferBeginInfo *pBeginInfo);
+
 
 VK_DEFINE_HANDLE_CASTS(nvk_cmd_buffer, vk.base, VkCommandBuffer,
                        VK_OBJECT_TYPE_COMMAND_BUFFER)
@@ -87,4 +101,5 @@ nvk_get_descriptors_state(struct nvk_cmd_buffer *cmd,
 bool
 nvk_cmd_buffer_upload_alloc(struct nvk_cmd_buffer *cmd_buffer, unsigned size,
                             uint64_t *addr, void **ptr);
+
 #endif
diff --git a/src/nouveau/vulkan/nvk_cmd_dispatch.c b/src/nouveau/vulkan/nvk_cmd_dispatch.c
new file mode 100644 (file)
index 0000000..dd26db3
--- /dev/null
@@ -0,0 +1,132 @@
+#include "nvk_cmd_buffer.h"
+#include "nvk_descriptor_set.h"
+#include "nvk_device.h"
+#include "nvk_physical_device.h"
+#include "nvk_pipeline.h"
+
+#include "classes/cla0b5.h"
+
+#include "nvk_cla0c0.h"
+#include "cla1c0.h"
+#include "nvk_clc3c0.h"
+
+#include "drf.h"
+#include "cla0c0qmd.h"
+#include "clc0c0qmd.h"
+#include "clc3c0qmd.h"
+
+#define NVA0C0_QMDV00_06_VAL_SET(p,a...) NVVAL_MW_SET((p), NVA0C0, QMDV00_06, ##a)
+#define NVA0C0_QMDV00_06_DEF_SET(p,a...) NVDEF_MW_SET((p), NVA0C0, QMDV00_06, ##a)
+#define NVC0C0_QMDV02_01_VAL_SET(p,a...) NVVAL_MW_SET((p), NVC0C0, QMDV02_01, ##a)
+#define NVC0C0_QMDV02_01_DEF_SET(p,a...) NVDEF_MW_SET((p), NVC0C0, QMDV02_01, ##a)
+#define NVC3C0_QMDV02_02_VAL_SET(p,a...) NVVAL_MW_SET((p), NVC3C0, QMDV02_02, ##a)
+#define NVC3C0_QMDV02_02_DEF_SET(p,a...) NVDEF_MW_SET((p), NVC3C0, QMDV02_02, ##a)
+
+static uint64_t
+calc_tls_size(struct nvk_device *device,
+              uint32_t lpos, uint32_t lneg, uint32_t cstack)
+{
+   uint64_t size = (lpos + lneg) * 32 + cstack;
+
+   assert (size < (1 << 20));
+
+   size *= 64; /* max warps */
+   size  = align(size, 0x8000);
+   size *= device->pdev->dev->mp_count;
+
+   size = align(size, 1 << 17);
+   return size;
+}
+
+void
+nvk_cmd_buffer_begin_compute(struct nvk_cmd_buffer *cmd,
+                             const VkCommandBufferBeginInfo *pBeginInfo)
+{
+   struct nvk_device *dev = (struct nvk_device *)cmd->vk.base.device;
+   struct nvk_physical_device *pdev = dev->pdev;
+
+   if (pdev->dev->cls < 0xa0)
+      return;
+
+   cmd->tls_space_needed = calc_tls_size(dev, 128 * 16, 0, 0x200);
+}
+
+static void
+gv100_compute_setup_launch_desc(uint32_t *qmd,
+                                uint32_t x, uint32_t y, uint32_t z)
+{
+   NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_RASTER_WIDTH, x);
+   NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_RASTER_HEIGHT, y);
+   NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_RASTER_DEPTH, z);
+}
+
+static inline void
+gp100_cp_launch_desc_set_cb(uint32_t *qmd, unsigned index,
+                            uint32_t size, uint64_t address)
+{
+   NVC0C0_QMDV02_01_VAL_SET(qmd, CONSTANT_BUFFER_ADDR_LOWER, index, address);
+   NVC0C0_QMDV02_01_VAL_SET(qmd, CONSTANT_BUFFER_ADDR_UPPER, index, address >> 32);
+   NVC0C0_QMDV02_01_VAL_SET(qmd, CONSTANT_BUFFER_SIZE_SHIFTED4, index,
+                                 DIV_ROUND_UP(size, 16));
+   NVC0C0_QMDV02_01_DEF_SET(qmd, CONSTANT_BUFFER_VALID, index, TRUE);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+nvk_CmdDispatch(VkCommandBuffer commandBuffer,
+                uint32_t groupCountX,
+                uint32_t groupCountY,
+                uint32_t groupCountZ)
+{
+   VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
+   const struct nvk_compute_pipeline *pipeline = cmd->state.cs.pipeline;
+   const struct nvk_shader *shader =
+      &pipeline->base.shaders[MESA_SHADER_COMPUTE];
+   struct nvk_descriptor_state *desc = &cmd->state.cs.descriptors;
+
+   desc->root.cs.block_size[0] = shader->cp.block_size[0];
+   desc->root.cs.block_size[1] = shader->cp.block_size[1];
+   desc->root.cs.block_size[2] = shader->cp.block_size[2];
+   desc->root.cs.grid_size[0] = groupCountX;
+   desc->root.cs.grid_size[1] = groupCountY;
+   desc->root.cs.grid_size[2] = groupCountZ;
+
+   uint32_t root_table_size = sizeof(desc->root);
+   void *root_table_map;
+   uint64_t root_table_addr;
+   if (!nvk_cmd_buffer_upload_alloc(cmd, root_table_size, &root_table_addr,
+                                    &root_table_map))
+      return; /* TODO: Error */
+
+   P_MTHD(cmd->push, NVA0C0, OFFSET_OUT_UPPER);
+   P_NVA0C0_OFFSET_OUT_UPPER(cmd->push, root_table_addr >> 32);
+   P_NVA0C0_OFFSET_OUT(cmd->push, root_table_addr & 0xffffffff);
+   P_MTHD(cmd->push, NVA0C0, LINE_LENGTH_IN);
+   P_NVA0C0_LINE_LENGTH_IN(cmd->push, root_table_size);
+   P_NVA0C0_LINE_COUNT(cmd->push, 0x1);
+
+   P_1INC(cmd->push, NVA0C0, LAUNCH_DMA);
+   P_NVA0C0_LAUNCH_DMA(cmd->push,
+                       { .dst_memory_layout = DST_MEMORY_LAYOUT_PITCH,
+                         .sysmembar_disable = SYSMEMBAR_DISABLE_TRUE });
+   P_INLINE_ARRAY(cmd->push, (uint32_t *)&desc->root, root_table_size / 4);
+
+   uint32_t *qmd;
+   uint64_t qmd_addr;
+   if (!nvk_cmd_buffer_upload_alloc(cmd, 512, &qmd_addr, (void **)&qmd))
+      return; /* TODO: Error */
+
+   memcpy(qmd, pipeline->qmd_template, 256);
+   gv100_compute_setup_launch_desc(qmd, groupCountX, groupCountY, groupCountZ);
+
+   gp100_cp_launch_desc_set_cb(qmd, 0, 256, root_table_addr);
+   gp100_cp_launch_desc_set_cb(qmd, 1, 256, root_table_addr);
+
+   P_MTHD(cmd->push, NVA0C0, INVALIDATE_SHADER_CACHES_NO_WFI);
+   P_NVA0C0_INVALIDATE_SHADER_CACHES_NO_WFI(cmd->push, { .constant = CONSTANT_TRUE });
+
+   P_MTHD(cmd->push, NVA0C0, SEND_PCAS_A);
+   P_NVA0C0_SEND_PCAS_A(cmd->push, qmd_addr >> 8);
+   P_IMMD(cmd->push, NVA0C0, SEND_SIGNALING_PCAS_B,
+          { .invalidate = INVALIDATE_TRUE,
+            .schedule = SCHEDULE_TRUE });
+}
diff --git a/src/nouveau/vulkan/nvk_compute_pipeline.c b/src/nouveau/vulkan/nvk_compute_pipeline.c
new file mode 100644 (file)
index 0000000..ff5991d
--- /dev/null
@@ -0,0 +1,109 @@
+#include "nvk_private.h"
+#include "nvk_device.h"
+#include "nvk_pipeline.h"
+#include "nvk_pipeline_layout.h"
+#include "nvk_shader.h"
+
+#include "nouveau_bo.h"
+#include "vk_shader_module.h"
+
+#include "drf.h"
+#include "cla0c0qmd.h"
+#include "clc0c0qmd.h"
+#include "clc3c0qmd.h"
+#define NVA0C0_QMDV00_06_VAL_SET(p,a...) NVVAL_MW_SET((p), NVA0C0, QMDV00_06, ##a)
+#define NVA0C0_QMDV00_06_DEF_SET(p,a...) NVDEF_MW_SET((p), NVA0C0, QMDV00_06, ##a)
+#define NVC0C0_QMDV02_01_VAL_SET(p,a...) NVVAL_MW_SET((p), NVC0C0, QMDV02_01, ##a)
+#define NVC0C0_QMDV02_01_DEF_SET(p,a...) NVDEF_MW_SET((p), NVC0C0, QMDV02_01, ##a)
+#define NVC3C0_QMDV02_02_VAL_SET(p,a...) NVVAL_MW_SET((p), NVC3C0, QMDV02_02, ##a)
+#define NVC3C0_QMDV02_02_DEF_SET(p,a...) NVDEF_MW_SET((p), NVC3C0, QMDV02_02, ##a)
+
+static int
+gv100_sm_config_smem_size(uint32_t size)
+{
+   if      (size > 64 * 1024) size = 96 * 1024;
+   else if (size > 32 * 1024) size = 64 * 1024;
+   else if (size > 16 * 1024) size = 32 * 1024;
+   else if (size >  8 * 1024) size = 16 * 1024;
+   else                       size =  8 * 1024;
+   return (size / 4096) + 1;
+}
+
+static void
+gv100_compute_setup_launch_desc_template(uint32_t *qmd,
+                                         struct nvk_shader *shader)
+{
+   NVC3C0_QMDV02_02_VAL_SET(qmd, SM_GLOBAL_CACHING_ENABLE, 1);
+   NVC3C0_QMDV02_02_DEF_SET(qmd, API_VISIBLE_CALL_LIMIT, NO_CHECK);
+   NVC3C0_QMDV02_02_DEF_SET(qmd, SAMPLER_INDEX, INDEPENDENTLY);
+   NVC3C0_QMDV02_02_VAL_SET(qmd, SHARED_MEMORY_SIZE,
+                            align(shader->cp.smem_size, 0x100));
+   NVC3C0_QMDV02_02_VAL_SET(qmd, SHADER_LOCAL_MEMORY_LOW_SIZE,
+                            (shader->hdr[1] & 0xfffff0) +
+                            align(shader->cp.lmem_size, 0x10));
+   NVC3C0_QMDV02_02_VAL_SET(qmd, SHADER_LOCAL_MEMORY_HIGH_SIZE, 0);
+   NVC3C0_QMDV02_02_VAL_SET(qmd, MIN_SM_CONFIG_SHARED_MEM_SIZE,
+                            gv100_sm_config_smem_size(8 * 1024));
+   NVC3C0_QMDV02_02_VAL_SET(qmd, MAX_SM_CONFIG_SHARED_MEM_SIZE,
+                            gv100_sm_config_smem_size(96 * 1024));
+   NVC3C0_QMDV02_02_VAL_SET(qmd, QMD_VERSION, 2);
+   NVC3C0_QMDV02_02_VAL_SET(qmd, QMD_MAJOR_VERSION, 2);
+   NVC3C0_QMDV02_02_VAL_SET(qmd, TARGET_SM_CONFIG_SHARED_MEM_SIZE,
+                            gv100_sm_config_smem_size(shader->cp.smem_size));
+
+   NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_THREAD_DIMENSION0, shader->cp.block_size[0]);
+   NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_THREAD_DIMENSION1, shader->cp.block_size[1]);
+   NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_THREAD_DIMENSION2, shader->cp.block_size[2]);
+   NVC3C0_QMDV02_02_VAL_SET(qmd, REGISTER_COUNT_V, shader->num_gprs);
+   NVC3C0_QMDV02_02_VAL_SET(qmd, BARRIER_COUNT, shader->num_barriers);
+
+   uint64_t entry = shader->bo->offset;
+   NVC3C0_QMDV02_02_VAL_SET(qmd, PROGRAM_ADDRESS_LOWER, entry & 0xffffffff);
+   NVC3C0_QMDV02_02_VAL_SET(qmd, PROGRAM_ADDRESS_UPPER, entry >> 32);
+
+}
+
+VkResult
+nvk_compute_pipeline_create(struct nvk_device *device,
+                            struct vk_pipeline_cache *cache,
+                            const VkComputePipelineCreateInfo *pCreateInfo,
+                            const VkAllocationCallbacks *pAllocator,
+                            VkPipeline *pPipeline)
+{
+   VK_FROM_HANDLE(nvk_pipeline_layout, pipeline_layout, pCreateInfo->layout);
+   struct nvk_physical_device *pdevice = nvk_device_physical(device);
+   struct nvk_compute_pipeline *pipeline;
+   VkResult result;
+
+   pipeline = vk_object_zalloc(&device->vk, pAllocator, sizeof(*pipeline),
+                               VK_OBJECT_TYPE_PIPELINE);
+   if (pipeline == NULL)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   pipeline->base.type = NVK_PIPELINE_COMPUTE;
+
+   assert(pCreateInfo->stage.stage == VK_SHADER_STAGE_COMPUTE_BIT);
+   VK_FROM_HANDLE(vk_shader_module, module, pCreateInfo->stage.module);
+
+   nir_shader *nir;
+   result = nvk_shader_compile_to_nir(device, module,
+                                      pCreateInfo->stage.pName,
+                                      MESA_SHADER_COMPUTE,
+                                      pCreateInfo->stage.pSpecializationInfo,
+                                      pipeline_layout, &nir);
+   if (result != VK_SUCCESS)
+      goto fail;
+
+   result = nvk_compile_nir(pdevice, nir, &pipeline->base.shaders[MESA_SHADER_COMPUTE]);
+   if (result != VK_SUCCESS)
+      goto fail;
+
+   nvk_shader_upload(pdevice, &pipeline->base.shaders[MESA_SHADER_COMPUTE]);
+   gv100_compute_setup_launch_desc_template(pipeline->qmd_template, &pipeline->base.shaders[MESA_SHADER_COMPUTE]);
+   *pPipeline = nvk_pipeline_to_handle(&pipeline->base);
+   return VK_SUCCESS;
+
+fail:
+   vk_object_free(&device->vk, pAllocator, pipeline);
+   return result;
+}
index b285d4c..311b7e7 100644 (file)
@@ -5,12 +5,27 @@
 #include "nvk_format.h"
 #include "nvk_image.h"
 #include "nvk_instance.h"
+#include "nvk_shader.h"
 #include "nvk_wsi.h"
 
 #include "vulkan/runtime/vk_device.h"
 #include "vulkan/util/vk_enum_defines.h"
 #include "vulkan/wsi/wsi_common.h"
 
+#include "vulkan/util/vk_enum_defines.h"
+
+#include "cl90c0.h"
+#include "cl91c0.h"
+#include "cla0c0.h"
+#include "cla1c0.h"
+#include "clb0c0.h"
+#include "clb1c0.h"
+#include "clc0c0.h"
+#include "clc1c0.h"
+#include "clc3c0.h"
+#include "clc5c0.h"
+
+
 VKAPI_ATTR void VKAPI_CALL
 nvk_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
    VkPhysicalDeviceFeatures2 *pFeatures)
index 6e7db2c..c2ec25f 100644 (file)
@@ -24,16 +24,6 @@ nvk_graphics_pipeline_create(struct nvk_device *device,
    unreachable("Graphics pipelines not yet implemented");
 }
 
-static VkResult
-nvk_compute_pipeline_create(struct nvk_device *device,
-                            struct vk_pipeline_cache *cache,
-                            const VkComputePipelineCreateInfo *pCreateInfo,
-                            const VkAllocationCallbacks *pAllocator,
-                            VkPipeline *pPipeline)
-{
-   unreachable("Compute pipelines not yet implemented");
-}
-
 VKAPI_ATTR VkResult VKAPI_CALL
 nvk_CreateGraphicsPipelines(VkDevice _device,
                             VkPipelineCache pipelineCache,
index 50da9be..f7e0aa2 100644 (file)
@@ -4,6 +4,8 @@
 #include "nvk_shader.h"
 #include "vk_object.h"
 
+struct vk_pipeline_cache;
+
 enum nvk_pipeline_type {
    NVK_PIPELINE_GRAPHICS,
    NVK_PIPELINE_COMPUTE,
@@ -19,4 +21,18 @@ struct nvk_pipeline {
 
 VK_DEFINE_NONDISP_HANDLE_CASTS(nvk_pipeline, base, VkPipeline,
                                VK_OBJECT_TYPE_PIPELINE)
+
+struct nvk_compute_pipeline {
+   struct nvk_pipeline base;
+
+   uint32_t qmd_template[64];
+};
+
+VkResult
+nvk_compute_pipeline_create(struct nvk_device *device,
+                            struct vk_pipeline_cache *cache,
+                            const VkComputePipelineCreateInfo *pCreateInfo,
+                            const VkAllocationCallbacks *pAllocator,
+                            VkPipeline *pPipeline);
+
 #endif