nvk: Implement VK_KHR_draw_indirect_count on Turing+
authorThomas H.P. Andersen <phomes@gmail.com>
Thu, 3 Nov 2022 22:30:53 +0000 (23:30 +0100)
committerMarge Bot <emma+marge@anholt.net>
Fri, 4 Aug 2023 21:32:04 +0000 (21:32 +0000)
v2: handle maxDrawCount parameter
v3: assert on pre-turing. Free regsisters after use.
v4: less register presure. Update to pass new tests

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24326>

src/nouveau/mme/mme_builder.h
src/nouveau/vulkan/nvk_cmd_draw.c
src/nouveau/vulkan/nvk_mme.c
src/nouveau/vulkan/nvk_mme.h
src/nouveau/vulkan/nvk_physical_device.c

index 9f27e99..bc5ea11 100644 (file)
@@ -111,6 +111,13 @@ mme_free_reg(struct mme_builder *b, struct mme_value val)
 }
 
 static inline void
+mme_free_reg64(struct mme_builder *b, struct mme_value64 val)
+{
+   mme_reg_alloc_free(&b->reg_alloc, val.lo);
+   mme_reg_alloc_free(&b->reg_alloc, val.hi);
+}
+
+static inline void
 mme_alu_to(struct mme_builder *b,
            struct mme_value dst,
            enum mme_alu_op op,
index 55d7920..0fd57fe 100644 (file)
@@ -1973,3 +1973,155 @@ nvk_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
       }
    }
 }
+
+void
+nvk_mme_draw_indirect_count(struct mme_builder *b)
+{
+   if (b->devinfo->cls_eng3d < TURING_A)
+      return;
+
+   struct mme_value begin = mme_load(b);
+   struct mme_value64 draw_addr = mme_load_addr64(b);
+   struct mme_value64 draw_count_addr = mme_load_addr64(b);
+   struct mme_value draw_max = mme_load(b);
+   struct mme_value stride = mme_load(b);
+
+   mme_tu104_read_fifoed(b, draw_count_addr, mme_imm(1));
+   mme_free_reg64(b, draw_count_addr);
+   struct mme_value draw_count_buf = mme_load(b);
+
+   mme_if(b, ule, draw_count_buf, draw_max) {
+      mme_mov_to(b, draw_max, draw_count_buf);
+   }
+   mme_free_reg(b, draw_count_buf);
+
+   struct mme_value draw = mme_mov(b, mme_zero());
+   mme_while(b, ult, draw, draw_max) {
+      mme_tu104_read_fifoed(b, draw_addr, mme_imm(4));
+
+      nvk_mme_build_draw(b, begin, draw);
+
+      mme_add_to(b, draw, draw, mme_imm(1));
+      mme_add64_to(b, draw_addr, draw_addr, mme_value64(stride, mme_zero()));
+   }
+}
+
+VKAPI_ATTR void VKAPI_CALL
+nvk_CmdDrawIndirectCount(VkCommandBuffer commandBuffer,
+                         VkBuffer _buffer,
+                         VkDeviceSize offset,
+                         VkBuffer countBuffer,
+                         VkDeviceSize countBufferOffset,
+                         uint32_t maxDrawCount,
+                         uint32_t stride)
+{
+   VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
+   VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
+   VK_FROM_HANDLE(nvk_buffer, count_buffer, countBuffer);
+
+   const struct vk_dynamic_graphics_state *dyn =
+      &cmd->vk.dynamic_graphics_state;
+
+   /* TODO: Indirect count draw pre-Turing */
+   assert(nvk_cmd_buffer_3d_cls(cmd) >= TURING_A);
+
+   nvk_flush_gfx_state(cmd);
+
+   uint32_t begin;
+   V_NV9097_BEGIN(begin, {
+      .op = vk_to_nv9097_primitive_topology(dyn->ia.primitive_topology),
+      .primitive_id = NV9097_BEGIN_PRIMITIVE_ID_FIRST,
+      .instance_id = NV9097_BEGIN_INSTANCE_ID_FIRST,
+      .split_mode = SPLIT_MODE_NORMAL_BEGIN_NORMAL_END,
+   });
+
+   struct nv_push *p = nvk_cmd_buffer_push(cmd, 10);
+   P_IMMD(p, NVC597, SET_MME_DATA_FIFO_CONFIG, FIFO_SIZE_SIZE_4KB);
+   P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDIRECT_COUNT));
+   P_INLINE_DATA(p, begin);
+   uint64_t draw_addr = nvk_buffer_address(buffer, offset);
+   P_INLINE_DATA(p, draw_addr >> 32);
+   P_INLINE_DATA(p, draw_addr);
+   uint64_t draw_count_addr = nvk_buffer_address(count_buffer,
+                                                 countBufferOffset);
+   P_INLINE_DATA(p, draw_count_addr >> 32);
+   P_INLINE_DATA(p, draw_count_addr);
+   P_INLINE_DATA(p, maxDrawCount);
+   P_INLINE_DATA(p, stride);
+}
+
+void
+nvk_mme_draw_indexed_indirect_count(struct mme_builder *b)
+{
+   if (b->devinfo->cls_eng3d < TURING_A)
+      return;
+
+   struct mme_value begin = mme_load(b);
+   struct mme_value64 draw_addr = mme_load_addr64(b);
+   struct mme_value64 draw_count_addr = mme_load_addr64(b);
+   struct mme_value draw_max = mme_load(b);
+   struct mme_value stride = mme_load(b);
+
+   mme_tu104_read_fifoed(b, draw_count_addr, mme_imm(1));
+   mme_free_reg64(b, draw_count_addr);
+   struct mme_value draw_count_buf = mme_load(b);
+
+   mme_if(b, ule, draw_count_buf, draw_max) {
+      mme_mov_to(b, draw_max, draw_count_buf);
+   }
+   mme_free_reg(b, draw_count_buf);
+
+   struct mme_value draw = mme_mov(b, mme_zero());
+   mme_while(b, ult, draw, draw_max) {
+      mme_tu104_read_fifoed(b, draw_addr, mme_imm(5));
+
+      nvk_mme_build_draw_indexed(b, begin, draw);
+
+      mme_add_to(b, draw, draw, mme_imm(1));
+      mme_add64_to(b, draw_addr, draw_addr, mme_value64(stride, mme_zero()));
+   }
+}
+
+VKAPI_ATTR void VKAPI_CALL
+nvk_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer,
+                                VkBuffer _buffer,
+                                VkDeviceSize offset,
+                                VkBuffer countBuffer,
+                                VkDeviceSize countBufferOffset,
+                                uint32_t maxDrawCount,
+                                uint32_t stride)
+{
+   VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
+   VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
+   VK_FROM_HANDLE(nvk_buffer, count_buffer, countBuffer);
+
+   const struct vk_dynamic_graphics_state *dyn =
+      &cmd->vk.dynamic_graphics_state;
+
+   /* TODO: Indexed indirect count draw pre-Turing */
+   assert(nvk_cmd_buffer_3d_cls(cmd) >= TURING_A);
+
+   nvk_flush_gfx_state(cmd);
+
+   uint32_t begin;
+   V_NV9097_BEGIN(begin, {
+      .op = vk_to_nv9097_primitive_topology(dyn->ia.primitive_topology),
+      .primitive_id = NV9097_BEGIN_PRIMITIVE_ID_FIRST,
+      .instance_id = NV9097_BEGIN_INSTANCE_ID_FIRST,
+      .split_mode = SPLIT_MODE_NORMAL_BEGIN_NORMAL_END,
+   });
+
+   struct nv_push *p = nvk_cmd_buffer_push(cmd, 10);
+   P_IMMD(p, NVC597, SET_MME_DATA_FIFO_CONFIG, FIFO_SIZE_SIZE_4KB);
+   P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED_INDIRECT_COUNT));
+   P_INLINE_DATA(p, begin);
+   uint64_t draw_addr = nvk_buffer_address(buffer, offset);
+   P_INLINE_DATA(p, draw_addr >> 32);
+   P_INLINE_DATA(p, draw_addr);
+   uint64_t draw_count_addr = nvk_buffer_address(count_buffer,
+                                                 countBufferOffset);
+   P_INLINE_DATA(p, draw_count_addr >> 32);
+   P_INLINE_DATA(p, draw_count_addr);
+   P_INLINE_DATA(p, maxDrawCount);
+   P_INLINE_DATA(p, stride);
+}
index d96c688..dd24e0e 100644 (file)
@@ -3,16 +3,18 @@
 #include "nvk_private.h"
 
 static const nvk_mme_builder_func mme_builders[NVK_MME_COUNT] = {
-   [NVK_MME_CLEAR_VIEWS]            = nvk_mme_clear_views,
-   [NVK_MME_CLEAR_LAYERS]           = nvk_mme_clear_layers,
-   [NVK_MME_DRAW]                   = nvk_mme_draw,
-   [NVK_MME_DRAW_INDEXED]           = nvk_mme_draw_indexed,
-   [NVK_MME_DRAW_INDIRECT]          = nvk_mme_draw_indirect,
-   [NVK_MME_DRAW_INDEXED_INDIRECT]  = nvk_mme_draw_indexed_indirect,
-   [NVK_MME_ADD_CS_INVOCATIONS]     = nvk_mme_add_cs_invocations,
-   [NVK_MME_DISPATCH_INDIRECT]      = nvk_mme_dispatch_indirect,
-   [NVK_MME_WRITE_CS_INVOCATIONS]   = nvk_mme_write_cs_invocations,
-   [NVK_MME_COPY_QUERIES]           = nvk_mme_copy_queries,
+   [NVK_MME_CLEAR_VIEWS]                 = nvk_mme_clear_views,
+   [NVK_MME_CLEAR_LAYERS]                = nvk_mme_clear_layers,
+   [NVK_MME_DRAW]                        = nvk_mme_draw,
+   [NVK_MME_DRAW_INDEXED]                = nvk_mme_draw_indexed,
+   [NVK_MME_DRAW_INDIRECT]               = nvk_mme_draw_indirect,
+   [NVK_MME_DRAW_INDEXED_INDIRECT]       = nvk_mme_draw_indexed_indirect,
+   [NVK_MME_DRAW_INDIRECT_COUNT]         = nvk_mme_draw_indirect_count,
+   [NVK_MME_DRAW_INDEXED_INDIRECT_COUNT] = nvk_mme_draw_indexed_indirect_count,
+   [NVK_MME_ADD_CS_INVOCATIONS]          = nvk_mme_add_cs_invocations,
+   [NVK_MME_DISPATCH_INDIRECT]           = nvk_mme_dispatch_indirect,
+   [NVK_MME_WRITE_CS_INVOCATIONS]        = nvk_mme_write_cs_invocations,
+   [NVK_MME_COPY_QUERIES]                = nvk_mme_copy_queries,
 };
 
 uint32_t *
index 40c0c40..ef14b3c 100644 (file)
@@ -12,6 +12,8 @@ enum nvk_mme {
    NVK_MME_DRAW_INDEXED,
    NVK_MME_DRAW_INDIRECT,
    NVK_MME_DRAW_INDEXED_INDIRECT,
+   NVK_MME_DRAW_INDIRECT_COUNT,
+   NVK_MME_DRAW_INDEXED_INDIRECT_COUNT,
    NVK_MME_ADD_CS_INVOCATIONS,
    NVK_MME_DISPATCH_INDIRECT,
    NVK_MME_WRITE_CS_INVOCATIONS,
@@ -40,6 +42,8 @@ void nvk_mme_draw(struct mme_builder *b);
 void nvk_mme_draw_indexed(struct mme_builder *b);
 void nvk_mme_draw_indirect(struct mme_builder *b);
 void nvk_mme_draw_indexed_indirect(struct mme_builder *b);
+void nvk_mme_draw_indirect_count(struct mme_builder *b);
+void nvk_mme_draw_indexed_indirect_count(struct mme_builder *b);
 void nvk_mme_add_cs_invocations(struct mme_builder *b);
 void nvk_mme_dispatch_indirect(struct mme_builder *b);
 void nvk_mme_write_cs_invocations(struct mme_builder *b);
index 86e0cc0..785896f 100644 (file)
@@ -22,6 +22,7 @@
 #include "clc0c0.h"
 #include "clc1c0.h"
 #include "clc3c0.h"
+#include "clc597.h"
 #include "clc5c0.h"
 
 
@@ -29,7 +30,7 @@ VKAPI_ATTR void VKAPI_CALL
 nvk_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
                                VkPhysicalDeviceFeatures2 *pFeatures)
 {
-   // VK_FROM_HANDLE(nvk_physical_device, pdev, physicalDevice);
+   VK_FROM_HANDLE(nvk_physical_device, pdev, physicalDevice);
 
    pFeatures->features = (VkPhysicalDeviceFeatures) {
       .robustBufferAccess = true,
@@ -103,6 +104,7 @@ nvk_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
       .bufferDeviceAddress = true,
       .bufferDeviceAddressCaptureReplay = false,
       .bufferDeviceAddressMultiDevice = false,
+      .drawIndirectCount = pdev->info.cls_eng3d >= TURING_A,
    };
 
    VkPhysicalDeviceVulkan13Features core_1_3 = {
@@ -483,6 +485,7 @@ nvk_get_device_extensions(const struct nvk_physical_device *pdev,
       .KHR_depth_stencil_resolve = true,
       .KHR_descriptor_update_template = true,
       .KHR_device_group = true,
+      .KHR_draw_indirect_count = pdev->info.cls_eng3d >= TURING_A,
       .KHR_driver_properties = true,
       .KHR_dynamic_rendering = true,
       .KHR_format_feature_flags2 = true,