pvr: Implement vkCmdDrawIndirect API.
authorRajnesh Kanwal <rajnesh.kanwal@imgtec.com>
Fri, 5 Aug 2022 09:45:10 +0000 (10:45 +0100)
committerMarge Bot <emma+marge@anholt.net>
Tue, 13 Sep 2022 12:33:13 +0000 (12:33 +0000)
Signed-off-by: Rajnesh Kanwal <rajnesh.kanwal@imgtec.com>
Reviewed-by: Karmjit Mahil <Karmjit.Mahil@imgtec.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18486>

src/imagination/vulkan/pvr_cmd_buffer.c

index 280b5d5..b6c0773 100644 (file)
@@ -4769,6 +4769,187 @@ static uint32_t pvr_get_hw_primitive_topology(VkPrimitiveTopology topology)
    }
 }
 
+/* TODO: Rewrite this in terms of ALIGN_POT() and pvr_cmd_length(). */
+/* Aligned to 128 bit for PDS loads / stores */
+#define DUMMY_VDM_CONTROL_STREAM_BLOCK_SIZE 8
+
+static VkResult
+pvr_write_draw_indirect_vdm_stream(struct pvr_cmd_buffer *cmd_buffer,
+                                   struct pvr_csb *const csb,
+                                   pvr_dev_addr_t idx_buffer_addr,
+                                   uint32_t idx_stride,
+                                   struct PVRX(VDMCTRL_INDEX_LIST0) * list_hdr,
+                                   struct pvr_buffer *buffer,
+                                   VkDeviceSize offset,
+                                   uint32_t count,
+                                   uint32_t stride)
+{
+   struct pvr_pds_drawindirect_program pds_prog = { 0 };
+   uint32_t word0;
+
+   /* Draw indirect always has index offset and instance count. */
+   list_hdr->index_offset_present = true;
+   list_hdr->index_instance_count_present = true;
+
+   pvr_cmd_pack(VDMCTRL_INDEX_LIST0)(&word0, list_hdr);
+
+   pds_prog.support_base_instance = true;
+   pds_prog.arg_buffer = buffer->dev_addr.addr + offset;
+   pds_prog.index_buffer = idx_buffer_addr.addr;
+   pds_prog.index_block_header = word0;
+   pds_prog.index_stride = idx_stride;
+   pds_prog.num_views = 1U;
+
+   /* TODO: See if we can pre-upload the code section of all the pds programs
+    * and reuse them here.
+    */
+   /* Generate and upload the PDS programs (code + data). */
+   for (uint32_t i = 0U; i < count; i++) {
+      const struct pvr_device_info *dev_info =
+         &cmd_buffer->device->pdevice->dev_info;
+      struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
+      struct pvr_bo *dummy_bo;
+      uint32_t *dummy_stream;
+      struct pvr_bo *pds_bo;
+      uint32_t *pds_base;
+      uint32_t pds_size;
+      VkResult result;
+
+      pds_prog.increment_draw_id = (i != 0);
+
+      if (state->draw_state.draw_indexed) {
+         pvr_pds_generate_draw_elements_indirect(&pds_prog,
+                                                 0,
+                                                 PDS_GENERATE_SIZES,
+                                                 dev_info);
+      } else {
+         pvr_pds_generate_draw_arrays_indirect(&pds_prog,
+                                               0,
+                                               PDS_GENERATE_SIZES,
+                                               dev_info);
+      }
+
+      pds_size = (pds_prog.program.data_size_aligned +
+                  pds_prog.program.code_size_aligned)
+                 << 2;
+
+      result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
+                                        cmd_buffer->device->heaps.pds_heap,
+                                        pds_size,
+                                        PVR_BO_ALLOC_FLAG_CPU_MAPPED,
+                                        &pds_bo);
+      if (result != VK_SUCCESS)
+         return result;
+
+      pds_base = pds_bo->bo->map;
+      memcpy(pds_base,
+             pds_prog.program.code,
+             pds_prog.program.code_size_aligned << 2);
+
+      if (state->draw_state.draw_indexed) {
+         pvr_pds_generate_draw_elements_indirect(
+            &pds_prog,
+            pds_base + pds_prog.program.code_size_aligned,
+            PDS_GENERATE_DATA_SEGMENT,
+            dev_info);
+      } else {
+         pvr_pds_generate_draw_arrays_indirect(
+            &pds_prog,
+            pds_base + pds_prog.program.code_size_aligned,
+            PDS_GENERATE_DATA_SEGMENT,
+            dev_info);
+      }
+
+      pvr_bo_cpu_unmap(cmd_buffer->device, pds_bo);
+
+      /* Write the VDM state update. */
+      pvr_csb_emit (csb, VDMCTRL_PDS_STATE0, state0) {
+         state0.usc_target = PVRX(VDMCTRL_USC_TARGET_ANY);
+
+         state0.pds_temp_size =
+            DIV_ROUND_UP(pds_prog.program.temp_size_aligned << 2,
+                         PVRX(VDMCTRL_PDS_STATE0_PDS_TEMP_SIZE_UNIT_SIZE));
+
+         state0.pds_data_size =
+            DIV_ROUND_UP(pds_prog.program.data_size_aligned << 2,
+                         PVRX(VDMCTRL_PDS_STATE0_PDS_DATA_SIZE_UNIT_SIZE));
+      }
+
+      pvr_csb_emit (csb, VDMCTRL_PDS_STATE1, state1) {
+         const uint32_t data_offset =
+            pds_bo->vma->dev_addr.addr + (pds_prog.program.code_size << 2) -
+            cmd_buffer->device->heaps.pds_heap->base_addr.addr;
+
+         state1.pds_data_addr = PVR_DEV_ADDR(data_offset);
+         state1.sd_type = PVRX(VDMCTRL_SD_TYPE_PDS);
+         state1.sd_next_type = PVRX(VDMCTRL_SD_TYPE_NONE);
+      }
+
+      pvr_csb_emit (csb, VDMCTRL_PDS_STATE2, state2) {
+         const uint32_t code_offset =
+            pds_bo->vma->dev_addr.addr -
+            cmd_buffer->device->heaps.pds_heap->base_addr.addr;
+
+         state2.pds_code_addr = PVR_DEV_ADDR(code_offset);
+      }
+
+      /* Sync task to ensure the VDM doesn't start reading the dummy blocks
+       * before they are ready.
+       */
+      pvr_csb_emit (csb, VDMCTRL_INDEX_LIST0, list0) {
+         list0.primitive_topology = PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_LIST);
+      }
+
+      result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
+                                        cmd_buffer->device->heaps.general_heap,
+                                        DUMMY_VDM_CONTROL_STREAM_BLOCK_SIZE,
+                                        PVR_BO_ALLOC_FLAG_CPU_MAPPED,
+                                        &dummy_bo);
+      if (result != VK_SUCCESS)
+         return result;
+
+      dummy_stream = dummy_bo->bo->map;
+
+      /* For indexed draw cmds fill in the dummy's header (as it won't change
+       * based on the indirect args) and increment by the in-use size of each
+       * dummy block.
+       */
+      if (!state->draw_state.draw_indexed) {
+         dummy_stream[0] = word0;
+         dummy_stream += 4;
+      } else {
+         dummy_stream += 5;
+      }
+
+      /* clang-format off */
+      pvr_csb_pack (dummy_stream, VDMCTRL_STREAM_RETURN, word);
+      /* clang-format on */
+
+      pvr_bo_cpu_unmap(cmd_buffer->device, dummy_bo);
+
+      /* Stream link to the first dummy which forces the VDM to discard any
+       * prefetched (dummy) control stream.
+       */
+      pvr_csb_emit (csb, VDMCTRL_STREAM_LINK0, link) {
+         link.with_return = true;
+         link.link_addrmsb = dummy_bo->vma->dev_addr;
+      }
+
+      pvr_csb_emit (csb, VDMCTRL_STREAM_LINK1, link) {
+         link.link_addrlsb = dummy_bo->vma->dev_addr;
+      }
+
+      /* Point the pds program to the next argument buffer and the next VDM
+       * dummy buffer.
+       */
+      pds_prog.arg_buffer += stride;
+   }
+
+   return VK_SUCCESS;
+}
+
+#undef DUMMY_VDM_CONTROL_STREAM_BLOCK_SIZE
+
 static void pvr_emit_vdm_index_list(struct pvr_cmd_buffer *cmd_buffer,
                                     struct pvr_sub_cmd_gfx *const sub_cmd,
                                     VkPrimitiveTopology topology,
@@ -4776,64 +4957,88 @@ static void pvr_emit_vdm_index_list(struct pvr_cmd_buffer *cmd_buffer,
                                     uint32_t vertex_count,
                                     uint32_t first_index,
                                     uint32_t index_count,
-                                    uint32_t instance_count)
+                                    uint32_t instance_count,
+                                    struct pvr_buffer *buffer,
+                                    VkDeviceSize offset,
+                                    uint32_t count,
+                                    uint32_t stride)
 {
    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
-   struct pvr_csb *const csb = &sub_cmd->control_stream;
+   const bool vertex_shader_has_side_effects =
+      state->gfx_pipeline->vertex_shader_state.stage_state.has_side_effects;
    struct PVRX(VDMCTRL_INDEX_LIST0)
       list_hdr = { pvr_cmd_header(VDMCTRL_INDEX_LIST0) };
    pvr_dev_addr_t index_buffer_addr = PVR_DEV_ADDR_INVALID;
+   struct pvr_csb *const csb = &sub_cmd->control_stream;
    unsigned int index_stride = 0;
 
-   pvr_csb_emit (csb, VDMCTRL_INDEX_LIST0, list0) {
-      const bool vertex_shader_has_side_effects =
-         cmd_buffer->state.gfx_pipeline->vertex_shader_state.stage_state
-            .has_side_effects;
-
-      list0.primitive_topology = pvr_get_hw_primitive_topology(topology);
+   list_hdr.primitive_topology = pvr_get_hw_primitive_topology(topology);
 
-      /* First instance is not handled in the VDM state, it's implemented as
-       * an addition in the PDS vertex fetch.
-       */
-      list0.index_count_present = true;
+   /* firstInstance is not handled here in the VDM state, it's implemented as
+    * an addition in the PDS vertex fetch using
+    * PVR_PDS_CONST_MAP_ENTRY_TYPE_BASE_INSTANCE entry type.
+    */
 
-      if (instance_count > 1)
-         list0.index_instance_count_present = true;
+   list_hdr.index_count_present = true;
 
-      if (first_vertex != 0)
-         list0.index_offset_present = true;
+   if (instance_count > 1)
+      list_hdr.index_instance_count_present = true;
 
-      if (state->draw_state.draw_indexed) {
-         struct pvr_buffer *buffer = state->index_buffer_binding.buffer;
+   if (first_vertex != 0)
+      list_hdr.index_offset_present = true;
 
-         switch (state->index_buffer_binding.type) {
-         case VK_INDEX_TYPE_UINT32:
-            list0.index_size = PVRX(VDMCTRL_INDEX_SIZE_B32);
-            index_stride = 4;
-            break;
+   if (state->draw_state.draw_indexed) {
+      struct pvr_buffer *buffer = state->index_buffer_binding.buffer;
 
-         case VK_INDEX_TYPE_UINT16:
-            list0.index_size = PVRX(VDMCTRL_INDEX_SIZE_B16);
-            index_stride = 2;
-            break;
+      switch (state->index_buffer_binding.type) {
+      case VK_INDEX_TYPE_UINT32:
+         list_hdr.index_size = PVRX(VDMCTRL_INDEX_SIZE_B32);
+         index_stride = 4;
+         break;
 
-         default:
-            unreachable("Invalid index type");
-         }
+      case VK_INDEX_TYPE_UINT16:
+         list_hdr.index_size = PVRX(VDMCTRL_INDEX_SIZE_B16);
+         index_stride = 2;
+         break;
 
-         list0.index_addr_present = true;
-         index_buffer_addr = PVR_DEV_ADDR_OFFSET(
-            buffer->dev_addr,
-            state->index_buffer_binding.offset + first_index * index_stride);
-         list0.index_base_addrmsb = index_buffer_addr;
+      default:
+         unreachable("Invalid index type");
       }
 
-      list0.degen_cull_enable =
-         PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info,
-                         vdm_degenerate_culling) &&
-         !vertex_shader_has_side_effects;
+      index_buffer_addr = PVR_DEV_ADDR_OFFSET(
+         buffer->dev_addr,
+         state->index_buffer_binding.offset + first_index * index_stride);
 
-      list_hdr = list0;
+      list_hdr.index_addr_present = true;
+
+      /* For indirect draw calls, index buffer address is not embedded into VDM
+       * control stream.
+       */
+      if (!state->draw_state.draw_indirect)
+         list_hdr.index_base_addrmsb = index_buffer_addr;
+   }
+
+   list_hdr.degen_cull_enable =
+      PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info,
+                      vdm_degenerate_culling) &&
+      !vertex_shader_has_side_effects;
+
+   if (state->draw_state.draw_indirect) {
+      assert(buffer);
+      pvr_write_draw_indirect_vdm_stream(cmd_buffer,
+                                         csb,
+                                         index_buffer_addr,
+                                         index_stride,
+                                         &list_hdr,
+                                         buffer,
+                                         offset,
+                                         count,
+                                         stride);
+      return;
+   }
+
+   pvr_csb_emit (csb, VDMCTRL_INDEX_LIST0, list0) {
+      list0 = list_hdr;
    }
 
    if (list_hdr.index_addr_present) {
@@ -4869,17 +5074,16 @@ void pvr_CmdDraw(VkCommandBuffer commandBuffer,
                  uint32_t firstVertex,
                  uint32_t firstInstance)
 {
+   const struct pvr_cmd_buffer_draw_state draw_state = {
+      .base_vertex = firstVertex,
+      .base_instance = firstInstance,
+   };
    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
-   struct pvr_cmd_buffer_draw_state draw_state;
    VkResult result;
 
    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
 
-   draw_state.base_vertex = firstVertex;
-   draw_state.base_instance = firstInstance;
-   draw_state.draw_indirect = false;
-   draw_state.draw_indexed = false;
    pvr_update_draw_state(state, &draw_state);
 
    result = pvr_validate_draw_state(cmd_buffer);
@@ -4894,7 +5098,11 @@ void pvr_CmdDraw(VkCommandBuffer commandBuffer,
                            vertexCount,
                            0U,
                            0U,
-                           instanceCount);
+                           instanceCount,
+                           NULL,
+                           0U,
+                           0U,
+                           0U);
 }
 
 void pvr_CmdDrawIndexed(VkCommandBuffer commandBuffer,
@@ -4904,17 +5112,17 @@ void pvr_CmdDrawIndexed(VkCommandBuffer commandBuffer,
                         int32_t vertexOffset,
                         uint32_t firstInstance)
 {
+   const struct pvr_cmd_buffer_draw_state draw_state = {
+      .base_vertex = vertexOffset,
+      .base_instance = firstInstance,
+      .draw_indexed = true,
+   };
    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
-   struct pvr_cmd_buffer_draw_state draw_state;
    VkResult result;
 
    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
 
-   draw_state.base_vertex = vertexOffset;
-   draw_state.base_instance = firstInstance;
-   draw_state.draw_indirect = false;
-   draw_state.draw_indexed = true;
    pvr_update_draw_state(state, &draw_state);
 
    result = pvr_validate_draw_state(cmd_buffer);
@@ -4929,7 +5137,11 @@ void pvr_CmdDrawIndexed(VkCommandBuffer commandBuffer,
                            0,
                            firstIndex,
                            indexCount,
-                           instanceCount);
+                           instanceCount,
+                           NULL,
+                           0U,
+                           0U,
+                           0U);
 }
 
 void pvr_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
@@ -4947,7 +5159,35 @@ void pvr_CmdDrawIndirect(VkCommandBuffer commandBuffer,
                          uint32_t drawCount,
                          uint32_t stride)
 {
-   assert(!"Unimplemented");
+   const struct pvr_cmd_buffer_draw_state draw_state = {
+      .draw_indirect = true,
+   };
+   PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
+   struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
+   PVR_FROM_HANDLE(pvr_buffer, buffer, _buffer);
+   VkResult result;
+
+   PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
+
+   pvr_update_draw_state(state, &draw_state);
+
+   result = pvr_validate_draw_state(cmd_buffer);
+   if (result != VK_SUCCESS)
+      return;
+
+   /* Write the VDM control stream for the primitive. */
+   pvr_emit_vdm_index_list(cmd_buffer,
+                           &state->current_sub_cmd->gfx,
+                           state->gfx_pipeline->input_asm_state.topology,
+                           0,
+                           0,
+                           0,
+                           0,
+                           0,
+                           buffer,
+                           offset,
+                           drawCount,
+                           stride);
 }
 
 static VkResult