radv: move emitting the strmout buffer in CmdDrawIndirectByteCountEXT()
authorSamuel Pitoiset <samuel.pitoiset@gmail.com>
Mon, 12 Dec 2022 15:56:42 +0000 (16:56 +0100)
committerMarge Bot <emma+marge@anholt.net>
Tue, 3 Jan 2023 16:58:13 +0000 (16:58 +0000)
This doesn't need to be in the generic draw path because only one
draw command uses it.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20299>

src/amd/vulkan/radv_cmd_buffer.c

index 359ada4..005e0f4 100644 (file)
@@ -5058,38 +5058,6 @@ radv_emit_draw_registers(struct radv_cmd_buffer *cmd_buffer, const struct radv_d
       }
    }
 
-   if (draw_info->strmout_buffer) {
-      uint64_t va = radv_buffer_get_va(draw_info->strmout_buffer->bo);
-
-      va += draw_info->strmout_buffer->offset + draw_info->strmout_buffer_offset;
-
-      radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE, draw_info->stride);
-
-      if (info->gfx_level >= GFX10) {
-         /* Emitting a COPY_DATA packet should be enough because RADV doesn't support preemption
-          * (shadow memory) but for unknown reasons, it can lead to GPU hangs on GFX10+.
-          */
-         radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
-         radeon_emit(cs, 0);
-
-         radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, 0));
-         radeon_emit(cs, va);
-         radeon_emit(cs, va >> 32);
-         radeon_emit(cs, (R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE - SI_CONTEXT_REG_OFFSET) >> 2);
-         radeon_emit(cs, 1); /* 1 DWORD */
-      } else {
-         radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
-         radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) |
-                         COPY_DATA_WR_CONFIRM);
-         radeon_emit(cs, va);
-         radeon_emit(cs, va >> 32);
-         radeon_emit(cs, R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2);
-         radeon_emit(cs, 0); /* unused */
-      }
-
-      radv_cs_add_buffer(cmd_buffer->device->ws, cs, draw_info->strmout_buffer->bo);
-   }
-
    /* RDNA2 is affected by a hardware bug when instance packing is enabled for adjacent primitive
     * topologies and instance_count > 1, pipeline stats generated by GE are incorrect. It needs to
     * be applied for indexed and non-indexed draws.
@@ -10859,6 +10827,42 @@ radv_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstCou
    radv_set_streamout_enable(cmd_buffer, false);
 }
 
+static void
+radv_emit_strmout_buffer(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *draw_info)
+{
+   const enum amd_gfx_level gfx_level = cmd_buffer->device->physical_device->rad_info.gfx_level;
+   uint64_t va = radv_buffer_get_va(draw_info->strmout_buffer->bo);
+   struct radeon_cmdbuf *cs = cmd_buffer->cs;
+
+   va += draw_info->strmout_buffer->offset + draw_info->strmout_buffer_offset;
+
+   radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE, draw_info->stride);
+
+   if (gfx_level >= GFX10) {
+      /* Emitting a COPY_DATA packet should be enough because RADV doesn't support preemption
+       * (shadow memory) but for unknown reasons, it can lead to GPU hangs on GFX10+.
+       */
+      radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
+      radeon_emit(cs, 0);
+
+      radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, 0));
+      radeon_emit(cs, va);
+      radeon_emit(cs, va >> 32);
+      radeon_emit(cs, (R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE - SI_CONTEXT_REG_OFFSET) >> 2);
+      radeon_emit(cs, 1); /* 1 DWORD */
+   } else {
+      radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
+      radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) |
+                      COPY_DATA_WR_CONFIRM);
+      radeon_emit(cs, va);
+      radeon_emit(cs, va >> 32);
+      radeon_emit(cs, R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2);
+      radeon_emit(cs, 0); /* unused */
+   }
+
+   radv_cs_add_buffer(cmd_buffer->device->ws, cs, draw_info->strmout_buffer->bo);
+}
+
 VKAPI_ATTR void VKAPI_CALL
 radv_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer, uint32_t instanceCount,
                                  uint32_t firstInstance, VkBuffer _counterBuffer,
@@ -10881,6 +10885,7 @@ radv_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer, uint32_t instanc
    if (!radv_before_draw(cmd_buffer, &info, 1))
       return;
    struct VkMultiDrawInfoEXT minfo = { 0, 0 };
+   radv_emit_strmout_buffer(cmd_buffer, &info);
    radv_emit_direct_draw_packets(cmd_buffer, &info, 1, &minfo, S_0287F0_USE_OPAQUE(1), 0);
    radv_after_draw(cmd_buffer);
 }