From 93cd96b5237ba2618fb6688fa9ab49e1b5cc5df8 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 14 Aug 2022 09:12:03 -0400
Subject: [PATCH] radeonsi: remove 1 draw packet order codepath, keep the first
 one

Multi-mode multi-draws will make it more complicated, so let's start with
simpler code.

I changed the order a little: I put the VBO update next to emit_draw_packets.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18195>
---
 src/gallium/drivers/radeonsi/si_state_draw.cpp | 133 +++++++++----------------
 1 file changed, 49 insertions(+), 84 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp
index b755cbd..3773770 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp
@@ -2422,11 +2422,22 @@ static void si_draw(struct pipe_context *ctx,
       return;
    }
 
+   /* This is the optimal packet order:
+    * Set all states first, so that all SET packets are processed in parallel with previous
+    * draw calls. Then flush caches and wait if needed. Then draw and prefetch at the end.
+    * It's better to draw before prefetches because we want to start fetching indices before
+    * shaders. The idea is to minimize the time when the CUs are idle.
+    */
+   unsigned masked_atoms = 0;
+   if (unlikely(sctx->flags & SI_CONTEXT_FLUSH_FOR_RENDER_COND)) {
+      /* The render condition state should be emitted after cache flushes. */
+      masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.render_cond);
+   }
+
    /* Vega10/Raven scissor bug workaround. When any context register is
     * written (i.e. the GPU rolls the context), PA_SC_VPORT_SCISSOR
     * registers must be written too.
     */
-   unsigned masked_atoms = 0;
    bool gfx9_scissor_bug = false;
 
    if (GFX_VERSION == GFX9 && sctx->screen->info.has_gfx9_scissor_bug) {
@@ -2441,95 +2452,49 @@ static void si_draw(struct pipe_context *ctx,
 
    bool primitive_restart = !IS_DRAW_VERTEX_STATE && info->primitive_restart;
 
-   /* Use optimal packet order based on whether we need to sync the pipeline. */
-   if (unlikely(sctx->flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB |
-                               SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
-                               SI_CONTEXT_VS_PARTIAL_FLUSH | SI_CONTEXT_VGT_FLUSH))) {
-      /* If we have to wait for idle, set all states first, so that all
-       * SET packets are processed in parallel with previous draw calls.
-       * Then draw and prefetch at the end. This ensures that the time
-       * the CUs are idle is very short.
-       */
-      if (unlikely(sctx->flags & SI_CONTEXT_FLUSH_FOR_RENDER_COND))
-         masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.render_cond);
-
-      /* Emit all states except possibly render condition. */
-      si_emit_all_states<GFX_VERSION, HAS_TESS, HAS_GS, NGG, IS_DRAW_VERTEX_STATE>
-            (sctx, info, indirect, prim, instance_count, min_direct_count,
-             primitive_restart, masked_atoms);
+   /* Emit all states except possibly render condition. */
+   si_emit_all_states<GFX_VERSION, HAS_TESS, HAS_GS, NGG, IS_DRAW_VERTEX_STATE>
+         (sctx, info, indirect, prim, instance_count, min_direct_count,
+          primitive_restart, masked_atoms);
+   if (sctx->flags)
       sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
-      /* <-- CUs are idle here. */
-
-      /* This uploads VBO descriptors, sets user SGPRs, and executes the L2 prefetch.
-       * It should done after cache flushing.
-       */
-      if (unlikely((!si_upload_and_prefetch_VB_descriptors
-                        <GFX_VERSION, HAS_TESS, HAS_GS, NGG, IS_DRAW_VERTEX_STATE, POPCNT>
-                        (sctx, state, partial_velem_mask)))) {
-         DRAW_CLEANUP;
-         return;
-      }
-
-      if (si_is_atom_dirty(sctx, &sctx->atoms.s.render_cond)) {
-         sctx->atoms.s.render_cond.emit(sctx);
-         sctx->dirty_atoms &= ~si_get_atom_bit(sctx, &sctx->atoms.s.render_cond);
-      }
-
-      if (GFX_VERSION == GFX9 && gfx9_scissor_bug &&
-          (sctx->context_roll || si_is_atom_dirty(sctx, &sctx->atoms.s.scissors))) {
-         sctx->atoms.s.scissors.emit(sctx);
-         sctx->dirty_atoms &= ~si_get_atom_bit(sctx, &sctx->atoms.s.scissors);
-      }
-      assert(sctx->dirty_atoms == 0);
-
-      si_emit_draw_packets<GFX_VERSION, NGG, IS_DRAW_VERTEX_STATE>
-            (sctx, info, drawid_offset, indirect, draws, num_draws, indexbuf,
-             index_size, index_offset, instance_count);
-      /* <-- CUs are busy here. */
-
-      /* Start prefetches after the draw has been started. Both will run
-       * in parallel, but starting the draw first is more important.
-       */
-      si_prefetch_shaders<GFX_VERSION, HAS_TESS, HAS_GS, NGG, PREFETCH_ALL>(sctx);
-   } else {
-      /* If we don't wait for idle, start prefetches first, then set
-       * states, and draw at the end.
-       */
-      if (sctx->flags)
-         sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
+   /* <-- CUs are idle here if we waited. */
 
-      /* Only prefetch the API VS and VBO descriptors. */
-      si_prefetch_shaders<GFX_VERSION, HAS_TESS, HAS_GS, NGG, PREFETCH_BEFORE_DRAW>(sctx);
-
-      /* This uploads VBO descriptors, sets user SGPRs, and executes the L2 prefetch.
-       * It should done after cache flushing and after the VS prefetch.
-       */
-      if (unlikely((!si_upload_and_prefetch_VB_descriptors
-                       <GFX_VERSION, HAS_TESS, HAS_GS, NGG, IS_DRAW_VERTEX_STATE, POPCNT>
-                       (sctx, state, partial_velem_mask)))) {
-         DRAW_CLEANUP;
-         return;
-      }
+   /* If we haven't emitted the render condition state (because it depends on cache flushes),
+    * do it now.
+    */
+   if (si_is_atom_dirty(sctx, &sctx->atoms.s.render_cond)) {
+      sctx->atoms.s.render_cond.emit(sctx);
+      sctx->dirty_atoms &= ~si_get_atom_bit(sctx, &sctx->atoms.s.render_cond);
+   }
 
-      si_emit_all_states<GFX_VERSION, HAS_TESS, HAS_GS, NGG, IS_DRAW_VERTEX_STATE>
-            (sctx, info, indirect, prim, instance_count, min_direct_count,
-             primitive_restart, masked_atoms);
+   /* This needs to be done after cache flushes because ACQUIRE_MEM rolls the context. */
+   if (GFX_VERSION == GFX9 && gfx9_scissor_bug &&
+       (sctx->context_roll || si_is_atom_dirty(sctx, &sctx->atoms.s.scissors))) {
+      sctx->atoms.s.scissors.emit(sctx);
+      sctx->dirty_atoms &= ~si_get_atom_bit(sctx, &sctx->atoms.s.scissors);
+   }
+   assert(sctx->dirty_atoms == 0);
 
-      if (GFX_VERSION == GFX9 && gfx9_scissor_bug &&
-          (sctx->context_roll || si_is_atom_dirty(sctx, &sctx->atoms.s.scissors))) {
-         sctx->atoms.s.scissors.emit(sctx);
-         sctx->dirty_atoms &= ~si_get_atom_bit(sctx, &sctx->atoms.s.scissors);
-      }
-      assert(sctx->dirty_atoms == 0);
+   /* This uploads VBO descriptors, sets user SGPRs, and executes the L2 prefetch.
+    * It should done after cache flushing.
+    */
+   if (unlikely((!si_upload_and_prefetch_VB_descriptors
+                     <GFX_VERSION, HAS_TESS, HAS_GS, NGG, IS_DRAW_VERTEX_STATE, POPCNT>
+                     (sctx, state, partial_velem_mask)))) {
+      DRAW_CLEANUP;
+      return;
+   }
 
-      si_emit_draw_packets<GFX_VERSION, NGG, IS_DRAW_VERTEX_STATE>
-            (sctx, info, drawid_offset, indirect, draws, num_draws, indexbuf,
-             index_size, index_offset, instance_count);
+   si_emit_draw_packets<GFX_VERSION, NGG, IS_DRAW_VERTEX_STATE>
+         (sctx, info, drawid_offset, indirect, draws, num_draws, indexbuf,
+          index_size, index_offset, instance_count);
+   /* <-- CUs start to get busy here if we waited. */
 
-      /* Prefetch the remaining shaders after the draw has been
-       * started. */
-      si_prefetch_shaders<GFX_VERSION, HAS_TESS, HAS_GS, NGG, PREFETCH_AFTER_DRAW>(sctx);
-   }
+   /* Start prefetches after the draw has been started. Both will run
+    * in parallel, but starting the draw first is more important.
+    */
+   si_prefetch_shaders<GFX_VERSION, HAS_TESS, HAS_GS, NGG, PREFETCH_ALL>(sctx);
 
    /* Clear the context roll flag after the draw call.
     * Only used by the gfx9 scissor bug.
-- 
2.7.4