From 93cd96b5237ba2618fb6688fa9ab49e1b5cc5df8 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sun, 14 Aug 2022 09:12:03 -0400 Subject: [PATCH] radeonsi: remove 1 draw packet order codepath, keep the first one Multi-mode multi-draws will make it more complicated, so let's start with simpler code. I changed the order a little: I put the VBO update next to emit_draw_packets. Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_state_draw.cpp | 133 +++++++++---------------- 1 file changed, 49 insertions(+), 84 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index b755cbd..3773770 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -2422,11 +2422,22 @@ static void si_draw(struct pipe_context *ctx, return; } + /* This is the optimal packet order: + * Set all states first, so that all SET packets are processed in parallel with previous + * draw calls. Then flush caches and wait if needed. Then draw and prefetch at the end. + * It's better to draw before prefetches because we want to start fetching indices before + * shaders. The idea is to minimize the time when the CUs are idle. + */ + unsigned masked_atoms = 0; + if (unlikely(sctx->flags & SI_CONTEXT_FLUSH_FOR_RENDER_COND)) { + /* The render condition state should be emitted after cache flushes. */ + masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.render_cond); + } + /* Vega10/Raven scissor bug workaround. When any context register is * written (i.e. the GPU rolls the context), PA_SC_VPORT_SCISSOR * registers must be written too. */ - unsigned masked_atoms = 0; bool gfx9_scissor_bug = false; if (GFX_VERSION == GFX9 && sctx->screen->info.has_gfx9_scissor_bug) { @@ -2441,95 +2452,49 @@ static void si_draw(struct pipe_context *ctx, bool primitive_restart = !IS_DRAW_VERTEX_STATE && info->primitive_restart; - /* Use optimal packet order based on whether we need to sync the pipeline. */ - if (unlikely(sctx->flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB | - SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH | - SI_CONTEXT_VS_PARTIAL_FLUSH | SI_CONTEXT_VGT_FLUSH))) { - /* If we have to wait for idle, set all states first, so that all - * SET packets are processed in parallel with previous draw calls. - * Then draw and prefetch at the end. This ensures that the time - * the CUs are idle is very short. - */ - if (unlikely(sctx->flags & SI_CONTEXT_FLUSH_FOR_RENDER_COND)) - masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.render_cond); - - /* Emit all states except possibly render condition. */ - si_emit_all_states - (sctx, info, indirect, prim, instance_count, min_direct_count, - primitive_restart, masked_atoms); + /* Emit all states except possibly render condition. */ + si_emit_all_states + (sctx, info, indirect, prim, instance_count, min_direct_count, + primitive_restart, masked_atoms); + if (sctx->flags) sctx->emit_cache_flush(sctx, &sctx->gfx_cs); - /* <-- CUs are idle here. */ - - /* This uploads VBO descriptors, sets user SGPRs, and executes the L2 prefetch. - * It should done after cache flushing. - */ - if (unlikely((!si_upload_and_prefetch_VB_descriptors - - (sctx, state, partial_velem_mask)))) { - DRAW_CLEANUP; - return; - } - - if (si_is_atom_dirty(sctx, &sctx->atoms.s.render_cond)) { - sctx->atoms.s.render_cond.emit(sctx); - sctx->dirty_atoms &= ~si_get_atom_bit(sctx, &sctx->atoms.s.render_cond); - } - - if (GFX_VERSION == GFX9 && gfx9_scissor_bug && - (sctx->context_roll || si_is_atom_dirty(sctx, &sctx->atoms.s.scissors))) { - sctx->atoms.s.scissors.emit(sctx); - sctx->dirty_atoms &= ~si_get_atom_bit(sctx, &sctx->atoms.s.scissors); - } - assert(sctx->dirty_atoms == 0); - - si_emit_draw_packets - (sctx, info, drawid_offset, indirect, draws, num_draws, indexbuf, - index_size, index_offset, instance_count); - /* <-- CUs are busy here. */ - - /* Start prefetches after the draw has been started. Both will run - * in parallel, but starting the draw first is more important. - */ - si_prefetch_shaders(sctx); - } else { - /* If we don't wait for idle, start prefetches first, then set - * states, and draw at the end. - */ - if (sctx->flags) - sctx->emit_cache_flush(sctx, &sctx->gfx_cs); + /* <-- CUs are idle here if we waited. */ - /* Only prefetch the API VS and VBO descriptors. */ - si_prefetch_shaders(sctx); - - /* This uploads VBO descriptors, sets user SGPRs, and executes the L2 prefetch. - * It should done after cache flushing and after the VS prefetch. - */ - if (unlikely((!si_upload_and_prefetch_VB_descriptors - - (sctx, state, partial_velem_mask)))) { - DRAW_CLEANUP; - return; - } + /* If we haven't emitted the render condition state (because it depends on cache flushes), + * do it now. + */ + if (si_is_atom_dirty(sctx, &sctx->atoms.s.render_cond)) { + sctx->atoms.s.render_cond.emit(sctx); + sctx->dirty_atoms &= ~si_get_atom_bit(sctx, &sctx->atoms.s.render_cond); + } - si_emit_all_states - (sctx, info, indirect, prim, instance_count, min_direct_count, - primitive_restart, masked_atoms); + /* This needs to be done after cache flushes because ACQUIRE_MEM rolls the context. */ + if (GFX_VERSION == GFX9 && gfx9_scissor_bug && + (sctx->context_roll || si_is_atom_dirty(sctx, &sctx->atoms.s.scissors))) { + sctx->atoms.s.scissors.emit(sctx); + sctx->dirty_atoms &= ~si_get_atom_bit(sctx, &sctx->atoms.s.scissors); + } + assert(sctx->dirty_atoms == 0); - if (GFX_VERSION == GFX9 && gfx9_scissor_bug && - (sctx->context_roll || si_is_atom_dirty(sctx, &sctx->atoms.s.scissors))) { - sctx->atoms.s.scissors.emit(sctx); - sctx->dirty_atoms &= ~si_get_atom_bit(sctx, &sctx->atoms.s.scissors); - } - assert(sctx->dirty_atoms == 0); + /* This uploads VBO descriptors, sets user SGPRs, and executes the L2 prefetch. + * It should done after cache flushing. + */ + if (unlikely((!si_upload_and_prefetch_VB_descriptors + + (sctx, state, partial_velem_mask)))) { + DRAW_CLEANUP; + return; + } - si_emit_draw_packets - (sctx, info, drawid_offset, indirect, draws, num_draws, indexbuf, - index_size, index_offset, instance_count); + si_emit_draw_packets + (sctx, info, drawid_offset, indirect, draws, num_draws, indexbuf, + index_size, index_offset, instance_count); + /* <-- CUs start to get busy here if we waited. */ - /* Prefetch the remaining shaders after the draw has been - * started. */ - si_prefetch_shaders(sctx); - } + /* Start prefetches after the draw has been started. Both will run + * in parallel, but starting the draw first is more important. + */ + si_prefetch_shaders(sctx); /* Clear the context roll flag after the draw call. * Only used by the gfx9 scissor bug. -- 2.7.4