From cb7dcdcea02d776431d1e1b614523deeb2dda781 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 11 Jul 2023 05:38:50 -0400 Subject: [PATCH] radeonsi: handle draw user SGPRs as tracked registers instead of this custom code doing the same thing. This tracks changes to LS, ES, and VS user SGPRs separately, so that we can skip more redundant register changes when enabling/disabling GS and tess. The perf impact should be neutral. Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_gfx_cs.c | 2 +- src/gallium/drivers/radeonsi/si_pipe.h | 27 +--- src/gallium/drivers/radeonsi/si_state.h | 19 +++ .../drivers/radeonsi/si_state_draw.cpp | 132 ++++++++---------- 4 files changed, 85 insertions(+), 95 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c index bec1c1efc40..8bbe4ca5aa7 100644 --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c @@ -532,7 +532,7 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs) /* Invalidate various draw states so that they are emitted before * the first draw call. */ - si_invalidate_draw_constants(ctx); + ctx->last_instance_count = SI_INSTANCE_COUNT_UNKNOWN; ctx->last_index_size = -1; /* Primitive restart is set to false by the gfx preamble on GFX11+. */ ctx->last_primitive_restart_en = ctx->gfx_level >= GFX11 ? false : -1; diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 154ab2bf138..57f9b62cb68 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -31,12 +31,10 @@ struct ac_llvm_compiler; /* special primitive types */ #define SI_PRIM_RECTANGLE_LIST MESA_PRIM_COUNT -/* The base vertex and primitive restart can be any number, but we must pick - * one which will mean "unknown" for the purpose of state tracking and - * the number shouldn't be a commonly-used one. */ -#define SI_BASE_VERTEX_UNKNOWN INT_MIN -#define SI_START_INSTANCE_UNKNOWN ((unsigned)INT_MIN) -#define SI_DRAW_ID_UNKNOWN ((unsigned)INT_MIN) +/* The primitive restart can be any number, but we must pick one which will + * mean "unknown" for the purpose of state tracking and the number shouldn't + * be a commonly-used one. + */ #define SI_RESTART_INDEX_UNKNOWN ((unsigned)INT_MIN) #define SI_INSTANCE_COUNT_UNKNOWN ((unsigned)INT_MIN) #define SI_NUM_SMOOTH_AA_SAMPLES 4 @@ -1167,11 +1165,7 @@ struct si_context { bool disable_instance_packing : 1; uint16_t ngg_culling; unsigned last_index_size; - int last_base_vertex; - unsigned last_start_instance; unsigned last_instance_count; - unsigned last_drawid; - unsigned last_sh_base_reg; int last_primitive_restart_en; unsigned last_restart_index; unsigned last_prim; @@ -1753,19 +1747,6 @@ static inline void si_context_add_resource_size(struct si_context *sctx, struct } } -static inline void si_invalidate_draw_sh_constants(struct si_context *sctx) -{ - sctx->last_base_vertex = SI_BASE_VERTEX_UNKNOWN; - sctx->last_start_instance = SI_START_INSTANCE_UNKNOWN; - sctx->last_drawid = SI_DRAW_ID_UNKNOWN; -} - -static inline void si_invalidate_draw_constants(struct si_context *sctx) -{ - si_invalidate_draw_sh_constants(sctx); - sctx->last_instance_count = SI_INSTANCE_COUNT_UNKNOWN; -} - static inline unsigned si_get_atom_bit(struct si_context *sctx, struct si_atom *atom) { return 1 << (atom - sctx->atoms.array); diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 3b8284c8e69..eba46d752c2 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -341,6 +341,18 @@ enum si_tracked_other_reg { SI_TRACKED_SPI_SHADER_USER_DATA_HS__TCS_OFFCHIP_ADDR, SI_TRACKED_SPI_SHADER_USER_DATA_HS__VS_STATE_BITS, /* GFX6-8 */ + SI_TRACKED_SPI_SHADER_USER_DATA_LS__BASE_VERTEX, + SI_TRACKED_SPI_SHADER_USER_DATA_LS__DRAWID, + SI_TRACKED_SPI_SHADER_USER_DATA_LS__START_INSTANCE, + + SI_TRACKED_SPI_SHADER_USER_DATA_ES__BASE_VERTEX, + SI_TRACKED_SPI_SHADER_USER_DATA_ES__DRAWID, + SI_TRACKED_SPI_SHADER_USER_DATA_ES__START_INSTANCE, + + SI_TRACKED_SPI_SHADER_USER_DATA_VS__BASE_VERTEX, /* GFX6-10 */ + SI_TRACKED_SPI_SHADER_USER_DATA_VS__DRAWID, /* GFX6-10 */ + SI_TRACKED_SPI_SHADER_USER_DATA_VS__START_INSTANCE, /* GFX6-10 */ + SI_TRACKED_COMPUTE_RESOURCE_LIMITS, SI_TRACKED_COMPUTE_NUM_THREAD_X, SI_TRACKED_COMPUTE_NUM_THREAD_Y, @@ -359,6 +371,13 @@ enum si_tracked_other_reg { SI_NUM_TRACKED_OTHER_REGS, }; +/* For 3 draw constants: BaseVertex, DrawID, StartInstance */ +#define BASEVERTEX_MASK 0x1 +#define DRAWID_MASK 0x2 +#define STARTINSTANCE_MASK 0x4 +#define BASEVERTEX_DRAWID_MASK (BASEVERTEX_MASK | DRAWID_MASK) +#define BASEVERTEX_DRAWID_STARTINSTANCE_MASK (BASEVERTEX_MASK | DRAWID_MASK | STARTINSTANCE_MASK) + struct si_tracked_regs { uint64_t context_reg_saved_mask; uint32_t context_reg_value[SI_NUM_TRACKED_CONTEXT_REGS]; diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index f36f3030481..0911407d3b6 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -901,18 +901,24 @@ static void si_emit_tess_io_layout_state(struct si_context *sctx) unsigned tes_sh_base = sctx->shader_pointers.sh_base[PIPE_SHADER_TESS_EVAL]; assert(tes_sh_base); - /* These can't be optimized because the user data SGPRs may have different meaning - * without tessellation. (they are VS and ES/GS user data SGPRs) + /* TES (as ES or VS) reuses the BaseVertex and DrawID user SGPRs that are used when + * tessellation is disabled. That's because those user SGPRs are only set in LS + * for tessellation. */ if (sctx->screen->info.has_set_pairs_packets) { - radeon_push_gfx_sh_reg(tes_sh_base + SI_SGPR_TES_OFFCHIP_LAYOUT * 4, - sctx->tcs_offchip_layout); - radeon_push_gfx_sh_reg(tes_sh_base + SI_SGPR_TES_OFFCHIP_ADDR * 4, - sctx->tes_offchip_ring_va_sgpr); + radeon_opt_push_gfx_sh_reg(tes_sh_base + SI_SGPR_TES_OFFCHIP_LAYOUT * 4, + SI_TRACKED_SPI_SHADER_USER_DATA_ES__BASE_VERTEX, + sctx->tcs_offchip_layout); + radeon_opt_push_gfx_sh_reg(tes_sh_base + SI_SGPR_TES_OFFCHIP_ADDR * 4, + SI_TRACKED_SPI_SHADER_USER_DATA_ES__DRAWID, + sctx->tes_offchip_ring_va_sgpr); } else { - radeon_set_sh_reg_seq(tes_sh_base + SI_SGPR_TES_OFFCHIP_LAYOUT * 4, 2); - radeon_emit(sctx->tcs_offchip_layout); - radeon_emit(sctx->tes_offchip_ring_va_sgpr); + bool has_gs = sctx->ngg || sctx->shader.gs.cso; + + radeon_opt_set_sh_reg2(sctx, tes_sh_base + SI_SGPR_TES_OFFCHIP_LAYOUT * 4, + has_gs ? SI_TRACKED_SPI_SHADER_USER_DATA_ES__BASE_VERTEX + : SI_TRACKED_SPI_SHADER_USER_DATA_VS__BASE_VERTEX, + sctx->tcs_offchip_layout, sctx->tes_offchip_ring_va_sgpr); } radeon_end(); @@ -1678,6 +1684,14 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw unsigned sh_base_reg = si_get_user_data_base(GFX_VERSION, HAS_TESS, HAS_GS, NGG, PIPE_SHADER_VERTEX); bool render_cond_bit = sctx->render_cond_enabled; + unsigned tracked_base_vertex_reg; + + if (HAS_TESS) + tracked_base_vertex_reg = SI_TRACKED_SPI_SHADER_USER_DATA_LS__BASE_VERTEX; + else if (HAS_GS || NGG) + tracked_base_vertex_reg = SI_TRACKED_SPI_SHADER_USER_DATA_ES__BASE_VERTEX; + else + tracked_base_vertex_reg = SI_TRACKED_SPI_SHADER_USER_DATA_VS__BASE_VERTEX; if (!IS_DRAW_VERTEX_STATE && indirect) { assert(num_draws == 1); @@ -1692,7 +1706,10 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw radeon_begin_again(cs); } - si_invalidate_draw_constants(sctx); + /* Invalidate tracked draw constants because DrawIndirect overwrites them. */ + sctx->tracked_regs.other_reg_saved_mask &= + ~(BASEVERTEX_DRAWID_STARTINSTANCE_MASK << tracked_base_vertex_reg); + sctx->last_instance_count = SI_INSTANCE_COUNT_UNKNOWN; radeon_emit(PKT3(PKT3_SET_BASE, 2, 0)); radeon_emit(1); @@ -1766,62 +1783,28 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw if (!is_blit) { /* Prefer SET_SH_REG_PAIRS_PACKED* on Gfx11+. */ if (HAS_PAIRS) { - bool shader_switch = sh_base_reg != sctx->last_sh_base_reg; - - if (shader_switch || - base_vertex != sctx->last_base_vertex || - sctx->last_base_vertex == SI_BASE_VERTEX_UNKNOWN) { - radeon_push_gfx_sh_reg(sh_base_reg + SI_SGPR_BASE_VERTEX * 4, base_vertex); - sctx->last_base_vertex = base_vertex; - } - - if (set_draw_id && - (shader_switch || - drawid_base != sctx->last_drawid || - sctx->last_drawid == SI_DRAW_ID_UNKNOWN)) { - radeon_push_gfx_sh_reg(sh_base_reg + SI_SGPR_DRAWID * 4, drawid_base); - sctx->last_drawid = drawid_base; + radeon_opt_push_gfx_sh_reg(sh_base_reg + SI_SGPR_BASE_VERTEX * 4, + tracked_base_vertex_reg, base_vertex); + if (set_draw_id) { + radeon_opt_push_gfx_sh_reg(sh_base_reg + SI_SGPR_DRAWID * 4, + tracked_base_vertex_reg + 1, drawid_base); } - - if (set_base_instance && - (shader_switch || - info->start_instance != sctx->last_start_instance || - sctx->last_start_instance == SI_START_INSTANCE_UNKNOWN)) { - radeon_push_gfx_sh_reg(sh_base_reg + SI_SGPR_START_INSTANCE * 4, - info->start_instance); - sctx->last_start_instance = info->start_instance; + if (set_base_instance) { + radeon_opt_push_gfx_sh_reg(sh_base_reg + SI_SGPR_START_INSTANCE * 4, + tracked_base_vertex_reg + 2, info->start_instance); } - - sctx->last_sh_base_reg = sh_base_reg; - } else if (base_vertex != sctx->last_base_vertex || - sctx->last_base_vertex == SI_BASE_VERTEX_UNKNOWN || - (set_base_instance && - (info->start_instance != sctx->last_start_instance || - sctx->last_start_instance == SI_START_INSTANCE_UNKNOWN)) || - (set_draw_id && - (drawid_base != sctx->last_drawid || - sctx->last_drawid == SI_DRAW_ID_UNKNOWN)) || - sh_base_reg != sctx->last_sh_base_reg) { + } else { if (set_base_instance) { - radeon_set_sh_reg_seq(sh_base_reg + SI_SGPR_BASE_VERTEX * 4, 3); - radeon_emit(base_vertex); - radeon_emit(drawid_base); - radeon_emit(info->start_instance); - - sctx->last_start_instance = info->start_instance; - sctx->last_drawid = drawid_base; + radeon_opt_set_sh_reg3(sctx, sh_base_reg + SI_SGPR_BASE_VERTEX * 4, + tracked_base_vertex_reg, base_vertex, drawid_base, + info->start_instance); } else if (set_draw_id) { - radeon_set_sh_reg_seq(sh_base_reg + SI_SGPR_BASE_VERTEX * 4, 2); - radeon_emit(base_vertex); - radeon_emit(drawid_base); - - sctx->last_drawid = drawid_base; + radeon_opt_set_sh_reg2(sctx, sh_base_reg + SI_SGPR_BASE_VERTEX * 4, + tracked_base_vertex_reg, base_vertex, drawid_base); } else { - radeon_set_sh_reg(sh_base_reg + SI_SGPR_BASE_VERTEX * 4, base_vertex); + radeon_opt_set_sh_reg(sctx, sh_base_reg + SI_SGPR_BASE_VERTEX * 4, + tracked_base_vertex_reg, base_vertex); } - - sctx->last_base_vertex = base_vertex; - sctx->last_sh_base_reg = sh_base_reg; } } @@ -1837,7 +1820,8 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw */ if (is_blit) { /* Re-emit draw constants after we leave u_blitter. */ - si_invalidate_draw_sh_constants(sctx); + sctx->tracked_regs.other_reg_saved_mask &= + ~(BASEVERTEX_DRAWID_STARTINSTANCE_MASK << tracked_base_vertex_reg); /* Blit VS doesn't use BASE_VERTEX, START_INSTANCE, and DRAWID. */ radeon_set_sh_reg_seq(sh_base_reg + SI_SGPR_VS_BLIT_DATA * 4, sctx->num_vs_blit_sgprs); @@ -1886,8 +1870,8 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw radeon_emit(V_0287F0_DI_SRC_SEL_DMA); /* NOT_EOP disabled */ } if (num_draws > 1) { - sctx->last_base_vertex = draws[num_draws - 1].index_bias; - sctx->last_drawid = drawid_base + num_draws - 1; + sctx->tracked_regs.other_reg_saved_mask &= + ~(BASEVERTEX_DRAWID_MASK << tracked_base_vertex_reg); } } else { /* Only DrawID varies. */ @@ -1904,8 +1888,10 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw radeon_emit(draws[i].count); radeon_emit(V_0287F0_DI_SRC_SEL_DMA); /* NOT_EOP disabled */ } - if (num_draws > 1) - sctx->last_drawid = drawid_base + num_draws - 1; + if (num_draws > 1) { + sctx->tracked_regs.other_reg_saved_mask &= + ~(DRAWID_MASK << tracked_base_vertex_reg); + } } } else { if (index_bias_varies) { @@ -1923,8 +1909,10 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw radeon_emit(draws[i].count); radeon_emit(V_0287F0_DI_SRC_SEL_DMA); /* NOT_EOP disabled */ } - if (num_draws > 1) - sctx->last_base_vertex = draws[num_draws - 1].index_bias; + if (num_draws > 1) { + sctx->tracked_regs.other_reg_saved_mask &= + ~(BASEVERTEX_MASK << tracked_base_vertex_reg); + } } else { /* DrawID and BaseVertex are constant. */ if (GFX_VERSION == GFX10) { @@ -1966,8 +1954,8 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw radeon_emit(V_0287F0_DI_SRC_SEL_AUTO_INDEX | use_opaque); } if (num_draws > 1 && (IS_DRAW_VERTEX_STATE || !sctx->num_vs_blit_sgprs)) { - sctx->last_base_vertex = draws[num_draws - 1].start; - sctx->last_drawid = drawid_base + num_draws - 1; + sctx->tracked_regs.other_reg_saved_mask &= + ~(BASEVERTEX_DRAWID_MASK << tracked_base_vertex_reg); } } else { for (unsigned i = 0; i < num_draws; i++) { @@ -1978,8 +1966,10 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw radeon_emit(draws[i].count); radeon_emit(V_0287F0_DI_SRC_SEL_AUTO_INDEX | use_opaque); } - if (num_draws > 1 && (IS_DRAW_VERTEX_STATE || !sctx->num_vs_blit_sgprs)) - sctx->last_base_vertex = draws[num_draws - 1].start; + if (num_draws > 1 && (IS_DRAW_VERTEX_STATE || !sctx->num_vs_blit_sgprs)) { + sctx->tracked_regs.other_reg_saved_mask &= + ~(BASEVERTEX_MASK << tracked_base_vertex_reg); + } } } } -- 2.34.1