radeonsi: eliminate redundant SPI_SHADER_PGM_RSRC3/4_GS register writes
authorMarek Olšák <marek.olsak@amd.com>
Wed, 18 Aug 2021 17:05:16 +0000 (13:05 -0400)
committerMarge Bot <eric+marge@anholt.net>
Tue, 14 Sep 2021 15:24:11 +0000 (15:24 +0000)
They don't change much.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12343>

src/gallium/drivers/radeonsi/si_build_pm4.h
src/gallium/drivers/radeonsi/si_gfx_cs.c
src/gallium/drivers/radeonsi/si_shader.h
src/gallium/drivers/radeonsi/si_state.h
src/gallium/drivers/radeonsi/si_state_shaders.c

index b96c920..40e430b 100644 (file)
    } \
 } while (0)
 
+#define radeon_opt_set_sh_reg(sctx, offset, reg, val) do { \
+   unsigned __value = val; \
+   if (((sctx->tracked_regs.reg_saved >> (reg)) & 0x1) != 0x1 || \
+       sctx->tracked_regs.reg_value[reg] != __value) { \
+      radeon_set_sh_reg(cs, offset, __value); \
+      sctx->tracked_regs.reg_saved |= BITFIELD64_BIT(reg); \
+      sctx->tracked_regs.reg_value[reg] = __value; \
+   } \
+} while (0)
+
 #define radeon_set_privileged_config_reg(cs, reg, value) do { \
    assert((reg) < CIK_UCONFIG_REG_OFFSET); \
    radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); \
index 1081b23..80e9f76 100644 (file)
@@ -294,7 +294,7 @@ void si_set_tracked_regs_to_clear_state(struct si_context *ctx)
    ctx->tracked_regs.reg_value[SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL]  = 0x0000001e; /* From GFX8 */
 
    /* Set all cleared context registers to saved. */
-   ctx->tracked_regs.reg_saved = ~(1ull << SI_TRACKED_GE_PC_ALLOC); /* uconfig reg */
+   ctx->tracked_regs.reg_saved = BITFIELD64_MASK(SI_TRACKED_GE_PC_ALLOC);
    ctx->last_gs_out_prim = 0; /* cleared by CLEAR_STATE */
 }
 
index 0bbd201..fa32c8e 100644 (file)
@@ -839,6 +839,8 @@ struct si_shader {
          unsigned vgt_gs_onchip_cntl;
          unsigned vgt_gs_max_prims_per_subgroup;
          unsigned vgt_esgs_ring_itemsize;
+         unsigned spi_shader_pgm_rsrc3_gs;
+         unsigned spi_shader_pgm_rsrc4_gs;
       } gs;
 
       struct {
@@ -855,6 +857,8 @@ struct si_shader {
          unsigned pa_cl_ngg_cntl;
          unsigned vgt_gs_max_vert_out; /* for API GS */
          unsigned ge_pc_alloc;         /* uconfig register */
+         unsigned spi_shader_pgm_rsrc3_gs;
+         unsigned spi_shader_pgm_rsrc4_gs;
          union si_vgt_stages_key vgt_stages;
       } ngg;
 
index 34fbf43..cd1bd63 100644 (file)
@@ -342,7 +342,10 @@ enum si_tracked_reg
    SI_TRACKED_VGT_TF_PARAM,
    SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
 
+   /* Non-context registers: */
    SI_TRACKED_GE_PC_ALLOC,
+   SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS,
+   SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS,
 
    SI_NUM_TRACKED_REGS,
 };
index 636c318..66a6871 100644 (file)
@@ -819,6 +819,20 @@ static void si_emit_shader_gs(struct si_context *sctx)
                                     shader->vgt_vertex_reuse_block_cntl);
    }
    radeon_end_update_context_roll(sctx);
+
+   /* These don't cause any context rolls. */
+   radeon_begin_again(&sctx->gfx_cs);
+   if (sctx->chip_class >= GFX7) {
+      radeon_opt_set_sh_reg(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
+                            SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS,
+                            shader->ctx_reg.gs.spi_shader_pgm_rsrc3_gs);
+   }
+   if (sctx->chip_class >= GFX10) {
+      radeon_opt_set_sh_reg(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
+                            SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS,
+                            shader->ctx_reg.gs.spi_shader_pgm_rsrc4_gs);
+   }
+   radeon_end();
 }
 
 static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
@@ -923,13 +937,11 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
 
       si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, rsrc1);
       si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2);
-      si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
-                     S_00B21C_CU_EN(0xffff) | S_00B21C_WAVE_LIMIT(0x3F));
 
-      if (sscreen->info.chip_class >= GFX10) {
-         si_pm4_set_reg(pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
-                        S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0));
-      }
+      shader->ctx_reg.gs.spi_shader_pgm_rsrc3_gs = S_00B21C_CU_EN(0xffff) |
+                                                   S_00B21C_WAVE_LIMIT(0x3F);
+      shader->ctx_reg.gs.spi_shader_pgm_rsrc4_gs =
+         S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0);
 
       shader->ctx_reg.gs.vgt_gs_onchip_cntl =
          S_028A44_ES_VERTS_PER_SUBGRP(shader->gs_info.es_verts_per_subgroup) |
@@ -944,10 +956,9 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
 
       polaris_set_vgt_vertex_reuse(sscreen, shader->key.part.gs.es, shader);
    } else {
-      if (sscreen->info.chip_class >= GFX7) {
-         si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
-                        S_00B21C_CU_EN(0xffff) | S_00B21C_WAVE_LIMIT(0x3F));
-      }
+      shader->ctx_reg.gs.spi_shader_pgm_rsrc3_gs = S_00B21C_CU_EN(0xffff) |
+                                                   S_00B21C_WAVE_LIMIT(0x3F);
+
       si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8);
       si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS,
                      S_00B224_MEM_BASE(sscreen->info.address32_hi >> 8));
@@ -1029,6 +1040,15 @@ static void gfx10_emit_shader_ngg_tail(struct si_context *sctx, struct si_shader
 
    /* GE_PC_ALLOC is not a context register, so it doesn't cause a context roll. */
    gfx10_emit_ge_pc_alloc(sctx, shader->ctx_reg.ngg.ge_pc_alloc);
+
+   radeon_begin_again(&sctx->gfx_cs);
+   radeon_opt_set_sh_reg(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
+                         SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS,
+                         shader->ctx_reg.ngg.spi_shader_pgm_rsrc3_gs);
+   radeon_opt_set_sh_reg(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
+                         SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS,
+                         shader->ctx_reg.ngg.spi_shader_pgm_rsrc4_gs);
+   radeon_end();
 }
 
 static void gfx10_emit_shader_ngg_notess_nogs(struct si_context *sctx)
@@ -1218,12 +1238,11 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
                      S_00B22C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5) |
                      S_00B22C_OC_LDS_EN(es_stage == MESA_SHADER_TESS_EVAL) |
                      S_00B22C_LDS_SIZE(shader->config.lds_size));
-   si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
-                  S_00B21C_CU_EN(cu_mask) | S_00B21C_WAVE_LIMIT(0x3F));
 
-   si_pm4_set_reg(
-      pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
-      S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64));
+   shader->ctx_reg.ngg.spi_shader_pgm_rsrc3_gs = S_00B21C_CU_EN(cu_mask) |
+                                                 S_00B21C_WAVE_LIMIT(0x3F);
+   shader->ctx_reg.ngg.spi_shader_pgm_rsrc4_gs =
+      S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64);
 
    nparams = MAX2(shader->info.nr_param_exports, 1);
    shader->ctx_reg.ngg.spi_vs_out_config =