From c8e2c6faf6448697d949b962179a543ac9c2afee Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 22 Feb 2022 03:05:35 -0500 Subject: [PATCH] radeonsi: use SET_SH_REG_INDEX with index=3 for registers containing CU_EN This matches PAL and RADV behavior. It's for preemption. Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_build_pm4.h | 33 ++++++++++++++++++ src/gallium/drivers/radeonsi/si_pm4.c | 41 +++++++++++++++-------- src/gallium/drivers/radeonsi/si_pm4.h | 1 + src/gallium/drivers/radeonsi/si_state.c | 24 ++++++++----- src/gallium/drivers/radeonsi/si_state_shaders.cpp | 39 +++++++++++---------- 5 files changed, 99 insertions(+), 39 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_build_pm4.h b/src/gallium/drivers/radeonsi/si_build_pm4.h index 6b461c9..a37ab1b 100644 --- a/src/gallium/drivers/radeonsi/si_build_pm4.h +++ b/src/gallium/drivers/radeonsi/si_build_pm4.h @@ -117,11 +117,23 @@ radeon_emit(((reg) - SI_SH_REG_OFFSET) >> 2); \ } while (0) +#define radeon_set_sh_reg_idx3_seq(reg, num) do { \ + SI_CHECK_SHADOWED_REGS(reg, num); \ + assert((reg) >= SI_SH_REG_OFFSET && (reg) < SI_SH_REG_END); \ + radeon_emit(PKT3(PKT3_SET_SH_REG_INDEX, num, 0)); \ + radeon_emit((((reg) - SI_SH_REG_OFFSET) >> 2) | (3 << 28)); \ +} while (0) + #define radeon_set_sh_reg(reg, value) do { \ radeon_set_sh_reg_seq(reg, 1); \ radeon_emit(value); \ } while (0) +#define radeon_set_sh_reg_idx3(reg, value) do { \ + radeon_set_sh_reg_idx3_seq(reg, 1); \ + radeon_emit(value); \ +} while (0) + #define radeon_set_uconfig_reg_seq(reg, num, perfctr) do { \ SI_CHECK_SHADOWED_REGS(reg, num); \ assert((reg) >= CIK_UCONFIG_REG_OFFSET && (reg) < CIK_UCONFIG_REG_END); \ @@ -247,6 +259,19 @@ } \ } while (0) +#define radeon_opt_set_sh_reg_idx3(sctx, offset, reg, val) do { \ + unsigned __value = val; \ + if (((sctx->tracked_regs.reg_saved >> (reg)) & 0x1) != 0x1 || \ + sctx->tracked_regs.reg_value[reg] != __value) { \ + if (sctx->chip_class >= GFX10) \ + radeon_set_sh_reg_idx3(offset, __value); \ + else \ + radeon_set_sh_reg(offset, __value); \ + sctx->tracked_regs.reg_saved |= BITFIELD64_BIT(reg); \ + sctx->tracked_regs.reg_value[reg] = __value; \ + } \ +} while (0) + #define radeon_opt_set_uconfig_reg(sctx, offset, reg, val) do { \ unsigned __value = val; \ if (((sctx->tracked_regs.reg_saved >> (reg)) & 0x1) != 0x1 || \ @@ -288,6 +313,14 @@ static inline void radeon_set_sh_reg_func(struct radeon_cmdbuf *cs, unsigned reg radeon_end(); } +static inline void radeon_set_sh_reg_idx3_func(struct radeon_cmdbuf *cs, unsigned reg_offset, + uint32_t value) +{ + radeon_begin(cs); + radeon_set_sh_reg_idx3(reg_offset, value); + radeon_end(); +} + /* This should be evaluated at compile time if all parameters are constants. */ static ALWAYS_INLINE unsigned si_get_user_data_base(enum chip_class chip_class, enum si_has_tess has_tess, diff --git a/src/gallium/drivers/radeonsi/si_pm4.c b/src/gallium/drivers/radeonsi/si_pm4.c index de12bdf..5c200b3 100644 --- a/src/gallium/drivers/radeonsi/si_pm4.c +++ b/src/gallium/drivers/radeonsi/si_pm4.c @@ -53,6 +53,27 @@ static void si_pm4_cmd_end(struct si_pm4_state *state, bool predicate) state->pm4[state->last_pm4] = PKT3(state->last_opcode, count, predicate); } +static void si_pm4_set_reg_custom(struct si_pm4_state *state, unsigned reg, uint32_t val, + unsigned opcode, unsigned idx) +{ + reg >>= 2; + + if (!state->max_dw) + state->max_dw = ARRAY_SIZE(state->pm4); + + assert(state->ndw + 2 <= state->max_dw); + + if (opcode != state->last_opcode || reg != (state->last_reg + 1)) { + si_pm4_cmd_begin(state, opcode); + state->pm4[state->ndw++] = reg | (idx << 28); + } + + assert(reg <= UINT16_MAX); + state->last_reg = reg; + state->pm4[state->ndw++] = val; + si_pm4_cmd_end(state, false); +} + void si_pm4_set_reg(struct si_pm4_state *state, unsigned reg, uint32_t val) { unsigned opcode; @@ -80,22 +101,14 @@ void si_pm4_set_reg(struct si_pm4_state *state, unsigned reg, uint32_t val) return; } - reg >>= 2; - - if (!state->max_dw) - state->max_dw = ARRAY_SIZE(state->pm4); - - assert(state->ndw + 2 <= state->max_dw); + si_pm4_set_reg_custom(state, reg, val, opcode, 0); +} - if (opcode != state->last_opcode || reg != (state->last_reg + 1)) { - si_pm4_cmd_begin(state, opcode); - state->pm4[state->ndw++] = reg; - } +void si_pm4_set_reg_idx3(struct si_pm4_state *state, unsigned reg, uint32_t val) +{ + SI_CHECK_SHADOWED_REGS(reg, 1); - assert(reg <= UINT16_MAX); - state->last_reg = reg; - state->pm4[state->ndw++] = val; - si_pm4_cmd_end(state, false); + si_pm4_set_reg_custom(state, reg - SI_SH_REG_OFFSET, val, PKT3_SET_SH_REG_INDEX, 3); } void si_pm4_clear_state(struct si_pm4_state *state) diff --git a/src/gallium/drivers/radeonsi/si_pm4.h b/src/gallium/drivers/radeonsi/si_pm4.h index 4ebc97c..636b622 100644 --- a/src/gallium/drivers/radeonsi/si_pm4.h +++ b/src/gallium/drivers/radeonsi/si_pm4.h @@ -61,6 +61,7 @@ struct si_pm4_state { void si_pm4_cmd_add(struct si_pm4_state *state, uint32_t dw); void si_pm4_set_reg(struct si_pm4_state *state, unsigned reg, uint32_t val); +void si_pm4_set_reg_idx3(struct si_pm4_state *state, unsigned reg, uint32_t val); void si_pm4_clear_state(struct si_pm4_state *state); void si_pm4_free_state(struct si_context *sctx, struct si_pm4_state *state, unsigned idx); diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 1e2755a..f4c3454 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -5476,8 +5476,10 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing) if (sctx->chip_class >= GFX7) { ac_set_reg_cu_en(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS, - S_00B01C_CU_EN(cu_mask_ps) | S_00B01C_WAVE_LIMIT(0x3F), - C_00B01C_CU_EN, 0, &sscreen->info, (void*)si_pm4_set_reg); + S_00B01C_CU_EN(cu_mask_ps) | + S_00B01C_WAVE_LIMIT(0x3F), + C_00B01C_CU_EN, 0, &sscreen->info, + (void*)(sctx->chip_class >= GFX10 ? si_pm4_set_reg_idx3 : si_pm4_set_reg)); } if (sctx->chip_class <= GFX8) { @@ -5514,11 +5516,13 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing) if (sctx->chip_class >= GFX7 && sctx->chip_class <= GFX8) { ac_set_reg_cu_en(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS, S_00B51C_CU_EN(0xffff) | S_00B51C_WAVE_LIMIT(0x3F), - C_00B51C_CU_EN, 0, &sscreen->info, (void*)si_pm4_set_reg); + C_00B51C_CU_EN, 0, &sscreen->info, + (void*)(sctx->chip_class >= GFX10 ? si_pm4_set_reg_idx3 : si_pm4_set_reg)); si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, S_00B41C_WAVE_LIMIT(0x3F)); ac_set_reg_cu_en(pm4, R_00B31C_SPI_SHADER_PGM_RSRC3_ES, S_00B31C_CU_EN(0xffff) | S_00B31C_WAVE_LIMIT(0x3F), - C_00B31C_CU_EN, 0, &sscreen->info, (void*)si_pm4_set_reg); + C_00B31C_CU_EN, 0, &sscreen->info, + (void*)(sctx->chip_class >= GFX10 ? si_pm4_set_reg_idx3 : si_pm4_set_reg)); /* If this is 0, Bonaire can hang even if GS isn't being used. * Other chips are unaffected. These are suboptimal values, @@ -5560,7 +5564,8 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing) if (sctx->chip_class >= GFX9) { ac_set_reg_cu_en(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, S_00B41C_CU_EN(0xffff) | S_00B41C_WAVE_LIMIT(0x3F), C_00B41C_CU_EN, - 0, &sscreen->info, (void*)si_pm4_set_reg); + 0, &sscreen->info, + (void*)(sctx->chip_class >= GFX10 ? si_pm4_set_reg_idx3 : si_pm4_set_reg)); si_pm4_set_reg(pm4, R_028B50_VGT_TESS_DISTRIBUTION, S_028B50_ACCUM_ISOLINE(12) | S_028B50_ACCUM_TRI(30) | S_028B50_ACCUM_QUAD(24) | @@ -5579,11 +5584,14 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing) if (sctx->chip_class >= GFX10) { /* Logical CUs 16 - 31 */ ac_set_reg_cu_en(pm4, R_00B004_SPI_SHADER_PGM_RSRC4_PS, S_00B004_CU_EN(cu_mask_ps >> 16), - C_00B004_CU_EN, 16, &sscreen->info, (void*)si_pm4_set_reg); + C_00B004_CU_EN, 16, &sscreen->info, + (void*)(sctx->chip_class >= GFX10 ? si_pm4_set_reg_idx3 : si_pm4_set_reg)); ac_set_reg_cu_en(pm4, R_00B104_SPI_SHADER_PGM_RSRC4_VS, S_00B104_CU_EN(0xffff), - C_00B104_CU_EN, 16, &sscreen->info, (void*)si_pm4_set_reg); + C_00B104_CU_EN, 16, &sscreen->info, + (void*)(sctx->chip_class >= GFX10 ? si_pm4_set_reg_idx3 : si_pm4_set_reg)); ac_set_reg_cu_en(pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS, S_00B404_CU_EN(0xffff), - C_00B404_CU_EN, 16, &sscreen->info, (void*)si_pm4_set_reg); + C_00B404_CU_EN, 16, &sscreen->info, + (void*)(sctx->chip_class >= GFX10 ? si_pm4_set_reg_idx3 : si_pm4_set_reg)); si_pm4_set_reg(pm4, R_00B0C8_SPI_SHADER_USER_ACCUM_PS_0, 0); si_pm4_set_reg(pm4, R_00B0CC_SPI_SHADER_USER_ACCUM_PS_1, 0); diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index 3a26ace..f8105e2 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -965,27 +965,29 @@ static void si_emit_shader_gs(struct si_context *sctx) ac_set_reg_cu_en(&sctx->gfx_cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, shader->ctx_reg.gs.spi_shader_pgm_rsrc3_gs, C_00B21C_CU_EN, 0, &sctx->screen->info, - (void (*)(void*, unsigned, uint32_t))radeon_set_sh_reg_func); + (void (*)(void*, unsigned, uint32_t)) + (sctx->chip_class >= GFX10 ? radeon_set_sh_reg_idx3_func : radeon_set_sh_reg_func)); sctx->tracked_regs.reg_saved &= ~BITFIELD64_BIT(SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS); } if (sctx->chip_class >= GFX10) { ac_set_reg_cu_en(&sctx->gfx_cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS, shader->ctx_reg.gs.spi_shader_pgm_rsrc4_gs, C_00B204_CU_EN, 16, &sctx->screen->info, - (void (*)(void*, unsigned, uint32_t))radeon_set_sh_reg_func); + (void (*)(void*, unsigned, uint32_t)) + (sctx->chip_class >= GFX10 ? radeon_set_sh_reg_idx3_func : radeon_set_sh_reg_func)); sctx->tracked_regs.reg_saved &= ~BITFIELD64_BIT(SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS); } } else { radeon_begin_again(&sctx->gfx_cs); if (sctx->chip_class >= GFX7) { - radeon_opt_set_sh_reg(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, - SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS, - shader->ctx_reg.gs.spi_shader_pgm_rsrc3_gs); + radeon_opt_set_sh_reg_idx3(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS, + shader->ctx_reg.gs.spi_shader_pgm_rsrc3_gs); } if (sctx->chip_class >= GFX10) { - radeon_opt_set_sh_reg(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS, - SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS, - shader->ctx_reg.gs.spi_shader_pgm_rsrc4_gs); + radeon_opt_set_sh_reg_idx3(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS, + shader->ctx_reg.gs.spi_shader_pgm_rsrc4_gs); } radeon_end(); } @@ -1192,20 +1194,22 @@ static void gfx10_emit_shader_ngg_tail(struct si_context *sctx, struct si_shader ac_set_reg_cu_en(&sctx->gfx_cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, shader->ctx_reg.ngg.spi_shader_pgm_rsrc3_gs, C_00B21C_CU_EN, 0, &sctx->screen->info, - (void (*)(void*, unsigned, uint32_t))radeon_set_sh_reg_func); + (void (*)(void*, unsigned, uint32_t)) + (sctx->chip_class >= GFX10 ? radeon_set_sh_reg_idx3_func : radeon_set_sh_reg_func)); ac_set_reg_cu_en(&sctx->gfx_cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS, shader->ctx_reg.ngg.spi_shader_pgm_rsrc4_gs, C_00B204_CU_EN, 16, &sctx->screen->info, - (void (*)(void*, unsigned, uint32_t))radeon_set_sh_reg_func); + (void (*)(void*, unsigned, uint32_t)) + (sctx->chip_class >= GFX10 ? radeon_set_sh_reg_idx3_func : radeon_set_sh_reg_func)); sctx->tracked_regs.reg_saved &= ~BITFIELD64_BIT(SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS) & ~BITFIELD64_BIT(SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS); } else { - radeon_opt_set_sh_reg(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, - SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS, - shader->ctx_reg.ngg.spi_shader_pgm_rsrc3_gs); - radeon_opt_set_sh_reg(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS, - SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS, - shader->ctx_reg.ngg.spi_shader_pgm_rsrc4_gs); + radeon_opt_set_sh_reg_idx3(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS, + shader->ctx_reg.ngg.spi_shader_pgm_rsrc3_gs); + radeon_opt_set_sh_reg_idx3(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS, + shader->ctx_reg.ngg.spi_shader_pgm_rsrc4_gs); radeon_end(); } } @@ -1674,7 +1678,8 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader, ac_set_reg_cu_en(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS, S_00B118_CU_EN(cu_mask) | S_00B118_WAVE_LIMIT(0x3F), C_00B118_CU_EN, 0, &sscreen->info, - (void (*)(void*, unsigned, uint32_t))si_pm4_set_reg); + (void (*)(void*, unsigned, uint32_t)) + (sscreen->info.chip_class >= GFX10 ? si_pm4_set_reg_idx3 : si_pm4_set_reg)); si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(late_alloc_wave64)); } -- 2.7.4