From ccaaf8fe04c956d9f16f98b7f7fa69a2526283bc Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 21 Feb 2023 12:22:38 -0500 Subject: [PATCH] amd: massively simplify how info->spi_cu_en is applied Instead of having ac_set_reg_cu_en that sets the register, replace it with ac_apply_cu_en that only returns the modified register value, which allows a large simplification in both drivers because a lot of code becomes duplicated after it's switched to ac_apply_cu_en. RADV also didn't apply it to a few registers. Fixed. This removes 82 lines of code in total. Reviewed-by: Samuel Pitoiset Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/amd/common/ac_shader_util.c | 11 +- src/amd/common/ac_shader_util.h | 5 +- src/amd/vulkan/radv_cs.h | 11 -- src/amd/vulkan/radv_pipeline.c | 73 +++++------- src/amd/vulkan/si_cmd_buffer.c | 52 ++++----- src/gallium/drivers/radeonsi/si_build_pm4.h | 30 +---- src/gallium/drivers/radeonsi/si_pm4.c | 6 +- src/gallium/drivers/radeonsi/si_pm4.h | 6 +- src/gallium/drivers/radeonsi/si_state.c | 45 ++++---- src/gallium/drivers/radeonsi/si_state_shaders.cpp | 133 +++++++++------------- 10 files changed, 145 insertions(+), 227 deletions(-) diff --git a/src/amd/common/ac_shader_util.c b/src/amd/common/ac_shader_util.c index 9ffb90c..340d51f 100644 --- a/src/amd/common/ac_shader_util.c +++ b/src/amd/common/ac_shader_util.c @@ -958,9 +958,8 @@ unsigned ac_compute_ngg_workgroup_size(unsigned es_verts, unsigned gs_inst_prims return CLAMP(workgroup_size, 1, 256); } -void ac_set_reg_cu_en(void *cs, unsigned reg_offset, uint32_t value, uint32_t clear_mask, - unsigned value_shift, const struct radeon_info *info, - void set_sh_reg(void*, unsigned, uint32_t)) +uint32_t ac_apply_cu_en(uint32_t value, uint32_t clear_mask, unsigned value_shift, + const struct radeon_info *info) { /* Register field position and mask. */ uint32_t cu_en_mask = ~clear_mask; @@ -970,10 +969,8 @@ void ac_set_reg_cu_en(void *cs, unsigned reg_offset, uint32_t value, uint32_t cl /* AND the field by spi_cu_en. */ uint32_t spi_cu_en = info->spi_cu_en >> value_shift; - uint32_t new_value = (value & ~cu_en_mask) | - (((cu_en & spi_cu_en) << cu_en_shift) & cu_en_mask); - - set_sh_reg(cs, reg_offset, new_value); + return (value & ~cu_en_mask) | + (((cu_en & spi_cu_en) << cu_en_shift) & cu_en_mask); } /* Return the register value and tune bytes_per_wave to increase scratch performance. */ diff --git a/src/amd/common/ac_shader_util.h b/src/amd/common/ac_shader_util.h index 3b28ae2..1c0d2fd 100644 --- a/src/amd/common/ac_shader_util.h +++ b/src/amd/common/ac_shader_util.h @@ -166,9 +166,8 @@ unsigned ac_compute_esgs_workgroup_size(enum amd_gfx_level gfx_level, unsigned w unsigned ac_compute_ngg_workgroup_size(unsigned es_verts, unsigned gs_inst_prims, unsigned max_vtx_out, unsigned prim_amp_factor); -void ac_set_reg_cu_en(void *cs, unsigned reg_offset, uint32_t value, uint32_t clear_mask, - unsigned value_shift, const struct radeon_info *info, - void set_sh_reg(void*, unsigned, uint32_t)); +uint32_t ac_apply_cu_en(uint32_t value, uint32_t clear_mask, unsigned value_shift, + const struct radeon_info *info); void ac_get_scratch_tmpring_size(const struct radeon_info *info, unsigned bytes_per_wave, unsigned *max_seen_bytes_per_wave, diff --git a/src/amd/vulkan/radv_cs.h b/src/amd/vulkan/radv_cs.h index ac3d107..351d8e2 100644 --- a/src/amd/vulkan/radv_cs.h +++ b/src/amd/vulkan/radv_cs.h @@ -118,17 +118,6 @@ radeon_set_sh_reg_idx(const struct radv_physical_device *pdevice, struct radeon_ } static inline void -gfx10_set_sh_reg_idx3(struct radeon_cmdbuf *cs, unsigned reg, unsigned value) -{ - assert(reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END); - assert(cs->cdw + 3 <= cs->max_dw); - - radeon_emit(cs, PKT3(PKT3_SET_SH_REG_INDEX, 1, 0)); - radeon_emit(cs, (reg - SI_SH_REG_OFFSET) >> 2 | (3 << 28)); - radeon_emit(cs, value); -} - -static inline void radeon_set_uconfig_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num) { assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END); diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index 0f13744..3382e41 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -3853,15 +3853,10 @@ radv_pipeline_emit_hw_vs(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs, &late_alloc_wave64, &cu_mask); if (pdevice->rad_info.gfx_level >= GFX7) { - if (pdevice->rad_info.gfx_level >= GFX10) { - ac_set_reg_cu_en(cs, R_00B118_SPI_SHADER_PGM_RSRC3_VS, - S_00B118_CU_EN(cu_mask) | S_00B118_WAVE_LIMIT(0x3F), - C_00B118_CU_EN, 0, &pdevice->rad_info, - (void*)gfx10_set_sh_reg_idx3); - } else { - radeon_set_sh_reg_idx(pdevice, cs, R_00B118_SPI_SHADER_PGM_RSRC3_VS, 3, - S_00B118_CU_EN(cu_mask) | S_00B118_WAVE_LIMIT(0x3F)); - } + radeon_set_sh_reg_idx(pdevice, cs, R_00B118_SPI_SHADER_PGM_RSRC3_VS, 3, + ac_apply_cu_en(S_00B118_CU_EN(cu_mask) | + S_00B118_WAVE_LIMIT(0x3F), + C_00B118_CU_EN, 0, &pdevice->rad_info)); radeon_set_sh_reg(cs, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(late_alloc_wave64)); } if (pdevice->rad_info.gfx_level >= GFX10) { @@ -4032,28 +4027,21 @@ radv_pipeline_emit_hw_ngg(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs ac_compute_late_alloc(&pdevice->rad_info, true, shader->info.has_ngg_culling, shader->config.scratch_bytes_per_wave > 0, &late_alloc_wave64, &cu_mask); + radeon_set_sh_reg_idx(pdevice, cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, 3, + ac_apply_cu_en(S_00B21C_CU_EN(cu_mask) | + S_00B21C_WAVE_LIMIT(0x3F), + C_00B21C_CU_EN, 0, &pdevice->rad_info)); + if (pdevice->rad_info.gfx_level >= GFX11) { - /* TODO: figure out how S_00B204_CU_EN_GFX11 interacts with ac_set_reg_cu_en */ - gfx10_set_sh_reg_idx3(cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, - S_00B21C_CU_EN(cu_mask) | S_00B21C_WAVE_LIMIT(0x3F)); - gfx10_set_sh_reg_idx3( - cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS, - S_00B204_CU_EN_GFX11(0x1) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64)); - } else if (pdevice->rad_info.gfx_level >= GFX10) { - ac_set_reg_cu_en(cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, - S_00B21C_CU_EN(cu_mask) | S_00B21C_WAVE_LIMIT(0x3F), - C_00B21C_CU_EN, 0, &pdevice->rad_info, (void*)gfx10_set_sh_reg_idx3); - ac_set_reg_cu_en(cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS, - S_00B204_CU_EN_GFX10(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64), - C_00B204_CU_EN_GFX10, 16, &pdevice->rad_info, - (void*)gfx10_set_sh_reg_idx3); + radeon_set_sh_reg_idx(pdevice, cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS, 3, + ac_apply_cu_en(S_00B204_CU_EN_GFX11(0x1) | + S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64), + C_00B204_CU_EN_GFX11, 16, &pdevice->rad_info)); } else { - radeon_set_sh_reg_idx( - pdevice, cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, 3, - S_00B21C_CU_EN(cu_mask) | S_00B21C_WAVE_LIMIT(0x3F)); - radeon_set_sh_reg_idx( - pdevice, cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS, 3, - S_00B204_CU_EN_GFX10(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64)); + radeon_set_sh_reg_idx(pdevice, cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS, 3, + ac_apply_cu_en(S_00B204_CU_EN_GFX10(0xffff) | + S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64), + C_00B204_CU_EN_GFX10, 16, &pdevice->rad_info)); } uint32_t oversub_pc_lines = late_alloc_wave64 ? pdevice->rad_info.pc_lines / 4 : 0; @@ -4213,25 +4201,16 @@ radv_pipeline_emit_hw_gs(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs, radeon_emit(cs, gs->config.rsrc2); } - if (pdevice->rad_info.gfx_level >= GFX10) { - ac_set_reg_cu_en(cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, - S_00B21C_CU_EN(0xffff) | S_00B21C_WAVE_LIMIT(0x3F), - C_00B21C_CU_EN, 0, &pdevice->rad_info, - (void*)gfx10_set_sh_reg_idx3); - ac_set_reg_cu_en(cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS, - S_00B204_CU_EN_GFX10(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0), - C_00B204_CU_EN_GFX10, 16, &pdevice->rad_info, - (void*)gfx10_set_sh_reg_idx3); - } else if (pdevice->rad_info.gfx_level >= GFX7) { - radeon_set_sh_reg_idx( - pdevice, cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, 3, - S_00B21C_CU_EN(0xffff) | S_00B21C_WAVE_LIMIT(0x3F)); + radeon_set_sh_reg_idx(pdevice, cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, 3, + ac_apply_cu_en(S_00B21C_CU_EN(0xffff) | + S_00B21C_WAVE_LIMIT(0x3F), + C_00B21C_CU_EN, 0, &pdevice->rad_info)); - if (pdevice->rad_info.gfx_level >= GFX10) { - radeon_set_sh_reg_idx( - pdevice, cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS, 3, - S_00B204_CU_EN_GFX10(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0)); - } + if (pdevice->rad_info.gfx_level >= GFX10) { + radeon_set_sh_reg_idx(pdevice, cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS, 3, + ac_apply_cu_en(S_00B204_CU_EN_GFX10(0xffff) | + S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0), + C_00B204_CU_EN_GFX10, 16, &pdevice->rad_info)); } radv_pipeline_emit_hw_vs(ctx_cs, cs, pipeline, pipeline->base.gs_copy_shader); diff --git a/src/amd/vulkan/si_cmd_buffer.c b/src/amd/vulkan/si_cmd_buffer.c index 6641cc8..e399282 100644 --- a/src/amd/vulkan/si_cmd_buffer.c +++ b/src/amd/vulkan/si_cmd_buffer.c @@ -353,34 +353,35 @@ si_emit_graphics(struct radv_device *device, struct radeon_cmdbuf *cs) if (physical_device->rad_info.gfx_level >= GFX10 && physical_device->rad_info.gfx_level < GFX11) { /* Logical CUs 16 - 31 */ - ac_set_reg_cu_en(cs, R_00B104_SPI_SHADER_PGM_RSRC4_VS, S_00B104_CU_EN(0xffff), - C_00B104_CU_EN, 16, &physical_device->rad_info, - (void*)gfx10_set_sh_reg_idx3); + radeon_set_sh_reg_idx(physical_device, cs, R_00B104_SPI_SHADER_PGM_RSRC4_VS, 3, + ac_apply_cu_en(S_00B104_CU_EN(0xffff), + C_00B104_CU_EN, 16, &physical_device->rad_info)); } if (physical_device->rad_info.gfx_level >= GFX10) { - ac_set_reg_cu_en(cs, R_00B404_SPI_SHADER_PGM_RSRC4_HS, S_00B404_CU_EN(0xffff), - C_00B404_CU_EN, 16, &physical_device->rad_info, - (void*)gfx10_set_sh_reg_idx3); - ac_set_reg_cu_en(cs, R_00B004_SPI_SHADER_PGM_RSRC4_PS, S_00B004_CU_EN(cu_mask_ps >> 16), - C_00B004_CU_EN, 16, &physical_device->rad_info, - (void*)gfx10_set_sh_reg_idx3); + radeon_set_sh_reg_idx(physical_device, cs, R_00B404_SPI_SHADER_PGM_RSRC4_HS, 3, + ac_apply_cu_en(S_00B404_CU_EN(0xffff), + C_00B404_CU_EN, 16, &physical_device->rad_info)); + radeon_set_sh_reg_idx(physical_device, cs, R_00B004_SPI_SHADER_PGM_RSRC4_PS, 3, + ac_apply_cu_en(S_00B004_CU_EN(cu_mask_ps >> 16), + C_00B004_CU_EN, 16, &physical_device->rad_info)); } - if (physical_device->rad_info.gfx_level >= GFX10) { - ac_set_reg_cu_en(cs, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, - S_00B41C_CU_EN(0xffff) | S_00B41C_WAVE_LIMIT(0x3F), - C_00B41C_CU_EN, 0, &physical_device->rad_info, - (void*)gfx10_set_sh_reg_idx3); - } else if (physical_device->rad_info.gfx_level == GFX9) { + if (physical_device->rad_info.gfx_level >= GFX9) { radeon_set_sh_reg_idx(physical_device, cs, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, 3, - S_00B41C_CU_EN(0xffff) | S_00B41C_WAVE_LIMIT(0x3F)); + ac_apply_cu_en(S_00B41C_CU_EN(0xffff) | + S_00B41C_WAVE_LIMIT(0x3F), + C_00B41C_CU_EN, 0, &physical_device->rad_info)); } else { radeon_set_sh_reg(cs, R_00B51C_SPI_SHADER_PGM_RSRC3_LS, - S_00B51C_CU_EN(0xffff) | S_00B51C_WAVE_LIMIT(0x3F)); + ac_apply_cu_en(S_00B51C_CU_EN(0xffff) | + S_00B51C_WAVE_LIMIT(0x3F), + C_00B51C_CU_EN, 0, &physical_device->rad_info)); radeon_set_sh_reg(cs, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, S_00B41C_WAVE_LIMIT(0x3F)); radeon_set_sh_reg(cs, R_00B31C_SPI_SHADER_PGM_RSRC3_ES, - S_00B31C_CU_EN(0xffff) | S_00B31C_WAVE_LIMIT(0x3F)); + ac_apply_cu_en(S_00B31C_CU_EN(0xffff) | + S_00B31C_WAVE_LIMIT(0x3F), + C_00B31C_CU_EN, 0, &physical_device->rad_info)); /* If this is 0, Bonaire can hang even if GS isn't being used. * Other chips are unaffected. These are suboptimal values, * but we don't use on-chip GS. @@ -389,16 +390,11 @@ si_emit_graphics(struct radv_device *device, struct radeon_cmdbuf *cs) S_028A44_ES_VERTS_PER_SUBGRP(64) | S_028A44_GS_PRIMS_PER_SUBGRP(4)); } - if (physical_device->rad_info.gfx_level >= GFX10) { - ac_set_reg_cu_en(cs, R_00B01C_SPI_SHADER_PGM_RSRC3_PS, - S_00B01C_CU_EN(cu_mask_ps) | S_00B01C_WAVE_LIMIT(0x3F) | - S_00B01C_LDS_GROUP_SIZE(physical_device->rad_info.gfx_level >= GFX11), - C_00B01C_CU_EN, 0, &physical_device->rad_info, - (void*)gfx10_set_sh_reg_idx3); - } else { - radeon_set_sh_reg_idx(physical_device, cs, R_00B01C_SPI_SHADER_PGM_RSRC3_PS, 3, - S_00B01C_CU_EN(cu_mask_ps) | S_00B01C_WAVE_LIMIT(0x3F)); - } + radeon_set_sh_reg_idx(physical_device, cs, R_00B01C_SPI_SHADER_PGM_RSRC3_PS, 3, + ac_apply_cu_en(S_00B01C_CU_EN(cu_mask_ps) | + S_00B01C_WAVE_LIMIT(0x3F) | + S_00B01C_LDS_GROUP_SIZE(physical_device->rad_info.gfx_level >= GFX11), + C_00B01C_CU_EN, 0, &physical_device->rad_info)); } if (physical_device->rad_info.gfx_level >= GFX10) { diff --git a/src/gallium/drivers/radeonsi/si_build_pm4.h b/src/gallium/drivers/radeonsi/si_build_pm4.h index 3578270..eb96eff 100644 --- a/src/gallium/drivers/radeonsi/si_build_pm4.h +++ b/src/gallium/drivers/radeonsi/si_build_pm4.h @@ -125,11 +125,11 @@ radeon_emit(((reg) - SI_SH_REG_OFFSET) >> 2); \ } while (0) -#define radeon_set_sh_reg_idx3_seq(reg, num) do { \ +#define radeon_set_sh_reg_idx3_seq(sctx, reg, num) do { \ SI_CHECK_SHADOWED_REGS(reg, num); \ assert((reg) >= SI_SH_REG_OFFSET && (reg) < SI_SH_REG_END); \ radeon_emit(PKT3(PKT3_SET_SH_REG_INDEX, num, 0)); \ - radeon_emit((((reg) - SI_SH_REG_OFFSET) >> 2) | (3 << 28)); \ + radeon_emit((((reg) - SI_SH_REG_OFFSET) >> 2) | ((sctx)->gfx_level >= GFX10 ? 3 << 28 : 0)); \ } while (0) #define radeon_set_sh_reg(reg, value) do { \ @@ -137,8 +137,8 @@ radeon_emit(value); \ } while (0) -#define radeon_set_sh_reg_idx3(reg, value) do { \ - radeon_set_sh_reg_idx3_seq(reg, 1); \ +#define radeon_set_sh_reg_idx3(sctx, reg, value) do { \ + radeon_set_sh_reg_idx3_seq(sctx, reg, 1); \ radeon_emit(value); \ } while (0) @@ -297,10 +297,7 @@ unsigned __value = val; \ if (((sctx->tracked_regs.reg_saved >> (reg)) & 0x1) != 0x1 || \ sctx->tracked_regs.reg_value[reg] != __value) { \ - if (sctx->gfx_level >= GFX10) \ - radeon_set_sh_reg_idx3(offset, __value); \ - else \ - radeon_set_sh_reg(offset, __value); \ + radeon_set_sh_reg_idx3(sctx, offset, __value); \ sctx->tracked_regs.reg_saved |= BITFIELD64_BIT(reg); \ sctx->tracked_regs.reg_value[reg] = __value; \ } \ @@ -338,23 +335,6 @@ radeon_emit_32bit_pointer(sctx->screen, (desc)->gpu_address); \ } while (0) -/* Wrappers that are only used when they are passed as function pointers. */ -static inline void radeon_set_sh_reg_func(struct radeon_cmdbuf *cs, unsigned reg_offset, - uint32_t value) -{ - radeon_begin(cs); - radeon_set_sh_reg(reg_offset, value); - radeon_end(); -} - -static inline void radeon_set_sh_reg_idx3_func(struct radeon_cmdbuf *cs, unsigned reg_offset, - uint32_t value) -{ - radeon_begin(cs); - radeon_set_sh_reg_idx3(reg_offset, value); - radeon_end(); -} - /* This should be evaluated at compile time if all parameters are constants. */ static ALWAYS_INLINE unsigned si_get_user_data_base(enum amd_gfx_level gfx_level, enum si_has_tess has_tess, diff --git a/src/gallium/drivers/radeonsi/si_pm4.c b/src/gallium/drivers/radeonsi/si_pm4.c index 280125b..d79cd01 100644 --- a/src/gallium/drivers/radeonsi/si_pm4.c +++ b/src/gallium/drivers/radeonsi/si_pm4.c @@ -104,11 +104,13 @@ void si_pm4_set_reg(struct si_pm4_state *state, unsigned reg, uint32_t val) si_pm4_set_reg_custom(state, reg, val, opcode, 0); } -void si_pm4_set_reg_idx3(struct si_pm4_state *state, unsigned reg, uint32_t val) +void si_pm4_set_reg_idx3(struct si_screen *sscreen, struct si_pm4_state *state, + unsigned reg, uint32_t val) { SI_CHECK_SHADOWED_REGS(reg, 1); - si_pm4_set_reg_custom(state, reg - SI_SH_REG_OFFSET, val, PKT3_SET_SH_REG_INDEX, 3); + si_pm4_set_reg_custom(state, reg - SI_SH_REG_OFFSET, val, PKT3_SET_SH_REG_INDEX, + sscreen->info.gfx_level >= GFX10 ? 3 : 0); } void si_pm4_clear_state(struct si_pm4_state *state) diff --git a/src/gallium/drivers/radeonsi/si_pm4.h b/src/gallium/drivers/radeonsi/si_pm4.h index 486b627..0add231 100644 --- a/src/gallium/drivers/radeonsi/si_pm4.h +++ b/src/gallium/drivers/radeonsi/si_pm4.h @@ -31,7 +31,8 @@ extern "C" { #endif -// forward defines +/* forward definitions */ +struct si_screen; struct si_context; /* State atoms are callbacks which write a sequence of packets into a GPU @@ -64,7 +65,8 @@ struct si_pm4_state { void si_pm4_cmd_add(struct si_pm4_state *state, uint32_t dw); void si_pm4_set_reg(struct si_pm4_state *state, unsigned reg, uint32_t val); -void si_pm4_set_reg_idx3(struct si_pm4_state *state, unsigned reg, uint32_t val); +void si_pm4_set_reg_idx3(struct si_screen *sscreen, struct si_pm4_state *state, + unsigned reg, uint32_t val); void si_pm4_clear_state(struct si_pm4_state *state); void si_pm4_free_state(struct si_context *sctx, struct si_pm4_state *state, unsigned idx); diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 72e6451..fcd8900 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -5671,12 +5671,11 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing) cu_mask_ps = gfx103_get_cu_mask_ps(sscreen); if (sctx->gfx_level >= GFX7) { - ac_set_reg_cu_en(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS, - S_00B01C_CU_EN(cu_mask_ps) | - S_00B01C_WAVE_LIMIT(0x3F) | - S_00B01C_LDS_GROUP_SIZE(sctx->gfx_level >= GFX11), - C_00B01C_CU_EN, 0, &sscreen->info, - (void*)(sctx->gfx_level >= GFX10 ? si_pm4_set_reg_idx3 : si_pm4_set_reg)); + si_pm4_set_reg_idx3(sscreen, pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS, + ac_apply_cu_en(S_00B01C_CU_EN(cu_mask_ps) | + S_00B01C_WAVE_LIMIT(0x3F) | + S_00B01C_LDS_GROUP_SIZE(sctx->gfx_level >= GFX11), + C_00B01C_CU_EN, 0, &sscreen->info)); } if (sctx->gfx_level <= GFX8) { @@ -5711,13 +5710,13 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing) } if (sctx->gfx_level >= GFX7 && sctx->gfx_level <= GFX8) { - ac_set_reg_cu_en(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS, - S_00B51C_CU_EN(0xffff) | S_00B51C_WAVE_LIMIT(0x3F), - C_00B51C_CU_EN, 0, &sscreen->info, (void*)si_pm4_set_reg); + si_pm4_set_reg(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS, + ac_apply_cu_en(S_00B51C_CU_EN(0xffff) | S_00B51C_WAVE_LIMIT(0x3F), + C_00B51C_CU_EN, 0, &sscreen->info)); si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, S_00B41C_WAVE_LIMIT(0x3F)); - ac_set_reg_cu_en(pm4, R_00B31C_SPI_SHADER_PGM_RSRC3_ES, - S_00B31C_CU_EN(0xffff) | S_00B31C_WAVE_LIMIT(0x3F), - C_00B31C_CU_EN, 0, &sscreen->info, (void*)si_pm4_set_reg); + si_pm4_set_reg(pm4, R_00B31C_SPI_SHADER_PGM_RSRC3_ES, + ac_apply_cu_en(S_00B31C_CU_EN(0xffff) | S_00B31C_WAVE_LIMIT(0x3F), + C_00B31C_CU_EN, 0, &sscreen->info)); /* If this is 0, Bonaire can hang even if GS isn't being used. * Other chips are unaffected. These are suboptimal values, @@ -5774,10 +5773,9 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing) } if (sctx->gfx_level >= GFX9) { - ac_set_reg_cu_en(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, - S_00B41C_CU_EN(0xffff) | S_00B41C_WAVE_LIMIT(0x3F), C_00B41C_CU_EN, - 0, &sscreen->info, - (void*)(sctx->gfx_level >= GFX10 ? si_pm4_set_reg_idx3 : si_pm4_set_reg)); + si_pm4_set_reg_idx3(sscreen, pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, + ac_apply_cu_en(S_00B41C_CU_EN(0xffff) | S_00B41C_WAVE_LIMIT(0x3F), + C_00B41C_CU_EN, 0, &sscreen->info)); si_pm4_set_reg(pm4, R_028C48_PA_SC_BINNER_CNTL_1, S_028C48_MAX_ALLOC_COUNT(sscreen->info.pbb_max_alloc_count - 1) | @@ -5885,12 +5883,15 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing) if (sctx->gfx_level >= GFX10 && sctx->gfx_level <= GFX10_3) { /* Logical CUs 16 - 31 */ - ac_set_reg_cu_en(pm4, R_00B004_SPI_SHADER_PGM_RSRC4_PS, S_00B004_CU_EN(cu_mask_ps >> 16), - C_00B004_CU_EN, 16, &sscreen->info, (void*)si_pm4_set_reg_idx3); - ac_set_reg_cu_en(pm4, R_00B104_SPI_SHADER_PGM_RSRC4_VS, S_00B104_CU_EN(0xffff), - C_00B104_CU_EN, 16, &sscreen->info, (void*)si_pm4_set_reg_idx3); - ac_set_reg_cu_en(pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS, S_00B404_CU_EN(0xffff), - C_00B404_CU_EN, 16, &sscreen->info, (void*)si_pm4_set_reg_idx3); + si_pm4_set_reg_idx3(sscreen, pm4, R_00B004_SPI_SHADER_PGM_RSRC4_PS, + ac_apply_cu_en(S_00B004_CU_EN(cu_mask_ps >> 16), + C_00B004_CU_EN, 16, &sscreen->info)); + si_pm4_set_reg_idx3(sscreen, pm4, R_00B104_SPI_SHADER_PGM_RSRC4_VS, + ac_apply_cu_en(S_00B104_CU_EN(0xffff), + C_00B104_CU_EN, 16, &sscreen->info)); + si_pm4_set_reg_idx3(sscreen, pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS, + ac_apply_cu_en(S_00B404_CU_EN(0xffff), + C_00B404_CU_EN, 16, &sscreen->info)); si_pm4_set_reg(pm4, R_00B1C0_SPI_SHADER_REQ_CTRL_VS, 0); si_pm4_set_reg(pm4, R_00B1C8_SPI_SHADER_USER_ACCUM_VS_0, 0); diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index 9d5b3a3..09dd033 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -713,11 +713,10 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader) if (sscreen->info.gfx_level >= GFX9) { if (sscreen->info.gfx_level >= GFX11) { - ac_set_reg_cu_en(pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS, - S_00B404_INST_PREF_SIZE(si_get_shader_prefetch_size(shader)) | - S_00B404_CU_EN(0xffff), - C_00B404_CU_EN, 16, &sscreen->info, - (void (*)(void*, unsigned, uint32_t))si_pm4_set_reg_idx3); + si_pm4_set_reg_idx3(sscreen, pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS, + ac_apply_cu_en(S_00B404_INST_PREF_SIZE(si_get_shader_prefetch_size(shader)) | + S_00B404_CU_EN(0xffff), + C_00B404_CU_EN, 16, &sscreen->info)); } if (sscreen->info.gfx_level >= GFX10) { si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8); @@ -983,37 +982,18 @@ static void si_emit_shader_gs(struct si_context *sctx) radeon_end_update_context_roll(sctx); /* These don't cause any context rolls. */ - if (sctx->screen->info.spi_cu_en_has_effect) { - if (sctx->gfx_level >= GFX7) { - ac_set_reg_cu_en(&sctx->gfx_cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, - shader->gs.spi_shader_pgm_rsrc3_gs, - C_00B21C_CU_EN, 0, &sctx->screen->info, - (void (*)(void*, unsigned, uint32_t)) - (sctx->gfx_level >= GFX10 ? radeon_set_sh_reg_idx3_func : radeon_set_sh_reg_func)); - sctx->tracked_regs.reg_saved &= ~BITFIELD64_BIT(SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS); - } - if (sctx->gfx_level >= GFX10) { - ac_set_reg_cu_en(&sctx->gfx_cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS, - shader->gs.spi_shader_pgm_rsrc4_gs, - C_00B204_CU_EN_GFX10, 16, &sctx->screen->info, - (void (*)(void*, unsigned, uint32_t)) - (sctx->gfx_level >= GFX10 ? radeon_set_sh_reg_idx3_func : radeon_set_sh_reg_func)); - sctx->tracked_regs.reg_saved &= ~BITFIELD64_BIT(SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS); - } - } else { - radeon_begin_again(&sctx->gfx_cs); - if (sctx->gfx_level >= GFX7) { - radeon_opt_set_sh_reg_idx3(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, - SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS, - shader->gs.spi_shader_pgm_rsrc3_gs); - } - if (sctx->gfx_level >= GFX10) { - radeon_opt_set_sh_reg_idx3(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS, - SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS, - shader->gs.spi_shader_pgm_rsrc4_gs); - } - radeon_end(); + radeon_begin_again(&sctx->gfx_cs); + if (sctx->gfx_level >= GFX7) { + radeon_opt_set_sh_reg_idx3(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS, + shader->gs.spi_shader_pgm_rsrc3_gs); } + if (sctx->gfx_level >= GFX10) { + radeon_opt_set_sh_reg_idx3(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS, + shader->gs.spi_shader_pgm_rsrc4_gs); + } + radeon_end(); } static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) @@ -1125,10 +1105,14 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, rsrc1); si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2); - shader->gs.spi_shader_pgm_rsrc3_gs = S_00B21C_CU_EN(0xffff) | - S_00B21C_WAVE_LIMIT(0x3F); - shader->gs.spi_shader_pgm_rsrc4_gs = S_00B204_CU_EN_GFX10(0xffff) | - S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0); + shader->gs.spi_shader_pgm_rsrc3_gs = + ac_apply_cu_en(S_00B21C_CU_EN(0xffff) | + S_00B21C_WAVE_LIMIT(0x3F), + C_00B21C_CU_EN, 0, &sscreen->info); + shader->gs.spi_shader_pgm_rsrc4_gs = + ac_apply_cu_en(S_00B204_CU_EN_GFX10(0xffff) | + S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0), + C_00B204_CU_EN_GFX10, 16, &sscreen->info); shader->gs.vgt_gs_onchip_cntl = S_028A44_ES_VERTS_PER_SUBGRP(shader->gs_info.es_verts_per_subgroup) | @@ -1143,8 +1127,10 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) polaris_set_vgt_vertex_reuse(sscreen, shader->key.ge.part.gs.es, shader); } else { - shader->gs.spi_shader_pgm_rsrc3_gs = S_00B21C_CU_EN(0xffff) | - S_00B21C_WAVE_LIMIT(0x3F); + shader->gs.spi_shader_pgm_rsrc3_gs = + ac_apply_cu_en(S_00B21C_CU_EN(0xffff) | + S_00B21C_WAVE_LIMIT(0x3F), + C_00B21C_CU_EN, 0, &sscreen->info); si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8); pm4->reg_va_low_idx = pm4->ndw - 1; @@ -1218,30 +1204,13 @@ static void gfx10_emit_shader_ngg_tail(struct si_context *sctx, struct si_shader radeon_begin_again(&sctx->gfx_cs); radeon_opt_set_uconfig_reg(sctx, R_030980_GE_PC_ALLOC, SI_TRACKED_GE_PC_ALLOC, shader->ngg.ge_pc_alloc); - if (sctx->screen->info.spi_cu_en_has_effect) { - radeon_end(); - ac_set_reg_cu_en(&sctx->gfx_cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, - shader->ngg.spi_shader_pgm_rsrc3_gs, - C_00B21C_CU_EN, 0, &sctx->screen->info, - (void (*)(void*, unsigned, uint32_t)) - (sctx->gfx_level >= GFX10 ? radeon_set_sh_reg_idx3_func : radeon_set_sh_reg_func)); - ac_set_reg_cu_en(&sctx->gfx_cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS, - shader->ngg.spi_shader_pgm_rsrc4_gs, - sctx->gfx_level >= GFX11 ? C_00B204_CU_EN_GFX11 : C_00B204_CU_EN_GFX10, 16, - &sctx->screen->info, - (void (*)(void*, unsigned, uint32_t)) - (sctx->gfx_level >= GFX10 ? radeon_set_sh_reg_idx3_func : radeon_set_sh_reg_func)); - sctx->tracked_regs.reg_saved &= ~BITFIELD64_BIT(SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS) & - ~BITFIELD64_BIT(SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS); - } else { - radeon_opt_set_sh_reg_idx3(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, - SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS, - shader->ngg.spi_shader_pgm_rsrc3_gs); - radeon_opt_set_sh_reg_idx3(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS, - SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS, - shader->ngg.spi_shader_pgm_rsrc4_gs); - radeon_end(); - } + radeon_opt_set_sh_reg_idx3(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS, + shader->ngg.spi_shader_pgm_rsrc3_gs); + radeon_opt_set_sh_reg_idx3(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS, + shader->ngg.spi_shader_pgm_rsrc4_gs); + radeon_end(); } static void gfx10_emit_shader_ngg_notess_nogs(struct si_context *sctx) @@ -1439,15 +1408,21 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader S_00B22C_OC_LDS_EN(es_stage == MESA_SHADER_TESS_EVAL) | S_00B22C_LDS_SIZE(shader->config.lds_size)); - shader->ngg.spi_shader_pgm_rsrc3_gs = S_00B21C_CU_EN(cu_mask) | - S_00B21C_WAVE_LIMIT(0x3F); + shader->ngg.spi_shader_pgm_rsrc3_gs = + ac_apply_cu_en(S_00B21C_CU_EN(cu_mask) | + S_00B21C_WAVE_LIMIT(0x3F), + C_00B21C_CU_EN, 0, &sscreen->info); if (sscreen->info.gfx_level >= GFX11) { shader->ngg.spi_shader_pgm_rsrc4_gs = - S_00B204_CU_EN_GFX11(0x1) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64) | - S_00B204_INST_PREF_SIZE(si_get_shader_prefetch_size(shader)); + ac_apply_cu_en(S_00B204_CU_EN_GFX11(0x1) | + S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64) | + S_00B204_INST_PREF_SIZE(si_get_shader_prefetch_size(shader)), + C_00B204_CU_EN_GFX11, 16, &sscreen->info); } else { shader->ngg.spi_shader_pgm_rsrc4_gs = - S_00B204_CU_EN_GFX10(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64); + ac_apply_cu_en(S_00B204_CU_EN_GFX10(0xffff) | + S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64), + C_00B204_CU_EN_GFX10, 16, &sscreen->info); } nparams = MAX2(shader->info.nr_param_exports, 1); @@ -1727,11 +1702,10 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader, oc_lds_en = shader->selector->stage == MESA_SHADER_TESS_EVAL ? 1 : 0; if (sscreen->info.gfx_level >= GFX7) { - ac_set_reg_cu_en(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS, - S_00B118_CU_EN(cu_mask) | S_00B118_WAVE_LIMIT(0x3F), - C_00B118_CU_EN, 0, &sscreen->info, - (void (*)(void*, unsigned, uint32_t)) - (sscreen->info.gfx_level >= GFX10 ? si_pm4_set_reg_idx3 : si_pm4_set_reg)); + si_pm4_set_reg_idx3(sscreen, pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS, + ac_apply_cu_en(S_00B118_CU_EN(cu_mask) | + S_00B118_WAVE_LIMIT(0x3F), + C_00B118_CU_EN, 0, &sscreen->info)); si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(late_alloc_wave64)); } @@ -2049,11 +2023,10 @@ static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader) if (sscreen->info.gfx_level >= GFX11) { unsigned cu_mask_ps = gfx103_get_cu_mask_ps(sscreen); - ac_set_reg_cu_en(pm4, R_00B004_SPI_SHADER_PGM_RSRC4_PS, - S_00B004_INST_PREF_SIZE(si_get_shader_prefetch_size(shader)) | - S_00B004_CU_EN(cu_mask_ps >> 16), - C_00B004_CU_EN, 16, &sscreen->info, - (void (*)(void*, unsigned, uint32_t))si_pm4_set_reg_idx3); + si_pm4_set_reg_idx3(sscreen, pm4, R_00B004_SPI_SHADER_PGM_RSRC4_PS, + ac_apply_cu_en(S_00B004_CU_EN(cu_mask_ps >> 16) | + S_00B004_INST_PREF_SIZE(si_get_shader_prefetch_size(shader)), + C_00B004_CU_EN, 16, &sscreen->info)); } } -- 2.7.4