From 21f169b2fb399c3a62d1b4f933596c371bda1b55 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Wed, 26 Jan 2022 19:38:26 -0500 Subject: [PATCH] ac,radeonsi: rework and optimize how TMPRING_SIZE is set Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/amd/common/ac_shader_util.c | 38 +++++++++++++++++++++++ src/amd/common/ac_shader_util.h | 4 +++ src/gallium/drivers/radeonsi/si_compute.c | 19 ++++++------ src/gallium/drivers/radeonsi/si_pipe.c | 4 --- src/gallium/drivers/radeonsi/si_state_shaders.cpp | 24 ++------------ 5 files changed, 54 insertions(+), 35 deletions(-) diff --git a/src/amd/common/ac_shader_util.c b/src/amd/common/ac_shader_util.c index 0660914..7ecea15 100644 --- a/src/amd/common/ac_shader_util.c +++ b/src/amd/common/ac_shader_util.c @@ -605,3 +605,41 @@ void ac_set_reg_cu_en(void *cs, unsigned reg_offset, uint32_t value, uint32_t cl set_sh_reg(cs, reg_offset, new_value); } + +/* Return the register value and tune bytes_per_wave to increase scratch performance. */ +void ac_get_scratch_tmpring_size(const struct radeon_info *info, unsigned max_scratch_waves, + unsigned bytes_per_wave, unsigned *max_seen_bytes_per_wave, + uint32_t *tmpring_size) +{ + /* SPI_TMPRING_SIZE and COMPUTE_TMPRING_SIZE are essentially scratch buffer descriptors. + * WAVES means NUM_RECORDS. WAVESIZE is the size of each element, meaning STRIDE. + * Thus, WAVESIZE must be constant while the scratch buffer is being used by the GPU. + * + * If you want to increase WAVESIZE without waiting for idle, you need to allocate a new + * scratch buffer and use it instead. This will result in multiple scratch buffers being + * used at the same time, each with a different WAVESIZE. + * + * If you want to decrease WAVESIZE, you don't have to. There is no advantage in decreasing + * WAVESIZE after it's been increased. + * + * Shaders with SCRATCH_EN=0 don't allocate scratch space. + */ + const unsigned size_shift = 10; + const unsigned min_size_per_wave = BITFIELD_BIT(size_shift); + + /* The LLVM shader backend should be reporting aligned scratch_sizes. */ + assert((bytes_per_wave & BITFIELD_MASK(size_shift)) == 0 && + "scratch size per wave should be aligned"); + + /* Add 1 scratch item to make the number of items odd. This should improve scratch + * performance by more randomly distributing scratch waves among memory channels. + */ + if (bytes_per_wave) + bytes_per_wave |= min_size_per_wave; + + *max_seen_bytes_per_wave = MAX2(*max_seen_bytes_per_wave, bytes_per_wave); + + /* TODO: We could decrease WAVES to make the whole buffer fit into the infinity cache. */ + *tmpring_size = S_0286E8_WAVES(max_scratch_waves) | + S_0286E8_WAVESIZE(*max_seen_bytes_per_wave >> size_shift); +} diff --git a/src/amd/common/ac_shader_util.h b/src/amd/common/ac_shader_util.h index 49f414d..5d7ee8f 100644 --- a/src/amd/common/ac_shader_util.h +++ b/src/amd/common/ac_shader_util.h @@ -123,6 +123,10 @@ void ac_set_reg_cu_en(void *cs, unsigned reg_offset, uint32_t value, uint32_t cl unsigned value_shift, const struct radeon_info *info, void set_sh_reg(void*, unsigned, uint32_t)); +void ac_get_scratch_tmpring_size(const struct radeon_info *info, unsigned max_scratch_waves, + unsigned bytes_per_wave, unsigned *max_seen_bytes_per_wave, + uint32_t *tmpring_size); + #ifdef __cplusplus } #endif diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index 9354cf2..6a157c5 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -449,12 +449,11 @@ void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf radeon_end(); } -static bool si_setup_compute_scratch_buffer(struct si_context *sctx, struct si_shader *shader, - struct ac_shader_config *config) +static bool si_setup_compute_scratch_buffer(struct si_context *sctx, struct si_shader *shader) { uint64_t scratch_bo_size, scratch_needed; scratch_bo_size = 0; - scratch_needed = config->scratch_bytes_per_wave * sctx->scratch_waves; + scratch_needed = sctx->max_seen_compute_scratch_bytes_per_wave * sctx->scratch_waves; if (sctx->compute_scratch_buffer) scratch_bo_size = sctx->compute_scratch_buffer->b.b.width0; @@ -524,7 +523,12 @@ static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute config->rsrc2 |= S_00B84C_LDS_SIZE(lds_blocks); } - if (!si_setup_compute_scratch_buffer(sctx, shader, config)) + unsigned tmpring_size; + ac_get_scratch_tmpring_size(&sctx->screen->info, sctx->scratch_waves, + config->scratch_bytes_per_wave, + &sctx->max_seen_compute_scratch_bytes_per_wave, &tmpring_size); + + if (!si_setup_compute_scratch_buffer(sctx, shader)) return false; if (shader->scratch_bo) { @@ -560,12 +564,7 @@ static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute "COMPUTE_PGM_RSRC2: 0x%08x\n", config->rsrc1, config->rsrc2); - sctx->max_seen_compute_scratch_bytes_per_wave = - MAX2(sctx->max_seen_compute_scratch_bytes_per_wave, config->scratch_bytes_per_wave); - - radeon_set_sh_reg(R_00B860_COMPUTE_TMPRING_SIZE, - S_00B860_WAVES(sctx->scratch_waves) | - S_00B860_WAVESIZE(sctx->max_seen_compute_scratch_bytes_per_wave >> 10)); + radeon_set_sh_reg(R_00B860_COMPUTE_TMPRING_SIZE, tmpring_size); radeon_end(); sctx->cs_shader_state.emitted_program = program; diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 7bd310c..c0a90a3 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -693,10 +693,6 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign * sctx->scratch_waves must be >= the maximum possible size of * 1 threadgroup, so that the hw doesn't hang from being unable * to start any. - * - * The recommended value is 4 per CU at most. Higher numbers don't - * bring much benefit, but they still occupy chip resources (think - * async compute). I've seen ~2% performance difference between 4 and 32. */ sctx->scratch_waves = MAX2(32 * sscreen->info.num_good_compute_units, max_threads_per_block / 64); diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index 074337e..251107a 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -4054,23 +4054,11 @@ static bool si_update_scratch_relocs(struct si_context *sctx) bool si_update_spi_tmpring_size(struct si_context *sctx, unsigned bytes) { - /* SPI_TMPRING_SIZE.WAVESIZE must be constant for each scratch buffer. - * There are 2 cases to handle: - * - * - If the current needed size is less than the maximum seen size, - * use the maximum seen size, so that WAVESIZE remains the same. - * - * - If the current needed size is greater than the maximum seen size, - * the scratch buffer is reallocated, so we can increase WAVESIZE. - * - * Shaders that set SCRATCH_EN=0 don't allocate scratch space. - * Otherwise, the number of waves that can use scratch is - * SPI_TMPRING_SIZE.WAVES. - */ - sctx->max_seen_scratch_bytes_per_wave = MAX2(sctx->max_seen_scratch_bytes_per_wave, bytes); + unsigned spi_tmpring_size; + ac_get_scratch_tmpring_size(&sctx->screen->info, sctx->scratch_waves, bytes, + &sctx->max_seen_scratch_bytes_per_wave, &spi_tmpring_size); unsigned scratch_needed_size = sctx->max_seen_scratch_bytes_per_wave * sctx->scratch_waves; - unsigned spi_tmpring_size; if (scratch_needed_size > 0) { if (!sctx->scratch_buffer || scratch_needed_size > sctx->scratch_buffer->b.b.width0) { @@ -4092,12 +4080,6 @@ bool si_update_spi_tmpring_size(struct si_context *sctx, unsigned bytes) return false; } - /* The LLVM shader backend should be reporting aligned scratch_sizes. */ - assert((scratch_needed_size & ~0x3FF) == scratch_needed_size && - "scratch size should already be aligned correctly."); - - spi_tmpring_size = S_0286E8_WAVES(sctx->scratch_waves) | - S_0286E8_WAVESIZE(sctx->max_seen_scratch_bytes_per_wave >> 10); if (spi_tmpring_size != sctx->spi_tmpring_size) { sctx->spi_tmpring_size = spi_tmpring_size; si_mark_atom_dirty(sctx, &sctx->atoms.s.scratch_state); -- 2.7.4