set_sh_reg(cs, reg_offset, new_value);
}
+
+/* Return the register value and tune bytes_per_wave to increase scratch performance. */
+void ac_get_scratch_tmpring_size(const struct radeon_info *info, unsigned max_scratch_waves,
+ unsigned bytes_per_wave, unsigned *max_seen_bytes_per_wave,
+ uint32_t *tmpring_size)
+{
+ /* SPI_TMPRING_SIZE and COMPUTE_TMPRING_SIZE are essentially scratch buffer descriptors.
+ * WAVES means NUM_RECORDS. WAVESIZE is the size of each element, meaning STRIDE.
+ * Thus, WAVESIZE must be constant while the scratch buffer is being used by the GPU.
+ *
+ * If you want to increase WAVESIZE without waiting for idle, you need to allocate a new
+ * scratch buffer and use it instead. This will result in multiple scratch buffers being
+ * used at the same time, each with a different WAVESIZE.
+ *
+ * If you want to decrease WAVESIZE, you don't have to. There is no advantage in decreasing
+ * WAVESIZE after it's been increased.
+ *
+ * Shaders with SCRATCH_EN=0 don't allocate scratch space.
+ */
+ const unsigned size_shift = 10;
+ const unsigned min_size_per_wave = BITFIELD_BIT(size_shift);
+
+ /* The LLVM shader backend should be reporting aligned scratch_sizes. */
+ assert((bytes_per_wave & BITFIELD_MASK(size_shift)) == 0 &&
+ "scratch size per wave should be aligned");
+
+ /* Add 1 scratch item to make the number of items odd. This should improve scratch
+ * performance by more randomly distributing scratch waves among memory channels.
+ */
+ if (bytes_per_wave)
+ bytes_per_wave |= min_size_per_wave;
+
+ *max_seen_bytes_per_wave = MAX2(*max_seen_bytes_per_wave, bytes_per_wave);
+
+ /* TODO: We could decrease WAVES to make the whole buffer fit into the infinity cache. */
+ *tmpring_size = S_0286E8_WAVES(max_scratch_waves) |
+ S_0286E8_WAVESIZE(*max_seen_bytes_per_wave >> size_shift);
+}
unsigned value_shift, const struct radeon_info *info,
void set_sh_reg(void*, unsigned, uint32_t));
+void ac_get_scratch_tmpring_size(const struct radeon_info *info, unsigned max_scratch_waves,
+ unsigned bytes_per_wave, unsigned *max_seen_bytes_per_wave,
+ uint32_t *tmpring_size);
+
#ifdef __cplusplus
}
#endif
radeon_end();
}
-static bool si_setup_compute_scratch_buffer(struct si_context *sctx, struct si_shader *shader,
- struct ac_shader_config *config)
+static bool si_setup_compute_scratch_buffer(struct si_context *sctx, struct si_shader *shader)
{
uint64_t scratch_bo_size, scratch_needed;
scratch_bo_size = 0;
- scratch_needed = config->scratch_bytes_per_wave * sctx->scratch_waves;
+ scratch_needed = sctx->max_seen_compute_scratch_bytes_per_wave * sctx->scratch_waves;
if (sctx->compute_scratch_buffer)
scratch_bo_size = sctx->compute_scratch_buffer->b.b.width0;
config->rsrc2 |= S_00B84C_LDS_SIZE(lds_blocks);
}
- if (!si_setup_compute_scratch_buffer(sctx, shader, config))
+ unsigned tmpring_size;
+ ac_get_scratch_tmpring_size(&sctx->screen->info, sctx->scratch_waves,
+ config->scratch_bytes_per_wave,
+ &sctx->max_seen_compute_scratch_bytes_per_wave, &tmpring_size);
+
+ if (!si_setup_compute_scratch_buffer(sctx, shader))
return false;
if (shader->scratch_bo) {
"COMPUTE_PGM_RSRC2: 0x%08x\n",
config->rsrc1, config->rsrc2);
- sctx->max_seen_compute_scratch_bytes_per_wave =
- MAX2(sctx->max_seen_compute_scratch_bytes_per_wave, config->scratch_bytes_per_wave);
-
- radeon_set_sh_reg(R_00B860_COMPUTE_TMPRING_SIZE,
- S_00B860_WAVES(sctx->scratch_waves) |
- S_00B860_WAVESIZE(sctx->max_seen_compute_scratch_bytes_per_wave >> 10));
+ radeon_set_sh_reg(R_00B860_COMPUTE_TMPRING_SIZE, tmpring_size);
radeon_end();
sctx->cs_shader_state.emitted_program = program;
* sctx->scratch_waves must be >= the maximum possible size of
* 1 threadgroup, so that the hw doesn't hang from being unable
* to start any.
- *
- * The recommended value is 4 per CU at most. Higher numbers don't
- * bring much benefit, but they still occupy chip resources (think
- * async compute). I've seen ~2% performance difference between 4 and 32.
*/
sctx->scratch_waves =
MAX2(32 * sscreen->info.num_good_compute_units, max_threads_per_block / 64);
bool si_update_spi_tmpring_size(struct si_context *sctx, unsigned bytes)
{
- /* SPI_TMPRING_SIZE.WAVESIZE must be constant for each scratch buffer.
- * There are 2 cases to handle:
- *
- * - If the current needed size is less than the maximum seen size,
- * use the maximum seen size, so that WAVESIZE remains the same.
- *
- * - If the current needed size is greater than the maximum seen size,
- * the scratch buffer is reallocated, so we can increase WAVESIZE.
- *
- * Shaders that set SCRATCH_EN=0 don't allocate scratch space.
- * Otherwise, the number of waves that can use scratch is
- * SPI_TMPRING_SIZE.WAVES.
- */
- sctx->max_seen_scratch_bytes_per_wave = MAX2(sctx->max_seen_scratch_bytes_per_wave, bytes);
+ unsigned spi_tmpring_size;
+ ac_get_scratch_tmpring_size(&sctx->screen->info, sctx->scratch_waves, bytes,
+ &sctx->max_seen_scratch_bytes_per_wave, &spi_tmpring_size);
unsigned scratch_needed_size = sctx->max_seen_scratch_bytes_per_wave * sctx->scratch_waves;
- unsigned spi_tmpring_size;
if (scratch_needed_size > 0) {
if (!sctx->scratch_buffer || scratch_needed_size > sctx->scratch_buffer->b.b.width0) {
return false;
}
- /* The LLVM shader backend should be reporting aligned scratch_sizes. */
- assert((scratch_needed_size & ~0x3FF) == scratch_needed_size &&
- "scratch size should already be aligned correctly.");
-
- spi_tmpring_size = S_0286E8_WAVES(sctx->scratch_waves) |
- S_0286E8_WAVESIZE(sctx->max_seen_scratch_bytes_per_wave >> 10);
if (spi_tmpring_size != sctx->spi_tmpring_size) {
sctx->spi_tmpring_size = spi_tmpring_size;
si_mark_atom_dirty(sctx, &sctx->atoms.s.scratch_state);