ac,radeonsi: rework and optimize how TMPRING_SIZE is set

author Marek Olšák <marek.olsak@amd.com>

Thu, 27 Jan 2022 00:38:26 +0000 (19:38 -0500)

committer Marge Bot <emma+marge@anholt.net>

Tue, 22 Feb 2022 11:41:04 +0000 (11:41 +0000)
author Marek Olšák <marek.olsak@amd.com>
Thu, 27 Jan 2022 00:38:26 +0000 (19:38 -0500)
committer Marge Bot <emma+marge@anholt.net>
Tue, 22 Feb 2022 11:41:04 +0000 (11:41 +0000)
diff --git a/src/amd/common/ac_shader_util.c b/src/amd/common/ac_shader_util.c

index 0660914..7ecea15 100644 (file)
--- a/src/amd/common/ac_shader_util.c
+++ b/src/amd/common/ac_shader_util.c
@@ -605,3 +605,41 @@ void ac_set_reg_cu_en(void *cs, unsigned reg_offset, uint32_t value, uint32_t cl
  
     set_sh_reg(cs, reg_offset, new_value);
  }
+
+/* Return the register value and tune bytes_per_wave to increase scratch performance. */
+void ac_get_scratch_tmpring_size(const struct radeon_info *info, unsigned max_scratch_waves,
+                                 unsigned bytes_per_wave, unsigned *max_seen_bytes_per_wave,
+                                 uint32_t *tmpring_size)
+{
+   /* SPI_TMPRING_SIZE and COMPUTE_TMPRING_SIZE are essentially scratch buffer descriptors.
+    * WAVES means NUM_RECORDS. WAVESIZE is the size of each element, meaning STRIDE.
+    * Thus, WAVESIZE must be constant while the scratch buffer is being used by the GPU.
+    *
+    * If you want to increase WAVESIZE without waiting for idle, you need to allocate a new
+    * scratch buffer and use it instead. This will result in multiple scratch buffers being
+    * used at the same time, each with a different WAVESIZE.
+    *
+    * If you want to decrease WAVESIZE, you don't have to. There is no advantage in decreasing
+    * WAVESIZE after it's been increased.
+    *
+    * Shaders with SCRATCH_EN=0 don't allocate scratch space.
+    */
+   const unsigned size_shift = 10;
+   const unsigned min_size_per_wave = BITFIELD_BIT(size_shift);
+
+   /* The LLVM shader backend should be reporting aligned scratch_sizes. */
+   assert((bytes_per_wave & BITFIELD_MASK(size_shift)) == 0 &&
+          "scratch size per wave should be aligned");
+
+   /* Add 1 scratch item to make the number of items odd. This should improve scratch
+    * performance by more randomly distributing scratch waves among memory channels.
+    */
+   if (bytes_per_wave)
+      bytes_per_wave |= min_size_per_wave;
+
+   *max_seen_bytes_per_wave = MAX2(*max_seen_bytes_per_wave, bytes_per_wave);
+
+   /* TODO: We could decrease WAVES to make the whole buffer fit into the infinity cache. */
+   *tmpring_size = S_0286E8_WAVES(max_scratch_waves) |
+                   S_0286E8_WAVESIZE(*max_seen_bytes_per_wave >> size_shift);
+}
diff --git a/src/amd/common/ac_shader_util.h b/src/amd/common/ac_shader_util.h

index 49f414d..5d7ee8f 100644 (file)
--- a/src/amd/common/ac_shader_util.h
+++ b/src/amd/common/ac_shader_util.h
@@ -123,6 +123,10 @@ void ac_set_reg_cu_en(void *cs, unsigned reg_offset, uint32_t value, uint32_t cl
                        unsigned value_shift, const struct radeon_info *info,
                        void set_sh_reg(void*, unsigned, uint32_t));
  
+void ac_get_scratch_tmpring_size(const struct radeon_info *info, unsigned max_scratch_waves,
+                                 unsigned bytes_per_wave, unsigned *max_seen_bytes_per_wave,
+                                 uint32_t *tmpring_size);
+
  #ifdef __cplusplus
  }
  #endif
diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c

index 9354cf2..6a157c5 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -449,12 +449,11 @@ void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf
     radeon_end();
  }
  
-static bool si_setup_compute_scratch_buffer(struct si_context *sctx, struct si_shader *shader,
-                                            struct ac_shader_config *config)
+static bool si_setup_compute_scratch_buffer(struct si_context *sctx, struct si_shader *shader)
  {
     uint64_t scratch_bo_size, scratch_needed;
     scratch_bo_size = 0;
-   scratch_needed = config->scratch_bytes_per_wave * sctx->scratch_waves;
+   scratch_needed = sctx->max_seen_compute_scratch_bytes_per_wave * sctx->scratch_waves;
     if (sctx->compute_scratch_buffer)
        scratch_bo_size = sctx->compute_scratch_buffer->b.b.width0;
  
@@ -524,7 +523,12 @@ static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute
        config->rsrc2 |= S_00B84C_LDS_SIZE(lds_blocks);
     }
  
-   if (!si_setup_compute_scratch_buffer(sctx, shader, config))
+   unsigned tmpring_size;
+   ac_get_scratch_tmpring_size(&sctx->screen->info, sctx->scratch_waves,
+                               config->scratch_bytes_per_wave,
+                               &sctx->max_seen_compute_scratch_bytes_per_wave, &tmpring_size);
+
+   if (!si_setup_compute_scratch_buffer(sctx, shader))
        return false;
  
     if (shader->scratch_bo) {
@@ -560,12 +564,7 @@ static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute
                 "COMPUTE_PGM_RSRC2: 0x%08x\n",
                 config->rsrc1, config->rsrc2);
  
-   sctx->max_seen_compute_scratch_bytes_per_wave =
-      MAX2(sctx->max_seen_compute_scratch_bytes_per_wave, config->scratch_bytes_per_wave);
-
-   radeon_set_sh_reg(R_00B860_COMPUTE_TMPRING_SIZE,
-                     S_00B860_WAVES(sctx->scratch_waves) |
-                        S_00B860_WAVESIZE(sctx->max_seen_compute_scratch_bytes_per_wave >> 10));
+   radeon_set_sh_reg(R_00B860_COMPUTE_TMPRING_SIZE, tmpring_size);
     radeon_end();
  
     sctx->cs_shader_state.emitted_program = program;
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c

index 7bd310c..c0a90a3 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -693,10 +693,6 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
      * sctx->scratch_waves must be >= the maximum possible size of
      * 1 threadgroup, so that the hw doesn't hang from being unable
      * to start any.
-    *
-    * The recommended value is 4 per CU at most. Higher numbers don't
-    * bring much benefit, but they still occupy chip resources (think
-    * async compute). I've seen ~2% performance difference between 4 and 32.
      */
     sctx->scratch_waves =
        MAX2(32 * sscreen->info.num_good_compute_units, max_threads_per_block / 64);
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp

index 074337e..251107a 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
@@ -4054,23 +4054,11 @@ static bool si_update_scratch_relocs(struct si_context *sctx)
  
  bool si_update_spi_tmpring_size(struct si_context *sctx, unsigned bytes)
  {
-   /* SPI_TMPRING_SIZE.WAVESIZE must be constant for each scratch buffer.
-    * There are 2 cases to handle:
-    *
-    * - If the current needed size is less than the maximum seen size,
-    *   use the maximum seen size, so that WAVESIZE remains the same.
-    *
-    * - If the current needed size is greater than the maximum seen size,
-    *   the scratch buffer is reallocated, so we can increase WAVESIZE.
-    *
-    * Shaders that set SCRATCH_EN=0 don't allocate scratch space.
-    * Otherwise, the number of waves that can use scratch is
-    * SPI_TMPRING_SIZE.WAVES.
-    */
-   sctx->max_seen_scratch_bytes_per_wave = MAX2(sctx->max_seen_scratch_bytes_per_wave, bytes);
+   unsigned spi_tmpring_size;
+   ac_get_scratch_tmpring_size(&sctx->screen->info, sctx->scratch_waves, bytes,
+                               &sctx->max_seen_scratch_bytes_per_wave, &spi_tmpring_size);
  
     unsigned scratch_needed_size = sctx->max_seen_scratch_bytes_per_wave * sctx->scratch_waves;
-   unsigned spi_tmpring_size;
  
     if (scratch_needed_size > 0) {
        if (!sctx->scratch_buffer || scratch_needed_size > sctx->scratch_buffer->b.b.width0) {
@@ -4092,12 +4080,6 @@ bool si_update_spi_tmpring_size(struct si_context *sctx, unsigned bytes)
           return false;
     }
  
-   /* The LLVM shader backend should be reporting aligned scratch_sizes. */
-   assert((scratch_needed_size & ~0x3FF) == scratch_needed_size &&
-          "scratch size should already be aligned correctly.");
-
-   spi_tmpring_size = S_0286E8_WAVES(sctx->scratch_waves) |
-                      S_0286E8_WAVESIZE(sctx->max_seen_scratch_bytes_per_wave >> 10);
     if (spi_tmpring_size != sctx->spi_tmpring_size) {
        sctx->spi_tmpring_size = spi_tmpring_size;
        si_mark_atom_dirty(sctx, &sctx->atoms.s.scratch_state);
author	Marek Olšák <marek.olsak@amd.com>
	Thu, 27 Jan 2022 00:38:26 +0000 (19:38 -0500)
committer	Marge Bot <emma+marge@anholt.net>
	Tue, 22 Feb 2022 11:41:04 +0000 (11:41 +0000)
src/amd/common/ac_shader_util.c		patch \| blob \| history
src/amd/common/ac_shader_util.h		patch \| blob \| history
src/gallium/drivers/radeonsi/si_compute.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_pipe.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_state_shaders.cpp		patch \| blob \| history