radeonsi/gfx11: fix compute scratch buffer - WAVES is always per SE
authorMarek Olšák <marek.olsak@amd.com>
Wed, 2 Nov 2022 18:34:58 +0000 (14:34 -0400)
committerEric Engestrom <eric@engestrom.ch>
Wed, 9 Nov 2022 21:22:06 +0000 (21:22 +0000)
Fixes: ba02ed91a60 - ac/gfx11: fix the scratch buffer

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19477>
(cherry picked from commit bdfacd0a24e023515fb7b7fae4a279cff0fbac4e)

.pick_status.json
src/amd/common/ac_shader_util.c
src/amd/common/ac_shader_util.h
src/gallium/drivers/radeonsi/si_compute.c
src/gallium/drivers/radeonsi/si_state_shaders.cpp

index c379501..0201dac 100644 (file)
         "description": "radeonsi/gfx11: fix compute scratch buffer - WAVES is always per SE",
         "nominated": true,
         "nomination_type": 1,
-        "resolution": 0,
+        "resolution": 1,
         "main_sha": null,
         "because_sha": "ba02ed91a60839f2a6dc6a89fd9de1144b0788aa"
     },
index cba42f1..fc95f4a 100644 (file)
@@ -916,7 +916,7 @@ void ac_set_reg_cu_en(void *cs, unsigned reg_offset, uint32_t value, uint32_t cl
 }
 
 /* Return the register value and tune bytes_per_wave to increase scratch performance. */
-void ac_get_scratch_tmpring_size(const struct radeon_info *info, bool compute,
+void ac_get_scratch_tmpring_size(const struct radeon_info *info,
                                  unsigned bytes_per_wave, unsigned *max_seen_bytes_per_wave,
                                  uint32_t *tmpring_size)
 {
@@ -949,8 +949,8 @@ void ac_get_scratch_tmpring_size(const struct radeon_info *info, bool compute,
    *max_seen_bytes_per_wave = MAX2(*max_seen_bytes_per_wave, bytes_per_wave);
 
    unsigned max_scratch_waves = info->max_scratch_waves;
-   if (info->gfx_level >= GFX11 && !compute)
-      max_scratch_waves /= info->num_se; /* WAVES is per SE for SPI_TMPRING_SIZE. */
+   if (info->gfx_level >= GFX11)
+      max_scratch_waves /= info->num_se; /* WAVES is per SE */
 
    /* TODO: We could decrease WAVES to make the whole buffer fit into the infinity cache. */
    *tmpring_size = S_0286E8_WAVES(max_scratch_waves) |
index 8799665..6552bb8 100644 (file)
@@ -166,7 +166,7 @@ void ac_set_reg_cu_en(void *cs, unsigned reg_offset, uint32_t value, uint32_t cl
                       unsigned value_shift, const struct radeon_info *info,
                       void set_sh_reg(void*, unsigned, uint32_t));
 
-void ac_get_scratch_tmpring_size(const struct radeon_info *info, bool compute,
+void ac_get_scratch_tmpring_size(const struct radeon_info *info,
                                  unsigned bytes_per_wave, unsigned *max_seen_bytes_per_wave,
                                  uint32_t *tmpring_size);
 
index f59cf3a..a4f75e8 100644 (file)
@@ -547,7 +547,7 @@ static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute
    }
 
    unsigned tmpring_size;
-   ac_get_scratch_tmpring_size(&sctx->screen->info, true,
+   ac_get_scratch_tmpring_size(&sctx->screen->info,
                                config->scratch_bytes_per_wave,
                                &sctx->max_seen_compute_scratch_bytes_per_wave, &tmpring_size);
 
index c2c0918..1601234 100644 (file)
@@ -4054,7 +4054,7 @@ static bool si_update_scratch_relocs(struct si_context *sctx)
 bool si_update_spi_tmpring_size(struct si_context *sctx, unsigned bytes)
 {
    unsigned spi_tmpring_size;
-   ac_get_scratch_tmpring_size(&sctx->screen->info, false, bytes,
+   ac_get_scratch_tmpring_size(&sctx->screen->info, bytes,
                                &sctx->max_seen_scratch_bytes_per_wave, &spi_tmpring_size);
 
    unsigned scratch_needed_size = sctx->max_seen_scratch_bytes_per_wave *