From 81641b01555faa4dd1dfc7de2513ad8d63e77ab7 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Wed, 26 Jul 2023 16:15:35 +0100 Subject: [PATCH] radv: vectorize RT stack access fossil-db (gfx1100): Totals from 10 (0.01% of 133461) affected shaders: MaxWaves: 176 -> 174 (-1.14%) Instrs: 39260 -> 38710 (-1.40%) CodeSize: 202272 -> 197288 (-2.46%) VGPRs: 888 -> 900 (+1.35%) Latency: 82306 -> 81762 (-0.66%); split: -0.68%, +0.02% InvThroughput: 11182 -> 11158 (-0.21%); split: -0.52%, +0.30% VClause: 721 -> 700 (-2.91%) SClause: 1147 -> 1148 (+0.09%); split: -0.17%, +0.26% Copies: 3625 -> 3891 (+7.34%) PreVGPRs: 819 -> 845 (+3.17%); split: -0.37%, +3.54% Signed-off-by: Rhys Perry Reviewed-by: Samuel Pitoiset Part-of: --- src/amd/vulkan/radv_pipeline.c | 20 +++++++++++++++++--- src/amd/vulkan/radv_pipeline_rt.c | 1 + 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index e89405e..bab0694 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -327,8 +327,19 @@ radv_mem_vectorize_callback(unsigned align_mul, unsigned align_offset, unsigned if (num_components > 4) return false; - /* >128 bit loads are split except with SMEM */ - if (bit_size * num_components > 128) + bool is_scratch = false; + switch (low->intrinsic) { + case nir_intrinsic_load_stack: + case nir_intrinsic_store_stack: + is_scratch = true; + break; + default: + break; + } + + /* >128 bit loads are split except with SMEM. On GFX6-8, >32 bit scratch loads are split. */ + enum amd_gfx_level gfx_level = *(enum amd_gfx_level *)data; + if (bit_size * num_components > (is_scratch && gfx_level <= GFX8 ? 32 : 128)) return false; uint32_t align; @@ -343,7 +354,9 @@ radv_mem_vectorize_callback(unsigned align_mul, unsigned align_offset, unsigned case nir_intrinsic_store_ssbo: case nir_intrinsic_load_ssbo: case nir_intrinsic_load_ubo: - case nir_intrinsic_load_push_constant: { + case nir_intrinsic_load_push_constant: + case nir_intrinsic_load_stack: + case nir_intrinsic_store_stack: { unsigned max_components; if (align % 4 == 0) max_components = NIR_MAX_VEC_COMPONENTS; @@ -554,6 +567,7 @@ radv_postprocess_nir(struct radv_device *device, const struct radv_pipeline_key nir_load_store_vectorize_options vectorize_opts = { .modes = nir_var_mem_ssbo | nir_var_mem_ubo | nir_var_mem_push_const | nir_var_mem_shared | nir_var_mem_global, .callback = radv_mem_vectorize_callback, + .cb_data = &gfx_level, .robust_modes = 0, /* On GFX6, read2/write2 is out-of-bounds if the offset register is negative, even if * the final offset is not. diff --git a/src/amd/vulkan/radv_pipeline_rt.c b/src/amd/vulkan/radv_pipeline_rt.c index e03e87a..fb9b35e 100644 --- a/src/amd/vulkan/radv_pipeline_rt.c +++ b/src/amd/vulkan/radv_pipeline_rt.c @@ -376,6 +376,7 @@ radv_rt_nir_to_asm(struct radv_device *device, struct vk_pipeline_cache *cache, .stack_alignment = 16, .localized_loads = true, .vectorizer_callback = radv_mem_vectorize_callback, + .vectorizer_data = &device->physical_device->rad_info.gfx_level, }; uint32_t num_resume_shaders = 0; nir_shader **resume_shaders = NULL; -- 2.7.4