From b3b2f97f2e25b2c4c72e5f04c945ce9c48ce6cd6 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Fri, 19 Nov 2021 18:36:03 -0500 Subject: [PATCH] radeonsi: add Wave32 heuristics and shader profiles This generally works well. There are new cases that select Wave32, and there are shader profiles which adjust that. Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_pipe.c | 1 + src/gallium/drivers/radeonsi/si_pipe.h | 1 + src/gallium/drivers/radeonsi/si_shader.h | 6 ++ src/gallium/drivers/radeonsi/si_shader_nir.c | 36 ++++++++++ src/gallium/drivers/radeonsi/si_state_shaders.cpp | 84 +++++++++++++++++++++-- 5 files changed, 123 insertions(+), 5 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index bd7d3f2..1717686 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -65,6 +65,7 @@ static const struct debug_named_value radeonsi_debug_options[] = { {"gisel", DBG(GISEL), "Enable LLVM global instruction selector."}, {"w32ge", DBG(W32_GE), "Use Wave32 for vertex, tessellation, and geometry shaders."}, {"w32ps", DBG(W32_PS), "Use Wave32 for pixel shaders."}, + {"w32psdiscard", DBG(W32_PS_DISCARD), "Use Wave32 for pixel shaders even if they contain discard and LLVM is buggy."}, {"w32cs", DBG(W32_CS), "Use Wave32 for computes shaders."}, {"w64ge", DBG(W64_GE), "Use Wave64 for vertex, tessellation, and geometry shaders."}, {"w64ps", DBG(W64_PS), "Use Wave64 for pixel shaders."}, diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index f01abc3..b0db575 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -197,6 +197,7 @@ enum DBG_GISEL, DBG_W32_GE, DBG_W32_PS, + DBG_W32_PS_DISCARD, DBG_W32_CS, DBG_W64_GE, DBG_W64_PS, diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index d410b7a..b740e2b 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -288,6 +288,10 @@ enum #define SI_NGG_CULL_CLIP_PLANE_ENABLE(enable) (((enable) & 0xff) << 5) #define SI_NGG_CULL_GET_CLIP_PLANE_ENABLE(x) (((x) >> 5) & 0xff) +#define SI_PROFILE_WAVE32 (1 << 0) +#define SI_PROFILE_WAVE64 (1 << 1) +#define SI_PROFILE_IGNORE_LLVM_DISCARD_BUG (1 << 2) + /** * For VS shader keys, describe any fixups required for vertex fetch. * @@ -344,6 +348,7 @@ struct si_shader_info { shader_info base; gl_shader_stage stage; + uint32_t options; /* bitmask of SI_PROFILE_* */ ubyte num_inputs; ubyte num_outputs; @@ -404,6 +409,7 @@ struct si_shader_info { bool uses_bindless_samplers; bool uses_bindless_images; bool uses_indirect_descriptor; + bool has_divergent_loop; bool uses_vmem_return_type_sampler_or_bvh; bool uses_vmem_return_type_other; /* all other VMEM loads and atomics with return */ diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c index 77fa077..b43b535 100644 --- a/src/gallium/drivers/radeonsi/si_shader_nir.c +++ b/src/gallium/drivers/radeonsi/si_shader_nir.c @@ -31,6 +31,29 @@ #include "si_pipe.h" #include "si_shader_internal.h" #include "tgsi/tgsi_from_mesa.h" +#include "util/mesa-sha1.h" + + +struct si_shader_profile { + uint32_t sha1[SHA1_DIGEST_LENGTH32]; + uint32_t options; +}; + +static struct si_shader_profile profiles[] = +{ + { + /* Viewperf/Energy isn't affected by the discard bug. */ + {0x17118671, 0xd0102e0c, 0x947f3592, 0xb2057e7b, 0x4da5d9b0}, + SI_PROFILE_IGNORE_LLVM_DISCARD_BUG, + }, + { + /* Viewperf/Medical, a shader with a divergent loop doesn't benefit from Wave32, + * probably due to interpolation performance. + */ + {0x29f0f4a0, 0x0672258d, 0x47ccdcfd, 0x31e67dcc, 0xdcb1fda8}, + SI_PROFILE_WAVE64, + }, +}; static const nir_src *get_texture_src(nir_tex_instr *instr, nir_tex_src_type type) { @@ -397,6 +420,14 @@ void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *inf info->base = nir->info; info->stage = nir->info.stage; + /* Get options from shader profiles. */ + for (unsigned i = 0; i < ARRAY_SIZE(profiles); i++) { + if (_mesa_printed_sha1_equal(info->base.source_sha1, profiles[i].sha1)) { + info->options = profiles[i].options; + break; + } + } + if (nir->info.stage == MESA_SHADER_TESS_EVAL) { if (info->base.tess.primitive_mode == GL_ISOLINES) info->base.tess.primitive_mode = GL_LINES; @@ -531,6 +562,8 @@ void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *inf /* Trim output read masks based on write masks. */ for (unsigned i = 0; i < info->num_outputs; i++) info->output_readmask[i] &= info->output_usagemask[i]; + + info->has_divergent_loop = nir_has_divergent_loop((nir_shader*)nir); } static bool si_alu_to_scalar_filter(const nir_instr *instr, const void *data) @@ -932,5 +965,8 @@ char *si_finalize_nir(struct pipe_screen *screen, void *nirptr) if (sscreen->options.inline_uniforms) nir_find_inlinable_uniforms(nir); + NIR_PASS_V(nir, nir_convert_to_lcssa, true, true); /* required by divergence analysis */ + NIR_PASS_V(nir, nir_divergence_analysis); /* to find divergent loops */ + return NULL; } diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index eaf3cf2..6bd67a8 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -52,13 +52,87 @@ unsigned si_determine_wave_size(struct si_screen *sscreen, struct si_shader *sha (stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg)) return 64; - if (stage == MESA_SHADER_COMPUTE) - return sscreen->debug_flags & DBG(W32_CS) ? 32 : 64; + /* Small workgroups use Wave32 unconditionally. */ + if (stage == MESA_SHADER_COMPUTE && info && + !info->base.workgroup_size_variable && + info->base.workgroup_size[0] * + info->base.workgroup_size[1] * + info->base.workgroup_size[2] <= 32) + return 32; + + /* Debug flags. */ + unsigned dbg_wave_size = 0; + if (sscreen->debug_flags & + (stage == MESA_SHADER_COMPUTE ? DBG(W32_CS) : + stage == MESA_SHADER_FRAGMENT ? DBG(W32_PS) | DBG(W32_PS_DISCARD) : DBG(W32_GE))) + dbg_wave_size = 32; + + if (sscreen->debug_flags & + (stage == MESA_SHADER_COMPUTE ? DBG(W64_CS) : + stage == MESA_SHADER_FRAGMENT ? DBG(W64_PS) : DBG(W64_GE))) { + assert(!dbg_wave_size); + dbg_wave_size = 64; + } + + /* Shader profiles. */ + unsigned profile_wave_size = 0; + if (info && info->options & SI_PROFILE_WAVE32) + profile_wave_size = 32; + + if (info && info->options & SI_PROFILE_WAVE64) { + assert(!profile_wave_size); + profile_wave_size = 64; + } + + if (profile_wave_size) { + /* Only debug flags override shader profiles. */ + if (dbg_wave_size) + return dbg_wave_size; + + return profile_wave_size; + } + + /* LLVM 13 and 14 have a bug that causes compile failures with discard in Wave32 + * in some cases. Alpha test in Wave32 is luckily unaffected. + */ + if (stage == MESA_SHADER_FRAGMENT && info->base.fs.uses_discard && + !(info && info->options & SI_PROFILE_IGNORE_LLVM_DISCARD_BUG) && + LLVM_VERSION_MAJOR >= 13 && !(sscreen->debug_flags & DBG(W32_PS_DISCARD))) + return 64; - if (stage == MESA_SHADER_FRAGMENT) - return sscreen->debug_flags & DBG(W32_PS) ? 32 : 64; + /* Debug flags except w32psdiscard don't override the discard bug workaround, + * but they override everything else. + */ + if (dbg_wave_size) + return dbg_wave_size; + + /* Pixel shaders without interp instructions don't suffer from reduced interpolation + * performance in Wave32, so use Wave32. This helps Piano and Voloplosion. + */ + if (stage == MESA_SHADER_FRAGMENT && !info->num_inputs) + return 32; + + /* There are a few very rare cases where VS is better with Wave32, and there are no known + * cases where Wave64 is better. + */ + if (stage <= MESA_SHADER_GEOMETRY) + return 32; + + /* TODO: Merged shaders must use the same wave size because the driver doesn't recompile + * individual shaders of merged shaders to match the wave size between them. + */ + bool merged_shader = shader && !shader->is_gs_copy_shader && + (shader->key.ge.as_ls || shader->key.ge.as_es || + stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_GEOMETRY); + + /* Divergent loops in Wave64 can end up having too many iterations in one half of the wave + * while the other half is idling but occupying VGPRs, preventing other waves from launching. + * Wave32 eliminates the idling half to allow the next wave to start. + */ + if (!merged_shader && info && info->has_divergent_loop) + return 32; - return sscreen->debug_flags & DBG(W32_GE) ? 32 : 64; + return 64; } /* SHADER_CACHE */ -- 2.7.4