From: Marek Olšák Date: Sat, 25 Feb 2023 22:52:24 +0000 (-0500) Subject: radeonsi: increase NGG workgroup size to 256 for VS/TES with streamout and GS X-Git-Tag: upstream/23.3.3~11955 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=461ccb00e15a234dcca8c035e1303260a42a1393;p=platform%2Fupstream%2Fmesa.git radeonsi: increase NGG workgroup size to 256 for VS/TES with streamout and GS NGG streamout performance is limited by the workgroup size, so make it as large as possible. Since this uses si_get_max_workgroup_size() to set the NGG workgroup size, the side effect is that all GS is also getting an increase to 256, which is OK. Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c index 6fe8b1d..2880f32 100644 --- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c +++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c @@ -117,8 +117,9 @@ bool gfx10_ngg_calculate_subgroup_info(struct si_shader *shader) gs_sel->screen->info.gfx_level >= GFX11 ? 3 : /* gfx11 requires at least 1 primitive per TG */ gs_sel->screen->info.gfx_level >= GFX10_3 ? 29 : (24 - 1 + max_verts_per_prim); bool max_vert_out_per_gs_instance = false; - unsigned max_gsprims_base = gs_sel->screen->ngg_subgroup_size; /* default prim group size clamp */ - unsigned max_esverts_base = gs_sel->screen->ngg_subgroup_size; + unsigned max_gsprims_base, max_esverts_base; + + max_gsprims_base = max_esverts_base = si_get_max_workgroup_size(shader); if (gs_stage == MESA_SHADER_GEOMETRY) { bool force_multi_cycling = false; diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 06b13ba..4dfbb0d 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -1413,8 +1413,6 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws, } } - sscreen->ngg_subgroup_size = 128; - if (sscreen->info.gfx_level >= GFX11) { unsigned attr_ring_size = sscreen->info.attribute_ring_size_per_se * sscreen->info.max_se; sscreen->attribute_ring = si_aligned_buffer_create(&sscreen->b, diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 368e3ab..3c3ad6a 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -694,8 +694,6 @@ struct si_screen { * We want to minimize the impact on multithreaded Mesa. */ struct ac_llvm_compiler compiler_lowp[10]; - unsigned ngg_subgroup_size; - struct util_idalloc_mt buffer_ids; struct util_vertex_state_cache vertex_state_cache; diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 095ed12..ac97f71 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -206,7 +206,11 @@ unsigned si_get_max_workgroup_size(const struct si_shader *shader) switch (shader->selector->stage) { case MESA_SHADER_VERTEX: case MESA_SHADER_TESS_EVAL: - return shader->key.ge.as_ngg ? shader->selector->screen->ngg_subgroup_size : 0; + /* Use the largest workgroup size for streamout */ + if (shader->key.ge.as_ngg) + return si_shader_uses_streamout(shader) ? 256 : 128; + else + return 0; case MESA_SHADER_TESS_CTRL: /* Return this so that LLVM doesn't remove s_barrier @@ -214,7 +218,7 @@ unsigned si_get_max_workgroup_size(const struct si_shader *shader) return shader->selector->screen->info.gfx_level >= GFX7 ? 128 : 0; case MESA_SHADER_GEOMETRY: - /* ngg_subgroup_size is only the input size. GS can always generate up to 256 vertices. */ + /* GS can always generate up to 256 vertices. */ return shader->selector->screen->info.gfx_level >= GFX9 ? 256 : 0; case MESA_SHADER_COMPUTE: diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index c4f7cda..a01944d 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -1058,7 +1058,7 @@ static inline bool gfx10_ngg_writes_user_edgeflags(struct si_shader *shader) shader->selector->info.writes_edgeflag; } -static inline bool si_shader_uses_streamout(struct si_shader *shader) +static inline bool si_shader_uses_streamout(const struct si_shader *shader) { return shader->selector->stage <= MESA_SHADER_GEOMETRY && shader->selector->info.enabled_streamout_buffer_mask &&