From a0fcd377310995035d22cdaeb26474df9ff16e42 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Mon, 10 May 2021 20:31:19 -0400 Subject: [PATCH] radeonsi: remove a twice duplicated workaround for VERT_GRP_SIZE MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This enables better lane occupancy. Acked-by: Timur Kristóf Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/gfx10_shader_ngg.c | 23 +---------------------- src/gallium/drivers/radeonsi/si_state_shaders.c | 20 ++++++++++++-------- 2 files changed, 13 insertions(+), 30 deletions(-) diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c index 49823ac..e837a64 100644 --- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c +++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c @@ -1964,16 +1964,6 @@ bool gfx10_ngg_calculate_subgroup_info(struct si_shader *shader) max_esverts_base = 128; } - /* Hardware has the following non-natural restrictions on the value - * of GE_CNTL.VERT_GRP_SIZE based on based on the primitive type of - * the draw: - * - at most 252 for any line input primitive type - * - at most 251 for any quad input primitive type - * - at most 251 for triangle strips with adjacency (this happens to - * be the natural limit for triangle *lists* with adjacency) - */ - max_esverts_base = MIN2(max_esverts_base, 251 + max_verts_per_prim - 1); - if (gs_stage == MESA_SHADER_GEOMETRY) { bool force_multi_cycling = false; unsigned max_out_verts_per_gsprim = gs_sel->info.base.gs.vertices_out * gs_num_invocations; @@ -2125,18 +2115,7 @@ retry_select_mode: } } - /* On gfx10, the GE only checks against the maximum number of ES verts after - * allocating a full GS primitive. So we need to ensure that whenever - * this check passes, there is enough space for a full primitive without - * vertex reuse. - */ - if (gs_sel->screen->info.chip_class == GFX10 && - !(shader->key.opt.ngg_culling & (SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST | - SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP))) - shader->ngg.hw_max_esverts = max_esverts - max_verts_per_prim + 1; - else - shader->ngg.hw_max_esverts = max_esverts; - + shader->ngg.hw_max_esverts = max_esverts; shader->ngg.max_gsprims = max_gsprims; shader->ngg.max_out_verts = max_out_vertices; shader->ngg.prim_amp_factor = prim_amp_factor; diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 29400f1..3481477 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -1314,19 +1314,23 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts) | S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi); - /* Bug workaround for a possible hang with non-tessellation cases. - * Tessellation always sets GE_CNTL.VERT_GRP_SIZE = 0 + /* On gfx10, the GE only checks against the maximum number of ES verts after + * allocating a full GS primitive. So we need to ensure that whenever + * this check passes, there is enough space for a full primitive without + * vertex reuse. VERT_GRP_SIZE=256 doesn't need this. We should always get 256 + * if we have enough LDS. * - * Requirement: GE_CNTL.VERT_GRP_SIZE = VGT_GS_ONCHIP_CNTL.ES_VERTS_PER_SUBGRP - 5 + * Tessellation is unaffected because it always sets GE_CNTL.VERT_GRP_SIZE = 0. */ if ((sscreen->info.chip_class == GFX10) && (es_stage == MESA_SHADER_VERTEX || gs_stage == MESA_SHADER_VERTEX) && /* = no tess */ - shader->ngg.hw_max_esverts != 256) { + shader->ngg.hw_max_esverts != 256 && + shader->ngg.hw_max_esverts > 5) { + /* This could be based on the input primitive type. 5 is the worst case + * for primitive types with adjacency. + */ shader->ge_cntl &= C_03096C_VERT_GRP_SIZE; - - if (shader->ngg.hw_max_esverts > 5) { - shader->ge_cntl |= S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts - 5); - } + shader->ge_cntl |= S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts - 5); } } -- 2.7.4