From 30c3b2c0b602a7e9f9d1246c2de61cf1ef38f4ae Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Fri, 25 Sep 2020 16:45:22 -0400 Subject: [PATCH] radeonsi: simplify NGG culling enablement and add radeonsi_shader_culling option Add a vertex count threshold into si_shader_selector to simplify the draw_vbo code. The new option is supposed to be used in 00-mesa-defaults.conf and should be tweaked for best performance unlike the AMD_DEBUG experimental options. Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_debug_options.h | 1 + src/gallium/drivers/radeonsi/si_pipe.c | 4 ---- src/gallium/drivers/radeonsi/si_pipe.h | 2 -- src/gallium/drivers/radeonsi/si_shader.h | 3 ++- src/gallium/drivers/radeonsi/si_state_draw.c | 17 +++++++--------- src/gallium/drivers/radeonsi/si_state_shaders.c | 27 +++++++++++++++++++++---- 6 files changed, 33 insertions(+), 21 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_debug_options.h b/src/gallium/drivers/radeonsi/si_debug_options.h index 9173aa2..cf07c2f 100644 --- a/src/gallium/drivers/radeonsi/si_debug_options.h +++ b/src/gallium/drivers/radeonsi/si_debug_options.h @@ -10,5 +10,6 @@ OPT_BOOL(prim_restart_tri_strips_only, false, "Only enable primitive restart for OPT_BOOL(no_infinite_interp, false, "Kill PS with infinite interp coeff") OPT_BOOL(clamp_div_by_zero, false, "Clamp div by zero (x / 0 becomes FLT_MAX instead of NaN)") OPT_BOOL(no_trunc_coord, false, "Always set TRUNC_COORD=0") +OPT_BOOL(shader_culling, false, "Cull primitives in shaders when benefical (without tess and GS)") #undef OPT_BOOL diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index ab47601..9676894 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -1190,10 +1190,6 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws, sscreen->info.family != CHIP_NAVI14 && sscreen->info.has_dedicated_vram; sscreen->use_ngg_culling = sscreen->use_ngg && !(sscreen->debug_flags & DBG(NO_NGG_CULLING)); - sscreen->always_use_ngg_culling_all = - sscreen->use_ngg_culling && sscreen->debug_flags & DBG(ALWAYS_NGG_CULLING_ALL); - sscreen->always_use_ngg_culling_tess = - sscreen->use_ngg_culling && sscreen->debug_flags & DBG(ALWAYS_NGG_CULLING_TESS); sscreen->use_ngg_streamout = false; /* Only enable primitive binning on APUs by default. */ diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 6cadec8..8854af6 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -527,8 +527,6 @@ struct si_screen { bool llvm_has_working_vgpr_indexing; bool use_ngg; bool use_ngg_culling; - bool always_use_ngg_culling_all; - bool always_use_ngg_culling_tess; bool use_ngg_streamout; struct { diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index d267612..a8aba0b 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -422,7 +422,6 @@ struct si_shader_selector { ubyte sampler_and_images_descriptors_index; bool vs_needs_prolog; bool prim_discard_cs_allowed; - bool ngg_culling_allowed; ubyte cs_shaderbufs_sgpr_index; ubyte cs_num_shaderbufs_in_user_sgprs; ubyte cs_images_sgpr_index; @@ -431,6 +430,8 @@ struct si_shader_selector { ubyte num_vs_inputs; ubyte num_vbos_in_user_sgprs; unsigned pa_cl_vs_out_cntl; + unsigned ngg_cull_vert_threshold; /* 0 = disabled */ + unsigned ngg_cull_nonindexed_fast_launch_vert_threshold; /* 0 = disabled */ ubyte clipdist_mask; ubyte culldist_mask; ubyte rast_prim; diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index 7983f81..79dfe5a 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -1979,17 +1979,14 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i } /* Update NGG culling settings. */ + struct si_shader_selector *hw_vs; if (sctx->ngg && !dispatch_prim_discard_cs && rast_prim == PIPE_PRIM_TRIANGLES && - !sctx->gs_shader.cso && /* GS doesn't support NGG culling. */ - (sctx->screen->always_use_ngg_culling_all || - (sctx->tes_shader.cso && sctx->screen->always_use_ngg_culling_tess) || - /* At least 1024 non-indexed vertices (8 subgroups) are needed - * per draw call (no TES/GS) to enable NGG culling. - */ - (!index_size && direct_count >= 1024 && - (prim == PIPE_PRIM_TRIANGLES || prim == PIPE_PRIM_TRIANGLE_STRIP) && - !sctx->tes_shader.cso)) && - si_get_vs(sctx)->cso->ngg_culling_allowed) { + (hw_vs = si_get_vs(sctx)->cso) && + (direct_count > hw_vs->ngg_cull_vert_threshold || + (!index_size && + direct_count > hw_vs->ngg_cull_nonindexed_fast_launch_vert_threshold && + prim & ((1 << PIPE_PRIM_TRIANGLES) | + (1 << PIPE_PRIM_TRIANGLE_STRIP))))) { unsigned ngg_culling = 0; if (rs->rasterizer_discard) { diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index ee252ac..967f6de 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -2771,14 +2771,12 @@ static void *si_create_shader_selector(struct pipe_context *ctx, default:; } - sel->ngg_culling_allowed = + bool ngg_culling_allowed = sscreen->info.chip_class >= GFX10 && sscreen->info.has_dedicated_vram && sscreen->use_ngg_culling && (sel->info.stage == MESA_SHADER_VERTEX || - (sel->info.stage == MESA_SHADER_TESS_EVAL && - (sscreen->always_use_ngg_culling_all || - sscreen->always_use_ngg_culling_tess))) && + sel->info.stage == MESA_SHADER_TESS_EVAL) && sel->info.writes_position && !sel->info.writes_viewport_index && /* cull only against viewport 0 */ !sel->info.base.writes_memory && !sel->so.num_outputs && @@ -2786,6 +2784,27 @@ static void *si_create_shader_selector(struct pipe_context *ctx, (!sel->info.base.vs.blit_sgprs_amd && !sel->info.base.vs.window_space_position)); + sel->ngg_cull_vert_threshold = UINT_MAX; /* disabled (changed below) */ + sel->ngg_cull_nonindexed_fast_launch_vert_threshold = UINT_MAX; + + if (ngg_culling_allowed) { + if (sel->info.stage == MESA_SHADER_VERTEX) { + /* 1000 non-indexed vertices (roughly 8 primgroups) are needed + * per draw call (no TES/GS) to enable NGG culling by default. + */ + sel->ngg_cull_nonindexed_fast_launch_vert_threshold = 1000; + + if (sscreen->debug_flags & DBG(ALWAYS_NGG_CULLING_ALL)) + sel->ngg_cull_vert_threshold = 0; /* always enabled */ + else if (sscreen->options.shader_culling) + sel->ngg_cull_vert_threshold = 1500; /* vertex count must be more than this */ + } else if (sel->info.stage == MESA_SHADER_TESS_EVAL) { + if (sscreen->debug_flags & DBG(ALWAYS_NGG_CULLING_ALL) || + sscreen->debug_flags & DBG(ALWAYS_NGG_CULLING_TESS)) + sel->ngg_cull_vert_threshold = 0; /* always enabled */ + } + } + /* PA_CL_VS_OUT_CNTL */ if (sctx->chip_class <= GFX9) sel->pa_cl_vs_out_cntl = si_get_vs_out_cntl(sel, NULL, false); -- 2.7.4