From ccbd551192ea08864da5ce88d51507572743747e Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sun, 26 Sep 2021 08:45:19 -0400 Subject: [PATCH] radeonsi: disallow NGG fast launch on Navi1x because VGT_FLUSH makes it slower This improves viewperf performance on Navi1x. All Navi1x fast launch workarounds are removed and all fast launch codepaths are disabled. Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_gfx_cs.c | 8 +++----- src/gallium/drivers/radeonsi/si_state_draw.cpp | 25 ++++++++++++------------- 2 files changed, 15 insertions(+), 18 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c index 007a523..f44f8e1 100644 --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c @@ -358,12 +358,10 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs) SI_CONTEXT_INV_L2 | SI_CONTEXT_START_PIPELINE_STATS; ctx->pipeline_stats_enabled = -1; - /* We don't know if the last draw used NGG or NGG fast launch because it can be a different - * process. When switching NGG->legacy or NGG->FAST_LAUNCH, we need to flush VGT for certain - * hw generations. + /* We don't know if the last draw used NGG because it can be a different process. + * When switching NGG->legacy, we need to flush VGT for certain hw generations. */ - if ((ctx->screen->info.has_vgt_flush_ngg_legacy_bug && !ctx->ngg) || - (ctx->chip_class == GFX10 && ctx->ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL)) + if (ctx->screen->info.has_vgt_flush_ngg_legacy_bug && !ctx->ngg) ctx->flags |= SI_CONTEXT_VGT_FLUSH; if (ctx->border_color_buffer) { diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index 847515b..671966b 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -1601,7 +1601,8 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw } } else { /* Set the index buffer for fast launch. The VS prolog will load the indices. */ - if (NGG && sctx->ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(~0)) { + if (GFX_VERSION >= GFX10_3 && NGG && + sctx->ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(~0)) { index_max_size = (indexbuf->width0 - index_offset) >> util_logbase2(original_index_size); radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(indexbuf), @@ -2173,11 +2174,14 @@ static void si_draw_vbo(struct pipe_context *ctx, * A draw must have at least 1 full primitive. * The fast launch doesn't work with tessellation. * + * Fast launch is disabled on Navi1x because enabling it requires VGT_FLUSH, + * which decreases performance by up to 10%. Only use fast launch on gfx10.3 and newer. + * * Since NGG fast launch is enabled by VGT_SHADER_STAGES_EN, which causes a context roll, * which decreases performance, decrease the frequency of switching it on/off using * a high vertex count threshold. */ - if (!HAS_TESS && total_direct_count >= 8000 && + if (GFX_VERSION >= GFX10_3 && !HAS_TESS && total_direct_count >= 8000 && !(sctx->screen->debug_flags & DBG(NO_FAST_LAUNCH))) { if (prim == PIPE_PRIM_TRIANGLES && !index_size) { ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST; @@ -2208,19 +2212,14 @@ static void si_draw_vbo(struct pipe_context *ctx, return; } - /* Insert a VGT_FLUSH when enabling fast launch changes to prevent hangs. - * See issues #2418, #2426, #2434 - * - * This is the setting that is used by the draw. + /* si_update_shaders can clear the ngg_culling settings if the shader compilation hasn't + * finished. */ - if (GFX_VERSION >= GFX10) { + if (GFX_VERSION >= GFX10 && NGG) { uint8_t ngg_culling = si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->current->key.opt.ngg_culling; - if (GFX_VERSION == GFX10 && - !(old_ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL) && - ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL) - sctx->flags |= SI_CONTEXT_VGT_FLUSH; - if (old_ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(~0) && + if (GFX_VERSION >= GFX10_3 && + old_ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(~0) && !(ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(~0))) { /* Need to re-set these, because we have bound an index buffer there. */ sctx->shader_pointers_dirty |= @@ -2235,7 +2234,7 @@ static void si_draw_vbo(struct pipe_context *ctx, } /* ngg_culling can be changed after si_update_shaders above, so determine index_size here. */ - if (GFX_VERSION >= GFX10 && NGG && + if (GFX_VERSION >= GFX10_3 && NGG && sctx->ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(~0)) index_size = 0; /* The index buffer will be emulated. */ -- 2.7.4