From 9db4b331f9f35f123e05a103d0e721aad873a006 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 9 Feb 2021 18:58:54 -0500 Subject: [PATCH] radeonsi: improve comments in si_emit_derived_tess_state MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Reviewed-by: Zoltán Böszörményi Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_state_draw.cpp | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index 62f4f3e..0f5d620 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -264,9 +264,10 @@ static void si_emit_derived_tess_state(struct si_context *sctx, lds_per_patch = MAX2(input_patch_size, output_patch_size); } - /* Ensure that we only need one wave per SIMD so we don't need to check - * resource usage. Also ensures that the number of tcs in and out - * vertices per threadgroup are at most 256. + /* Ensure that we only need 4 waves per CU, so that we don't need to check + * resource usage (such as whether we have enough VGPRs to fit the whole + * threadgroup into the CU). It also ensures that the number of tcs in and out + * vertices per threadgroup are at most 256, which is the hw limit. */ unsigned max_verts_per_patch = MAX2(num_tcs_input_cp, num_tcs_output_cp); *num_patches = 256 / max_verts_per_patch; @@ -290,11 +291,8 @@ static void si_emit_derived_tess_state(struct si_context *sctx, /* Make sure that the data fits in LDS. This assumes the shaders only * use LDS for the inputs and outputs. * - * While GFX7 can use 64K per threadgroup, there is a hang on Stoney - * with 2 CUs if we use more than 32K. The closed Vulkan driver also - * uses 32K at most on all GCN chips. - * - * Use 16K so that we can fit 2 workgroups on the same CU. + * The maximum allowed LDS size is 32K. Higher numbers can hang. + * Use 16K as the maximum, so that we can fit 2 workgroups on the same CU. */ ASSERTED unsigned max_lds_size = 32 * 1024; /* hw limit */ unsigned target_lds_size = 16 * 1024; /* target at least 2 workgroups per CU, 16K each */ @@ -302,9 +300,8 @@ static void si_emit_derived_tess_state(struct si_context *sctx, *num_patches = MAX2(*num_patches, 1); assert(*num_patches * lds_per_patch <= max_lds_size); - /* Make sure that vector lanes are reasonably occupied. It probably - * doesn't matter much because this is LS-HS, and TES is likely to - * occupy significantly more CUs. + /* Make sure that vector lanes are fully occupied by cutting off the last wave + * if it's only partially filled. */ unsigned temp_verts_per_tg = *num_patches * max_verts_per_patch; unsigned wave_size = sctx->screen->ge_wave_size; -- 2.7.4