From 76664c1677356d3d568b0c4b5db2700ef926b015 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Wed, 7 Jun 2023 23:27:18 -0400 Subject: [PATCH] radeonsi: shrink the last field of tcs_offchip_layout due to LDS limit Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_nir_lower_abi.c | 2 +- src/gallium/drivers/radeonsi/si_shader_internal.h | 13 +++++++++---- src/gallium/drivers/radeonsi/si_shader_llvm_tess.c | 2 +- src/gallium/drivers/radeonsi/si_state_draw.cpp | 4 ++-- 4 files changed, 13 insertions(+), 8 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c index 3f26764..4841a37 100644 --- a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c +++ b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c @@ -399,7 +399,7 @@ static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_s break; } case nir_intrinsic_load_hs_out_patch_data_offset_amd: - replacement = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 11, 21); + replacement = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 16, 16); break; case nir_intrinsic_load_ring_tess_offchip_offset_amd: replacement = ac_nir_load_arg(b, &args->ac, args->ac.tess_offchip_offset); diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h index 3914c89..981e713 100644 --- a/src/gallium/drivers/radeonsi/si_shader_internal.h +++ b/src/gallium/drivers/radeonsi/si_shader_internal.h @@ -42,15 +42,20 @@ struct si_shader_args { * [0:5] = the number of patches per threadgroup - 1, max = 63 * # 5 bits * [6:10] = the number of output vertices per patch - 1, max = 31 - * # 21 bits - * [11:31] = the offset of per patch attributes in the buffer in bytes. - * max = NUM_PATCHES*32*32*16 = 1M + * # 16 bits + * [16:31] = the offset of per patch attributes in the buffer in bytes. + * 64 outputs are implied by SI_UNIQUE_SLOT_* values. + * max = 32(CPs) * 64(outputs) * 16(vec4) * 64(num_patches) = 2M, + * clamped to 32K(LDS limit) = 32K */ struct ac_arg tcs_offchip_layout; /* API TCS */ /* Offsets where TCS outputs and TCS patch outputs live in LDS (<= 16K): - * [16:31] = TCS output patch0 offset for per-patch / 4, max = 16K / 4 = 4K + * [16:31] = TCS output patch0 offset for per-patch / 4, + * 64 outputs are implied by SI_UNIQUE_SLOT_* values. + * max = 32(CPs) * 64(outputs) * 16(vec4) * 64(num_patches) * 2(inputs + outputs) / 4 + * = 1M, clamped to 32K(LDS limit) / 4 = 8K */ struct ac_arg tcs_out_lds_offsets; /* Layout of TCS outputs / TES inputs: diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c index 2a26041..5be9647 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c @@ -134,7 +134,7 @@ static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx, base_addr = LLVMBuildMul(ctx->ac.builder, base_addr, constant16, ""); if (!vertex_index) { - LLVMValueRef patch_data_offset = si_unpack_param(ctx, ctx->args->tcs_offchip_layout, 11, 21); + LLVMValueRef patch_data_offset = si_unpack_param(ctx, ctx->args->tcs_offchip_layout, 16, 16); base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr, patch_data_offset, ""); } diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index 7660f3b..7f14cf4 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -790,7 +790,7 @@ void si_update_tess_io_layout_state(struct si_context *sctx) assert(num_tcs_input_cp <= 32); assert(num_tcs_output_cp <= 32); assert(num_patches <= 64); - assert(((pervertex_output_patch_size * num_patches) & ~0x1fffff) == 0); + assert(((pervertex_output_patch_size * num_patches) & ~0xffff) == 0); uint64_t ring_va = (unlikely(sctx->ws->cs_is_secure(&sctx->gfx_cs)) ? si_resource(sctx->tess_rings_tmz) : si_resource(sctx->tess_rings))->gpu_address; @@ -801,7 +801,7 @@ void si_update_tess_io_layout_state(struct si_context *sctx) sctx->tcs_out_offsets = ((perpatch_output_offset / 4) << 16); sctx->tcs_offchip_layout = (num_patches - 1) | ((num_tcs_output_cp - 1) << 6) | - ((pervertex_output_patch_size * num_patches) << 11); + ((pervertex_output_patch_size * num_patches) << 16); /* Compute the LDS size. */ unsigned lds_size = lds_per_patch * num_patches; -- 2.7.4