From 2930317cea53843b4f3f2b25f11fba5ba82fda16 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Thu, 21 Jul 2022 15:54:26 +0100 Subject: [PATCH] aco/gfx11: deallocate VGPRs at the end of the shader MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit fossil-db (gfx1100): Totals from 65987 (40.81% of 161689) affected shaders: Instrs: 57123207 -> 57199947 (+0.13%) CodeSize: 308402500 -> 308709460 (+0.10%) Latency: 680527139 -> 680527160 (+0.00%) InvThroughput: 131620026 -> 131620045 (+0.00%) Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_ir.cpp | 24 ++++++++++++++++++++++++ src/amd/compiler/aco_ir.h | 1 + src/amd/compiler/aco_lower_to_hw_instr.cpp | 4 ++++ 3 files changed, 29 insertions(+) diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp index 08c7372..0b0b92a 100644 --- a/src/amd/compiler/aco_ir.cpp +++ b/src/amd/compiler/aco_ir.cpp @@ -943,4 +943,28 @@ should_form_clause(const Instruction* a, const Instruction* b) return false; } +bool +dealloc_vgprs(Program* program) +{ + if (program->gfx_level < GFX11) + return false; + + /* skip if deallocating VGPRs won't increase occupancy */ + uint16_t max_waves = program->dev.max_wave64_per_simd * (64 / program->wave_size); + max_waves = max_suitable_waves(program, max_waves); + if (program->max_reg_demand.vgpr <= get_addr_vgpr_from_waves(program, max_waves)) + return false; + + Block& block = program->blocks.back(); + + /* don't bother checking if there is a pending VMEM store or export: there almost always is */ + Builder bld(program); + if (!block.instructions.empty() && block.instructions.back()->opcode == aco_opcode::s_endpgm) { + bld.reset(&block.instructions, block.instructions.begin() + (block.instructions.size() - 1)); + bld.sopp(aco_opcode::s_sendmsg, -1, sendmsg_dealloc_vgprs); + } + + return true; +} + } // namespace aco diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index fbdea65..6483c2d 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -2289,6 +2289,7 @@ void lower_to_hw_instr(Program* program); void schedule_program(Program* program, live& live_vars); void spill(Program* program, live& live_vars); void insert_wait_states(Program* program); +bool dealloc_vgprs(Program* program); void insert_NOPs(Program* program); void form_hard_clauses(Program* program); unsigned emit_program(Program* program, std::vector& code); diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 409f9a1..995be14 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -2004,6 +2004,8 @@ lower_to_hw_instr(Program* program) { Block* discard_block = NULL; + bool should_dealloc_vgprs = dealloc_vgprs(program); + for (int block_idx = program->blocks.size() - 1; block_idx >= 0; block_idx--) { Block* block = &program->blocks[block_idx]; lower_context ctx; @@ -2126,6 +2128,8 @@ lower_to_hw_instr(Program* program) block = &program->blocks[block_idx]; bld.reset(discard_block); + if (should_dealloc_vgprs) + bld.sopp(aco_opcode::s_sendmsg, -1, sendmsg_dealloc_vgprs); bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0, program->gfx_level >= GFX11 ? V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_NULL, false, true, true); -- 2.7.4