aco/gfx11: deallocate VGPRs at the end of the shader
authorRhys Perry <pendingchaos02@gmail.com>
Thu, 21 Jul 2022 14:54:26 +0000 (15:54 +0100)
committerMarge Bot <emma+marge@anholt.net>
Fri, 30 Sep 2022 20:57:02 +0000 (20:57 +0000)
fossil-db (gfx1100):
Totals from 65987 (40.81% of 161689) affected shaders:
Instrs: 57123207 -> 57199947 (+0.13%)
CodeSize: 308402500 -> 308709460 (+0.10%)
Latency: 680527139 -> 680527160 (+0.00%)
InvThroughput: 131620026 -> 131620045 (+0.00%)

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17710>

src/amd/compiler/aco_ir.cpp
src/amd/compiler/aco_ir.h
src/amd/compiler/aco_lower_to_hw_instr.cpp

index 08c7372..0b0b92a 100644 (file)
@@ -943,4 +943,28 @@ should_form_clause(const Instruction* a, const Instruction* b)
    return false;
 }
 
+bool
+dealloc_vgprs(Program* program)
+{
+   if (program->gfx_level < GFX11)
+      return false;
+
+   /* skip if deallocating VGPRs won't increase occupancy */
+   uint16_t max_waves = program->dev.max_wave64_per_simd * (64 / program->wave_size);
+   max_waves = max_suitable_waves(program, max_waves);
+   if (program->max_reg_demand.vgpr <= get_addr_vgpr_from_waves(program, max_waves))
+      return false;
+
+   Block& block = program->blocks.back();
+
+   /* don't bother checking if there is a pending VMEM store or export: there almost always is */
+   Builder bld(program);
+   if (!block.instructions.empty() && block.instructions.back()->opcode == aco_opcode::s_endpgm) {
+      bld.reset(&block.instructions, block.instructions.begin() + (block.instructions.size() - 1));
+      bld.sopp(aco_opcode::s_sendmsg, -1, sendmsg_dealloc_vgprs);
+   }
+
+   return true;
+}
+
 } // namespace aco
index fbdea65..6483c2d 100644 (file)
@@ -2289,6 +2289,7 @@ void lower_to_hw_instr(Program* program);
 void schedule_program(Program* program, live& live_vars);
 void spill(Program* program, live& live_vars);
 void insert_wait_states(Program* program);
+bool dealloc_vgprs(Program* program);
 void insert_NOPs(Program* program);
 void form_hard_clauses(Program* program);
 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
index 409f9a1..995be14 100644 (file)
@@ -2004,6 +2004,8 @@ lower_to_hw_instr(Program* program)
 {
    Block* discard_block = NULL;
 
+   bool should_dealloc_vgprs = dealloc_vgprs(program);
+
    for (int block_idx = program->blocks.size() - 1; block_idx >= 0; block_idx--) {
       Block* block = &program->blocks[block_idx];
       lower_context ctx;
@@ -2126,6 +2128,8 @@ lower_to_hw_instr(Program* program)
                   block = &program->blocks[block_idx];
 
                   bld.reset(discard_block);
+                  if (should_dealloc_vgprs)
+                     bld.sopp(aco_opcode::s_sendmsg, -1, sendmsg_dealloc_vgprs);
                   bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0,
                           program->gfx_level >= GFX11 ? V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_NULL,
                           false, true, true);