aco/gfx11: deallocate VGPRs at the end of the shader

author Rhys Perry <pendingchaos02@gmail.com>

Thu, 21 Jul 2022 14:54:26 +0000 (15:54 +0100)

committer Marge Bot <emma+marge@anholt.net>

Fri, 30 Sep 2022 20:57:02 +0000 (20:57 +0000)
author Rhys Perry <pendingchaos02@gmail.com>
Thu, 21 Jul 2022 14:54:26 +0000 (15:54 +0100)
committer Marge Bot <emma+marge@anholt.net>
Fri, 30 Sep 2022 20:57:02 +0000 (20:57 +0000)
diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp

index 08c7372..0b0b92a 100644 (file)
--- a/src/amd/compiler/aco_ir.cpp
+++ b/src/amd/compiler/aco_ir.cpp
@@ -943,4 +943,28 @@ should_form_clause(const Instruction* a, const Instruction* b)
     return false;
  }
  
+bool
+dealloc_vgprs(Program* program)
+{
+   if (program->gfx_level < GFX11)
+      return false;
+
+   /* skip if deallocating VGPRs won't increase occupancy */
+   uint16_t max_waves = program->dev.max_wave64_per_simd * (64 / program->wave_size);
+   max_waves = max_suitable_waves(program, max_waves);
+   if (program->max_reg_demand.vgpr <= get_addr_vgpr_from_waves(program, max_waves))
+      return false;
+
+   Block& block = program->blocks.back();
+
+   /* don't bother checking if there is a pending VMEM store or export: there almost always is */
+   Builder bld(program);
+   if (!block.instructions.empty() && block.instructions.back()->opcode == aco_opcode::s_endpgm) {
+      bld.reset(&block.instructions, block.instructions.begin() + (block.instructions.size() - 1));
+      bld.sopp(aco_opcode::s_sendmsg, -1, sendmsg_dealloc_vgprs);
+   }
+
+   return true;
+}
+
  } // namespace aco
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h

index fbdea65..6483c2d 100644 (file)
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -2289,6 +2289,7 @@ void lower_to_hw_instr(Program* program);
  void schedule_program(Program* program, live& live_vars);
  void spill(Program* program, live& live_vars);
  void insert_wait_states(Program* program);
+bool dealloc_vgprs(Program* program);
  void insert_NOPs(Program* program);
  void form_hard_clauses(Program* program);
  unsigned emit_program(Program* program, std::vector<uint32_t>& code);
diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp

index 409f9a1..995be14 100644 (file)
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
@@ -2004,6 +2004,8 @@ lower_to_hw_instr(Program* program)
  {
     Block* discard_block = NULL;
  
+   bool should_dealloc_vgprs = dealloc_vgprs(program);
+
     for (int block_idx = program->blocks.size() - 1; block_idx >= 0; block_idx--) {
        Block* block = &program->blocks[block_idx];
        lower_context ctx;
@@ -2126,6 +2128,8 @@ lower_to_hw_instr(Program* program)
                    block = &program->blocks[block_idx];
  
                    bld.reset(discard_block);
+                  if (should_dealloc_vgprs)
+                     bld.sopp(aco_opcode::s_sendmsg, -1, sendmsg_dealloc_vgprs);
                    bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0,
                            program->gfx_level >= GFX11 ? V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_NULL,
                            false, true, true);
author	Rhys Perry <pendingchaos02@gmail.com>
	Thu, 21 Jul 2022 14:54:26 +0000 (15:54 +0100)
committer	Marge Bot <emma+marge@anholt.net>
	Fri, 30 Sep 2022 20:57:02 +0000 (20:57 +0000)
src/amd/compiler/aco_ir.cpp		patch \| blob \| history
src/amd/compiler/aco_ir.h		patch \| blob \| history
src/amd/compiler/aco_lower_to_hw_instr.cpp		patch \| blob \| history