aco/gfx11: workaround LdsDirectVALUHazard
authorRhys Perry <pendingchaos02@gmail.com>
Thu, 25 Aug 2022 11:16:39 +0000 (12:16 +0100)
committerMarge Bot <emma+marge@anholt.net>
Wed, 19 Oct 2022 02:46:03 +0000 (02:46 +0000)
fossil-db (gfx1100):
Totals from 57858 (42.85% of 135032) affected shaders:

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18273>

src/amd/compiler/README-ISA.md
src/amd/compiler/aco_insert_NOPs.cpp
src/amd/compiler/aco_opcodes.py
src/amd/compiler/tests/test_assembler.cpp
src/amd/compiler/tests/test_insert_nops.cpp

index 040d281..48f0924 100644 (file)
@@ -294,3 +294,11 @@ stability issues: https://reviews.llvm.org/D103348
 ### VcmpxPermlaneHazard
 
 Same as GFX10.
+
+### LdsDirectVALUHazard
+
+Triggered by:
+LDSDIR instruction writing a VGPR soon after it's used by a VALU instruction.
+
+Mitigated by:
+A vdst wait, preferably using the LDSDIR's field.
index 0230805..66a3590 100644 (file)
@@ -27,6 +27,7 @@
 
 #include <algorithm>
 #include <bitset>
+#include <set>
 #include <stack>
 #include <vector>
 
@@ -865,6 +866,96 @@ handle_instruction_gfx10(State& state, NOP_ctx_gfx10& ctx, aco_ptr<Instruction>&
    }
 }
 
+/* GFX11 */
+unsigned
+parse_vdst_wait(aco_ptr<Instruction>& instr)
+{
+   if (instr->isVMEM() || instr->isFlatLike() || instr->isDS() || instr->isEXP())
+      return 0;
+   else if (instr->isLDSDIR())
+      return instr->ldsdir().wait_vdst;
+   else if (instr->opcode == aco_opcode::s_waitcnt_depctr)
+      return (instr->sopp().imm >> 12) & 0xf;
+   else
+      return 15;
+}
+
+struct LdsDirectVALUHazardGlobalState {
+   unsigned wait_vdst = 15;
+   PhysReg vgpr;
+   std::set<unsigned> loop_headers_visited;
+};
+
+struct LdsDirectVALUHazardBlockState {
+   unsigned num_valu = 0;
+   bool has_trans = false;
+};
+
+bool
+handle_lds_direct_valu_hazard_instr(LdsDirectVALUHazardGlobalState& global_state,
+                                    LdsDirectVALUHazardBlockState& block_state,
+                                    aco_ptr<Instruction>& instr)
+{
+   if (instr->isVALU() || instr->isVINTERP_INREG()) {
+      instr_class cls = instr_info.classes[(int)instr->opcode];
+      block_state.has_trans |= cls == instr_class::valu_transcendental32 ||
+                               cls == instr_class::valu_double_transcendental;
+
+      bool uses_vgpr = false;
+      for (Definition& def : instr->definitions)
+         uses_vgpr |= regs_intersect(def.physReg(), def.size(), global_state.vgpr, 1);
+      for (Operand& op : instr->operands) {
+         uses_vgpr |=
+            !op.isConstant() && regs_intersect(op.physReg(), op.size(), global_state.vgpr, 1);
+      }
+      if (uses_vgpr) {
+         /* Transcendentals execute in parallel to other VALU and va_vdst count becomes unusable */
+         global_state.wait_vdst =
+            MIN2(global_state.wait_vdst, block_state.has_trans ? 0 : block_state.num_valu);
+         return true;
+      }
+
+      block_state.num_valu++;
+   }
+
+   if (parse_vdst_wait(instr) == 0)
+      return true;
+
+   return block_state.num_valu >= global_state.wait_vdst;
+}
+
+bool
+handle_lds_direct_valu_hazard_block(LdsDirectVALUHazardGlobalState& global_state,
+                                    LdsDirectVALUHazardBlockState& block_state, Block* block)
+{
+   if (block->kind & block_kind_loop_header) {
+      if (global_state.loop_headers_visited.count(block->index))
+         return false;
+      global_state.loop_headers_visited.insert(block->index);
+   }
+
+   return true;
+}
+
+unsigned
+handle_lds_direct_valu_hazard(State& state, aco_ptr<Instruction>& instr)
+{
+   /* LdsDirectVALUHazard
+    * Handle LDSDIR writing a VGPR after it's used by a VALU instruction.
+    */
+   if (instr->ldsdir().wait_vdst == 0)
+      return 0; /* early exit */
+
+   LdsDirectVALUHazardGlobalState global_state;
+   global_state.wait_vdst = instr->ldsdir().wait_vdst;
+   global_state.vgpr = instr->definitions[0].physReg();
+   LdsDirectVALUHazardBlockState block_state;
+   search_backwards<LdsDirectVALUHazardGlobalState, LdsDirectVALUHazardBlockState,
+                    &handle_lds_direct_valu_hazard_block, &handle_lds_direct_valu_hazard_instr>(
+      state, global_state, block_state);
+   return global_state.wait_vdst;
+}
+
 void
 handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>& instr,
                          std::vector<aco_ptr<Instruction>>& new_instructions)
@@ -886,6 +977,12 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>&
    } else if (instr->isVALU() && instr->opcode != aco_opcode::v_nop) {
       ctx.has_Vcmpx = false;
    }
+
+   if (instr->isLDSDIR()) {
+      unsigned count = handle_lds_direct_valu_hazard(state, instr);
+      LDSDIR_instruction* ldsdir = &instr->ldsdir();
+      ldsdir->wait_vdst = MIN2(ldsdir->wait_vdst, count);
+   }
 }
 
 template <typename Ctx>
index 098733c..52a52af 100644 (file)
@@ -99,7 +99,7 @@ class Format(Enum):
          return [('uint8_t', 'attr', 0),
                  ('uint8_t', 'attr_chan', 0),
                  ('memory_sync_info', 'sync', 'memory_sync_info()'),
-                 ('uint8_t', 'wait_vdst', 0)]
+                 ('uint8_t', 'wait_vdst', 15)]
       elif self == Format.MTBUF:
          return [('unsigned', 'dfmt', None),
                  ('unsigned', 'nfmt', None),
index ad1f5b4..eee7f65 100644 (file)
@@ -791,19 +791,19 @@ BEGIN_TEST(assembler.gfx11.ldsdir)
    bld.ldsdir(aco_opcode::lds_direct_load, dst, op).instr->ldsdir().wait_vdst = 6;
 
    //! lds_direct_load v42                                         ; ce10002a
-   bld.ldsdir(aco_opcode::lds_direct_load, dst, op);
+   bld.ldsdir(aco_opcode::lds_direct_load, dst, op).instr->ldsdir().wait_vdst = 0;
 
    //! lds_param_load v42, attr56.x wait_vdst:8                    ; ce08e02a
    bld.ldsdir(aco_opcode::lds_param_load, dst, op, 56, 0).instr->ldsdir().wait_vdst = 8;
 
    //! lds_param_load v42, attr56.x                                ; ce00e02a
-   bld.ldsdir(aco_opcode::lds_param_load, dst, op, 56, 0);
+   bld.ldsdir(aco_opcode::lds_param_load, dst, op, 56, 0).instr->ldsdir().wait_vdst = 0;
 
    //! lds_param_load v42, attr34.y                                ; ce00892a
-   bld.ldsdir(aco_opcode::lds_param_load, dst, op, 34, 1);
+   bld.ldsdir(aco_opcode::lds_param_load, dst, op, 34, 1).instr->ldsdir().wait_vdst = 0;
 
    //! lds_param_load v42, attr12.z                                ; ce00322a
-   bld.ldsdir(aco_opcode::lds_param_load, dst, op, 12, 2);
+   bld.ldsdir(aco_opcode::lds_param_load, dst, op, 12, 2).instr->ldsdir().wait_vdst = 0;
 
    finish_assembler_test();
 END_TEST
index de8d8de..8bec022 100644 (file)
@@ -306,3 +306,129 @@ BEGIN_TEST(insert_nops.vmem_to_scalar_write)
 
    finish_insert_nops_test();
 END_TEST
+
+BEGIN_TEST(insert_nops.lds_direct_valu)
+   if (!setup_cs(NULL, GFX11))
+      return;
+
+   /* WaW */
+   //>> p_unit_test 0
+   //! v1: %0:v[0] = v_mov_b32 0
+   //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
+   bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
+   bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
+   bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
+
+   /* WaR */
+   //! p_unit_test 1
+   //! v1: %0:v[1] = v_mov_b32 %0:v[0]
+   //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
+   bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
+   bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
+   bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
+
+   /* No hazard. */
+   //! p_unit_test 2
+   //! v1: %0:v[1] = v_mov_b32 0
+   //! v1: %0:v[0] = lds_direct_load %0:m0
+   bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
+   bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::zero());
+   bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
+
+   /* multiples hazards, nearest should be considered */
+   //! p_unit_test 3
+   //! v1: %0:v[1] = v_mov_b32 %0:v[0]
+   //! v1: %0:v[0] = v_mov_b32 0
+   //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
+   bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
+   bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
+   bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
+   bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
+
+   /* independent VALU increase wait_vdst */
+   //! p_unit_test 4
+   //! v1: %0:v[0] = v_mov_b32 0
+   //! v_nop
+   //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:1
+   bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
+   bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
+   bld.vop1(aco_opcode::v_nop);
+   bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
+
+   //! p_unit_test 5
+   //! v1: %0:v[0] = v_mov_b32 0
+   //; for i in range(10): insert_pattern('v_nop')
+   //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:10
+   bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
+   bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
+   for (unsigned i = 0; i < 10; i++)
+      bld.vop1(aco_opcode::v_nop);
+   bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
+
+   //! p_unit_test 6
+   //! v1: %0:v[0] = v_mov_b32 0
+   //; for i in range(20): insert_pattern('v_nop')
+   //! v1: %0:v[0] = lds_direct_load %0:m0
+   bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
+   bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
+   for (unsigned i = 0; i < 20; i++)
+      bld.vop1(aco_opcode::v_nop);
+   bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
+
+   /* transcendental requires wait_vdst=0 */
+   //! p_unit_test 7
+   //! v1: %0:v[0] = v_mov_b32 0
+   //! v_nop
+   //! v1: %0:v[1] = v_sqrt_f32 %0:v[1]
+   //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
+   bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
+   bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
+   bld.vop1(aco_opcode::v_nop);
+   bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(257), v1), Operand(PhysReg(257), v1));
+   bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
+
+   //! p_unit_test 8
+   //! v1: %0:v[0] = v_sqrt_f32 %0:v[0]
+   //! v_nop
+   //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
+   bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
+   bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
+   bld.vop1(aco_opcode::v_nop);
+   bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
+
+   /* transcendental is fine if it's before the instruction */
+   //! p_unit_test 9
+   //! v1: %0:v[1] = v_sqrt_f32 %0:v[1]
+   //! v1: %0:v[0] = v_mov_b32 0
+   //! v_nop
+   //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:1
+   bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
+   bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(257), v1), Operand(PhysReg(257), v1));
+   bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
+   bld.vop1(aco_opcode::v_nop);
+   bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
+
+   /* non-VALU does not increase wait_vdst */
+   //! p_unit_test 10
+   //! v1: %0:v[0] = v_mov_b32 0
+   //! s1: %0:m0 = s_mov_b32 0
+   //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
+   bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10));
+   bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
+   bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero());
+   bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
+
+   /* consider instructions which wait on vdst */
+   //! p_unit_test 11
+   //! v1: %0:v[0] = v_mov_b32 0
+   //! v_nop
+   //! s_waitcnt_depctr va_vdst(0)
+   //! v1: %0:v[0] = lds_direct_load %0:m0
+   bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11));
+   bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
+   bld.vop1(aco_opcode::v_nop);
+   bld.sopp(aco_opcode::s_waitcnt_depctr, -1, 0x0fff);
+   bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
+
+   finish_insert_nops_test();
+END_TEST