aco/gfx11: workaround VALUTransUseHazard
authorRhys Perry <pendingchaos02@gmail.com>
Thu, 13 Oct 2022 16:55:57 +0000 (17:55 +0100)
committerMarge Bot <emma+marge@anholt.net>
Wed, 19 Oct 2022 02:46:03 +0000 (02:46 +0000)
fossil-db (gfx1100):
Totals from 116990 (86.64% of 135032) affected shaders:
Instrs: 67942325 -> 69493991 (+2.28%)
CodeSize: 366448984 -> 372655648 (+1.69%)
Latency: 673236871 -> 673269808 (+0.00%); split: -0.00%, +0.00%
InvThroughput: 128266905 -> 128270175 (+0.00%); split: -0.00%, +0.00%

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18273>

src/amd/compiler/README-ISA.md
src/amd/compiler/aco_insert_NOPs.cpp
src/amd/compiler/tests/test_insert_nops.cpp

index ad85160..518946e 100644 (file)
@@ -311,3 +311,12 @@ LDSDIR instruction writing a VGPR after it's used by a VMEM/DS instruction.
 Mitigated by:
 Waiting for the VMEM/DS instruction to finish, a VALU or export instruction, or
 `s_waitcnt_depctr 0xffe3`.
+
+### VALUTransUseHazard
+
+Triggered by:
+A VALU instrction reading a VGPR written by a transcendental VALU instruction without 6+ VALU or 2+
+transcendental instructions in-between.
+
+Mitigated by:
+A va_vdst=0 wait: `s_waitcnt_deptr 0x0fff`
index d8f5405..bc956a0 100644 (file)
@@ -194,6 +194,91 @@ struct NOP_ctx_gfx10 {
    }
 };
 
+template <int Max> struct VGPRCounterMap {
+public:
+   int base = 0;
+   BITSET_DECLARE(resident, 256);
+   int val[256];
+
+   /* Initializes all counters to Max. */
+   VGPRCounterMap() { BITSET_ZERO(resident); }
+
+   /* Increase all counters, clamping at Max. */
+   void inc() { base++; }
+
+   /* Set counter to 0. */
+   void set(unsigned idx)
+   {
+      val[idx] = -base;
+      BITSET_SET(resident, idx);
+   }
+
+   void set(PhysReg reg, unsigned bytes)
+   {
+      if (reg.reg() < 256)
+         return;
+
+      for (unsigned i = 0; i < DIV_ROUND_UP(bytes, 4); i++)
+         set(reg.reg() - 256 + i);
+   }
+
+   /* Reset all counters to Max. */
+   void reset()
+   {
+      base = 0;
+      BITSET_ZERO(resident);
+   }
+
+   void reset(PhysReg reg, unsigned bytes)
+   {
+      if (reg.reg() < 256)
+         return;
+
+      for (unsigned i = 0; i < DIV_ROUND_UP(bytes, 4); i++)
+         BITSET_CLEAR(resident, reg.reg() - 256 + i);
+   }
+
+   uint8_t get(unsigned idx)
+   {
+      return BITSET_TEST(resident, idx) ? MIN2(val[idx] + base, Max) : Max;
+   }
+
+   uint8_t get(PhysReg reg, unsigned offset = 0)
+   {
+      assert(reg.reg() >= 256);
+      return get(reg.reg() - 256 + offset);
+   }
+
+   void join_min(const VGPRCounterMap& other)
+   {
+      unsigned i;
+      BITSET_FOREACH_SET(i, other.resident, 256)
+      {
+         if (BITSET_TEST(resident, i))
+            val[i] = MIN2(val[i] + base, other.val[i] + other.base) - base;
+         else
+            val[i] = other.val[i] + other.base - base;
+      }
+      BITSET_OR(resident, resident, other.resident);
+   }
+
+   bool operator==(const VGPRCounterMap& other) const
+   {
+      if (!BITSET_EQUAL(resident, other.resident))
+         return false;
+
+      unsigned i;
+      BITSET_FOREACH_SET(i, other.resident, 256)
+      {
+         if (!BITSET_TEST(resident, i))
+            return false;
+         if (val[i] + base != other.val[i] + other.base)
+            return false;
+      }
+      return true;
+   }
+};
+
 struct NOP_ctx_gfx11 {
    /* VcmpxPermlaneHazard */
    bool has_Vcmpx = false;
@@ -203,12 +288,18 @@ struct NOP_ctx_gfx11 {
    std::bitset<256> vgpr_used_by_vmem_store;
    std::bitset<256> vgpr_used_by_ds;
 
+   /* VALUTransUseHazard */
+   VGPRCounterMap<15> valu_since_wr_by_trans;
+   VGPRCounterMap<2> trans_since_wr_by_trans;
+
    void join(const NOP_ctx_gfx11& other)
    {
       has_Vcmpx |= other.has_Vcmpx;
       vgpr_used_by_vmem_load |= other.vgpr_used_by_vmem_load;
       vgpr_used_by_vmem_store |= other.vgpr_used_by_vmem_store;
       vgpr_used_by_ds |= other.vgpr_used_by_ds;
+      valu_since_wr_by_trans.join_min(other.valu_since_wr_by_trans);
+      trans_since_wr_by_trans.join_min(other.trans_since_wr_by_trans);
    }
 
    bool operator==(const NOP_ctx_gfx11& other)
@@ -216,7 +307,9 @@ struct NOP_ctx_gfx11 {
       return has_Vcmpx == other.has_Vcmpx &&
              vgpr_used_by_vmem_load == other.vgpr_used_by_vmem_load &&
              vgpr_used_by_vmem_store == other.vgpr_used_by_vmem_store &&
-             vgpr_used_by_ds == other.vgpr_used_by_ds;
+             vgpr_used_by_ds == other.vgpr_used_by_ds &&
+             valu_since_wr_by_trans == other.valu_since_wr_by_trans &&
+             trans_since_wr_by_trans == other.trans_since_wr_by_trans;
    }
 };
 
@@ -1010,6 +1103,51 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>&
       ldsdir->wait_vdst = MIN2(ldsdir->wait_vdst, count);
    }
 
+   /* VALUTransUseHazard
+    * VALU reads VGPR written by transcendental instruction without 6+ VALU or 2+ transcendental
+    * in-between.
+    */
+   unsigned va_vdst = 15;
+   if (instr->isVALU() || instr->isVINTERP_INREG()) {
+      uint8_t num_valu = 15;
+      uint8_t num_trans = 15;
+      for (Operand& op : instr->operands) {
+         if (op.physReg().reg() < 256)
+            continue;
+         for (unsigned i = 0; i < op.size(); i++) {
+            num_valu = std::min(num_valu, ctx.valu_since_wr_by_trans.get(op.physReg(), i));
+            num_trans = std::min(num_trans, ctx.trans_since_wr_by_trans.get(op.physReg(), i));
+         }
+      }
+      if (num_trans <= 1 && num_valu <= 5) {
+         bld.sopp(aco_opcode::s_waitcnt_depctr, -1, 0x0fff);
+         va_vdst = 0;
+      }
+   }
+
+   va_vdst = std::min(va_vdst, parse_vdst_wait(instr));
+   if (va_vdst == 0) {
+      ctx.valu_since_wr_by_trans.reset();
+      ctx.trans_since_wr_by_trans.reset();
+   }
+
+   if (instr->isVALU() || instr->isVINTERP_INREG()) {
+      instr_class cls = instr_info.classes[(int)instr->opcode];
+      bool is_trans = cls == instr_class::valu_transcendental32 ||
+                      cls == instr_class::valu_double_transcendental;
+
+      ctx.valu_since_wr_by_trans.inc();
+      if (is_trans)
+         ctx.trans_since_wr_by_trans.inc();
+
+      if (is_trans) {
+         for (Definition& def : instr->definitions) {
+            ctx.valu_since_wr_by_trans.set(def.physReg(), def.bytes());
+            ctx.trans_since_wr_by_trans.set(def.physReg(), def.bytes());
+         }
+      }
+   }
+
    /* LdsDirectVMEMHazard
     * Handle LDSDIR writing a VGPR after it's used by a VMEM/DS instruction.
     */
index 80ed4b7..41f8b62 100644 (file)
@@ -570,3 +570,87 @@ BEGIN_TEST(insert_nops.lds_direct_vmem)
 
    finish_insert_nops_test();
 END_TEST
+
+BEGIN_TEST(insert_nops.valu_trans_use)
+   if (!setup_cs(NULL, GFX11))
+      return;
+
+   //>> p_unit_test 0
+   //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
+   //! s_waitcnt_depctr va_vdst(0)
+   //! v1: %0:v[1] = v_mov_b32 %0:v[0]
+   bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
+   bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
+   bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
+
+   /* Sufficient VALU mitigates the hazard. */
+   //! p_unit_test 1
+   //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
+   //; for i in range(4): insert_pattern('v_nop')
+   //! s_waitcnt_depctr va_vdst(0)
+   //! v1: %0:v[1] = v_mov_b32 %0:v[0]
+   bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
+   bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
+   for (unsigned i = 0; i < 4; i++)
+      bld.vop1(aco_opcode::v_nop);
+   bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
+
+   //! p_unit_test 2
+   //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
+   //; for i in range(8): insert_pattern('v_nop')
+   //! v1: %0:v[1] = v_mov_b32 %0:v[0]
+   bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
+   bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
+   for (unsigned i = 0; i < 8; i++)
+      bld.vop1(aco_opcode::v_nop);
+   bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
+
+   /* Sufficient transcendental VALU mitigates the hazard. */
+   //! p_unit_test 3
+   //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
+   //! v1: %0:v[2] = v_sqrt_f32 %0:v[3]
+   //! s_waitcnt_depctr va_vdst(0)
+   //! v1: %0:v[1] = v_mov_b32 %0:v[0]
+   bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
+   bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
+   bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(258), v1), Operand(PhysReg(259), v1));
+   bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
+
+   //! p_unit_test 4
+   //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
+   //! v1: %0:v[2] = v_sqrt_f32 %0:v[3]
+   //! v1: %0:v[2] = v_sqrt_f32 %0:v[3]
+   //! v1: %0:v[1] = v_mov_b32 %0:v[0]
+   bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
+   bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
+   for (unsigned i = 0; i < 2; i++)
+      bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(258), v1), Operand(PhysReg(259), v1));
+   bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
+
+   /* Transcendental VALU should be counted towards VALU */
+   //! p_unit_test 5
+   //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
+   //; for i in range(5): insert_pattern('v_nop')
+   //! v1: %0:v[2] = v_sqrt_f32 %0:v[3]
+   //! v1: %0:v[1] = v_mov_b32 %0:v[0]
+   bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
+   bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
+   for (unsigned i = 0; i < 5; i++)
+      bld.vop1(aco_opcode::v_nop);
+   bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(258), v1), Operand(PhysReg(259), v1));
+   bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
+
+   /* non-VALU does not mitigate the hazard. */
+   //! p_unit_test 6
+   //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
+   //; for i in range(8): insert_pattern('s_nop')
+   //! s_waitcnt_depctr va_vdst(0)
+   //! v1: %0:v[1] = v_mov_b32 %0:v[0]
+   bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
+   bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
+   for (unsigned i = 0; i < 8; i++)
+      bld.sopp(aco_opcode::s_nop, -1, 0);
+   bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
+
+   finish_insert_nops_test();
+END_TEST