aco/gfx11: workaround VALUMaskWriteHazard
authorRhys Perry <pendingchaos02@gmail.com>
Tue, 27 Sep 2022 14:37:11 +0000 (15:37 +0100)
committerMarge Bot <emma+marge@anholt.net>
Wed, 19 Oct 2022 02:46:03 +0000 (02:46 +0000)
fossil-db (gfx1100):
Totals from 62812 (46.52% of 135032) affected shaders:
Instrs: 43971580 -> 44069887 (+0.22%)
CodeSize: 233473420 -> 233866648 (+0.17%)
Latency: 463487489 -> 463520688 (+0.01%); split: -0.00%, +0.01%
InvThroughput: 86505748 -> 86509679 (+0.00%); split: -0.00%, +0.00%

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18273>

src/amd/compiler/README-ISA.md
src/amd/compiler/aco_insert_NOPs.cpp
src/amd/compiler/tests/test_insert_nops.cpp

index d59ca9f..f0cdbf8 100644 (file)
@@ -330,3 +330,11 @@ VALU between the second VGPR write and the current instruction.
 
 Mitigated by:
 A va_vdst=0 wait: `s_waitcnt_deptr 0x0fff`
+
+### VALUMaskWriteHazard
+
+Triggered by:
+SALU writing then reading a SGPR that was previously used as a lane mask for a VALU.
+
+Mitigated by:
+A VALU instruction reading a SGPR or with literal, or a sa_sdst=0 wait: `s_waitcnt_depctr 0xfffe`
index f4ce028..2c460e5 100644 (file)
@@ -292,6 +292,10 @@ struct NOP_ctx_gfx11 {
    VGPRCounterMap<15> valu_since_wr_by_trans;
    VGPRCounterMap<2> trans_since_wr_by_trans;
 
+   /* VALUMaskWriteHazard */
+   std::bitset<128> sgpr_read_by_valu_as_lanemask;
+   std::bitset<128> sgpr_read_by_valu_as_lanemask_then_wr_by_salu;
+
    void join(const NOP_ctx_gfx11& other)
    {
       has_Vcmpx |= other.has_Vcmpx;
@@ -300,6 +304,9 @@ struct NOP_ctx_gfx11 {
       vgpr_used_by_ds |= other.vgpr_used_by_ds;
       valu_since_wr_by_trans.join_min(other.valu_since_wr_by_trans);
       trans_since_wr_by_trans.join_min(other.trans_since_wr_by_trans);
+      sgpr_read_by_valu_as_lanemask |= other.sgpr_read_by_valu_as_lanemask;
+      sgpr_read_by_valu_as_lanemask_then_wr_by_salu |=
+         other.sgpr_read_by_valu_as_lanemask_then_wr_by_salu;
    }
 
    bool operator==(const NOP_ctx_gfx11& other)
@@ -309,7 +316,10 @@ struct NOP_ctx_gfx11 {
              vgpr_used_by_vmem_store == other.vgpr_used_by_vmem_store &&
              vgpr_used_by_ds == other.vgpr_used_by_ds &&
              valu_since_wr_by_trans == other.valu_since_wr_by_trans &&
-             trans_since_wr_by_trans == other.trans_since_wr_by_trans;
+             trans_since_wr_by_trans == other.trans_since_wr_by_trans &&
+             sgpr_read_by_valu_as_lanemask == other.sgpr_read_by_valu_as_lanemask &&
+             sgpr_read_by_valu_as_lanemask_then_wr_by_salu ==
+                other.sgpr_read_by_valu_as_lanemask_then_wr_by_salu;
    }
 };
 
@@ -729,6 +739,24 @@ check_written_regs(const aco_ptr<Instruction>& instr, const std::bitset<N>& chec
 }
 
 template <std::size_t N>
+bool
+check_read_regs(const aco_ptr<Instruction>& instr, const std::bitset<N>& check_regs)
+{
+   return std::any_of(instr->operands.begin(), instr->operands.end(),
+                      [&check_regs](const Operand& op) -> bool
+                      {
+                         if (op.isConstant())
+                            return false;
+                         bool writes_any = false;
+                         for (unsigned i = 0; i < op.size(); i++) {
+                            unsigned op_reg = op.physReg() + i;
+                            writes_any |= op_reg < check_regs.size() && check_regs[op_reg];
+                         }
+                         return writes_any;
+                      });
+}
+
+template <std::size_t N>
 void
 mark_read_regs(const aco_ptr<Instruction>& instr, std::bitset<N>& reg_reads)
 {
@@ -1265,6 +1293,21 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>&
       va_vdst = 0;
    }
 
+   /* VALUMaskWriteHazard
+    * VALU reads SGPR as a lane mask and later written by SALU cannot safely be read by SALU.
+    */
+   if (state.program->wave_size == 64 && instr->isSALU() &&
+       check_written_regs(instr, ctx.sgpr_read_by_valu_as_lanemask)) {
+      ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu = ctx.sgpr_read_by_valu_as_lanemask;
+      ctx.sgpr_read_by_valu_as_lanemask.reset();
+   } else if (state.program->wave_size == 64 && instr->isSALU() &&
+              check_read_regs(instr, ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu)) {
+      bld.sopp(aco_opcode::s_waitcnt_depctr, -1, 0xfffe);
+      ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.reset();
+   } else if (instr->opcode == aco_opcode::s_waitcnt_depctr && (instr->sopp().imm & 0x1) == 0) {
+      ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.reset();
+   }
+
    va_vdst = std::min(va_vdst, parse_vdst_wait(instr));
    if (va_vdst == 0) {
       ctx.valu_since_wr_by_trans.reset();
@@ -1286,6 +1329,28 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>&
             ctx.trans_since_wr_by_trans.set(def.physReg(), def.bytes());
          }
       }
+
+      if (state.program->wave_size == 64) {
+         for (Operand& op : instr->operands) {
+            if (op.isLiteral() || (!op.isConstant() && op.physReg().reg() < 128))
+               ctx.sgpr_read_by_valu_as_lanemask.reset();
+         }
+         switch (instr->opcode) {
+         case aco_opcode::v_addc_co_u32:
+         case aco_opcode::v_subb_co_u32:
+         case aco_opcode::v_subbrev_co_u32:
+         case aco_opcode::v_cndmask_b16:
+         case aco_opcode::v_cndmask_b32:
+         case aco_opcode::v_div_fmas_f32:
+         case aco_opcode::v_div_fmas_f64:
+            if (instr->operands.back().physReg() != exec) {
+               ctx.sgpr_read_by_valu_as_lanemask.set(instr->operands.back().physReg().reg());
+               ctx.sgpr_read_by_valu_as_lanemask.set(instr->operands.back().physReg().reg() + 1);
+            }
+            break;
+         default: break;
+         }
+      }
    }
 
    /* LdsDirectVMEMHazard
index bedf267..8fc3a18 100644 (file)
@@ -920,3 +920,72 @@ BEGIN_TEST(insert_nops.valu_partial_forwarding.control_flow)
 
    finish_insert_nops_test();
 END_TEST
+
+BEGIN_TEST(insert_nops.valu_mask_write)
+   if (!setup_cs(NULL, GFX11))
+      return;
+
+   /* Basic case. */
+   //>> p_unit_test 0
+   //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
+   //! s1: %0:s[1] = s_mov_b32 0
+   //! s_waitcnt_depctr sa_sdst(0)
+   //! s1: %0:s[2] = s_mov_b32 %0:s[1]
+   bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
+   bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1),
+                Operand::zero(), Operand::zero(), Operand(PhysReg(0), s2));
+   bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
+   bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
+
+   /* Mitigation. */
+   //! p_unit_test 1
+   //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
+   //! v1: %0:v[1] = v_mov_b32 %0:s[1]
+   //! s1: %0:s[1] = s_mov_b32 0
+   //! s1: %0:s[2] = s_mov_b32 %0:s[1]
+   bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
+   bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1),
+                Operand::zero(), Operand::zero(), Operand(PhysReg(0), s2));
+   bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(1), s1));
+   bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
+   bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
+
+   //! p_unit_test 2
+   //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
+   //! s1: %0:s[1] = s_mov_b32 0
+   //! s_waitcnt_depctr sa_sdst(0)
+   //! s1: %0:s[2] = s_mov_b32 %0:s[1]
+   //! s1: %0:s[2] = s_mov_b32 %0:s[1]
+   bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
+   bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1),
+                Operand::zero(), Operand::zero(), Operand(PhysReg(0), s2));
+   bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
+   bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
+   bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
+
+   //! p_unit_test 3
+   //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
+   //! s1: %0:s[1] = s_mov_b32 0
+   //! s_waitcnt_depctr sa_sdst(0)
+   //! s1: %0:s[2] = s_mov_b32 %0:s[1]
+   bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
+   bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1),
+                Operand::zero(), Operand::zero(), Operand(PhysReg(0), s2));
+   bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
+   bld.sopp(aco_opcode::s_waitcnt_depctr, -1, 0xfffe);
+   bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
+
+   /* Instruction which is both involved in the hazard and is a mitigation. */
+   //! p_unit_test 4
+   //! v1: %0:v[0] = v_cndmask_b32 %0:s[2], 0, %0:s[0-1]
+   //! s1: %0:s[1] = s_mov_b32 0
+   //! s_waitcnt_depctr sa_sdst(0)
+   //! s1: %0:s[2] = s_mov_b32 %0:s[1]
+   bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
+   bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1),
+                Operand(PhysReg(2), s1), Operand::zero(), Operand(PhysReg(0), s2));
+   bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
+   bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
+
+   finish_insert_nops_test();
+END_TEST