}
};
+template <int Max> struct VGPRCounterMap {
+public:
+ int base = 0;
+ BITSET_DECLARE(resident, 256);
+ int val[256];
+
+ /* Initializes all counters to Max. */
+ VGPRCounterMap() { BITSET_ZERO(resident); }
+
+ /* Increase all counters, clamping at Max. */
+ void inc() { base++; }
+
+ /* Set counter to 0. */
+ void set(unsigned idx)
+ {
+ val[idx] = -base;
+ BITSET_SET(resident, idx);
+ }
+
+ void set(PhysReg reg, unsigned bytes)
+ {
+ if (reg.reg() < 256)
+ return;
+
+ for (unsigned i = 0; i < DIV_ROUND_UP(bytes, 4); i++)
+ set(reg.reg() - 256 + i);
+ }
+
+ /* Reset all counters to Max. */
+ void reset()
+ {
+ base = 0;
+ BITSET_ZERO(resident);
+ }
+
+ void reset(PhysReg reg, unsigned bytes)
+ {
+ if (reg.reg() < 256)
+ return;
+
+ for (unsigned i = 0; i < DIV_ROUND_UP(bytes, 4); i++)
+ BITSET_CLEAR(resident, reg.reg() - 256 + i);
+ }
+
+ uint8_t get(unsigned idx)
+ {
+ return BITSET_TEST(resident, idx) ? MIN2(val[idx] + base, Max) : Max;
+ }
+
+ uint8_t get(PhysReg reg, unsigned offset = 0)
+ {
+ assert(reg.reg() >= 256);
+ return get(reg.reg() - 256 + offset);
+ }
+
+ void join_min(const VGPRCounterMap& other)
+ {
+ unsigned i;
+ BITSET_FOREACH_SET(i, other.resident, 256)
+ {
+ if (BITSET_TEST(resident, i))
+ val[i] = MIN2(val[i] + base, other.val[i] + other.base) - base;
+ else
+ val[i] = other.val[i] + other.base - base;
+ }
+ BITSET_OR(resident, resident, other.resident);
+ }
+
+ bool operator==(const VGPRCounterMap& other) const
+ {
+ if (!BITSET_EQUAL(resident, other.resident))
+ return false;
+
+ unsigned i;
+ BITSET_FOREACH_SET(i, other.resident, 256)
+ {
+ if (!BITSET_TEST(resident, i))
+ return false;
+ if (val[i] + base != other.val[i] + other.base)
+ return false;
+ }
+ return true;
+ }
+};
+
struct NOP_ctx_gfx11 {
/* VcmpxPermlaneHazard */
bool has_Vcmpx = false;
std::bitset<256> vgpr_used_by_vmem_store;
std::bitset<256> vgpr_used_by_ds;
+ /* VALUTransUseHazard */
+ VGPRCounterMap<15> valu_since_wr_by_trans;
+ VGPRCounterMap<2> trans_since_wr_by_trans;
+
void join(const NOP_ctx_gfx11& other)
{
has_Vcmpx |= other.has_Vcmpx;
vgpr_used_by_vmem_load |= other.vgpr_used_by_vmem_load;
vgpr_used_by_vmem_store |= other.vgpr_used_by_vmem_store;
vgpr_used_by_ds |= other.vgpr_used_by_ds;
+ valu_since_wr_by_trans.join_min(other.valu_since_wr_by_trans);
+ trans_since_wr_by_trans.join_min(other.trans_since_wr_by_trans);
}
bool operator==(const NOP_ctx_gfx11& other)
return has_Vcmpx == other.has_Vcmpx &&
vgpr_used_by_vmem_load == other.vgpr_used_by_vmem_load &&
vgpr_used_by_vmem_store == other.vgpr_used_by_vmem_store &&
- vgpr_used_by_ds == other.vgpr_used_by_ds;
+ vgpr_used_by_ds == other.vgpr_used_by_ds &&
+ valu_since_wr_by_trans == other.valu_since_wr_by_trans &&
+ trans_since_wr_by_trans == other.trans_since_wr_by_trans;
}
};
ldsdir->wait_vdst = MIN2(ldsdir->wait_vdst, count);
}
+ /* VALUTransUseHazard
+ * VALU reads VGPR written by transcendental instruction without 6+ VALU or 2+ transcendental
+ * in-between.
+ */
+ unsigned va_vdst = 15;
+ if (instr->isVALU() || instr->isVINTERP_INREG()) {
+ uint8_t num_valu = 15;
+ uint8_t num_trans = 15;
+ for (Operand& op : instr->operands) {
+ if (op.physReg().reg() < 256)
+ continue;
+ for (unsigned i = 0; i < op.size(); i++) {
+ num_valu = std::min(num_valu, ctx.valu_since_wr_by_trans.get(op.physReg(), i));
+ num_trans = std::min(num_trans, ctx.trans_since_wr_by_trans.get(op.physReg(), i));
+ }
+ }
+ if (num_trans <= 1 && num_valu <= 5) {
+ bld.sopp(aco_opcode::s_waitcnt_depctr, -1, 0x0fff);
+ va_vdst = 0;
+ }
+ }
+
+ va_vdst = std::min(va_vdst, parse_vdst_wait(instr));
+ if (va_vdst == 0) {
+ ctx.valu_since_wr_by_trans.reset();
+ ctx.trans_since_wr_by_trans.reset();
+ }
+
+ if (instr->isVALU() || instr->isVINTERP_INREG()) {
+ instr_class cls = instr_info.classes[(int)instr->opcode];
+ bool is_trans = cls == instr_class::valu_transcendental32 ||
+ cls == instr_class::valu_double_transcendental;
+
+ ctx.valu_since_wr_by_trans.inc();
+ if (is_trans)
+ ctx.trans_since_wr_by_trans.inc();
+
+ if (is_trans) {
+ for (Definition& def : instr->definitions) {
+ ctx.valu_since_wr_by_trans.set(def.physReg(), def.bytes());
+ ctx.trans_since_wr_by_trans.set(def.physReg(), def.bytes());
+ }
+ }
+ }
+
/* LdsDirectVMEMHazard
* Handle LDSDIR writing a VGPR after it's used by a VMEM/DS instruction.
*/
finish_insert_nops_test();
END_TEST
+
+BEGIN_TEST(insert_nops.valu_trans_use)
+ if (!setup_cs(NULL, GFX11))
+ return;
+
+ //>> p_unit_test 0
+ //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
+ //! s_waitcnt_depctr va_vdst(0)
+ //! v1: %0:v[1] = v_mov_b32 %0:v[0]
+ bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
+ bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
+ bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
+
+ /* Sufficient VALU mitigates the hazard. */
+ //! p_unit_test 1
+ //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
+ //; for i in range(4): insert_pattern('v_nop')
+ //! s_waitcnt_depctr va_vdst(0)
+ //! v1: %0:v[1] = v_mov_b32 %0:v[0]
+ bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
+ bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
+ for (unsigned i = 0; i < 4; i++)
+ bld.vop1(aco_opcode::v_nop);
+ bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
+
+ //! p_unit_test 2
+ //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
+ //; for i in range(8): insert_pattern('v_nop')
+ //! v1: %0:v[1] = v_mov_b32 %0:v[0]
+ bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
+ bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
+ for (unsigned i = 0; i < 8; i++)
+ bld.vop1(aco_opcode::v_nop);
+ bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
+
+ /* Sufficient transcendental VALU mitigates the hazard. */
+ //! p_unit_test 3
+ //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
+ //! v1: %0:v[2] = v_sqrt_f32 %0:v[3]
+ //! s_waitcnt_depctr va_vdst(0)
+ //! v1: %0:v[1] = v_mov_b32 %0:v[0]
+ bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
+ bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
+ bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(258), v1), Operand(PhysReg(259), v1));
+ bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
+
+ //! p_unit_test 4
+ //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
+ //! v1: %0:v[2] = v_sqrt_f32 %0:v[3]
+ //! v1: %0:v[2] = v_sqrt_f32 %0:v[3]
+ //! v1: %0:v[1] = v_mov_b32 %0:v[0]
+ bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
+ bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
+ for (unsigned i = 0; i < 2; i++)
+ bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(258), v1), Operand(PhysReg(259), v1));
+ bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
+
+ /* Transcendental VALU should be counted towards VALU */
+ //! p_unit_test 5
+ //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
+ //; for i in range(5): insert_pattern('v_nop')
+ //! v1: %0:v[2] = v_sqrt_f32 %0:v[3]
+ //! v1: %0:v[1] = v_mov_b32 %0:v[0]
+ bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
+ bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
+ for (unsigned i = 0; i < 5; i++)
+ bld.vop1(aco_opcode::v_nop);
+ bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(258), v1), Operand(PhysReg(259), v1));
+ bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
+
+ /* non-VALU does not mitigate the hazard. */
+ //! p_unit_test 6
+ //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
+ //; for i in range(8): insert_pattern('s_nop')
+ //! s_waitcnt_depctr va_vdst(0)
+ //! v1: %0:v[1] = v_mov_b32 %0:v[0]
+ bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
+ bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
+ for (unsigned i = 0; i < 8; i++)
+ bld.sopp(aco_opcode::s_nop, -1, 0);
+ bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
+
+ finish_insert_nops_test();
+END_TEST