VGPRCounterMap<15> valu_since_wr_by_trans;
VGPRCounterMap<2> trans_since_wr_by_trans;
+ /* VALUMaskWriteHazard */
+ std::bitset<128> sgpr_read_by_valu_as_lanemask;
+ std::bitset<128> sgpr_read_by_valu_as_lanemask_then_wr_by_salu;
+
void join(const NOP_ctx_gfx11& other)
{
has_Vcmpx |= other.has_Vcmpx;
vgpr_used_by_ds |= other.vgpr_used_by_ds;
valu_since_wr_by_trans.join_min(other.valu_since_wr_by_trans);
trans_since_wr_by_trans.join_min(other.trans_since_wr_by_trans);
+ sgpr_read_by_valu_as_lanemask |= other.sgpr_read_by_valu_as_lanemask;
+ sgpr_read_by_valu_as_lanemask_then_wr_by_salu |=
+ other.sgpr_read_by_valu_as_lanemask_then_wr_by_salu;
}
bool operator==(const NOP_ctx_gfx11& other)
vgpr_used_by_vmem_store == other.vgpr_used_by_vmem_store &&
vgpr_used_by_ds == other.vgpr_used_by_ds &&
valu_since_wr_by_trans == other.valu_since_wr_by_trans &&
- trans_since_wr_by_trans == other.trans_since_wr_by_trans;
+ trans_since_wr_by_trans == other.trans_since_wr_by_trans &&
+ sgpr_read_by_valu_as_lanemask == other.sgpr_read_by_valu_as_lanemask &&
+ sgpr_read_by_valu_as_lanemask_then_wr_by_salu ==
+ other.sgpr_read_by_valu_as_lanemask_then_wr_by_salu;
}
};
}
template <std::size_t N>
+bool
+check_read_regs(const aco_ptr<Instruction>& instr, const std::bitset<N>& check_regs)
+{
+ return std::any_of(instr->operands.begin(), instr->operands.end(),
+ [&check_regs](const Operand& op) -> bool
+ {
+ if (op.isConstant())
+ return false;
+ bool writes_any = false;
+ for (unsigned i = 0; i < op.size(); i++) {
+ unsigned op_reg = op.physReg() + i;
+ writes_any |= op_reg < check_regs.size() && check_regs[op_reg];
+ }
+ return writes_any;
+ });
+}
+
+template <std::size_t N>
void
mark_read_regs(const aco_ptr<Instruction>& instr, std::bitset<N>& reg_reads)
{
va_vdst = 0;
}
+ /* VALUMaskWriteHazard
+ * VALU reads SGPR as a lane mask and later written by SALU cannot safely be read by SALU.
+ */
+ if (state.program->wave_size == 64 && instr->isSALU() &&
+ check_written_regs(instr, ctx.sgpr_read_by_valu_as_lanemask)) {
+ ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu = ctx.sgpr_read_by_valu_as_lanemask;
+ ctx.sgpr_read_by_valu_as_lanemask.reset();
+ } else if (state.program->wave_size == 64 && instr->isSALU() &&
+ check_read_regs(instr, ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu)) {
+ bld.sopp(aco_opcode::s_waitcnt_depctr, -1, 0xfffe);
+ ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.reset();
+ } else if (instr->opcode == aco_opcode::s_waitcnt_depctr && (instr->sopp().imm & 0x1) == 0) {
+ ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.reset();
+ }
+
va_vdst = std::min(va_vdst, parse_vdst_wait(instr));
if (va_vdst == 0) {
ctx.valu_since_wr_by_trans.reset();
ctx.trans_since_wr_by_trans.set(def.physReg(), def.bytes());
}
}
+
+ if (state.program->wave_size == 64) {
+ for (Operand& op : instr->operands) {
+ if (op.isLiteral() || (!op.isConstant() && op.physReg().reg() < 128))
+ ctx.sgpr_read_by_valu_as_lanemask.reset();
+ }
+ switch (instr->opcode) {
+ case aco_opcode::v_addc_co_u32:
+ case aco_opcode::v_subb_co_u32:
+ case aco_opcode::v_subbrev_co_u32:
+ case aco_opcode::v_cndmask_b16:
+ case aco_opcode::v_cndmask_b32:
+ case aco_opcode::v_div_fmas_f32:
+ case aco_opcode::v_div_fmas_f64:
+ if (instr->operands.back().physReg() != exec) {
+ ctx.sgpr_read_by_valu_as_lanemask.set(instr->operands.back().physReg().reg());
+ ctx.sgpr_read_by_valu_as_lanemask.set(instr->operands.back().physReg().reg() + 1);
+ }
+ break;
+ default: break;
+ }
+ }
}
/* LdsDirectVMEMHazard
finish_insert_nops_test();
END_TEST
+
+BEGIN_TEST(insert_nops.valu_mask_write)
+ if (!setup_cs(NULL, GFX11))
+ return;
+
+ /* Basic case. */
+ //>> p_unit_test 0
+ //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
+ //! s1: %0:s[1] = s_mov_b32 0
+ //! s_waitcnt_depctr sa_sdst(0)
+ //! s1: %0:s[2] = s_mov_b32 %0:s[1]
+ bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
+ bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1),
+ Operand::zero(), Operand::zero(), Operand(PhysReg(0), s2));
+ bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
+ bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
+
+ /* Mitigation. */
+ //! p_unit_test 1
+ //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
+ //! v1: %0:v[1] = v_mov_b32 %0:s[1]
+ //! s1: %0:s[1] = s_mov_b32 0
+ //! s1: %0:s[2] = s_mov_b32 %0:s[1]
+ bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
+ bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1),
+ Operand::zero(), Operand::zero(), Operand(PhysReg(0), s2));
+ bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(1), s1));
+ bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
+ bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
+
+ //! p_unit_test 2
+ //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
+ //! s1: %0:s[1] = s_mov_b32 0
+ //! s_waitcnt_depctr sa_sdst(0)
+ //! s1: %0:s[2] = s_mov_b32 %0:s[1]
+ //! s1: %0:s[2] = s_mov_b32 %0:s[1]
+ bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
+ bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1),
+ Operand::zero(), Operand::zero(), Operand(PhysReg(0), s2));
+ bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
+ bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
+ bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
+
+ //! p_unit_test 3
+ //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
+ //! s1: %0:s[1] = s_mov_b32 0
+ //! s_waitcnt_depctr sa_sdst(0)
+ //! s1: %0:s[2] = s_mov_b32 %0:s[1]
+ bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
+ bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1),
+ Operand::zero(), Operand::zero(), Operand(PhysReg(0), s2));
+ bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
+ bld.sopp(aco_opcode::s_waitcnt_depctr, -1, 0xfffe);
+ bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
+
+ /* Instruction which is both involved in the hazard and is a mitigation. */
+ //! p_unit_test 4
+ //! v1: %0:v[0] = v_cndmask_b32 %0:s[2], 0, %0:s[0-1]
+ //! s1: %0:s[1] = s_mov_b32 0
+ //! s_waitcnt_depctr sa_sdst(0)
+ //! s1: %0:s[2] = s_mov_b32 %0:s[1]
+ bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
+ bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1),
+ Operand(PhysReg(2), s1), Operand::zero(), Operand(PhysReg(0), s2));
+ bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
+ bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
+
+ finish_insert_nops_test();
+END_TEST