/* VcmpxPermlaneHazard */
bool has_Vcmpx = false;
- void join(const NOP_ctx_gfx11& other) { has_Vcmpx |= other.has_Vcmpx; }
+ /* LdsDirectVMEMHazard */
+ std::bitset<256> vgpr_used_by_vmem_load;
+ std::bitset<256> vgpr_used_by_vmem_store;
+ std::bitset<256> vgpr_used_by_ds;
- bool operator==(const NOP_ctx_gfx11& other) { return has_Vcmpx == other.has_Vcmpx; }
+ void join(const NOP_ctx_gfx11& other)
+ {
+ has_Vcmpx |= other.has_Vcmpx;
+ vgpr_used_by_vmem_load |= other.vgpr_used_by_vmem_load;
+ vgpr_used_by_vmem_store |= other.vgpr_used_by_vmem_store;
+ vgpr_used_by_ds |= other.vgpr_used_by_ds;
+ }
+
+ bool operator==(const NOP_ctx_gfx11& other)
+ {
+ return has_Vcmpx == other.has_Vcmpx &&
+ vgpr_used_by_vmem_load == other.vgpr_used_by_vmem_load &&
+ vgpr_used_by_vmem_store == other.vgpr_used_by_vmem_store &&
+ vgpr_used_by_ds == other.vgpr_used_by_ds;
+ }
};
int
}
}
+void
+fill_vgpr_bitset(std::bitset<256>& set, PhysReg reg, unsigned bytes)
+{
+ if (reg.reg() < 256)
+ return;
+ for (unsigned i = 0; i < DIV_ROUND_UP(bytes, 4); i++)
+ set.set(reg.reg() - 256 + i);
+}
+
/* GFX11 */
unsigned
parse_vdst_wait(aco_ptr<Instruction>& instr)
LDSDIR_instruction* ldsdir = &instr->ldsdir();
ldsdir->wait_vdst = MIN2(ldsdir->wait_vdst, count);
}
+
+ /* LdsDirectVMEMHazard
+ * Handle LDSDIR writing a VGPR after it's used by a VMEM/DS instruction.
+ */
+ if (instr->isVMEM() || instr->isFlatLike()) {
+ for (Definition& def : instr->definitions)
+ fill_vgpr_bitset(ctx.vgpr_used_by_vmem_store, def.physReg(), def.bytes());
+ if (instr->definitions.empty()) {
+ for (Operand& op : instr->operands)
+ fill_vgpr_bitset(ctx.vgpr_used_by_vmem_store, op.physReg(), op.bytes());
+ } else {
+ for (Operand& op : instr->operands)
+ fill_vgpr_bitset(ctx.vgpr_used_by_vmem_load, op.physReg(), op.bytes());
+ }
+ }
+ if (instr->isDS() || instr->isFlat()) {
+ for (Definition& def : instr->definitions)
+ fill_vgpr_bitset(ctx.vgpr_used_by_ds, def.physReg(), def.bytes());
+ for (Operand& op : instr->operands)
+ fill_vgpr_bitset(ctx.vgpr_used_by_ds, op.physReg(), op.bytes());
+ }
+ if (instr->isVALU() || instr->isVINTERP_INREG() || instr->isEXP() ||
+ (instr->opcode == aco_opcode::s_waitcnt_depctr && ((instr->sopp().imm >> 2) & 0x7) == 0)) {
+ ctx.vgpr_used_by_vmem_load.reset();
+ ctx.vgpr_used_by_vmem_store.reset();
+ ctx.vgpr_used_by_ds.reset();
+ } else if (instr->opcode == aco_opcode::s_waitcnt) {
+ wait_imm imm(GFX11, instr->sopp().imm);
+ if (imm.vm == 0)
+ ctx.vgpr_used_by_vmem_load.reset();
+ if (imm.lgkm == 0)
+ ctx.vgpr_used_by_ds.reset();
+ } else if (instr->opcode == aco_opcode::s_waitcnt_vscnt && instr->sopk().imm == 0) {
+ ctx.vgpr_used_by_vmem_store.reset();
+ }
+ if (instr->isLDSDIR()) {
+ if (ctx.vgpr_used_by_vmem_load[instr->definitions[0].physReg().reg() - 256] ||
+ ctx.vgpr_used_by_vmem_store[instr->definitions[0].physReg().reg() - 256] ||
+ ctx.vgpr_used_by_ds[instr->definitions[0].physReg().reg() - 256]) {
+ bld.sopp(aco_opcode::s_waitcnt_depctr, -1, 0xffe3);
+ ctx.vgpr_used_by_vmem_load.reset();
+ ctx.vgpr_used_by_vmem_store.reset();
+ ctx.vgpr_used_by_ds.reset();
+ }
+ }
}
template <typename Ctx>
using namespace aco;
-void create_mubuf(unsigned offset)
+void create_mubuf(unsigned offset, PhysReg dst=PhysReg(256), PhysReg vaddr=PhysReg(256))
{
- bld.mubuf(aco_opcode::buffer_load_dword, Definition(PhysReg(256), v1), Operand(PhysReg(0), s4),
- Operand(PhysReg(256), v1), Operand::zero(), offset, true);
+ bld.mubuf(aco_opcode::buffer_load_dword, Definition(dst, v1), Operand(PhysReg(0), s4),
+ Operand(vaddr, v1), Operand::zero(), offset, true);
}
void create_mubuf_store(PhysReg src=PhysReg(256))
finish_insert_nops_test();
END_TEST
+
+BEGIN_TEST(insert_nops.lds_direct_vmem)
+ if (!setup_cs(NULL, GFX11))
+ return;
+
+ /* WaR: VMEM */
+ //>> p_unit_test 0
+ //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
+ //! s_waitcnt_depctr vm_vsrc(0)
+ //! v1: %0:v[0] = lds_direct_load %0:m0
+ bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
+ create_mubuf(0, PhysReg(257));
+ bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
+
+ /* WaW: VMEM */
+ //! p_unit_test 1
+ //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[1], 0 offen
+ //! s_waitcnt_depctr vm_vsrc(0)
+ //! v1: %0:v[0] = lds_direct_load %0:m0
+ bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
+ create_mubuf(0, PhysReg(256), PhysReg(257));
+ bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
+
+ /* no hazard: VMEM */
+ //! p_unit_test 2
+ //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[1], 0 offen
+ //! v1: %0:v[0] = lds_direct_load %0:m0
+ bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
+ create_mubuf(0, PhysReg(257), PhysReg(257));
+ bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
+
+ /* no hazard: VMEM with VALU in-between */
+ //! p_unit_test 3
+ //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
+ //! v_nop
+ //! v1: %0:v[0] = lds_direct_load %0:m0
+ bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
+ create_mubuf(0, PhysReg(257));
+ bld.vop1(aco_opcode::v_nop);
+ bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
+
+ /* WaR: LDS */
+ //! p_unit_test 4
+ //! v1: %0:v[1] = ds_read_b32 %0:v[0]
+ //! s_waitcnt_depctr vm_vsrc(0)
+ //! v1: %0:v[0] = lds_direct_load %0:m0
+ bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
+ bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
+ bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
+
+ /* WaW: LDS */
+ //! p_unit_test 5
+ //! v1: %0:v[0] = ds_read_b32 %0:v[1]
+ //! s_waitcnt_depctr vm_vsrc(0)
+ //! v1: %0:v[0] = lds_direct_load %0:m0
+ bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
+ bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
+ bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
+
+ /* no hazard: LDS */
+ //! p_unit_test 6
+ //! v1: %0:v[1] = ds_read_b32 %0:v[1]
+ //! v1: %0:v[0] = lds_direct_load %0:m0
+ bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
+ bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(257), v1));
+ bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
+
+ /* no hazard: LDS with VALU in-between */
+ //! p_unit_test 7
+ //! v1: %0:v[1] = ds_read_b32 %0:v[0]
+ //! v_nop
+ //! v1: %0:v[0] = lds_direct_load %0:m0
+ bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
+ bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
+ bld.vop1(aco_opcode::v_nop);
+ bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
+
+ /* no hazard: VMEM/LDS with the correct waitcnt in-between */
+ //! p_unit_test 8
+ //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
+ //! s_waitcnt vmcnt(0)
+ //! v1: %0:v[0] = lds_direct_load %0:m0
+ bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
+ create_mubuf(0, PhysReg(257));
+ bld.sopp(aco_opcode::s_waitcnt, -1, 0x3ff);
+ bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
+
+ //! p_unit_test 9
+ //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen
+ //! s1: %0:null = s_waitcnt_vscnt imm:0
+ //! v1: %0:v[0] = lds_direct_load %0:m0
+ bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
+ create_mubuf_store();
+ bld.sopk(aco_opcode::s_waitcnt_vscnt, Definition(sgpr_null, s1), 0);
+ bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
+
+ //! p_unit_test 10
+ //! v1: %0:v[1] = ds_read_b32 %0:v[0]
+ //! s_waitcnt lgkmcnt(0)
+ //! v1: %0:v[0] = lds_direct_load %0:m0
+ bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10));
+ bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
+ bld.sopp(aco_opcode::s_waitcnt, -1, 0xfc0f);
+ bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
+
+ /* VMEM/LDS with the wrong waitcnt in-between */
+ //! p_unit_test 11
+ //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
+ //! s1: %0:null = s_waitcnt_vscnt imm:0
+ //! s_waitcnt_depctr vm_vsrc(0)
+ //! v1: %0:v[0] = lds_direct_load %0:m0
+ bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11));
+ create_mubuf(0, PhysReg(257));
+ bld.sopk(aco_opcode::s_waitcnt_vscnt, Definition(sgpr_null, s1), 0);
+ bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
+
+ //! p_unit_test 12
+ //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen
+ //! s_waitcnt lgkmcnt(0)
+ //! s_waitcnt_depctr vm_vsrc(0)
+ //! v1: %0:v[0] = lds_direct_load %0:m0
+ bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12));
+ create_mubuf_store();
+ bld.sopp(aco_opcode::s_waitcnt, -1, 0xfc0f);
+ bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
+
+ //! p_unit_test 13
+ //! v1: %0:v[1] = ds_read_b32 %0:v[0]
+ //! s_waitcnt vmcnt(0)
+ //! s_waitcnt_depctr vm_vsrc(0)
+ //! v1: %0:v[0] = lds_direct_load %0:m0
+ bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13));
+ bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
+ bld.sopp(aco_opcode::s_waitcnt, -1, 0x3ff);
+ bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
+
+ finish_insert_nops_test();
+END_TEST