From: Rhys Perry Date: Thu, 25 Aug 2022 11:26:06 +0000 (+0100) Subject: aco/gfx11: workaround LdsDirectVMEMHazard X-Git-Tag: upstream/22.3.5~1567 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=98ee3e1468cf55b66a71a3150f4d8f5a0ec7a918;p=platform%2Fupstream%2Fmesa.git aco/gfx11: workaround LdsDirectVMEMHazard fossil-db (gfx1100): Totals from 27217 (20.16% of 135032) affected shaders: Instrs: 18010853 -> 18047277 (+0.20%) CodeSize: 99369568 -> 99515264 (+0.15%) Latency: 207454040 -> 207464932 (+0.01%); split: -0.00%, +0.01% InvThroughput: 39810158 -> 39810628 (+0.00%); split: -0.00%, +0.00% Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- diff --git a/src/amd/compiler/README-ISA.md b/src/amd/compiler/README-ISA.md index 48f0924..ad85160 100644 --- a/src/amd/compiler/README-ISA.md +++ b/src/amd/compiler/README-ISA.md @@ -302,3 +302,12 @@ LDSDIR instruction writing a VGPR soon after it's used by a VALU instruction. Mitigated by: A vdst wait, preferably using the LDSDIR's field. + +### LdsDirectVMEMHazard + +Triggered by: +LDSDIR instruction writing a VGPR after it's used by a VMEM/DS instruction. + +Mitigated by: +Waiting for the VMEM/DS instruction to finish, a VALU or export instruction, or +`s_waitcnt_depctr 0xffe3`. diff --git a/src/amd/compiler/aco_insert_NOPs.cpp b/src/amd/compiler/aco_insert_NOPs.cpp index 66a3590..d8f5405 100644 --- a/src/amd/compiler/aco_insert_NOPs.cpp +++ b/src/amd/compiler/aco_insert_NOPs.cpp @@ -198,9 +198,26 @@ struct NOP_ctx_gfx11 { /* VcmpxPermlaneHazard */ bool has_Vcmpx = false; - void join(const NOP_ctx_gfx11& other) { has_Vcmpx |= other.has_Vcmpx; } + /* LdsDirectVMEMHazard */ + std::bitset<256> vgpr_used_by_vmem_load; + std::bitset<256> vgpr_used_by_vmem_store; + std::bitset<256> vgpr_used_by_ds; - bool operator==(const NOP_ctx_gfx11& other) { return has_Vcmpx == other.has_Vcmpx; } + void join(const NOP_ctx_gfx11& other) + { + has_Vcmpx |= other.has_Vcmpx; + vgpr_used_by_vmem_load |= other.vgpr_used_by_vmem_load; + vgpr_used_by_vmem_store |= other.vgpr_used_by_vmem_store; + vgpr_used_by_ds |= other.vgpr_used_by_ds; + } + + bool operator==(const NOP_ctx_gfx11& other) + { + return has_Vcmpx == other.has_Vcmpx && + vgpr_used_by_vmem_load == other.vgpr_used_by_vmem_load && + vgpr_used_by_vmem_store == other.vgpr_used_by_vmem_store && + vgpr_used_by_ds == other.vgpr_used_by_ds; + } }; int @@ -866,6 +883,15 @@ handle_instruction_gfx10(State& state, NOP_ctx_gfx10& ctx, aco_ptr& } } +void +fill_vgpr_bitset(std::bitset<256>& set, PhysReg reg, unsigned bytes) +{ + if (reg.reg() < 256) + return; + for (unsigned i = 0; i < DIV_ROUND_UP(bytes, 4); i++) + set.set(reg.reg() - 256 + i); +} + /* GFX11 */ unsigned parse_vdst_wait(aco_ptr& instr) @@ -983,6 +1009,51 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr& LDSDIR_instruction* ldsdir = &instr->ldsdir(); ldsdir->wait_vdst = MIN2(ldsdir->wait_vdst, count); } + + /* LdsDirectVMEMHazard + * Handle LDSDIR writing a VGPR after it's used by a VMEM/DS instruction. + */ + if (instr->isVMEM() || instr->isFlatLike()) { + for (Definition& def : instr->definitions) + fill_vgpr_bitset(ctx.vgpr_used_by_vmem_store, def.physReg(), def.bytes()); + if (instr->definitions.empty()) { + for (Operand& op : instr->operands) + fill_vgpr_bitset(ctx.vgpr_used_by_vmem_store, op.physReg(), op.bytes()); + } else { + for (Operand& op : instr->operands) + fill_vgpr_bitset(ctx.vgpr_used_by_vmem_load, op.physReg(), op.bytes()); + } + } + if (instr->isDS() || instr->isFlat()) { + for (Definition& def : instr->definitions) + fill_vgpr_bitset(ctx.vgpr_used_by_ds, def.physReg(), def.bytes()); + for (Operand& op : instr->operands) + fill_vgpr_bitset(ctx.vgpr_used_by_ds, op.physReg(), op.bytes()); + } + if (instr->isVALU() || instr->isVINTERP_INREG() || instr->isEXP() || + (instr->opcode == aco_opcode::s_waitcnt_depctr && ((instr->sopp().imm >> 2) & 0x7) == 0)) { + ctx.vgpr_used_by_vmem_load.reset(); + ctx.vgpr_used_by_vmem_store.reset(); + ctx.vgpr_used_by_ds.reset(); + } else if (instr->opcode == aco_opcode::s_waitcnt) { + wait_imm imm(GFX11, instr->sopp().imm); + if (imm.vm == 0) + ctx.vgpr_used_by_vmem_load.reset(); + if (imm.lgkm == 0) + ctx.vgpr_used_by_ds.reset(); + } else if (instr->opcode == aco_opcode::s_waitcnt_vscnt && instr->sopk().imm == 0) { + ctx.vgpr_used_by_vmem_store.reset(); + } + if (instr->isLDSDIR()) { + if (ctx.vgpr_used_by_vmem_load[instr->definitions[0].physReg().reg() - 256] || + ctx.vgpr_used_by_vmem_store[instr->definitions[0].physReg().reg() - 256] || + ctx.vgpr_used_by_ds[instr->definitions[0].physReg().reg() - 256]) { + bld.sopp(aco_opcode::s_waitcnt_depctr, -1, 0xffe3); + ctx.vgpr_used_by_vmem_load.reset(); + ctx.vgpr_used_by_vmem_store.reset(); + ctx.vgpr_used_by_ds.reset(); + } + } } template diff --git a/src/amd/compiler/tests/test_insert_nops.cpp b/src/amd/compiler/tests/test_insert_nops.cpp index 8bec022..80ed4b7 100644 --- a/src/amd/compiler/tests/test_insert_nops.cpp +++ b/src/amd/compiler/tests/test_insert_nops.cpp @@ -25,10 +25,10 @@ using namespace aco; -void create_mubuf(unsigned offset) +void create_mubuf(unsigned offset, PhysReg dst=PhysReg(256), PhysReg vaddr=PhysReg(256)) { - bld.mubuf(aco_opcode::buffer_load_dword, Definition(PhysReg(256), v1), Operand(PhysReg(0), s4), - Operand(PhysReg(256), v1), Operand::zero(), offset, true); + bld.mubuf(aco_opcode::buffer_load_dword, Definition(dst, v1), Operand(PhysReg(0), s4), + Operand(vaddr, v1), Operand::zero(), offset, true); } void create_mubuf_store(PhysReg src=PhysReg(256)) @@ -432,3 +432,141 @@ BEGIN_TEST(insert_nops.lds_direct_valu) finish_insert_nops_test(); END_TEST + +BEGIN_TEST(insert_nops.lds_direct_vmem) + if (!setup_cs(NULL, GFX11)) + return; + + /* WaR: VMEM */ + //>> p_unit_test 0 + //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen + //! s_waitcnt_depctr vm_vsrc(0) + //! v1: %0:v[0] = lds_direct_load %0:m0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0)); + create_mubuf(0, PhysReg(257)); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + + /* WaW: VMEM */ + //! p_unit_test 1 + //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[1], 0 offen + //! s_waitcnt_depctr vm_vsrc(0) + //! v1: %0:v[0] = lds_direct_load %0:m0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); + create_mubuf(0, PhysReg(256), PhysReg(257)); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + + /* no hazard: VMEM */ + //! p_unit_test 2 + //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[1], 0 offen + //! v1: %0:v[0] = lds_direct_load %0:m0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); + create_mubuf(0, PhysReg(257), PhysReg(257)); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + + /* no hazard: VMEM with VALU in-between */ + //! p_unit_test 3 + //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen + //! v_nop + //! v1: %0:v[0] = lds_direct_load %0:m0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); + create_mubuf(0, PhysReg(257)); + bld.vop1(aco_opcode::v_nop); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + + /* WaR: LDS */ + //! p_unit_test 4 + //! v1: %0:v[1] = ds_read_b32 %0:v[0] + //! s_waitcnt_depctr vm_vsrc(0) + //! v1: %0:v[0] = lds_direct_load %0:m0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); + bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1)); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + + /* WaW: LDS */ + //! p_unit_test 5 + //! v1: %0:v[0] = ds_read_b32 %0:v[1] + //! s_waitcnt_depctr vm_vsrc(0) + //! v1: %0:v[0] = lds_direct_load %0:m0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); + bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1)); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + + /* no hazard: LDS */ + //! p_unit_test 6 + //! v1: %0:v[1] = ds_read_b32 %0:v[1] + //! v1: %0:v[0] = lds_direct_load %0:m0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6)); + bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(257), v1)); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + + /* no hazard: LDS with VALU in-between */ + //! p_unit_test 7 + //! v1: %0:v[1] = ds_read_b32 %0:v[0] + //! v_nop + //! v1: %0:v[0] = lds_direct_load %0:m0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7)); + bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1)); + bld.vop1(aco_opcode::v_nop); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + + /* no hazard: VMEM/LDS with the correct waitcnt in-between */ + //! p_unit_test 8 + //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen + //! s_waitcnt vmcnt(0) + //! v1: %0:v[0] = lds_direct_load %0:m0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8)); + create_mubuf(0, PhysReg(257)); + bld.sopp(aco_opcode::s_waitcnt, -1, 0x3ff); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + + //! p_unit_test 9 + //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen + //! s1: %0:null = s_waitcnt_vscnt imm:0 + //! v1: %0:v[0] = lds_direct_load %0:m0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9)); + create_mubuf_store(); + bld.sopk(aco_opcode::s_waitcnt_vscnt, Definition(sgpr_null, s1), 0); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + + //! p_unit_test 10 + //! v1: %0:v[1] = ds_read_b32 %0:v[0] + //! s_waitcnt lgkmcnt(0) + //! v1: %0:v[0] = lds_direct_load %0:m0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10)); + bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1)); + bld.sopp(aco_opcode::s_waitcnt, -1, 0xfc0f); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + + /* VMEM/LDS with the wrong waitcnt in-between */ + //! p_unit_test 11 + //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen + //! s1: %0:null = s_waitcnt_vscnt imm:0 + //! s_waitcnt_depctr vm_vsrc(0) + //! v1: %0:v[0] = lds_direct_load %0:m0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11)); + create_mubuf(0, PhysReg(257)); + bld.sopk(aco_opcode::s_waitcnt_vscnt, Definition(sgpr_null, s1), 0); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + + //! p_unit_test 12 + //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen + //! s_waitcnt lgkmcnt(0) + //! s_waitcnt_depctr vm_vsrc(0) + //! v1: %0:v[0] = lds_direct_load %0:m0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12)); + create_mubuf_store(); + bld.sopp(aco_opcode::s_waitcnt, -1, 0xfc0f); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + + //! p_unit_test 13 + //! v1: %0:v[1] = ds_read_b32 %0:v[0] + //! s_waitcnt vmcnt(0) + //! s_waitcnt_depctr vm_vsrc(0) + //! v1: %0:v[0] = lds_direct_load %0:m0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13)); + bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1)); + bld.sopp(aco_opcode::s_waitcnt, -1, 0x3ff); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + + finish_insert_nops_test(); +END_TEST