From b0b48b768785fc69f23fb115641c4a4a7a941099 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Thu, 13 Oct 2022 17:55:57 +0100 Subject: [PATCH] aco/gfx11: workaround VALUTransUseHazard MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit fossil-db (gfx1100): Totals from 116990 (86.64% of 135032) affected shaders: Instrs: 67942325 -> 69493991 (+2.28%) CodeSize: 366448984 -> 372655648 (+1.69%) Latency: 673236871 -> 673269808 (+0.00%); split: -0.00%, +0.00% InvThroughput: 128266905 -> 128270175 (+0.00%); split: -0.00%, +0.00% Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/README-ISA.md | 9 ++ src/amd/compiler/aco_insert_NOPs.cpp | 140 +++++++++++++++++++++++++++- src/amd/compiler/tests/test_insert_nops.cpp | 84 +++++++++++++++++ 3 files changed, 232 insertions(+), 1 deletion(-) diff --git a/src/amd/compiler/README-ISA.md b/src/amd/compiler/README-ISA.md index ad85160..518946e 100644 --- a/src/amd/compiler/README-ISA.md +++ b/src/amd/compiler/README-ISA.md @@ -311,3 +311,12 @@ LDSDIR instruction writing a VGPR after it's used by a VMEM/DS instruction. Mitigated by: Waiting for the VMEM/DS instruction to finish, a VALU or export instruction, or `s_waitcnt_depctr 0xffe3`. + +### VALUTransUseHazard + +Triggered by: +A VALU instrction reading a VGPR written by a transcendental VALU instruction without 6+ VALU or 2+ +transcendental instructions in-between. + +Mitigated by: +A va_vdst=0 wait: `s_waitcnt_deptr 0x0fff` diff --git a/src/amd/compiler/aco_insert_NOPs.cpp b/src/amd/compiler/aco_insert_NOPs.cpp index d8f5405..bc956a0 100644 --- a/src/amd/compiler/aco_insert_NOPs.cpp +++ b/src/amd/compiler/aco_insert_NOPs.cpp @@ -194,6 +194,91 @@ struct NOP_ctx_gfx10 { } }; +template struct VGPRCounterMap { +public: + int base = 0; + BITSET_DECLARE(resident, 256); + int val[256]; + + /* Initializes all counters to Max. */ + VGPRCounterMap() { BITSET_ZERO(resident); } + + /* Increase all counters, clamping at Max. */ + void inc() { base++; } + + /* Set counter to 0. */ + void set(unsigned idx) + { + val[idx] = -base; + BITSET_SET(resident, idx); + } + + void set(PhysReg reg, unsigned bytes) + { + if (reg.reg() < 256) + return; + + for (unsigned i = 0; i < DIV_ROUND_UP(bytes, 4); i++) + set(reg.reg() - 256 + i); + } + + /* Reset all counters to Max. */ + void reset() + { + base = 0; + BITSET_ZERO(resident); + } + + void reset(PhysReg reg, unsigned bytes) + { + if (reg.reg() < 256) + return; + + for (unsigned i = 0; i < DIV_ROUND_UP(bytes, 4); i++) + BITSET_CLEAR(resident, reg.reg() - 256 + i); + } + + uint8_t get(unsigned idx) + { + return BITSET_TEST(resident, idx) ? MIN2(val[idx] + base, Max) : Max; + } + + uint8_t get(PhysReg reg, unsigned offset = 0) + { + assert(reg.reg() >= 256); + return get(reg.reg() - 256 + offset); + } + + void join_min(const VGPRCounterMap& other) + { + unsigned i; + BITSET_FOREACH_SET(i, other.resident, 256) + { + if (BITSET_TEST(resident, i)) + val[i] = MIN2(val[i] + base, other.val[i] + other.base) - base; + else + val[i] = other.val[i] + other.base - base; + } + BITSET_OR(resident, resident, other.resident); + } + + bool operator==(const VGPRCounterMap& other) const + { + if (!BITSET_EQUAL(resident, other.resident)) + return false; + + unsigned i; + BITSET_FOREACH_SET(i, other.resident, 256) + { + if (!BITSET_TEST(resident, i)) + return false; + if (val[i] + base != other.val[i] + other.base) + return false; + } + return true; + } +}; + struct NOP_ctx_gfx11 { /* VcmpxPermlaneHazard */ bool has_Vcmpx = false; @@ -203,12 +288,18 @@ struct NOP_ctx_gfx11 { std::bitset<256> vgpr_used_by_vmem_store; std::bitset<256> vgpr_used_by_ds; + /* VALUTransUseHazard */ + VGPRCounterMap<15> valu_since_wr_by_trans; + VGPRCounterMap<2> trans_since_wr_by_trans; + void join(const NOP_ctx_gfx11& other) { has_Vcmpx |= other.has_Vcmpx; vgpr_used_by_vmem_load |= other.vgpr_used_by_vmem_load; vgpr_used_by_vmem_store |= other.vgpr_used_by_vmem_store; vgpr_used_by_ds |= other.vgpr_used_by_ds; + valu_since_wr_by_trans.join_min(other.valu_since_wr_by_trans); + trans_since_wr_by_trans.join_min(other.trans_since_wr_by_trans); } bool operator==(const NOP_ctx_gfx11& other) @@ -216,7 +307,9 @@ struct NOP_ctx_gfx11 { return has_Vcmpx == other.has_Vcmpx && vgpr_used_by_vmem_load == other.vgpr_used_by_vmem_load && vgpr_used_by_vmem_store == other.vgpr_used_by_vmem_store && - vgpr_used_by_ds == other.vgpr_used_by_ds; + vgpr_used_by_ds == other.vgpr_used_by_ds && + valu_since_wr_by_trans == other.valu_since_wr_by_trans && + trans_since_wr_by_trans == other.trans_since_wr_by_trans; } }; @@ -1010,6 +1103,51 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr& ldsdir->wait_vdst = MIN2(ldsdir->wait_vdst, count); } + /* VALUTransUseHazard + * VALU reads VGPR written by transcendental instruction without 6+ VALU or 2+ transcendental + * in-between. + */ + unsigned va_vdst = 15; + if (instr->isVALU() || instr->isVINTERP_INREG()) { + uint8_t num_valu = 15; + uint8_t num_trans = 15; + for (Operand& op : instr->operands) { + if (op.physReg().reg() < 256) + continue; + for (unsigned i = 0; i < op.size(); i++) { + num_valu = std::min(num_valu, ctx.valu_since_wr_by_trans.get(op.physReg(), i)); + num_trans = std::min(num_trans, ctx.trans_since_wr_by_trans.get(op.physReg(), i)); + } + } + if (num_trans <= 1 && num_valu <= 5) { + bld.sopp(aco_opcode::s_waitcnt_depctr, -1, 0x0fff); + va_vdst = 0; + } + } + + va_vdst = std::min(va_vdst, parse_vdst_wait(instr)); + if (va_vdst == 0) { + ctx.valu_since_wr_by_trans.reset(); + ctx.trans_since_wr_by_trans.reset(); + } + + if (instr->isVALU() || instr->isVINTERP_INREG()) { + instr_class cls = instr_info.classes[(int)instr->opcode]; + bool is_trans = cls == instr_class::valu_transcendental32 || + cls == instr_class::valu_double_transcendental; + + ctx.valu_since_wr_by_trans.inc(); + if (is_trans) + ctx.trans_since_wr_by_trans.inc(); + + if (is_trans) { + for (Definition& def : instr->definitions) { + ctx.valu_since_wr_by_trans.set(def.physReg(), def.bytes()); + ctx.trans_since_wr_by_trans.set(def.physReg(), def.bytes()); + } + } + } + /* LdsDirectVMEMHazard * Handle LDSDIR writing a VGPR after it's used by a VMEM/DS instruction. */ diff --git a/src/amd/compiler/tests/test_insert_nops.cpp b/src/amd/compiler/tests/test_insert_nops.cpp index 80ed4b7..41f8b62 100644 --- a/src/amd/compiler/tests/test_insert_nops.cpp +++ b/src/amd/compiler/tests/test_insert_nops.cpp @@ -570,3 +570,87 @@ BEGIN_TEST(insert_nops.lds_direct_vmem) finish_insert_nops_test(); END_TEST + +BEGIN_TEST(insert_nops.valu_trans_use) + if (!setup_cs(NULL, GFX11)) + return; + + //>> p_unit_test 0 + //! v1: %0:v[0] = v_rcp_f32 %0:v[1] + //! s_waitcnt_depctr va_vdst(0) + //! v1: %0:v[1] = v_mov_b32 %0:v[0] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0)); + bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1)); + + /* Sufficient VALU mitigates the hazard. */ + //! p_unit_test 1 + //! v1: %0:v[0] = v_rcp_f32 %0:v[1] + //; for i in range(4): insert_pattern('v_nop') + //! s_waitcnt_depctr va_vdst(0) + //! v1: %0:v[1] = v_mov_b32 %0:v[0] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); + bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1)); + for (unsigned i = 0; i < 4; i++) + bld.vop1(aco_opcode::v_nop); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1)); + + //! p_unit_test 2 + //! v1: %0:v[0] = v_rcp_f32 %0:v[1] + //; for i in range(8): insert_pattern('v_nop') + //! v1: %0:v[1] = v_mov_b32 %0:v[0] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); + bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1)); + for (unsigned i = 0; i < 8; i++) + bld.vop1(aco_opcode::v_nop); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1)); + + /* Sufficient transcendental VALU mitigates the hazard. */ + //! p_unit_test 3 + //! v1: %0:v[0] = v_rcp_f32 %0:v[1] + //! v1: %0:v[2] = v_sqrt_f32 %0:v[3] + //! s_waitcnt_depctr va_vdst(0) + //! v1: %0:v[1] = v_mov_b32 %0:v[0] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); + bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1)); + bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(258), v1), Operand(PhysReg(259), v1)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1)); + + //! p_unit_test 4 + //! v1: %0:v[0] = v_rcp_f32 %0:v[1] + //! v1: %0:v[2] = v_sqrt_f32 %0:v[3] + //! v1: %0:v[2] = v_sqrt_f32 %0:v[3] + //! v1: %0:v[1] = v_mov_b32 %0:v[0] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); + bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1)); + for (unsigned i = 0; i < 2; i++) + bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(258), v1), Operand(PhysReg(259), v1)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1)); + + /* Transcendental VALU should be counted towards VALU */ + //! p_unit_test 5 + //! v1: %0:v[0] = v_rcp_f32 %0:v[1] + //; for i in range(5): insert_pattern('v_nop') + //! v1: %0:v[2] = v_sqrt_f32 %0:v[3] + //! v1: %0:v[1] = v_mov_b32 %0:v[0] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); + bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1)); + for (unsigned i = 0; i < 5; i++) + bld.vop1(aco_opcode::v_nop); + bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(258), v1), Operand(PhysReg(259), v1)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1)); + + /* non-VALU does not mitigate the hazard. */ + //! p_unit_test 6 + //! v1: %0:v[0] = v_rcp_f32 %0:v[1] + //; for i in range(8): insert_pattern('s_nop') + //! s_waitcnt_depctr va_vdst(0) + //! v1: %0:v[1] = v_mov_b32 %0:v[0] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6)); + bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1)); + for (unsigned i = 0; i < 8; i++) + bld.sopp(aco_opcode::s_nop, -1, 0); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1)); + + finish_insert_nops_test(); +END_TEST -- 2.7.4