From 6f368431fd75e83eb47bb3ece58d0de81ba494b4 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Tue, 27 Sep 2022 15:37:11 +0100 Subject: [PATCH] aco/gfx11: workaround VALUMaskWriteHazard MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit fossil-db (gfx1100): Totals from 62812 (46.52% of 135032) affected shaders: Instrs: 43971580 -> 44069887 (+0.22%) CodeSize: 233473420 -> 233866648 (+0.17%) Latency: 463487489 -> 463520688 (+0.01%); split: -0.00%, +0.01% InvThroughput: 86505748 -> 86509679 (+0.00%); split: -0.00%, +0.00% Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/README-ISA.md | 8 ++++ src/amd/compiler/aco_insert_NOPs.cpp | 67 +++++++++++++++++++++++++++- src/amd/compiler/tests/test_insert_nops.cpp | 69 +++++++++++++++++++++++++++++ 3 files changed, 143 insertions(+), 1 deletion(-) diff --git a/src/amd/compiler/README-ISA.md b/src/amd/compiler/README-ISA.md index d59ca9f..f0cdbf8 100644 --- a/src/amd/compiler/README-ISA.md +++ b/src/amd/compiler/README-ISA.md @@ -330,3 +330,11 @@ VALU between the second VGPR write and the current instruction. Mitigated by: A va_vdst=0 wait: `s_waitcnt_deptr 0x0fff` + +### VALUMaskWriteHazard + +Triggered by: +SALU writing then reading a SGPR that was previously used as a lane mask for a VALU. + +Mitigated by: +A VALU instruction reading a SGPR or with literal, or a sa_sdst=0 wait: `s_waitcnt_depctr 0xfffe` diff --git a/src/amd/compiler/aco_insert_NOPs.cpp b/src/amd/compiler/aco_insert_NOPs.cpp index f4ce028..2c460e5 100644 --- a/src/amd/compiler/aco_insert_NOPs.cpp +++ b/src/amd/compiler/aco_insert_NOPs.cpp @@ -292,6 +292,10 @@ struct NOP_ctx_gfx11 { VGPRCounterMap<15> valu_since_wr_by_trans; VGPRCounterMap<2> trans_since_wr_by_trans; + /* VALUMaskWriteHazard */ + std::bitset<128> sgpr_read_by_valu_as_lanemask; + std::bitset<128> sgpr_read_by_valu_as_lanemask_then_wr_by_salu; + void join(const NOP_ctx_gfx11& other) { has_Vcmpx |= other.has_Vcmpx; @@ -300,6 +304,9 @@ struct NOP_ctx_gfx11 { vgpr_used_by_ds |= other.vgpr_used_by_ds; valu_since_wr_by_trans.join_min(other.valu_since_wr_by_trans); trans_since_wr_by_trans.join_min(other.trans_since_wr_by_trans); + sgpr_read_by_valu_as_lanemask |= other.sgpr_read_by_valu_as_lanemask; + sgpr_read_by_valu_as_lanemask_then_wr_by_salu |= + other.sgpr_read_by_valu_as_lanemask_then_wr_by_salu; } bool operator==(const NOP_ctx_gfx11& other) @@ -309,7 +316,10 @@ struct NOP_ctx_gfx11 { vgpr_used_by_vmem_store == other.vgpr_used_by_vmem_store && vgpr_used_by_ds == other.vgpr_used_by_ds && valu_since_wr_by_trans == other.valu_since_wr_by_trans && - trans_since_wr_by_trans == other.trans_since_wr_by_trans; + trans_since_wr_by_trans == other.trans_since_wr_by_trans && + sgpr_read_by_valu_as_lanemask == other.sgpr_read_by_valu_as_lanemask && + sgpr_read_by_valu_as_lanemask_then_wr_by_salu == + other.sgpr_read_by_valu_as_lanemask_then_wr_by_salu; } }; @@ -729,6 +739,24 @@ check_written_regs(const aco_ptr& instr, const std::bitset& chec } template +bool +check_read_regs(const aco_ptr& instr, const std::bitset& check_regs) +{ + return std::any_of(instr->operands.begin(), instr->operands.end(), + [&check_regs](const Operand& op) -> bool + { + if (op.isConstant()) + return false; + bool writes_any = false; + for (unsigned i = 0; i < op.size(); i++) { + unsigned op_reg = op.physReg() + i; + writes_any |= op_reg < check_regs.size() && check_regs[op_reg]; + } + return writes_any; + }); +} + +template void mark_read_regs(const aco_ptr& instr, std::bitset& reg_reads) { @@ -1265,6 +1293,21 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr& va_vdst = 0; } + /* VALUMaskWriteHazard + * VALU reads SGPR as a lane mask and later written by SALU cannot safely be read by SALU. + */ + if (state.program->wave_size == 64 && instr->isSALU() && + check_written_regs(instr, ctx.sgpr_read_by_valu_as_lanemask)) { + ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu = ctx.sgpr_read_by_valu_as_lanemask; + ctx.sgpr_read_by_valu_as_lanemask.reset(); + } else if (state.program->wave_size == 64 && instr->isSALU() && + check_read_regs(instr, ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu)) { + bld.sopp(aco_opcode::s_waitcnt_depctr, -1, 0xfffe); + ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.reset(); + } else if (instr->opcode == aco_opcode::s_waitcnt_depctr && (instr->sopp().imm & 0x1) == 0) { + ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.reset(); + } + va_vdst = std::min(va_vdst, parse_vdst_wait(instr)); if (va_vdst == 0) { ctx.valu_since_wr_by_trans.reset(); @@ -1286,6 +1329,28 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr& ctx.trans_since_wr_by_trans.set(def.physReg(), def.bytes()); } } + + if (state.program->wave_size == 64) { + for (Operand& op : instr->operands) { + if (op.isLiteral() || (!op.isConstant() && op.physReg().reg() < 128)) + ctx.sgpr_read_by_valu_as_lanemask.reset(); + } + switch (instr->opcode) { + case aco_opcode::v_addc_co_u32: + case aco_opcode::v_subb_co_u32: + case aco_opcode::v_subbrev_co_u32: + case aco_opcode::v_cndmask_b16: + case aco_opcode::v_cndmask_b32: + case aco_opcode::v_div_fmas_f32: + case aco_opcode::v_div_fmas_f64: + if (instr->operands.back().physReg() != exec) { + ctx.sgpr_read_by_valu_as_lanemask.set(instr->operands.back().physReg().reg()); + ctx.sgpr_read_by_valu_as_lanemask.set(instr->operands.back().physReg().reg() + 1); + } + break; + default: break; + } + } } /* LdsDirectVMEMHazard diff --git a/src/amd/compiler/tests/test_insert_nops.cpp b/src/amd/compiler/tests/test_insert_nops.cpp index bedf267..8fc3a18 100644 --- a/src/amd/compiler/tests/test_insert_nops.cpp +++ b/src/amd/compiler/tests/test_insert_nops.cpp @@ -920,3 +920,72 @@ BEGIN_TEST(insert_nops.valu_partial_forwarding.control_flow) finish_insert_nops_test(); END_TEST + +BEGIN_TEST(insert_nops.valu_mask_write) + if (!setup_cs(NULL, GFX11)) + return; + + /* Basic case. */ + //>> p_unit_test 0 + //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1] + //! s1: %0:s[1] = s_mov_b32 0 + //! s_waitcnt_depctr sa_sdst(0) + //! s1: %0:s[2] = s_mov_b32 %0:s[1] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0)); + bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), + Operand::zero(), Operand::zero(), Operand(PhysReg(0), s2)); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero()); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1)); + + /* Mitigation. */ + //! p_unit_test 1 + //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1] + //! v1: %0:v[1] = v_mov_b32 %0:s[1] + //! s1: %0:s[1] = s_mov_b32 0 + //! s1: %0:s[2] = s_mov_b32 %0:s[1] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); + bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), + Operand::zero(), Operand::zero(), Operand(PhysReg(0), s2)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(1), s1)); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero()); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1)); + + //! p_unit_test 2 + //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1] + //! s1: %0:s[1] = s_mov_b32 0 + //! s_waitcnt_depctr sa_sdst(0) + //! s1: %0:s[2] = s_mov_b32 %0:s[1] + //! s1: %0:s[2] = s_mov_b32 %0:s[1] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); + bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), + Operand::zero(), Operand::zero(), Operand(PhysReg(0), s2)); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero()); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1)); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1)); + + //! p_unit_test 3 + //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1] + //! s1: %0:s[1] = s_mov_b32 0 + //! s_waitcnt_depctr sa_sdst(0) + //! s1: %0:s[2] = s_mov_b32 %0:s[1] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); + bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), + Operand::zero(), Operand::zero(), Operand(PhysReg(0), s2)); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero()); + bld.sopp(aco_opcode::s_waitcnt_depctr, -1, 0xfffe); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1)); + + /* Instruction which is both involved in the hazard and is a mitigation. */ + //! p_unit_test 4 + //! v1: %0:v[0] = v_cndmask_b32 %0:s[2], 0, %0:s[0-1] + //! s1: %0:s[1] = s_mov_b32 0 + //! s_waitcnt_depctr sa_sdst(0) + //! s1: %0:s[2] = s_mov_b32 %0:s[1] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); + bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), + Operand(PhysReg(2), s1), Operand::zero(), Operand(PhysReg(0), s2)); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero()); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1)); + + finish_insert_nops_test(); +END_TEST -- 2.7.4