From 2bd16256a6a8f830dc43aa7224879d11edb9583a Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Thu, 25 Aug 2022 20:04:01 +0100 Subject: [PATCH] aco: fix VMEMtoScalarWriteHazard s_waitcnt mitigation MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit It doesn't make sense for a "s_waitcnt vmcnt(0)" to affect a store or DS instruction. LLVM checks for "s_waitcnt vmcnt(0) lgkmcnt(0) expcnt(0)" but ignores s_waitcnt_vscnt (which I assume is a bug). No fossil-db changes. Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Fixes: bcf94bb933e ("aco: properly recognize that s_waitcnt mitigates VMEMtoScalarWriteHazard") Part-of: --- src/amd/compiler/README-ISA.md | 2 +- src/amd/compiler/aco_insert_NOPs.cpp | 51 ++++++++++++++++++++++++++---------- 2 files changed, 38 insertions(+), 15 deletions(-) diff --git a/src/amd/compiler/README-ISA.md b/src/amd/compiler/README-ISA.md index 00bb15e..f247436 100644 --- a/src/amd/compiler/README-ISA.md +++ b/src/amd/compiler/README-ISA.md @@ -220,7 +220,7 @@ VMEM/FLAT/GLOBAL/SCRATCH/DS instruction reads an SGPR (or EXEC, or M0). Then, a SALU/SMEM instruction writes the same SGPR. Mitigated by: -A VALU instruction or an `s_waitcnt vmcnt(0)` between the two instructions. +A VALU instruction or an `s_waitcnt` between the two instructions. ### SMEMtoVectorWriteHazard diff --git a/src/amd/compiler/aco_insert_NOPs.cpp b/src/amd/compiler/aco_insert_NOPs.cpp index dab408e..0742190 100644 --- a/src/amd/compiler/aco_insert_NOPs.cpp +++ b/src/amd/compiler/aco_insert_NOPs.cpp @@ -159,6 +159,8 @@ struct NOP_ctx_gfx10 { bool has_NSA_MIMG = false; bool has_writelane = false; std::bitset<128> sgprs_read_by_VMEM; + std::bitset<128> sgprs_read_by_VMEM_store; + std::bitset<128> sgprs_read_by_DS; std::bitset<128> sgprs_read_by_SMEM; void join(const NOP_ctx_gfx10& other) @@ -172,6 +174,8 @@ struct NOP_ctx_gfx10 { has_NSA_MIMG |= other.has_NSA_MIMG; has_writelane |= other.has_writelane; sgprs_read_by_VMEM |= other.sgprs_read_by_VMEM; + sgprs_read_by_DS |= other.sgprs_read_by_DS; + sgprs_read_by_VMEM_store |= other.sgprs_read_by_VMEM_store; sgprs_read_by_SMEM |= other.sgprs_read_by_SMEM; } @@ -183,6 +187,8 @@ struct NOP_ctx_gfx10 { has_branch_after_DS == other.has_branch_after_DS && has_NSA_MIMG == other.has_NSA_MIMG && has_writelane == other.has_writelane && sgprs_read_by_VMEM == other.sgprs_read_by_VMEM && + sgprs_read_by_DS == other.sgprs_read_by_DS && + sgprs_read_by_VMEM_store == other.sgprs_read_by_VMEM_store && sgprs_read_by_SMEM == other.sgprs_read_by_SMEM; } }; @@ -583,6 +589,16 @@ mark_read_regs(const aco_ptr& instr, std::bitset& reg_reads) } } +template +void +mark_read_regs_exec(State& state, const aco_ptr& instr, std::bitset& reg_reads) +{ + mark_read_regs(instr, reg_reads); + reg_reads.set(exec); + if (state.program->wave_size == 64) + reg_reads.set(exec_hi); +} + bool VALU_writes_sgpr(aco_ptr& instr) { @@ -641,31 +657,36 @@ handle_instruction_gfx10(State& state, NOP_ctx_gfx10& ctx, aco_ptr& Builder bld(state.program, &new_instructions); /* VMEMtoScalarWriteHazard - * Handle EXEC/M0/SGPR write following a VMEM instruction without a VALU or "waitcnt vmcnt(0)" + * Handle EXEC/M0/SGPR write following a VMEM/DS instruction without a VALU or "waitcnt vmcnt(0)" * in-between. */ if (instr->isVMEM() || instr->isFlatLike() || instr->isDS()) { - /* Remember all SGPRs that are read by the VMEM instruction */ - mark_read_regs(instr, ctx.sgprs_read_by_VMEM); - ctx.sgprs_read_by_VMEM.set(exec); - if (state.program->wave_size == 64) - ctx.sgprs_read_by_VMEM.set(exec_hi); + /* Remember all SGPRs that are read by the VMEM/DS instruction */ + if (instr->isVMEM() || instr->isFlatLike()) + mark_read_regs_exec( + state, instr, + instr->definitions.empty() ? ctx.sgprs_read_by_VMEM_store : ctx.sgprs_read_by_VMEM); + if (instr->isFlat() || instr->isDS()) + mark_read_regs_exec(state, instr, ctx.sgprs_read_by_DS); } else if (instr->isSALU() || instr->isSMEM()) { if (instr->opcode == aco_opcode::s_waitcnt) { - /* Hazard is mitigated by "s_waitcnt vmcnt(0)" */ - uint16_t imm = instr->sopp().imm; - unsigned vmcnt = (imm & 0xF) | ((imm & (0x3 << 14)) >> 10); - if (vmcnt == 0) + wait_imm imm(state.program->gfx_level, instr->sopp().imm); + if (imm.vm == 0) ctx.sgprs_read_by_VMEM.reset(); - } else if (instr->opcode == aco_opcode::s_waitcnt_depctr) { + } else if (instr->opcode == aco_opcode::s_waitcnt_depctr && instr->sopp().imm == 0xffe3) { /* Hazard is mitigated by a s_waitcnt_depctr with a magic imm */ - if (instr->sopp().imm == 0xffe3) - ctx.sgprs_read_by_VMEM.reset(); + ctx.sgprs_read_by_VMEM.reset(); + ctx.sgprs_read_by_DS.reset(); + ctx.sgprs_read_by_VMEM_store.reset(); } /* Check if SALU writes an SGPR that was previously read by the VALU */ - if (check_written_regs(instr, ctx.sgprs_read_by_VMEM)) { + if (check_written_regs(instr, ctx.sgprs_read_by_VMEM) || + check_written_regs(instr, ctx.sgprs_read_by_DS) || + check_written_regs(instr, ctx.sgprs_read_by_VMEM_store)) { ctx.sgprs_read_by_VMEM.reset(); + ctx.sgprs_read_by_DS.reset(); + ctx.sgprs_read_by_VMEM_store.reset(); /* Insert s_waitcnt_depctr instruction with magic imm to mitigate the problem */ aco_ptr depctr{ @@ -677,6 +698,8 @@ handle_instruction_gfx10(State& state, NOP_ctx_gfx10& ctx, aco_ptr& } else if (instr->isVALU()) { /* Hazard is mitigated by any VALU instruction */ ctx.sgprs_read_by_VMEM.reset(); + ctx.sgprs_read_by_DS.reset(); + ctx.sgprs_read_by_VMEM_store.reset(); } /* VcmpxPermlaneHazard -- 2.7.4