From 194f3e4c69b10b9dee4d577ef02218bf37702860 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Mon, 22 Feb 2021 11:12:15 +0000 Subject: [PATCH] aco: fix NSA MIMG followed by MUBUF/MTBUF MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit No fossil-db changes on GFX10. Signed-off-by: Rhys Perry Reviewed-by: Timur Kristóf Fixes: c353895c922 ("aco: use non-sequential addressing") Part-of: --- src/amd/compiler/aco_assembler.cpp | 21 ++++++++++++--------- src/amd/compiler/aco_insert_NOPs.cpp | 19 +++++++++++++++++++ src/amd/compiler/aco_ir.h | 2 ++ 3 files changed, 33 insertions(+), 9 deletions(-) diff --git a/src/amd/compiler/aco_assembler.cpp b/src/amd/compiler/aco_assembler.cpp index 994ed28..19b63d1 100644 --- a/src/amd/compiler/aco_assembler.cpp +++ b/src/amd/compiler/aco_assembler.cpp @@ -48,6 +48,15 @@ static uint32_t get_sdwa_sel(unsigned sel, PhysReg reg) return sel & sdwa_asuint; } +unsigned get_mimg_nsa_dwords(const Instruction *instr) { + unsigned addr_dwords = instr->operands.size() - 3; + for (unsigned i = 1; i < addr_dwords; i++) { + if (instr->operands[3 + i].physReg() != instr->operands[3].physReg().advance(i * 4)) + return DIV_ROUND_UP(addr_dwords - 1, 4); + } + return 0; +} + void emit_instruction(asm_context& ctx, std::vector& out, Instruction* instr) { /* lower remaining pseudo-instructions */ @@ -412,14 +421,8 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* break; } case Format::MIMG: { - unsigned use_nsa = false; - unsigned addr_dwords = instr->operands.size() - 3; - for (unsigned i = 1; i < addr_dwords; i++) { - if (instr->operands[3 + i].physReg() != instr->operands[3].physReg().advance(i * 4)) - use_nsa = true; - } - assert(!use_nsa || ctx.chip_class >= GFX10); - unsigned nsa_dwords = use_nsa ? DIV_ROUND_UP(addr_dwords - 1, 4) : 0; + unsigned nsa_dwords = get_mimg_nsa_dwords(instr); + assert(!nsa_dwords || ctx.chip_class >= GFX10); MIMG_instruction& mimg = instr->mimg(); uint32_t encoding = (0b111100 << 26); @@ -463,7 +466,7 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* if (nsa_dwords) { out.resize(out.size() + nsa_dwords); std::vector::iterator nsa = std::prev(out.end(), nsa_dwords); - for (unsigned i = 0; i < addr_dwords - 1; i++) + for (unsigned i = 0; i < instr->operands.size() - 4u; i++) nsa[i / 4] |= (0xFF & instr->operands[4 + i].physReg().reg()) << (i % 4 * 8); } break; diff --git a/src/amd/compiler/aco_insert_NOPs.cpp b/src/amd/compiler/aco_insert_NOPs.cpp index b621316..15c41a7 100644 --- a/src/amd/compiler/aco_insert_NOPs.cpp +++ b/src/amd/compiler/aco_insert_NOPs.cpp @@ -25,6 +25,7 @@ #include #include "aco_ir.h" +#include "aco_builder.h" #include #include @@ -149,6 +150,7 @@ struct NOP_ctx_gfx10 { bool has_branch_after_VMEM = false; bool has_DS = false; bool has_branch_after_DS = false; + bool has_NSA_MIMG = false; std::bitset<128> sgprs_read_by_VMEM; std::bitset<128> sgprs_read_by_SMEM; @@ -159,6 +161,7 @@ struct NOP_ctx_gfx10 { has_branch_after_VMEM |= other.has_branch_after_VMEM; has_DS |= other.has_DS; has_branch_after_DS |= other.has_branch_after_DS; + has_NSA_MIMG |= other.has_NSA_MIMG; sgprs_read_by_VMEM |= other.sgprs_read_by_VMEM; sgprs_read_by_SMEM |= other.sgprs_read_by_SMEM; } @@ -172,6 +175,7 @@ struct NOP_ctx_gfx10 { has_branch_after_VMEM == other.has_branch_after_VMEM && has_DS == other.has_DS && has_branch_after_DS == other.has_branch_after_DS && + has_NSA_MIMG == other.has_NSA_MIMG && sgprs_read_by_VMEM == other.sgprs_read_by_VMEM && sgprs_read_by_SMEM == other.sgprs_read_by_SMEM; } @@ -737,6 +741,21 @@ void handle_instruction_gfx10(Program *program, Block *cur_block, NOP_ctx_gfx10 wait->imm = 0; new_instructions.emplace_back(std::move(wait)); } + + /* NSAToVMEMBug + * Handles NSA MIMG (4 or more dwords) immediately followed by MUBUF/MTBUF (with offset[2:1] != 0). + */ + if (instr->isMIMG() && get_mimg_nsa_dwords(instr.get()) > 1) { + ctx.has_NSA_MIMG = true; + } else if (ctx.has_NSA_MIMG) { + ctx.has_NSA_MIMG = false; + + if (instr->isMUBUF() || instr->isMTBUF()) { + uint32_t offset = instr->isMUBUF() ? instr->mubuf().offset : instr->mtbuf().offset; + if (offset & 6) + Builder(program, &new_instructions).sopp(aco_opcode::s_nop, -1, 0); + } + } } template diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index ff3f3e6..cf69e90 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -1615,6 +1615,8 @@ bool needs_exec_mask(const Instruction* instr); uint32_t get_reduction_identity(ReduceOp op, unsigned idx); +unsigned get_mimg_nsa_dwords(const Instruction *instr); + enum block_kind { /* uniform indicates that leaving this block, * all actives lanes stay active */ -- 2.7.4