From 2f424c83e072f6a21d15af1064f6e744e801fbfa Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Fri, 27 Mar 2020 15:16:39 +0100 Subject: [PATCH] aco: only break SMEM clauses if XNACK is enabled (mostly APUs) MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit According to LLVM, it seems only required for APUs like RAVEN, but we still ensure that SMEM stores are in their own clause. pipeline-db (VEGA10): Totals from affected shaders: SGPRS: 1775364 -> 1775364 (0.00 %) VGPRS: 1287176 -> 1287176 (0.00 %) Spilled SGPRs: 725 -> 725 (0.00 %) Spilled VGPRs: 0 -> 0 (0.00 %) Code Size: 65386620 -> 65107460 (-0.43 %) bytes Max Waves: 287099 -> 287099 (0.00 %) pipeline-db (POLARIS10): Totals from affected shaders: SGPRS: 1797743 -> 1797743 (0.00 %) VGPRS: 1271108 -> 1271108 (0.00 %) Spilled SGPRs: 730 -> 730 (0.00 %) Spilled VGPRs: 0 -> 0 (0.00 %) Code Size: 64046244 -> 63782324 (-0.41 %) bytes Max Waves: 254875 -> 254875 (0.00 %) This only affects GFX6-GFX9 chips because the compiler uses a different pass for GFX10. Signed-off-by: Samuel Pitoiset Reviewed-by: Daniel Schürmann Tested-by: Marge Bot Part-of: --- src/amd/compiler/aco_insert_NOPs.cpp | 77 ++++++++++++++-------- .../compiler/aco_instruction_selection_setup.cpp | 20 ++++++ src/amd/compiler/aco_ir.h | 3 +- src/amd/compiler/aco_live_var_analysis.cpp | 6 +- 4 files changed, 75 insertions(+), 31 deletions(-) diff --git a/src/amd/compiler/aco_insert_NOPs.cpp b/src/amd/compiler/aco_insert_NOPs.cpp index 75dbe85..bb703d7 100644 --- a/src/amd/compiler/aco_insert_NOPs.cpp +++ b/src/amd/compiler/aco_insert_NOPs.cpp @@ -274,6 +274,41 @@ bool test_bitset_range(BITSET_WORD *words, unsigned start, unsigned size) { } } +/* A SMEM clause is any group of consecutive SMEM instructions. The + * instructions in this group may return out of order and/or may be replayed. + * + * To fix this potential hazard correctly, we have to make sure that when a + * clause has more than one instruction, no instruction in the clause writes + * to a register that is read by another instruction in the clause (including + * itself). In this case, we have to break the SMEM clause by inserting non + * SMEM instructions. + * + * SMEM clauses are only present on GFX8+, and only matter when XNACK is set. + */ +void handle_smem_clause_hazards(Program *program, NOP_ctx_gfx6 &ctx, + aco_ptr& instr, int *NOPs) +{ + /* break off from previous SMEM clause if needed */ + if (!*NOPs & (ctx.smem_clause || ctx.smem_write)) { + /* Don't allow clauses with store instructions since the clause's + * instructions may use the same address. */ + if (ctx.smem_write || instr->definitions.empty() || instr_info.is_atomic[(unsigned)instr->opcode]) { + *NOPs = 1; + } else if (program->xnack_enabled) { + for (Operand op : instr->operands) { + if (!op.isConstant() && test_bitset_range(ctx.smem_clause_write, op.physReg(), op.size())) { + *NOPs = 1; + break; + } + } + + Definition def = instr->definitions[0]; + if (!*NOPs && test_bitset_range(ctx.smem_clause_read_write, def.physReg(), def.size())) + *NOPs = 1; + } + } +} + /* TODO: we don't handle accessing VCC using the actual SGPR instead of using the alias */ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &ctx, aco_ptr& instr, std::vector>& new_instructions) @@ -300,24 +335,7 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c } } - /* break off from prevous SMEM clause if needed */ - if (!NOPs & (ctx.smem_clause || ctx.smem_write)) { - /* Don't allow clauses with store instructions since the clause's - * instructions may use the same address. */ - if (ctx.smem_write || instr->definitions.empty() || instr_info.is_atomic[(unsigned)instr->opcode]) { - NOPs = 1; - } else { - for (Operand op : instr->operands) { - if (!op.isConstant() && test_bitset_range(ctx.smem_clause_write, op.physReg(), op.size())) { - NOPs = 1; - break; - } - } - Definition def = instr->definitions[0]; - if (!NOPs && test_bitset_range(ctx.smem_clause_read_write, def.physReg(), def.size())) - NOPs = 1; - } - } + handle_smem_clause_hazards(program, ctx, instr, &NOPs); } else if (instr->isSALU()) { if (instr->opcode == aco_opcode::s_setreg_b32 || instr->opcode == aco_opcode::s_setreg_imm32_b32 || instr->opcode == aco_opcode::s_getreg_b32) { @@ -414,8 +432,11 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c if ((ctx.smem_clause || ctx.smem_write) && (NOPs || instr->format != Format::SMEM)) { ctx.smem_clause = false; ctx.smem_write = false; - BITSET_ZERO(ctx.smem_clause_read_write); - BITSET_ZERO(ctx.smem_clause_write); + + if (program->xnack_enabled) { + BITSET_ZERO(ctx.smem_clause_read_write); + BITSET_ZERO(ctx.smem_clause_write); + } } if (instr->format == Format::SMEM) { @@ -424,15 +445,17 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c } else { ctx.smem_clause = true; - for (Operand op : instr->operands) { - if (!op.isConstant()) { - set_bitset_range(ctx.smem_clause_read_write, op.physReg(), op.size()); + if (program->xnack_enabled) { + for (Operand op : instr->operands) { + if (!op.isConstant()) { + set_bitset_range(ctx.smem_clause_read_write, op.physReg(), op.size()); + } } - } - Definition def = instr->definitions[0]; - set_bitset_range(ctx.smem_clause_read_write, def.physReg(), def.size()); - set_bitset_range(ctx.smem_clause_write, def.physReg(), def.size()); + Definition def = instr->definitions[0]; + set_bitset_range(ctx.smem_clause_read_write, def.physReg(), def.size()); + set_bitset_range(ctx.smem_clause_write, def.physReg(), def.size()); + } } } else if (instr->isVALU()) { for (Definition def : instr->definitions) { diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index d365f79..462cd48 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -1150,6 +1150,24 @@ setup_nir(isel_context *ctx, nir_shader *nir) nir_index_ssa_defs(func); } +void +setup_xnack(Program *program) +{ + switch (program->family) { + /* GFX8 APUs */ + case CHIP_CARRIZO: + case CHIP_STONEY: + /* GFX9 APUS */ + case CHIP_RAVEN: + case CHIP_RAVEN2: + case CHIP_RENOIR: + program->xnack_enabled = true; + break; + default: + break; + } +} + isel_context setup_isel_context(Program* program, unsigned shader_count, @@ -1308,6 +1326,8 @@ setup_isel_context(Program* program, ctx.block->loop_nest_depth = 0; ctx.block->kind = block_kind_top_level; + setup_xnack(program); + return ctx; } diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 73a1d39..ace84db 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -1252,8 +1252,9 @@ public: uint16_t vgpr_alloc_granule; /* minus one. must be power of two */ unsigned workgroup_size; /* if known; otherwise UINT_MAX */ + bool xnack_enabled = false; + bool needs_vcc = false; - bool needs_xnack_mask = false; bool needs_flat_scr = false; uint32_t allocateId() diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp index e223d6d..106c5eb 100644 --- a/src/amd/compiler/aco_live_var_analysis.cpp +++ b/src/amd/compiler/aco_live_var_analysis.cpp @@ -302,19 +302,19 @@ uint16_t get_extra_sgprs(Program *program) { if (program->chip_class >= GFX10) { assert(!program->needs_flat_scr); - assert(!program->needs_xnack_mask); + assert(!program->xnack_enabled); return 2; } else if (program->chip_class >= GFX8) { if (program->needs_flat_scr) return 6; - else if (program->needs_xnack_mask) + else if (program->xnack_enabled) return 4; else if (program->needs_vcc) return 2; else return 0; } else { - assert(!program->needs_xnack_mask); + assert(!program->xnack_enabled); if (program->needs_flat_scr) return 4; else if (program->needs_vcc) -- 2.7.4