From 3bca0af25dbf6d6b162463138100abb20bc1a1cc Mon Sep 17 00:00:00 2001 From: =?utf8?q?Daniel=20Sch=C3=BCrmann?= Date: Tue, 14 Jan 2020 13:14:38 +0100 Subject: [PATCH] aco: ignore parallelcopies to the same register on jump threading The more conservative lowering to CSSA inserts unnecessary parallelcopies which might get coalesced and can be ignored on jump threading. v2: outline is_empty_block() check. Reviewed-by: Rhys Perry Tested-by: Marge Bot Part-of: --- src/amd/compiler/aco_ssa_elimination.cpp | 69 ++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 31 deletions(-) diff --git a/src/amd/compiler/aco_ssa_elimination.cpp b/src/amd/compiler/aco_ssa_elimination.cpp index 95e2e84..94f5a1a 100644 --- a/src/amd/compiler/aco_ssa_elimination.cpp +++ b/src/amd/compiler/aco_ssa_elimination.cpp @@ -111,6 +111,35 @@ void insert_parallelcopies(ssa_elimination_ctx& ctx) } } +bool is_empty_block(Block* block, bool ignore_exec_writes) +{ + /* check if this block is empty and the exec mask is not needed */ + for (aco_ptr& instr : block->instructions) { + switch (instr->opcode) { + case aco_opcode::p_linear_phi: + case aco_opcode::p_phi: + case aco_opcode::p_logical_start: + case aco_opcode::p_logical_end: + case aco_opcode::p_branch: + break; + case aco_opcode::p_parallelcopy: + for (unsigned i = 0; i < instr->definitions.size(); i++) { + if (ignore_exec_writes && instr->definitions[i].physReg() == exec) + continue; + if (instr->definitions[i].physReg() != instr->operands[i].physReg()) + return false; + } + break; + case aco_opcode::s_andn2_b64: + case aco_opcode::s_andn2_b32: + if (ignore_exec_writes && instr->definitions[0].physReg() == exec) + break; + default: + return false; + } + } + return true; +} void try_remove_merge_block(ssa_elimination_ctx& ctx, Block* block) { @@ -120,22 +149,9 @@ void try_remove_merge_block(ssa_elimination_ctx& ctx, Block* block) !(ctx.program->blocks[block->linear_succs[0]].kind & block_kind_merge)) return; - /* check if this block is empty and the exec mask is not needed */ - for (aco_ptr& instr : block->instructions) { - if (instr->opcode == aco_opcode::p_parallelcopy) { - if (instr->definitions[0].physReg() == exec) - continue; - else - return; - } - - if (instr->opcode != aco_opcode::p_linear_phi && - instr->opcode != aco_opcode::p_phi && - instr->opcode != aco_opcode::p_logical_start && - instr->opcode != aco_opcode::p_logical_end && - instr->opcode != aco_opcode::p_branch) - return; - } + /* check if this block is empty */ + if (!is_empty_block(block, true)) + return; /* keep the branch instruction and remove the rest */ aco_ptr branch = std::move(block->instructions.back()); @@ -146,18 +162,13 @@ void try_remove_merge_block(ssa_elimination_ctx& ctx, Block* block) void try_remove_invert_block(ssa_elimination_ctx& ctx, Block* block) { assert(block->linear_succs.size() == 2); + /* only remove this block if the successor got removed as well */ if (block->linear_succs[0] != block->linear_succs[1]) return; - /* check if we can remove this block */ - for (aco_ptr& instr : block->instructions) { - if (instr->opcode != aco_opcode::p_linear_phi && - instr->opcode != aco_opcode::p_phi && - (instr->opcode != aco_opcode::s_andn2_b64 || ctx.program->wave_size != 64) && - (instr->opcode != aco_opcode::s_andn2_b32 || ctx.program->wave_size != 32) && - instr->opcode != aco_opcode::p_branch) - return; - } + /* check if block is otherwise empty */ + if (!is_empty_block(block, true)) + return; unsigned succ_idx = block->linear_succs[0]; assert(block->linear_preds.size() == 2); @@ -179,12 +190,8 @@ void try_remove_invert_block(ssa_elimination_ctx& ctx, Block* block) void try_remove_simple_block(ssa_elimination_ctx& ctx, Block* block) { - for (aco_ptr& instr : block->instructions) { - if (instr->opcode != aco_opcode::p_logical_start && - instr->opcode != aco_opcode::p_logical_end && - instr->opcode != aco_opcode::p_branch) - return; - } + if (!is_empty_block(block, false)) + return; Block& pred = ctx.program->blocks[block->linear_preds[0]]; Block& succ = ctx.program->blocks[block->linear_succs[0]]; -- 2.7.4