From d068eb53e84ca1e44ad96c31dab63476880b3c72 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Wed, 23 Feb 2022 17:29:25 +0000 Subject: [PATCH] aco/insert_exec_mask: optimize top-level transition to exact before demote MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit fossil-db (Sienna Cichlid): Totals from 5767 (3.55% of 162293) affected shaders: Instrs: 3264949 -> 3257527 (-0.23%); split: -0.23%, +0.00% CodeSize: 17835692 -> 17806004 (-0.17%); split: -0.17%, +0.00% Latency: 45990060 -> 45987924 (-0.00%); split: -0.00%, +0.00% InvThroughput: 7643850 -> 7643835 (-0.00%); split: -0.00%, +0.00% Copies: 193641 -> 186219 (-3.83%); split: -3.84%, +0.01% Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_insert_exec_mask.cpp | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp b/src/amd/compiler/aco_insert_exec_mask.cpp index 04928c3..e1dd392 100644 --- a/src/amd/compiler/aco_insert_exec_mask.cpp +++ b/src/amd/compiler/aco_insert_exec_mask.cpp @@ -585,8 +585,7 @@ process_instructions(exec_ctx& ctx, Block* block, std::vectoropcode == aco_opcode::p_demote_to_helper) { /* turn demote into discard_if with only exact masks */ - assert((ctx.info[block->index].exec[0].second & (mask_type_exact | mask_type_global)) == - (mask_type_exact | mask_type_global)); + assert(ctx.info[block->index].exec[0].second == (mask_type_exact | mask_type_global)); int num; Temp cond, exit_cond; @@ -605,7 +604,15 @@ process_instructions(exec_ctx& ctx, Block* block, std::vectorindex); + if (block->kind & block_kind_top_level && ctx.info[block->index].exec.size() == 2 && + ctx.info[block->index].exec.back().second & mask_type_global) { + /* We don't need to actually copy anything into exact, since the s_andn2 + * instructions later will do that. + */ + ctx.info[block->index].exec.pop_back(); + } else { + transition_to_Exact(ctx, bld, block->index); + } assert(instr->operands[0].isTemp()); cond = instr->operands[0].getTemp(); num = ctx.info[block->index].exec.size() - 1; -- 2.7.4