From 5f66723188806e7a1792988ec3f75b979127edd6 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Daniel=20Sch=C3=BCrmann?= Date: Sun, 3 Sep 2023 11:05:08 +0200 Subject: [PATCH] aco/insert_exec_mask: Simplify WQM handling (1/2) by using p_end_wqm as indicator for when to end WQM mode. Totals from 10049 (13.12% of 76572) affected shaders: (GFX11) MaxWaves: 301126 -> 301136 (+0.00%) Instrs: 7061909 -> 7049272 (-0.18%); split: -0.21%, +0.03% CodeSize: 37720684 -> 37664244 (-0.15%); split: -0.18%, +0.03% VGPRs: 357204 -> 357180 (-0.01%); split: -0.13%, +0.12% Latency: 62757830 -> 62827080 (+0.11%); split: -0.06%, +0.17% InvThroughput: 8589248 -> 8589963 (+0.01%); split: -0.02%, +0.02% VClause: 132541 -> 132547 (+0.00%); split: -0.03%, +0.03% SClause: 322916 -> 322964 (+0.01%); split: -0.04%, +0.05% Copies: 546446 -> 547657 (+0.22%); split: -0.13%, +0.35% Branches: 189527 -> 188293 (-0.65%) PreSGPRs: 332792 -> 332529 (-0.08%); split: -0.08%, +0.00% Part-of: --- src/amd/compiler/aco_insert_exec_mask.cpp | 64 ++++++++++++------------------- 1 file changed, 24 insertions(+), 40 deletions(-) diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp b/src/amd/compiler/aco_insert_exec_mask.cpp index 517e27c..6a31a01 100644 --- a/src/amd/compiler/aco_insert_exec_mask.cpp +++ b/src/amd/compiler/aco_insert_exec_mask.cpp @@ -281,9 +281,8 @@ add_coupling_code(exec_ctx& ctx, Block* block, std::vector> if (ctx.handle_wqm) { ctx.info[idx].exec.emplace_back(start_exec, mask_type_global | mask_type_exact); - /* if this block needs WQM, initialize already */ - if (ctx.info[idx].block_needs & WQM) - transition_to_WQM(ctx, bld, idx); + /* Initialize WQM already */ + transition_to_WQM(ctx, bld, idx); } else { uint8_t mask = mask_type_global; if (ctx.program->needs_wqm) { @@ -474,13 +473,14 @@ add_coupling_code(exec_ctx& ctx, Block* block, std::vector> i++; } - /* try to satisfy the block's needs */ if (ctx.handle_wqm) { + /* End WQM handling if not needed anymore */ if (block->kind & block_kind_top_level && ctx.info[idx].exec.size() == 2) { - if (ctx.info[idx].block_needs == 0 || ctx.info[idx].block_needs == Exact) { + if (block->instructions[i]->opcode == aco_opcode::p_end_wqm) { ctx.info[idx].exec.back().second |= mask_type_global; transition_to_Exact(ctx, bld, idx); ctx.handle_wqm = false; + i++; } } } @@ -557,21 +557,16 @@ process_instructions(exec_ctx& ctx, Block* block, std::vectoropcode == aco_opcode::p_discard_if) { Operand current_exec = Operand(exec, bld.lm); - if (ctx.info[block->index].exec.size() >= 2) { - if (needs == WQM) { - /* Preserve the WQM mask */ - ctx.info[block->index].exec[1].second &= ~mask_type_global; - } else if (block->kind & block_kind_top_level) { - /* Transition to Exact without extra instruction. Since needs != WQM, we won't need - * WQM again. - */ - ctx.info[block->index].exec.resize(1); - assert(ctx.info[block->index].exec[0].second == - (mask_type_exact | mask_type_global)); - current_exec = get_exec_op(ctx.info[block->index].exec.back().first); - ctx.info[block->index].exec[0].first = Operand(bld.lm); - state = Exact; - } + if (block->instructions[idx + 1]->opcode == aco_opcode::p_end_wqm) { + /* Transition to Exact without extra instruction. */ + ctx.info[block->index].exec.resize(1); + assert(ctx.info[block->index].exec[0].second == (mask_type_exact | mask_type_global)); + current_exec = get_exec_op(ctx.info[block->index].exec[0].first); + ctx.info[block->index].exec[0].first = Operand(bld.lm); + state = Exact; + } else if (ctx.info[block->index].exec.size() >= 2 && needs == WQM) { + /* Preserve the WQM mask */ + ctx.info[block->index].exec[1].second &= ~mask_type_global; } Temp cond, exit_cond; @@ -688,7 +683,15 @@ process_instructions(exec_ctx& ctx, Block* block, std::vectoropcode == aco_opcode::p_end_wqm) { + assert(block->kind & block_kind_top_level); + assert(ctx.info[block->index].exec.size() <= 2); + /* This instruction indicates the end of WQM mode. */ + ctx.info[block->index].exec.back().second |= mask_type_global; + transition_to_Exact(ctx, bld, block->index); + state = Exact; + ctx.handle_wqm = false; continue; } @@ -705,25 +708,6 @@ add_branch_code(exec_ctx& ctx, Block* block) if (block->linear_succs.empty()) return; - /* try to disable wqm handling */ - if (ctx.handle_wqm && block->kind & block_kind_top_level) { - if (ctx.info[idx].exec.size() == 3) { - assert(ctx.info[idx].exec[1].second & mask_type_wqm); - ctx.info[idx].exec.pop_back(); - } - assert(ctx.info[idx].exec.size() <= 2); - - if (!(ctx.info[idx].instr_needs.back() & WQM)) { - /* transition to Exact if the branch doesn't need WQM */ - aco_ptr branch = std::move(block->instructions.back()); - block->instructions.pop_back(); - ctx.info[idx].exec.back().second |= mask_type_global; - transition_to_Exact(ctx, bld, idx); - bld.insert(std::move(branch)); - ctx.handle_wqm = false; - } - } - if (block->kind & block_kind_loop_preheader) { /* collect information about the succeeding loop */ bool has_divergent_break = false; -- 2.7.4