From fb622775b5e583784cd836afa4e00faf538ae178 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Daniel=20Sch=C3=BCrmann?= Date: Tue, 5 Oct 2021 13:09:02 +0100 Subject: [PATCH] aco/optimizer: optimize extract(extract()) Totals from 53 (0.04% of 134572) affected shaders: (GFX10.3) SpillVGPRs: 1780 -> 1776 (-0.22%); split: -0.34%, +0.11% CodeSize: 968352 -> 963196 (-0.53%); split: -0.55%, +0.02% Scratch: 180224 -> 178176 (-1.14%) Instrs: 169800 -> 169158 (-0.38%); split: -0.39%, +0.01% Latency: 6186064 -> 6141408 (-0.72%); split: -1.16%, +0.44% InvThroughput: 2605044 -> 2582967 (-0.85%); split: -1.37%, +0.52% VClause: 4851 -> 4866 (+0.31%); split: -0.16%, +0.47% SClause: 1744 -> 1746 (+0.11%) Copies: 42874 -> 42325 (-1.28%); split: -1.40%, +0.12% Branches: 5762 -> 5765 (+0.05%); split: -0.02%, +0.07% Reviewed-by: Rhys Perry Part-of: --- src/amd/compiler/aco_optimizer.cpp | 40 +++++++++++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index bc842a5..1fd3cf6 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -527,6 +527,7 @@ pseudo_propagate_temp(opt_ctx& ctx, aco_ptr& instr, Temp temp, unsi return false; break; case aco_opcode::p_extract_vector: + case aco_opcode::p_extract: if (temp.type() == RegType::sgpr && !can_accept_sgpr) return false; break; @@ -991,9 +992,21 @@ can_apply_extract(opt_ctx& ctx, aco_ptr& instr, unsigned idx, ssa_i can_use_opsel(ctx.program->chip_class, instr->opcode, idx, sel.offset()) && !(instr->vop3().opsel & (1 << idx))) { return true; - } else { - return false; + } else if (instr->opcode == aco_opcode::p_extract) { + SubdwordSel instrSel = parse_extract(instr.get()); + + /* the outer offset must be within extracted range */ + if (instrSel.offset() >= sel.size()) + return false; + + /* don't remove the sign-extension when increasing the size further */ + if (instrSel.size() > sel.size() && !instrSel.sign_extend() && sel.sign_extend()) + return false; + + return true; } + + return false; } /* Combine an p_extract (or p_insert, in some cases) instruction with instr. @@ -1033,6 +1046,18 @@ apply_extract(opt_ctx& ctx, aco_ptr& instr, unsigned idx, ssa_info& } else if (instr->isVOP3()) { if (sel.offset()) instr->vop3().opsel |= 1 << idx; + } else if (instr->opcode == aco_opcode::p_extract) { + SubdwordSel instrSel = parse_extract(instr.get()); + + unsigned size = std::min(sel.size(), instrSel.size()); + unsigned offset = sel.offset() + instrSel.offset(); + unsigned sign_extend = + instrSel.sign_extend() && (sel.sign_extend() || instrSel.size() <= sel.size()); + + instr->operands[1] = Operand::c32(offset / size); + instr->operands[2] = Operand::c32(size * 8u); + instr->operands[3] = Operand::c32(sign_extend); + return; } /* output modifier and label_vopc seem to be the only one worth keeping at the moment */ @@ -3406,8 +3431,17 @@ combine_instruction(opt_ctx& ctx, aco_ptr& instr) if (instr->isSDWA() || instr->isDPP()) return; - if (instr->opcode == aco_opcode::p_extract) + if (instr->opcode == aco_opcode::p_extract) { + ssa_info& info = ctx.info[instr->operands[0].tempId()]; + if (info.is_extract() && can_apply_extract(ctx, instr, 0, info)) { + apply_extract(ctx, instr, 0, info); + if (--ctx.uses[instr->operands[0].tempId()]) + ctx.uses[info.instr->operands[0].tempId()]++; + instr->operands[0].setTemp(info.instr->operands[0].getTemp()); + } + apply_ds_extract(ctx, instr); + } /* TODO: There are still some peephole optimizations that could be done: * - abs(a - b) -> s_absdiff_i32 -- 2.7.4