From 1762e6b5406bf6c0ebec84a21fa8eb62f812dd2b Mon Sep 17 00:00:00 2001 From: =?utf8?q?Timur=20Krist=C3=B3f?= Date: Wed, 13 Apr 2022 19:29:30 +0200 Subject: [PATCH] aco: Improve SCC nocompare optimization when SCC is clobbered. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit When SCC is clobbered between s_cmp and its operand's writer, the current optimization that eliminates s_cmp won't kick in. However, when s_cmp is the only user of its operand temporary, it is possible to "pull down" the instruction that wrote the operand. Fossil DB stats on Navi 21: Totals from 63302 (46.92% of 134906) affected shaders: CodeSize: 176689272 -> 176418332 (-0.15%) Instrs: 33552237 -> 33484502 (-0.20%) Latency: 205847485 -> 205816205 (-0.02%); split: -0.02%, +0.00% InvThroughput: 34321285 -> 34319908 (-0.00%); split: -0.00%, +0.00% Signed-off-by: Timur Kristóf Reviewed-by: Rhys Perry Part-of: --- src/amd/compiler/aco_optimizer_postRA.cpp | 48 ++++++++++++++++++++++-- src/amd/compiler/tests/test_optimizer_postRA.cpp | 19 ++++++++++ 2 files changed, 64 insertions(+), 3 deletions(-) diff --git a/src/amd/compiler/aco_optimizer_postRA.cpp b/src/amd/compiler/aco_optimizer_postRA.cpp index a59ee83..2c4ab83 100644 --- a/src/amd/compiler/aco_optimizer_postRA.cpp +++ b/src/amd/compiler/aco_optimizer_postRA.cpp @@ -267,10 +267,9 @@ try_optimize_scc_nocompare(pr_opt_ctx& ctx, aco_ptr& instr) if (ctx.uses[instr->operands[0].tempId()] > 1) return; - /* Make sure both SCC and Operand 0 are written by the same instruction. */ + /* Find the writer instruction of Operand 0. */ Idx wr_idx = last_writer_idx(ctx, instr->operands[0]); - Idx sccwr_idx = last_writer_idx(ctx, scc, s1); - if (!wr_idx.found() || wr_idx != sccwr_idx) + if (!wr_idx.found()) return; Instruction* wr_instr = ctx.get(wr_idx); @@ -313,6 +312,49 @@ try_optimize_scc_nocompare(pr_opt_ctx& ctx, aco_ptr& instr) default: return; } + /* Check whether both SCC and Operand 0 are written by the same instruction. */ + Idx sccwr_idx = last_writer_idx(ctx, scc, s1); + if (wr_idx != sccwr_idx) { + /* Check whether the current instruction is the only user of its first operand. */ + if (ctx.uses[wr_instr->definitions[1].tempId()] || + ctx.uses[wr_instr->definitions[0].tempId()] > 1) + return; + + /* Check whether the operands of the writer are clobbered. */ + for (const Operand& op : wr_instr->operands) { + if (!op.isConstant() && is_clobbered_since(ctx, op, wr_idx)) + return; + } + + aco_opcode pulled_opcode = wr_instr->opcode; + if (instr->opcode == aco_opcode::s_cmp_eq_u32 || + instr->opcode == aco_opcode::s_cmp_eq_i32 || + instr->opcode == aco_opcode::s_cmp_eq_u64) { + /* When s_cmp_eq is used, it effectively inverts the SCC def. + * However, we can't simply invert the opcodes here because that + * would change the meaning of the program. + */ + return; + } + + Definition scc_def = instr->definitions[0]; + ctx.uses[wr_instr->definitions[0].tempId()]--; + + /* Copy the writer instruction, but use SCC from the current instr. + * This means that the original instruction will be eliminated. + */ + if (wr_instr->format == Format::SOP2) { + instr.reset(create_instruction(pulled_opcode, Format::SOP2, 2, 2)); + instr->operands[1] = wr_instr->operands[1]; + } else if (wr_instr->format == Format::SOP1) { + instr.reset(create_instruction(pulled_opcode, Format::SOP1, 1, 2)); + } + instr->definitions[0] = wr_instr->definitions[0]; + instr->definitions[1] = scc_def; + instr->operands[0] = wr_instr->operands[0]; + return; + } + /* Use the SCC def from wr_instr */ ctx.uses[instr->operands[0].tempId()]--; instr->operands[0] = Operand(wr_instr->definitions[1].getTemp(), scc); diff --git a/src/amd/compiler/tests/test_optimizer_postRA.cpp b/src/amd/compiler/tests/test_optimizer_postRA.cpp index 468a24c..da56693 100644 --- a/src/amd/compiler/tests/test_optimizer_postRA.cpp +++ b/src/amd/compiler/tests/test_optimizer_postRA.cpp @@ -241,6 +241,25 @@ BEGIN_TEST(optimizer_postRA.scc_nocmp_opt) //; del d, e, f, g, h, x { + /* SCC is overwritten in between, optimize by pulling down */ + + //! s1: %h:s[3], s1: %x:scc = s_add_u32 %a:s[0], 1 + //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018 + //! s2: %f:vcc = p_cbranch_z %g:scc + //! p_unit_test 5, %f:vcc, %h:s[3] + auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0, + Operand::c32(0x40018u)); + auto ovrw = bld.sop2(aco_opcode::s_add_u32, bld.def(s1, reg_s3), bld.def(s1, scc), op_in_0, + Operand::c32(1u)); + auto scmp = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand(salu, reg_s2), + Operand::zero()); + auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, vcc), bld.scc(scmp)); + writeout(5, Operand(br, vcc), Operand(ovrw, reg_s3)); + } + + //; del d, e, f, g, h, x + + { //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018 //! s1: %f:s[4] = s_cselect_b32 %z:s[6], %a:s[0], %e:scc //! p_unit_test 6, %f:s[4] -- 2.7.4