aco: Improve SCC nocompare optimization when SCC is clobbered.
authorTimur Kristóf <timur.kristof@gmail.com>
Wed, 13 Apr 2022 17:29:30 +0000 (19:29 +0200)
committerMarge Bot <emma+marge@anholt.net>
Sat, 20 Aug 2022 15:27:40 +0000 (15:27 +0000)
When SCC is clobbered between s_cmp and its operand's writer,
the current optimization that eliminates s_cmp won't kick in.

However, when s_cmp is the only user of its operand temporary,
it is possible to "pull down" the instruction that wrote the operand.

Fossil DB stats on Navi 21:

Totals from 63302 (46.92% of 134906) affected shaders:
CodeSize: 176689272 -> 176418332 (-0.15%)
Instrs: 33552237 -> 33484502 (-0.20%)
Latency: 205847485 -> 205816205 (-0.02%); split: -0.02%, +0.00%
InvThroughput: 34321285 -> 34319908 (-0.00%); split: -0.00%, +0.00%

Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16266>

src/amd/compiler/aco_optimizer_postRA.cpp
src/amd/compiler/tests/test_optimizer_postRA.cpp

index a59ee83..2c4ab83 100644 (file)
@@ -267,10 +267,9 @@ try_optimize_scc_nocompare(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
       if (ctx.uses[instr->operands[0].tempId()] > 1)
          return;
 
-      /* Make sure both SCC and Operand 0 are written by the same instruction. */
+      /* Find the writer instruction of Operand 0. */
       Idx wr_idx = last_writer_idx(ctx, instr->operands[0]);
-      Idx sccwr_idx = last_writer_idx(ctx, scc, s1);
-      if (!wr_idx.found() || wr_idx != sccwr_idx)
+      if (!wr_idx.found())
          return;
 
       Instruction* wr_instr = ctx.get(wr_idx);
@@ -313,6 +312,49 @@ try_optimize_scc_nocompare(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
       default: return;
       }
 
+      /* Check whether both SCC and Operand 0 are written by the same instruction. */
+      Idx sccwr_idx = last_writer_idx(ctx, scc, s1);
+      if (wr_idx != sccwr_idx) {
+         /* Check whether the current instruction is the only user of its first operand. */
+         if (ctx.uses[wr_instr->definitions[1].tempId()] ||
+             ctx.uses[wr_instr->definitions[0].tempId()] > 1)
+            return;
+
+         /* Check whether the operands of the writer are clobbered. */
+         for (const Operand& op : wr_instr->operands) {
+            if (!op.isConstant() && is_clobbered_since(ctx, op, wr_idx))
+               return;
+         }
+
+         aco_opcode pulled_opcode = wr_instr->opcode;
+         if (instr->opcode == aco_opcode::s_cmp_eq_u32 ||
+             instr->opcode == aco_opcode::s_cmp_eq_i32 ||
+             instr->opcode == aco_opcode::s_cmp_eq_u64) {
+            /* When s_cmp_eq is used, it effectively inverts the SCC def.
+             * However, we can't simply invert the opcodes here because that
+             * would change the meaning of the program.
+             */
+            return;
+         }
+
+         Definition scc_def = instr->definitions[0];
+         ctx.uses[wr_instr->definitions[0].tempId()]--;
+
+         /* Copy the writer instruction, but use SCC from the current instr.
+          * This means that the original instruction will be eliminated.
+          */
+         if (wr_instr->format == Format::SOP2) {
+            instr.reset(create_instruction<SOP2_instruction>(pulled_opcode, Format::SOP2, 2, 2));
+            instr->operands[1] = wr_instr->operands[1];
+         } else if (wr_instr->format == Format::SOP1) {
+            instr.reset(create_instruction<SOP1_instruction>(pulled_opcode, Format::SOP1, 1, 2));
+         }
+         instr->definitions[0] = wr_instr->definitions[0];
+         instr->definitions[1] = scc_def;
+         instr->operands[0] = wr_instr->operands[0];
+         return;
+      }
+
       /* Use the SCC def from wr_instr */
       ctx.uses[instr->operands[0].tempId()]--;
       instr->operands[0] = Operand(wr_instr->definitions[1].getTemp(), scc);
index 468a24c..da56693 100644 (file)
@@ -241,6 +241,25 @@ BEGIN_TEST(optimizer_postRA.scc_nocmp_opt)
     //; del d, e, f, g, h, x
 
     {
+       /* SCC is overwritten in between, optimize by pulling down */
+
+       //! s1: %h:s[3], s1: %x:scc = s_add_u32 %a:s[0], 1
+       //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018
+       //! s2: %f:vcc = p_cbranch_z %g:scc
+       //! p_unit_test 5, %f:vcc, %h:s[3]
+       auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0,
+                            Operand::c32(0x40018u));
+       auto ovrw = bld.sop2(aco_opcode::s_add_u32, bld.def(s1, reg_s3), bld.def(s1, scc), op_in_0,
+                            Operand::c32(1u));
+       auto scmp = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand(salu, reg_s2),
+                            Operand::zero());
+       auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, vcc), bld.scc(scmp));
+       writeout(5, Operand(br, vcc), Operand(ovrw, reg_s3));
+    }
+
+    //; del d, e, f, g, h, x
+
+    {
         //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018
         //! s1: %f:s[4] = s_cselect_b32 %z:s[6], %a:s[0], %e:scc
         //! p_unit_test 6, %f:s[4]