aco/ra: rework fixed operands
authorRhys Perry <pendingchaos02@gmail.com>
Fri, 24 Jun 2022 11:36:24 +0000 (12:36 +0100)
committerMarge Bot <emma+marge@anholt.net>
Thu, 1 Sep 2022 11:22:46 +0000 (11:22 +0000)
This moves all fixed operands at once, so they don't interfere with one
another.

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17493>

src/amd/compiler/aco_register_allocation.cpp
src/amd/compiler/tests/test_regalloc.cpp

index 1af93c9..5fb867c 100644 (file)
@@ -1918,9 +1918,6 @@ bool
 operand_can_use_reg(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg,
                     RegClass rc)
 {
-   if (instr->operands[idx].isFixed())
-      return instr->operands[idx].physReg() == reg;
-
    bool is_writelane = instr->opcode == aco_opcode::v_writelane_b32 ||
                        instr->opcode == aco_opcode::v_writelane_b32_e64;
    if (gfx_level <= GFX9 && is_writelane && idx <= 1) {
@@ -1953,37 +1950,77 @@ operand_can_use_reg(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr, unsign
 }
 
 void
-get_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file,
-                    std::vector<std::pair<Operand, Definition>>& parallelcopy,
-                    aco_ptr<Instruction>& instr, Operand& operand, unsigned operand_index)
+handle_fixed_operands(ra_ctx& ctx, RegisterFile& register_file,
+                      std::vector<std::pair<Operand, Definition>>& parallelcopy,
+                      aco_ptr<Instruction>& instr)
 {
-   /* check if the operand is fixed */
-   PhysReg src = ctx.assignments[operand.tempId()].reg;
-   PhysReg dst;
-   if (operand.isFixed()) {
-      assert(operand.physReg() != src);
+   assert(instr->operands.size() <= 64);
 
-      /* check if target reg is blocked, and move away the blocking var */
-      if (register_file.test(operand.physReg(), operand.bytes())) {
-         PhysRegInterval target{operand.physReg(), operand.size()};
+   RegisterFile tmp_file(register_file);
 
-         RegisterFile tmp_file(register_file);
+   uint64_t mask = 0;
+   for (unsigned i = 0; i < instr->operands.size(); i++) {
+      Operand& op = instr->operands[i];
 
-         std::vector<unsigned> blocking_vars = collect_vars(ctx, tmp_file, target);
+      if (!op.isTemp() || !op.isFixed())
+         continue;
 
-         tmp_file.clear(src, operand.regClass()); // TODO: try to avoid moving block vars to src
-         tmp_file.block(operand.physReg(), operand.regClass());
+      PhysReg src = ctx.assignments[op.tempId()].reg;
 
-         get_regs_for_copies(ctx, tmp_file, parallelcopy, blocking_vars, instr, PhysRegInterval());
+      if (op.physReg() == src) {
+         tmp_file.block(op.physReg(), op.regClass());
+         continue;
       }
-      dst = operand.physReg();
 
-   } else {
-      /* clear the operand in case it's only a stride mismatch */
-      register_file.clear(src, operand.regClass());
-      dst = get_reg(ctx, register_file, operand.getTemp(), parallelcopy, instr, operand_index);
+      bool found = false;
+      u_foreach_bit64 (j, mask) {
+         if (instr->operands[j].tempId() == op.tempId() &&
+             instr->operands[j].physReg() == op.physReg()) {
+            found = true;
+            break;
+         }
+      }
+      if (found)
+         continue; /* the copy is already added to the list */
+
+      /* clear from register_file so fixed operands are not collected be collect_vars() */
+      tmp_file.clear(src, op.regClass()); // TODO: try to avoid moving block vars to src
+
+      mask |= (uint64_t)1 << i;
+
+      Operand pc_op(instr->operands[i].getTemp(), src);
+      Definition pc_def = Definition(op.physReg(), pc_op.regClass());
+      parallelcopy.emplace_back(pc_op, pc_def);
+   }
+
+   if (!mask)
+      return;
+
+   std::vector<unsigned> blocking_vars;
+   u_foreach_bit64 (i, mask) {
+      Operand& op = instr->operands[i];
+      PhysRegInterval target{op.physReg(), op.size()};
+      std::vector<unsigned> blocking_vars2 = collect_vars(ctx, tmp_file, target);
+      blocking_vars.insert(blocking_vars.end(), blocking_vars2.begin(), blocking_vars2.end());
+
+      /* prevent get_regs_for_copies() from using these registers */
+      tmp_file.block(op.physReg(), op.regClass());
    }
 
+   get_regs_for_copies(ctx, tmp_file, parallelcopy, blocking_vars, instr, PhysRegInterval());
+   update_renames(ctx, register_file, parallelcopy, instr, rename_not_killed_ops | fill_killed_ops);
+}
+
+void
+get_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file,
+                    std::vector<std::pair<Operand, Definition>>& parallelcopy,
+                    aco_ptr<Instruction>& instr, Operand& operand, unsigned operand_index)
+{
+   /* clear the operand in case it's only a stride mismatch */
+   PhysReg src = ctx.assignments[operand.tempId()].reg;
+   register_file.clear(src, operand.regClass());
+   PhysReg dst = get_reg(ctx, register_file, operand.getTemp(), parallelcopy, instr, operand_index);
+
    Operand pc_op = operand;
    pc_op.setFixed(src);
    Definition pc_def = Definition(dst, pc_op.regClass());
@@ -2757,6 +2794,7 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
          bool temp_in_scc = register_file[scc];
 
          /* handle operands */
+         bool fixed = false;
          for (unsigned i = 0; i < instr->operands.size(); ++i) {
             auto& operand = instr->operands[i];
             if (!operand.isTemp())
@@ -2766,6 +2804,18 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
             operand.setTemp(read_variable(ctx, operand.getTemp(), block.index));
             assert(ctx.assignments[operand.tempId()].assigned);
 
+            fixed |=
+               operand.isFixed() && ctx.assignments[operand.tempId()].reg != operand.physReg();
+         }
+
+         if (fixed)
+            handle_fixed_operands(ctx, register_file, parallelcopy, instr);
+
+         for (unsigned i = 0; i < instr->operands.size(); ++i) {
+            auto& operand = instr->operands[i];
+            if (!operand.isTemp() || operand.isFixed())
+               continue;
+
             PhysReg reg = ctx.assignments[operand.tempId()].reg;
             if (operand_can_use_reg(program->gfx_level, instr, i, reg, operand.regClass()))
                operand.setFixed(reg);
index da4f1e2..2886120 100644 (file)
@@ -89,7 +89,7 @@ BEGIN_TEST(regalloc.precolor.swap)
    //! s2: %op1:s[2-3] = p_unit_test
    Temp op1 = bld.pseudo(aco_opcode::p_unit_test, bld.def(s2));
 
-   //! s2: %op1_2:s[0-1], s2: %op0_2:s[2-3] = p_parallelcopy %op1:s[2-3], %op0:s[0-1]
+   //! s2: %op0_2:s[2-3], s2: %op1_2:s[0-1] = p_parallelcopy %op0:s[0-1], %op1:s[2-3]
    //! p_unit_test %op0_2:s[2-3], %op1_2:s[0-1]
    Operand op(inputs[0]);
    op.setFixed(PhysReg(2));
@@ -103,7 +103,7 @@ BEGIN_TEST(regalloc.precolor.blocking_vector)
    if (!setup_cs("s2 s1", GFX10))
       return;
 
-   //! s2: %tmp0_2:s[2-3], s1: %tmp1_2:s[1] = p_parallelcopy %tmp0:s[0-1], %tmp1:s[2]
+   //! s1: %tmp1_2:s[1], s2: %tmp0_2:s[2-3] = p_parallelcopy %tmp1:s[2], %tmp0:s[0-1]
    //! p_unit_test %tmp1_2:s[1]
    Operand op(inputs[1]);
    op.setFixed(PhysReg(1));
@@ -120,7 +120,7 @@ BEGIN_TEST(regalloc.precolor.vector.test)
    if (!setup_cs("s2 s1 s1", GFX10))
       return;
 
-   //! s1: %tmp2_2:s[0], s2: %tmp0_2:s[2-3] = p_parallelcopy %tmp2:s[3], %tmp0:s[0-1]
+   //! s2: %tmp0_2:s[2-3], s1: %tmp2_2:s[0] = p_parallelcopy %tmp0:s[0-1], %tmp2:s[3]
    //! p_unit_test %tmp0_2:s[2-3]
    Operand op(inputs[0]);
    op.setFixed(PhysReg(2));
@@ -137,7 +137,7 @@ BEGIN_TEST(regalloc.precolor.vector.collect)
    if (!setup_cs("s2 s1 s1", GFX10))
       return;
 
-   //! s1: %tmp1_2:s[0], s1: %tmp2_2:s[1], s2: %tmp0_2:s[2-3] = p_parallelcopy %tmp1:s[2], %tmp2:s[3], %tmp0:s[0-1]
+   //! s2: %tmp0_2:s[2-3], s1: %tmp1_2:s[0], s1: %tmp2_2:s[1] = p_parallelcopy %tmp0:s[0-1], %tmp1:s[2], %tmp2:s[3]
    //! p_unit_test %tmp0_2:s[2-3]
    Operand op(inputs[0]);
    op.setFixed(PhysReg(2));
@@ -154,13 +154,40 @@ BEGIN_TEST(regalloc.precolor.vgpr_move)
    if (!setup_cs("v1 v1", GFX10))
       return;
 
-   //! v1: %tmp0_2:v[1], v1: %tmp1_2:v[0] = p_parallelcopy %tmp0:v[0], %tmp1:v[1]
+   //! v1: %tmp1_2:v[0], v1: %tmp0_2:v[1] = p_parallelcopy %tmp1:v[1], %tmp0:v[0]
    //! p_unit_test %tmp0_2:v[1], %tmp1_2:v[0]
    bld.pseudo(aco_opcode::p_unit_test, inputs[0], Operand(inputs[1], PhysReg(256)));
 
    finish_ra_test(ra_test_policy());
 END_TEST
 
+BEGIN_TEST(regalloc.precolor.multiple_operands)
+   //>> v1: %tmp0:v[0], v1: %tmp1:v[1], v1: %tmp2:v[2], v1: %tmp3:v[3] = p_startpgm
+   if (!setup_cs("v1 v1 v1 v1", GFX10))
+      return;
+
+   //! v1: %tmp3_2:v[0], v1: %tmp0_2:v[1], v1: %tmp1_2:v[2], v1: %tmp2_2:v[3] = p_parallelcopy %tmp3:v[3], %tmp0:v[0], %tmp1:v[1], %tmp2:v[2]
+   //! p_unit_test %tmp3_2:v[0], %tmp0_2:v[1], %tmp1_2:v[2], %tmp2_2:v[3]
+   bld.pseudo(aco_opcode::p_unit_test, Operand(inputs[3], PhysReg(256+0)),
+              Operand(inputs[0], PhysReg(256+1)), Operand(inputs[1], PhysReg(256+2)),
+              Operand(inputs[2], PhysReg(256+3)));
+
+   finish_ra_test(ra_test_policy());
+END_TEST
+
+BEGIN_TEST(regalloc.precolor.different_regs)
+   //>> v1: %tmp0:v[0] = p_startpgm
+   if (!setup_cs("v1", GFX10))
+      return;
+
+   //! v1: %tmp1:v[1], v1: %tmp2:v[2] = p_parallelcopy %tmp0:v[0], %tmp0:v[0]
+   //! p_unit_test %tmp1:v[1], %tmp1:v[1], %tmp1:v[1]
+   bld.pseudo(aco_opcode::p_unit_test, Operand(inputs[0], PhysReg(256+0)),
+              Operand(inputs[0], PhysReg(256+1)), Operand(inputs[0], PhysReg(256+2)));
+
+   finish_ra_test(ra_test_policy());
+END_TEST
+
 BEGIN_TEST(regalloc.scratch_sgpr.create_vector)
    if (!setup_cs("v1 s1", GFX7))
       return;