From 34424b81df6e5ffb2d22c572864ab6f6b4ac1abb Mon Sep 17 00:00:00 2001
From: Rhys Perry <pendingchaos02@gmail.com>
Date: Fri, 7 Feb 2020 11:55:43 +0000
Subject: [PATCH] aco: make PhysReg in units of bytes
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Reviewed-by: Daniel SchÃ¼rmann <daniel@schuermann.dev>
Reviewed-By: Timur KristÃ³f <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4002>
---
 src/amd/compiler/aco_assembler.cpp           |  2 +-
 src/amd/compiler/aco_insert_waitcnt.cpp      |  4 +--
 src/amd/compiler/aco_ir.h                    | 18 ++++++------
 src/amd/compiler/aco_lower_to_hw_instr.cpp   | 10 +++----
 src/amd/compiler/aco_print_ir.cpp            |  2 +-
 src/amd/compiler/aco_register_allocation.cpp | 42 ++++++++++++++--------------
 6 files changed, 40 insertions(+), 38 deletions(-)

diff --git a/src/amd/compiler/aco_assembler.cpp b/src/amd/compiler/aco_assembler.cpp
index 9544fc3..c46208b 100644
--- a/src/amd/compiler/aco_assembler.cpp
+++ b/src/amd/compiler/aco_assembler.cpp
@@ -156,7 +156,7 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
          encoding |= instr->operands.size() ? (instr->operands[0].physReg() >> 1) << 9 : 0;
          if (instr->operands.size() >= 2) {
             if (!instr->operands[1].isConstant() || instr->operands[1].constantValue() >= 1024) {
-               encoding |= instr->operands[1].physReg().reg;
+               encoding |= instr->operands[1].physReg().reg();
             } else {
                encoding |= instr->operands[1].constantValue() >> 2;
                encoding |= 1 << 8;
diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp
index c0a93e3..fc874ae 100644
--- a/src/amd/compiler/aco_insert_waitcnt.cpp
+++ b/src/amd/compiler/aco_insert_waitcnt.cpp
@@ -685,7 +685,7 @@ void insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event
    wait_entry new_entry(event, imm, !rc.is_linear(), wait_on_read);
 
    for (unsigned i = 0; i < rc.size(); i++) {
-      auto it = ctx.gpr_map.emplace(PhysReg{reg.reg+i}, new_entry);
+      auto it = ctx.gpr_map.emplace(PhysReg{reg.reg()+i}, new_entry);
       if (!it.second)
          it.first->second.join(new_entry);
    }
@@ -696,7 +696,7 @@ void insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event
          unsigned i = u_bit_scan(&counters_todo);
          ctx.unwaited_instrs[i].insert(std::make_pair(ctx.gen_instr, 0u));
          for (unsigned j = 0; j < rc.size(); j++)
-            ctx.reg_instrs[i][PhysReg{reg.reg+j}].insert(ctx.gen_instr);
+            ctx.reg_instrs[i][PhysReg{reg.reg()+j}].insert(ctx.gen_instr);
       }
    }
 }
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
index a625250..986b2d6 100644
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -267,10 +267,12 @@ private:
  */
 struct PhysReg {
    constexpr PhysReg() = default;
-   explicit constexpr PhysReg(unsigned r) : reg(r) {}
-   constexpr operator unsigned() const { return reg; }
+   explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
+   constexpr unsigned reg() const { return reg_b >> 2; }
+   constexpr unsigned byte() const { return reg_b & 0x3; }
+   constexpr operator unsigned() const { return reg(); }
 
-   uint16_t reg = 0;
+   uint16_t reg_b = 0;
 };
 
 /* helper expressions for special registers */
@@ -475,12 +477,12 @@ public:
    constexpr uint64_t constantValue64(bool signext=false) const noexcept
    {
       if (is64BitConst_) {
-         if (reg_.reg <= 192)
-            return reg_.reg - 128;
-         else if (reg_.reg <= 208)
-            return 0xFFFFFFFFFFFFFFFF - (reg_.reg - 193);
+         if (reg_ <= 192)
+            return reg_ - 128;
+         else if (reg_ <= 208)
+            return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
 
-         switch (reg_.reg) {
+         switch (reg_) {
          case 240:
             return 0x3FE0000000000000;
          case 241:
diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp
index 606f2fd..0dcdf70 100644
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
@@ -195,15 +195,15 @@ void emit_int64_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysRe
 {
    Builder bld(ctx->program, &ctx->instructions);
    Definition dst[] = {Definition(dst_reg, v1), Definition(PhysReg{dst_reg+1}, v1)};
-   RegClass src0_rc = src0_reg.reg >= 256 ? v1 : s1;
+   RegClass src0_rc = src0_reg.reg() >= 256 ? v1 : s1;
    Operand src0[] = {Operand(src0_reg, src0_rc), Operand(PhysReg{src0_reg+1}, src0_rc)};
    Operand src1[] = {Operand(src1_reg, v1), Operand(PhysReg{src1_reg+1}, v1)};
-   Operand src0_64 = Operand(src0_reg, src0_reg.reg >= 256 ? v2 : s2);
+   Operand src0_64 = Operand(src0_reg, src0_reg.reg() >= 256 ? v2 : s2);
    Operand src1_64 = Operand(src1_reg, v2);
 
    if (src0_rc == s1 &&
        (op == imul64 || op == umin64 || op == umax64 || op == imin64 || op == imax64)) {
-      assert(vtmp.reg != 0);
+      assert(vtmp.reg() != 0);
       bld.vop1(aco_opcode::v_mov_b32, Definition(vtmp, v1), src0[0]);
       bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+1}, v1), src0[1]);
       src0_reg = vtmp;
@@ -211,7 +211,7 @@ void emit_int64_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysRe
       src0[1] = Operand(PhysReg{vtmp+1}, v1);
       src0_64 = Operand(vtmp, v2);
    } else if (src0_rc == s1 && op == iadd64) {
-      assert(vtmp.reg != 0);
+      assert(vtmp.reg() != 0);
       bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+1}, v1), src0[1]);
       src0[1] = Operand(PhysReg{vtmp+1}, v1);
    }
@@ -330,7 +330,7 @@ void emit_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1
    Builder bld(ctx->program, &ctx->instructions);
    RegClass rc = RegClass(RegType::vgpr, size);
    Definition dst(dst_reg, rc);
-   Operand src0(src0_reg, RegClass(src0_reg.reg >= 256 ? RegType::vgpr : RegType::sgpr, size));
+   Operand src0(src0_reg, RegClass(src0_reg.reg() >= 256 ? RegType::vgpr : RegType::sgpr, size));
    Operand src1(src1_reg, rc);
 
    aco_opcode opcode = get_reduce_opcode(ctx->program->chip_class, op);
diff --git a/src/amd/compiler/aco_print_ir.cpp b/src/amd/compiler/aco_print_ir.cpp
index 8f89236..7564b52 100644
--- a/src/amd/compiler/aco_print_ir.cpp
+++ b/src/amd/compiler/aco_print_ir.cpp
@@ -126,7 +126,7 @@ static void print_operand(const Operand *operand, FILE *output)
    if (operand->isLiteral()) {
       fprintf(output, "0x%x", operand->constantValue());
    } else if (operand->isConstant()) {
-      print_constant(operand->physReg().reg, output);
+      print_constant(operand->physReg().reg(), output);
    } else if (operand->isUndefined()) {
       print_reg_class(operand->regClass(), output);
       fprintf(output, "undef");
diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp
index c3726ac..1a67751 100644
--- a/src/amd/compiler/aco_register_allocation.cpp
+++ b/src/amd/compiler/aco_register_allocation.cpp
@@ -212,7 +212,7 @@ void update_renames(ra_ctx& ctx, RegisterFile& reg_file,
       // FIXME: if a definition got moved, change the target location and remove the parallelcopy
       copy.second.setTemp(Temp(ctx.program->allocateId(), copy.second.regClass()));
       ctx.assignments[copy.second.tempId()] = {copy.second.physReg(), copy.second.regClass()};
-      for (unsigned i = copy.second.physReg().reg; i < copy.second.physReg() + copy.second.size(); i++)
+      for (unsigned i = copy.second.physReg().reg(); i < copy.second.physReg() + copy.second.size(); i++)
          reg_file[i] = copy.second.tempId();
       /* check if we moved an operand */
       for (Operand& op : instr->operands) {
@@ -223,8 +223,8 @@ void update_renames(ra_ctx& ctx, RegisterFile& reg_file,
             for (std::pair<Operand, Definition>& pc : parallelcopies) {
                PhysReg def_reg = pc.second.physReg();
                omit_renaming &= def_reg > copy.first.physReg() ?
-                                (copy.first.physReg() + copy.first.size() <= def_reg.reg) :
-                                (def_reg + pc.second.size() <= copy.first.physReg().reg);
+                                (copy.first.physReg() + copy.first.size() <= def_reg.reg()) :
+                                (def_reg + pc.second.size() <= copy.first.physReg().reg());
             }
             if (omit_renaming)
                continue;
@@ -492,7 +492,7 @@ std::pair<PhysReg, bool> get_reg_impl(ra_ctx& ctx,
           instr->operands[j].physReg() >= lb &&
           instr->operands[j].physReg() < ub) {
          assert(instr->operands[j].isFixed());
-         assert(reg_file[instr->operands[j].physReg().reg] == 0);
+         assert(reg_file[instr->operands[j].physReg()] == 0);
          reg_file.fill(instr->operands[j].physReg(), instr->operands[j].size(), 0xFFFFFFFF);
          killed_ops += instr->operands[j].getTemp().size();
       }
@@ -879,7 +879,7 @@ bool get_reg_specified(ra_ctx& ctx,
       ub = ctx.program->max_reg_demand.sgpr;
    }
 
-   uint32_t reg_lo = reg.reg;
+   uint32_t reg_lo = reg.reg();
    uint32_t reg_hi = reg + (size - 1);
 
    if (reg_lo < lb || reg_hi >= ub || reg_lo > reg_hi)
@@ -930,7 +930,7 @@ void handle_pseudo(ra_ctx& ctx,
       return;
 
    Pseudo_instruction *pi = (Pseudo_instruction *)instr;
-   if (reg_file[scc.reg]) {
+   if (reg_file[scc.reg()]) {
       pi->tmp_in_scc = true;
 
       int reg = ctx.max_used_sgpr;
@@ -1385,7 +1385,7 @@ void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_
       /* fill in sgpr_live_in */
       for (unsigned i = 0; i <= ctx.max_used_sgpr; i++)
          sgpr_live_in[block.index][i] = register_file[i];
-      sgpr_live_in[block.index][127] = register_file[scc.reg];
+      sgpr_live_in[block.index][127] = register_file[scc.reg()];
 
       /* Handle all other instructions of the block */
       for (; it != block.instructions.end(); ++it) {
@@ -1445,8 +1445,8 @@ void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_
 
                } else {
                   /* check if target reg is blocked, and move away the blocking var */
-                  if (register_file[operand.physReg().reg]) {
-                     uint32_t blocking_id = register_file[operand.physReg().reg];
+                  if (register_file[operand.physReg().reg()]) {
+                     uint32_t blocking_id = register_file[operand.physReg().reg()];
                      RegClass rc = ctx.assignments[blocking_id].second;
                      Operand pc_op = Operand(Temp{blocking_id, rc});
                      pc_op.setFixed(operand.physReg());
@@ -1503,7 +1503,7 @@ void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_
                    (instr->isVMEM() && i == 3 && program->chip_class == GFX6) ||
                    (instr->format == Format::DS && static_cast<DS_instruction*>(instr.get())->gds)) {
                   for (unsigned j = 0; j < operand.size(); j++)
-                     ctx.war_hint.set(operand.physReg().reg + j);
+                     ctx.war_hint.set(operand.physReg().reg() + j);
                }
             }
             std::map<unsigned, phi_info>::iterator phi = phi_map.find(operand.getTemp().id());
@@ -1563,11 +1563,11 @@ void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_
 
             adjust_max_used_regs(ctx, definition.regClass(), definition.physReg());
             /* check if the target register is blocked */
-            if (register_file[definition.physReg().reg] != 0) {
+            if (register_file[definition.physReg().reg()] != 0) {
                /* create parallelcopy pair to move blocking var */
                Temp tmp = {register_file[definition.physReg()], ctx.assignments[register_file[definition.physReg()]].second};
                Operand pc_op = Operand(tmp);
-               pc_op.setFixed(ctx.assignments[register_file[definition.physReg().reg]].first);
+               pc_op.setFixed(ctx.assignments[register_file[definition.physReg().reg()]].first);
                RegClass rc = pc_op.regClass();
                tmp = Temp{program->allocateId(), rc};
                Definition pc_def = Definition(tmp);
@@ -1621,7 +1621,7 @@ void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_
                continue;
 
             /* find free reg */
-            if (definition.hasHint() && register_file[definition.physReg().reg] == 0)
+            if (definition.hasHint() && register_file[definition.physReg().reg()] == 0)
                definition.setFixed(definition.physReg());
             else if (instr->opcode == aco_opcode::p_split_vector) {
                PhysReg reg = PhysReg{instr->operands[0].physReg() + i * definition.size()};
@@ -1632,7 +1632,7 @@ void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_
                PhysReg reg;
                if (instr->operands[0].isKillBeforeDef() && instr->operands[0].getTemp().type() == definition.getTemp().type()) {
                   reg = instr->operands[0].physReg();
-                  assert(register_file[reg.reg] == 0);
+                  assert(register_file[reg.reg()] == 0);
                } else {
                   reg = get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr);
                }
@@ -1642,8 +1642,8 @@ void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_
                if (instr->operands[0].isKillBeforeDef() &&
                    instr->operands[0].getTemp().type() == definition.getTemp().type()) {
                   reg = instr->operands[0].physReg();
-                  reg.reg += definition.size() * instr->operands[1].constantValue();
-                  assert(register_file[reg.reg] == 0);
+                  reg = PhysReg(reg.reg() + definition.size() * instr->operands[1].constantValue());
+                  assert(register_file[reg.reg()] == 0);
                } else {
                   reg = get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr);
                }
@@ -1676,7 +1676,7 @@ void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_
                       op.getTemp().type() == definition.getTemp().type() &&
                       ctx.assignments.find(op.tempId()) != ctx.assignments.end()) {
                      PhysReg reg = ctx.assignments[op.tempId()].first;
-                     reg.reg = reg - k + offset;
+                     reg = PhysReg(reg.reg() - k + offset);
                      if (get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, instr, reg)) {
                         definition.setFixed(reg);
                         break;
@@ -1688,7 +1688,7 @@ void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_
                   std::pair<PhysReg, bool> res = get_reg_vec(ctx, register_file, vec->definitions[0].regClass());
                   PhysReg reg = res.first;
                   if (res.second) {
-                     reg.reg += offset;
+                     reg = PhysReg(reg.reg() + offset);
                   } else {
                      reg = get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr);
                   }
@@ -1726,17 +1726,17 @@ void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_
          if (!parallelcopy.empty()) {
             aco_ptr<Pseudo_instruction> pc;
             pc.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, parallelcopy.size(), parallelcopy.size()));
-            bool temp_in_scc = register_file[scc.reg];
+            bool temp_in_scc = register_file[scc.reg()];
             bool sgpr_operands_alias_defs = false;
             uint64_t sgpr_operands[4] = {0, 0, 0, 0};
             for (unsigned i = 0; i < parallelcopy.size(); i++) {
                if (temp_in_scc && parallelcopy[i].first.isTemp() && parallelcopy[i].first.getTemp().type() == RegType::sgpr) {
                   if (!sgpr_operands_alias_defs) {
-                     unsigned reg = parallelcopy[i].first.physReg().reg;
+                     unsigned reg = parallelcopy[i].first.physReg().reg();
                      unsigned size = parallelcopy[i].first.getTemp().size();
                      sgpr_operands[reg / 64u] |= ((1u << size) - 1) << (reg % 64u);
 
-                     reg = parallelcopy[i].second.physReg().reg;
+                     reg = parallelcopy[i].second.physReg().reg();
                      size = parallelcopy[i].second.getTemp().size();
                      if (sgpr_operands[reg / 64u] & ((1u << size) - 1) << (reg % 64u))
                         sgpr_operands_alias_defs = true;
-- 
2.7.4