From 4e55b5b8514c49c303d8af526e9748f39e54133f Mon Sep 17 00:00:00 2001
From: Rhys Perry <pendingchaos02@gmail.com>
Date: Fri, 17 Jun 2022 11:23:00 +0100
Subject: [PATCH] aco: update assembler for GFX11
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel SchÃ¼rmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17333>
---
 src/amd/compiler/aco_assembler.cpp | 160 ++++++++++++++++++++++++++-----------
 src/amd/compiler/aco_ir.h          |   3 +-
 2 files changed, 117 insertions(+), 46 deletions(-)

diff --git a/src/amd/compiler/aco_assembler.cpp b/src/amd/compiler/aco_assembler.cpp
index de73b0f..d4e1c06 100644
--- a/src/amd/compiler/aco_assembler.cpp
+++ b/src/amd/compiler/aco_assembler.cpp
@@ -77,6 +77,12 @@ get_mimg_nsa_dwords(const Instruction* instr)
 uint32_t
 reg(asm_context& ctx, PhysReg reg)
 {
+   if (ctx.gfx_level >= GFX11) {
+      if (reg == m0)
+         return sgpr_null.reg();
+      else if (reg == sgpr_null)
+         return m0.reg();
+   }
    return reg.reg();
 }
 
@@ -231,11 +237,11 @@ emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* inst
       } else {
          encoding = (0b111101 << 26);
          assert(!smem.nv); /* Non-volatile is not supported on GFX10 */
-         encoding |= smem.dlc ? 1 << 14 : 0;
+         encoding |= smem.dlc ? 1 << (ctx.gfx_level >= GFX11 ? 13 : 14) : 0;
       }
 
       encoding |= opcode << 18;
-      encoding |= smem.glc ? 1 << 16 : 0;
+      encoding |= smem.glc ? 1 << (ctx.gfx_level >= GFX11 ? 14 : 16) : 0;
 
       if (ctx.gfx_level <= GFX9) {
          if (instr->operands.size() >= 2)
@@ -395,28 +401,42 @@ emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* inst
    case Format::MUBUF: {
       MUBUF_instruction& mubuf = instr->mubuf();
       uint32_t encoding = (0b111000 << 26);
+      if (ctx.gfx_level >= GFX11 && mubuf.lds) /* GFX11 has separate opcodes for LDS loads */
+         opcode = opcode == 0 ? 0x32 : (opcode + 0x1d);
+      else
+         encoding |= (mubuf.lds ? 1 : 0) << 16;
       encoding |= opcode << 18;
-      encoding |= (mubuf.lds ? 1 : 0) << 16;
       encoding |= (mubuf.glc ? 1 : 0) << 14;
-      encoding |= (mubuf.idxen ? 1 : 0) << 13;
+      if (ctx.gfx_level <= GFX10_3)
+         encoding |= (mubuf.idxen ? 1 : 0) << 13;
       assert(!mubuf.addr64 || ctx.gfx_level <= GFX7);
       if (ctx.gfx_level == GFX6 || ctx.gfx_level == GFX7)
          encoding |= (mubuf.addr64 ? 1 : 0) << 15;
-      encoding |= (mubuf.offen ? 1 : 0) << 12;
+      if (ctx.gfx_level <= GFX10_3)
+         encoding |= (mubuf.offen ? 1 : 0) << 12;
       if (ctx.gfx_level == GFX8 || ctx.gfx_level == GFX9) {
          assert(!mubuf.dlc); /* Device-level coherent is not supported on GFX9 and lower */
          encoding |= (mubuf.slc ? 1 : 0) << 17;
+      } else if (ctx.gfx_level >= GFX11) {
+         encoding |= (mubuf.slc ? 1 : 0) << 12;
+         encoding |= (mubuf.dlc ? 1 : 0) << 13;
       } else if (ctx.gfx_level >= GFX10) {
          encoding |= (mubuf.dlc ? 1 : 0) << 15;
       }
       encoding |= 0x0FFF & mubuf.offset;
       out.push_back(encoding);
       encoding = 0;
-      if (ctx.gfx_level <= GFX7 || ctx.gfx_level >= GFX10) {
+      if (ctx.gfx_level <= GFX7 || (ctx.gfx_level >= GFX10 && ctx.gfx_level <= GFX10_3)) {
          encoding |= (mubuf.slc ? 1 : 0) << 22;
       }
       encoding |= reg(ctx, instr->operands[2]) << 24;
-      encoding |= (mubuf.tfe ? 1 : 0) << 23;
+      if (ctx.gfx_level >= GFX11) {
+         encoding |= (mubuf.tfe ? 1 : 0) << 21;
+         encoding |= (mubuf.offen ? 1 : 0) << 22;
+         encoding |= (mubuf.idxen ? 1 : 0) << 23;
+      } else {
+         encoding |= (mubuf.tfe ? 1 : 0) << 23;
+      }
       encoding |= (reg(ctx, instr->operands[0]) >> 2) << 16;
       if (instr->operands.size() > 3 && !mubuf.lds)
          encoding |= reg(ctx, instr->operands[3], 8) << 8;
@@ -433,14 +453,22 @@ emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* inst
       uint32_t encoding = (0b111010 << 26);
       assert(img_format <= 0x7F);
       assert(!mtbuf.dlc || ctx.gfx_level >= GFX10);
-      encoding |= (mtbuf.dlc ? 1 : 0) << 15; /* DLC bit replaces one bit of the OPCODE on GFX10 */
+      if (ctx.gfx_level >= GFX11) {
+         encoding |= (mtbuf.slc ? 1 : 0) << 12;
+         encoding |= (mtbuf.dlc ? 1 : 0) << 13;
+      } else {
+         /* DLC bit replaces one bit of the OPCODE on GFX10 */
+         encoding |= (mtbuf.dlc ? 1 : 0) << 15;
+      }
+      if (ctx.gfx_level <= GFX10_3) {
+         encoding |= (mtbuf.idxen ? 1 : 0) << 13;
+         encoding |= (mtbuf.offen ? 1 : 0) << 12;
+      }
       encoding |= (mtbuf.glc ? 1 : 0) << 14;
-      encoding |= (mtbuf.idxen ? 1 : 0) << 13;
-      encoding |= (mtbuf.offen ? 1 : 0) << 12;
       encoding |= 0x0FFF & mtbuf.offset;
       encoding |= (img_format << 19); /* Handles both the GFX10 FORMAT and the old NFMT+DFMT */
 
-      if (ctx.gfx_level == GFX8 || ctx.gfx_level == GFX9) {
+      if (ctx.gfx_level == GFX8 || ctx.gfx_level == GFX9 || ctx.gfx_level >= GFX11) {
          encoding |= opcode << 15;
       } else {
          encoding |= (opcode & 0x07) << 16; /* 3 LSBs of 4-bit OPCODE */
@@ -450,8 +478,14 @@ emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* inst
       encoding = 0;
 
       encoding |= reg(ctx, instr->operands[2]) << 24;
-      encoding |= (mtbuf.tfe ? 1 : 0) << 23;
-      encoding |= (mtbuf.slc ? 1 : 0) << 22;
+      if (ctx.gfx_level >= GFX11) {
+         encoding |= (mtbuf.tfe ? 1 : 0) << 21;
+         encoding |= (mtbuf.offen ? 1 : 0) << 22;
+         encoding |= (mtbuf.idxen ? 1 : 0) << 23;
+      } else {
+         encoding |= (mtbuf.tfe ? 1 : 0) << 23;
+         encoding |= (mtbuf.slc ? 1 : 0) << 22;
+      }
       encoding |= (reg(ctx, instr->operands[0]) >> 2) << 16;
       if (instr->operands.size() > 3)
          encoding |= reg(ctx, instr->operands[3], 8) << 8;
@@ -472,27 +506,44 @@ emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* inst
 
       MIMG_instruction& mimg = instr->mimg();
       uint32_t encoding = (0b111100 << 26);
-      encoding |= mimg.slc ? 1 << 25 : 0;
-      encoding |= (opcode & 0x7f) << 18;
-      encoding |= (opcode >> 7) & 1;
-      encoding |= mimg.lwe ? 1 << 17 : 0;
-      encoding |= mimg.tfe ? 1 << 16 : 0;
-      encoding |= mimg.glc ? 1 << 13 : 0;
-      encoding |= mimg.unrm ? 1 << 12 : 0;
-      if (ctx.gfx_level <= GFX9) {
-         assert(!mimg.dlc); /* Device-level coherent is not supported on GFX9 and lower */
-         assert(!mimg.r128);
-         encoding |= mimg.a16 ? 1 << 15 : 0;
-         encoding |= mimg.da ? 1 << 14 : 0;
+      if (ctx.gfx_level >= GFX11) { /* GFX11: rearranges most fields */
+         assert(nsa_dwords <= 1);
+         encoding |= nsa_dwords;
+         encoding |= mimg.dim << 2;
+         encoding |= mimg.unrm ? 1 << 7 : 0;
+         encoding |= (0xF & mimg.dmask) << 8;
+         encoding |= mimg.slc ? 1 << 12 : 0;
+         encoding |= mimg.dlc ? 1 << 13 : 0;
+         encoding |= mimg.glc ? 1 << 14 : 0;
+         encoding |= mimg.r128 ? 1 << 15 : 0;
+         encoding |= mimg.a16 ? 1 << 16 : 0;
+         encoding |= mimg.d16 ? 1 << 17 : 0;
+         encoding |= (opcode & 0xFF) << 18;
       } else {
-         encoding |= mimg.r128 ? 1 << 15
-                               : 0; /* GFX10: A16 moved to 2nd word, R128 replaces it in 1st word */
-         encoding |= nsa_dwords << 1;
-         encoding |= mimg.dim << 3; /* GFX10: dimensionality instead of declare array */
-         encoding |= mimg.dlc ? 1 << 7 : 0;
+         encoding |= mimg.slc ? 1 << 25 : 0;
+         encoding |= (opcode & 0x7f) << 18;
+         encoding |= (opcode >> 7) & 1;
+         encoding |= mimg.lwe ? 1 << 17 : 0;
+         encoding |= mimg.tfe ? 1 << 16 : 0;
+         encoding |= mimg.glc ? 1 << 13 : 0;
+         encoding |= mimg.unrm ? 1 << 12 : 0;
+         if (ctx.gfx_level <= GFX9) {
+            assert(!mimg.dlc); /* Device-level coherent is not supported on GFX9 and lower */
+            assert(!mimg.r128);
+            encoding |= mimg.a16 ? 1 << 15 : 0;
+            encoding |= mimg.da ? 1 << 14 : 0;
+         } else {
+            encoding |= mimg.r128
+                           ? 1 << 15
+                           : 0; /* GFX10: A16 moved to 2nd word, R128 replaces it in 1st word */
+            encoding |= nsa_dwords << 1;
+            encoding |= mimg.dim << 3; /* GFX10: dimensionality instead of declare array */
+            encoding |= mimg.dlc ? 1 << 7 : 0;
+         }
+         encoding |= (0xF & mimg.dmask) << 8;
       }
-      encoding |= (0xF & mimg.dmask) << 8;
       out.push_back(encoding);
+
       encoding = reg(ctx, instr->operands[3], 8); /* VADDR */
       if (!instr->definitions.empty()) {
          encoding |= reg(ctx, instr->definitions[0], 8) << 8; /* VDATA */
@@ -500,14 +551,23 @@ emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* inst
          encoding |= reg(ctx, instr->operands[2], 8) << 8; /* VDATA */
       }
       encoding |= (0x1F & (reg(ctx, instr->operands[0]) >> 2)) << 16; /* T# (resource) */
-      if (!instr->operands[1].isUndefined())
-         encoding |= (0x1F & (reg(ctx, instr->operands[1]) >> 2)) << 21; /* sampler */
 
       assert(!mimg.d16 || ctx.gfx_level >= GFX9);
-      encoding |= mimg.d16 ? 1 << 31 : 0;
-      if (ctx.gfx_level >= GFX10) {
-         /* GFX10: A16 still exists, but is in a different place */
-         encoding |= mimg.a16 ? 1 << 30 : 0;
+      if (ctx.gfx_level >= GFX11) {
+         if (!instr->operands[1].isUndefined())
+            encoding |= (0x1F & (reg(ctx, instr->operands[1]) >> 2)) << 26; /* sampler */
+
+         encoding |= mimg.tfe ? 1 << 21 : 0;
+         encoding |= mimg.lwe ? 1 << 22 : 0;
+      } else {
+         if (!instr->operands[1].isUndefined())
+            encoding |= (0x1F & (reg(ctx, instr->operands[1]) >> 2)) << 21; /* sampler */
+
+         encoding |= mimg.d16 ? 1 << 31 : 0;
+         if (ctx.gfx_level >= GFX10) {
+            /* GFX10: A16 still exists, but is in a different place */
+            encoding |= mimg.a16 ? 1 << 30 : 0;
+         }
       }
 
       out.push_back(encoding);
@@ -542,15 +602,15 @@ emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* inst
          encoding |= flat.offset & 0xfff;
       }
       if (instr->isScratch())
-         encoding |= 1 << 14;
+         encoding |= 1 << (ctx.gfx_level >= GFX11 ? 16 : 14);
       else if (instr->isGlobal())
-         encoding |= 2 << 14;
+         encoding |= 2 << (ctx.gfx_level >= GFX11 ? 16 : 14);
       encoding |= flat.lds ? 1 << 13 : 0;
-      encoding |= flat.glc ? 1 << 16 : 0;
-      encoding |= flat.slc ? 1 << 17 : 0;
+      encoding |= flat.glc ? 1 << (ctx.gfx_level >= GFX11 ? 14 : 16) : 0;
+      encoding |= flat.slc ? 1 << (ctx.gfx_level >= GFX11 ? 15 : 17) : 0;
       if (ctx.gfx_level >= GFX10) {
          assert(!flat.nv);
-         encoding |= flat.dlc ? 1 << 12 : 0;
+         encoding |= flat.dlc ? 1 << (ctx.gfx_level >= GFX11 ? 13 : 12) : 0;
       } else {
          assert(!flat.dlc);
       }
@@ -575,7 +635,10 @@ emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* inst
          else
             encoding |= reg(ctx, sgpr_null) << 16;
       }
-      encoding |= flat.nv ? 1 << 23 : 0;
+      if (ctx.gfx_level >= GFX11 && instr->isScratch())
+         encoding |= !instr->operands[0].isUndefined() ? 1 << 23 : 0;
+      else
+         encoding |= flat.nv ? 1 << 23 : 0;
       out.push_back(encoding);
       break;
    }
@@ -588,9 +651,13 @@ emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* inst
          encoding = (0b111110 << 26);
       }
 
-      encoding |= exp.valid_mask ? 0b1 << 12 : 0;
+      if (ctx.gfx_level >= GFX11) {
+         encoding |= exp.row_en ? 0b1 << 13 : 0;
+      } else {
+         encoding |= exp.valid_mask ? 0b1 << 12 : 0;
+         encoding |= exp.compressed ? 0b1 << 10 : 0;
+      }
       encoding |= exp.done ? 0b1 << 11 : 0;
-      encoding |= exp.compressed ? 0b1 << 10 : 0;
       encoding |= exp.dest << 4;
       encoding |= exp.enabled_mask;
       out.push_back(encoding);
@@ -607,6 +674,9 @@ emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* inst
          unreachable("Pseudo instructions should be lowered before assembly.");
       break;
    default:
+      /* TODO: VOP3/VOP3P can use DPP8/16 on GFX11 (encoding of src0 and DPP8/16 word seems same
+       * except abs/neg is ignored). src2 cannot be literal and src0/src1 must be VGPR.
+       */
       if (instr->isVOP3()) {
          VOP3_instruction& vop3 = instr->vop3();
 
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
index 585238b..f320763 100644
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -1659,7 +1659,8 @@ struct Export_instruction : public Instruction {
    bool compressed : 1;
    bool done : 1;
    bool valid_mask : 1;
-   uint8_t padding0 : 5;
+   bool row_en : 1;
+   uint8_t padding0 : 4;
    uint8_t padding1;
 };
 static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
-- 
2.7.4