aco: add p_dual_src_export_gfx11 for dual source blending on GFX11
authorSamuel Pitoiset <samuel.pitoiset@gmail.com>
Wed, 16 Nov 2022 14:18:54 +0000 (15:18 +0100)
committerEric Engestrom <eric@engestrom.ch>
Thu, 17 Nov 2022 14:05:04 +0000 (14:05 +0000)
Dual source blending must be in strict WQM mode.

Cc: 22.3 mesa-stable
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19643>
(cherry picked from commit bb90d29660bb44326188809da2deec0675c8264a)

.pick_status.json
src/amd/compiler/aco_insert_exec_mask.cpp
src/amd/compiler/aco_ir.h
src/amd/compiler/aco_lower_to_hw_instr.cpp
src/amd/compiler/aco_opcodes.py
src/amd/compiler/aco_optimizer.cpp
src/amd/compiler/aco_validate.cpp

index f8bd985..e223d99 100644 (file)
         "description": "aco: add p_dual_src_export_gfx11 for dual source blending on GFX11",
         "nominated": true,
         "nomination_type": 0,
-        "resolution": 0,
+        "resolution": 1,
         "main_sha": null,
         "because_sha": null
     },
index 6e7cf49..09f2d63 100644 (file)
@@ -103,7 +103,8 @@ needs_exact(aco_ptr<Instruction>& instr)
        * emitted inside the same block, the main FS will always jump to the PS
        * epilog without considering the exec mask.
        */
-      return instr->isEXP() || instr->opcode == aco_opcode::p_jump_to_epilog;
+      return instr->isEXP() || instr->opcode == aco_opcode::p_jump_to_epilog ||
+             instr->opcode == aco_opcode::p_dual_src_export_gfx11;
    }
 }
 
index 9a10c67..fad6272 100644 (file)
@@ -1857,7 +1857,8 @@ inline bool
 is_dead(const std::vector<uint16_t>& uses, const Instruction* instr)
 {
    if (instr->definitions.empty() || instr->isBranch() ||
-       instr->opcode == aco_opcode::p_init_scratch)
+       instr->opcode == aco_opcode::p_init_scratch ||
+       instr->opcode == aco_opcode::p_dual_src_export_gfx11)
       return false;
 
    if (std::any_of(instr->definitions.begin(), instr->definitions.end(),
index 7984237..08eabda 100644 (file)
@@ -2431,6 +2431,85 @@ lower_to_hw_instr(Program* program)
                }
                break;
             }
+            case aco_opcode::p_dual_src_export_gfx11: {
+               PhysReg dst0 = instr->definitions[0].physReg();
+               PhysReg dst1 = instr->definitions[1].physReg();
+               Definition tmp = instr->definitions[2];
+               Definition exec_tmp = instr->definitions[3];
+               Definition clobber_vcc = instr->definitions[4];
+               Definition clobber_scc = instr->definitions[5];
+
+               assert(tmp.regClass() == v1);
+               assert(exec_tmp.regClass() == bld.lm);
+               assert(clobber_vcc.regClass() == bld.lm && clobber_vcc.physReg() == vcc);
+               assert(clobber_scc.isFixed() && clobber_scc.physReg() == scc);
+
+               bld.sop1(Builder::s_mov, Definition(exec_tmp.physReg(), bld.lm),
+                        Operand(exec, bld.lm));
+               bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), clobber_scc,
+                        Operand(exec, bld.lm));
+
+               uint8_t enabled_channels = 0;
+               Operand mrt0[4], mrt1[4];
+
+               bld.sop1(aco_opcode::s_mov_b32, Definition(clobber_vcc.physReg(), s1),
+                        Operand::c32(0x55555555));
+               if (ctx.program->wave_size == 64)
+                  bld.sop1(aco_opcode::s_mov_b32, Definition(clobber_vcc.physReg().advance(4), s1),
+                           Operand::c32(0x55555555));
+
+               for (unsigned i = 0; i < 4; i++) {
+                  if (instr->operands[i].isUndefined() && instr->operands[i + 4].isUndefined()) {
+                     mrt0[i] = instr->operands[i];
+                     mrt1[i] = instr->operands[i + 4];
+                     continue;
+                  }
+
+                  Operand src0 = instr->operands[i];
+                  Operand src1 = instr->operands[i + 4];
+
+                  /* Swap odd, even lanes of mrt0. */
+                  Builder::Result ret =
+                     bld.vop1_dpp8(aco_opcode::v_mov_b32, Definition(dst0, v1), src0);
+                  for (unsigned j = 0; j < 8; j++) {
+                     ret.instr->dpp8().lane_sel[j] = j ^ 1;
+                  }
+
+                  /* Swap even lanes between mrt0 and mrt1. */
+                  bld.vop2(aco_opcode::v_cndmask_b32, tmp, Operand(dst0, v1), src1,
+                           Operand(clobber_vcc.physReg(), bld.lm));
+                  bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst1, v1), src1, Operand(dst0, v1),
+                           Operand(clobber_vcc.physReg(), bld.lm));
+
+                  /* Swap odd, even lanes of mrt0 again. */
+                  ret = bld.vop1_dpp8(aco_opcode::v_mov_b32, Definition(dst0, v1),
+                                      Operand(tmp.physReg(), v1));
+                  for (unsigned j = 0; j < 8; j++) {
+                     ret.instr->dpp8().lane_sel[j] = j ^ 1;
+                  }
+
+                  mrt0[i] = Operand(dst0, v1);
+                  mrt1[i] = Operand(dst1, v1);
+
+                  enabled_channels |= 1 << i;
+
+                  dst0 = dst0.advance(4);
+                  dst1 = dst1.advance(4);
+               }
+
+               bld.sop1(Builder::s_mov, Definition(exec, bld.lm),
+                        Operand(exec_tmp.physReg(), bld.lm));
+
+               /* Force export all channels when everything is undefined. */
+               if (!enabled_channels)
+                  enabled_channels = 0xf;
+
+               bld.exp(aco_opcode::exp, mrt0[0], mrt0[1], mrt0[2], mrt0[3], enabled_channels,
+                       V_008DFC_SQ_EXP_MRT + 21, false);
+               bld.exp(aco_opcode::exp, mrt1[0], mrt1[1], mrt1[2], mrt1[3], enabled_channels,
+                       V_008DFC_SQ_EXP_MRT + 22, false);
+               break;
+            }
             default: break;
             }
          } else if (instr->isBranch()) {
index cd89b40..2c11cf2 100644 (file)
@@ -340,6 +340,9 @@ opcode("p_jump_to_epilog")
 #dst0=result, dst1=exec_tmp, dst2=clobber_scc, src0=linear_vgpr, src1=attribute, src2=component, src3=dpp_ctrl, src4=m0
 opcode("p_interp_gfx11")
 
+# performs dual source MRTs swizzling and emits exports on GFX11
+opcode("p_dual_src_export_gfx11")
+
 # SOP2 instructions: 2 scalar inputs, 1 scalar output (+optional scc)
 SOP2 = {
   # GFX6, GFX7, GFX8, GFX9, GFX10,GFX11,name
index 0c128db..adb959a 100644 (file)
@@ -673,7 +673,8 @@ alu_can_accept_constant(aco_opcode opcode, unsigned operand)
    case aco_opcode::v_readfirstlane_b32:
    case aco_opcode::p_extract:
    case aco_opcode::p_insert: return operand != 0;
-   case aco_opcode::p_interp_gfx11: return false;
+   case aco_opcode::p_interp_gfx11:
+   case aco_opcode::p_dual_src_export_gfx11: return false;
    default: return true;
    }
 }
index d0367e7..b2aa99d 100644 (file)
@@ -262,6 +262,7 @@ validate_ir(Program* program)
                bool can_be_undef = is_phi(instr) || instr->isEXP() || instr->isReduction() ||
                                    instr->opcode == aco_opcode::p_create_vector ||
                                    instr->opcode == aco_opcode::p_jump_to_epilog ||
+                                   instr->opcode == aco_opcode::p_dual_src_export_gfx11 ||
                                    (instr->opcode == aco_opcode::p_interp_gfx11 && i == 0) ||
                                    (flat && i == 1) || (instr->isMIMG() && (i == 1 || i == 2)) ||
                                    ((instr->isMUBUF() || instr->isMTBUF()) && i == 1) ||
@@ -526,6 +527,26 @@ validate_ir(Program* program)
                            instr->operands[i].isUndefined(),
                         "Other operands of p_jump_to_epilog must be VGPRs or undef", instr.get());
                }
+            } else if (instr->opcode == aco_opcode::p_dual_src_export_gfx11) {
+               check(instr->definitions.size() == 6,
+                     "p_dual_src_export_gfx11 must have 6 definitions", instr.get());
+               check(instr->definitions[2].getTemp().type() == RegType::vgpr &&
+                        instr->definitions[2].getTemp().size() == 1,
+                     "Third definition of p_dual_src_export_gfx11 must be a v1", instr.get());
+               check(instr->definitions[3].getTemp().type() == RegType::sgpr &&
+                        instr->definitions[3].getTemp().size() == 2,
+                     "Fourth definition of p_dual_src_export_gfx11 must be a s2", instr.get());
+               check(instr->definitions[4].physReg() == vcc,
+                     "Fifth definition of p_dual_src_export_gfx11 must be vcc", instr.get());
+               check(instr->definitions[5].physReg() == scc,
+                     "Sixth definition of p_dual_src_export_gfx11 must be scc", instr.get());
+               check(instr->operands.size() == 8, "p_dual_src_export_gfx11 must have 8 operands",
+                     instr.get());
+               for (unsigned i = 0; i < instr->operands.size(); i++) {
+                  check(instr->operands[i].getTemp().type() == RegType::vgpr ||
+                           instr->operands[i].isUndefined(),
+                        "Operands of p_dual_src_export_gfx11 must be VGPRs or undef", instr.get());
+               }
             }
             break;
          }