aco/gfx11: optimize dual source export
authorGeorg Lehmann <dadschoorse@gmail.com>
Wed, 4 Oct 2023 09:51:21 +0000 (11:51 +0200)
committerMarge Bot <emma+marge@anholt.net>
Thu, 5 Oct 2023 10:37:34 +0000 (10:37 +0000)
We can combine dpp with the v_cndmask_b32.

Foz-DB Navi31:
Totals from 222 (0.28% of 79330) affected shaders:
Instrs: 564392 -> 563373 (-0.18%); split: -0.19%, +0.01%
CodeSize: 2867040 -> 2864728 (-0.08%); split: -0.09%, +0.01%
Latency: 4278957 -> 4275199 (-0.09%); split: -0.09%, +0.00%
InvThroughput: 586636 -> 585824 (-0.14%); split: -0.14%, +0.00%
SClause: 20210 -> 20211 (+0.00%); split: -0.02%, +0.02%
Copies: 39763 -> 39778 (+0.04%); split: -0.13%, +0.17%
PreVGPRs: 13924 -> 13922 (-0.01%)

Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25541>

src/amd/compiler/aco_instruction_selection.cpp
src/amd/compiler/aco_lower_to_hw_instr.cpp
src/amd/compiler/aco_validate.cpp

index c398488..bf28ae1 100644 (file)
@@ -8057,7 +8057,7 @@ create_fs_dual_src_export_gfx11(isel_context* ctx, const struct aco_export_mrt*
    RegClass type = RegClass(RegType::vgpr, util_bitcount(mrt0->enabled_channels));
    exp->definitions[0] = bld.def(type); /* mrt0 */
    exp->definitions[1] = bld.def(type); /* mrt1 */
-   exp->definitions[2] = bld.def(v1);
+   exp->definitions[2] = bld.def(bld.lm);
    exp->definitions[3] = bld.def(bld.lm);
    exp->definitions[4] = bld.def(bld.lm, vcc);
    exp->definitions[5] = bld.def(s1, scc);
index e26860d..5300708 100644 (file)
@@ -2797,13 +2797,13 @@ lower_to_hw_instr(Program* program)
             case aco_opcode::p_dual_src_export_gfx11: {
                PhysReg dst0 = instr->definitions[0].physReg();
                PhysReg dst1 = instr->definitions[1].physReg();
-               Definition tmp = instr->definitions[2];
-               Definition exec_tmp = instr->definitions[3];
+               Definition exec_tmp = instr->definitions[2];
+               Definition not_vcc_tmp = instr->definitions[3];
                Definition clobber_vcc = instr->definitions[4];
                Definition clobber_scc = instr->definitions[5];
 
-               assert(tmp.regClass() == v1);
                assert(exec_tmp.regClass() == bld.lm);
+               assert(not_vcc_tmp.regClass() == bld.lm);
                assert(clobber_vcc.regClass() == bld.lm && clobber_vcc.physReg() == vcc);
                assert(clobber_scc.isFixed() && clobber_scc.physReg() == scc);
 
@@ -2821,6 +2821,12 @@ lower_to_hw_instr(Program* program)
                   bld.sop1(aco_opcode::s_mov_b32, Definition(clobber_vcc.physReg().advance(4), s1),
                            Operand::c32(0x55555555));
 
+               Operand src_even = Operand(clobber_vcc.physReg(), bld.lm);
+
+               bld.sop1(Builder::s_not, not_vcc_tmp, clobber_scc, src_even);
+
+               Operand src_odd = Operand(not_vcc_tmp.physReg(), bld.lm);
+
                for (unsigned i = 0; i < 4; i++) {
                   if (instr->operands[i].isUndefined() && instr->operands[i + 4].isUndefined()) {
                      mrt0[i] = instr->operands[i];
@@ -2831,22 +2837,14 @@ lower_to_hw_instr(Program* program)
                   Operand src0 = instr->operands[i];
                   Operand src1 = instr->operands[i + 4];
 
-                  uint32_t lane_sel_xor1 = 0;
-                  for (unsigned j = 0; j < 8; j++)
-                     lane_sel_xor1 |= (j ^ 1) << (j * 3);
-
-                  /* Swap odd, even lanes of mrt0. */
-                  bld.vop1_dpp8(aco_opcode::v_mov_b32, Definition(dst0, v1), src0, lane_sel_xor1);
-
-                  /* Swap even lanes between mrt0 and mrt1. */
-                  bld.vop2(aco_opcode::v_cndmask_b32, tmp, Operand(dst0, v1), src1,
-                           Operand(clobber_vcc.physReg(), bld.lm));
-                  bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst1, v1), src1, Operand(dst0, v1),
-                           Operand(clobber_vcc.physReg(), bld.lm));
-
-                  /* Swap odd, even lanes of mrt0 again. */
-                  bld.vop1_dpp8(aco_opcode::v_mov_b32, Definition(dst0, v1),
-                                Operand(tmp.physReg(), v1), lane_sel_xor1);
+                  /*      | even lanes | odd lanes
+                   * mrt0 | src0 even  | src1 even
+                   * mrt1 | src0 odd   | src1 odd
+                   */
+                  bld.vop2_dpp(aco_opcode::v_cndmask_b32, Definition(dst0, v1), src1, src0,
+                               src_even, dpp_row_xmask(1));
+                  bld.vop2_e64_dpp(aco_opcode::v_cndmask_b32, Definition(dst1, v1), src0, src1,
+                                   src_odd, dpp_row_xmask(1));
 
                   mrt0[i] = Operand(dst0, v1);
                   mrt1[i] = Operand(dst1, v1);
index 0525849..4a43cdf 100644 (file)
@@ -565,9 +565,9 @@ validate_ir(Program* program)
             } else if (instr->opcode == aco_opcode::p_dual_src_export_gfx11) {
                check(instr->definitions.size() == 6,
                      "p_dual_src_export_gfx11 must have 6 definitions", instr.get());
-               check(instr->definitions[2].regClass().type() == RegType::vgpr &&
-                        instr->definitions[2].regClass().size() == 1,
-                     "Third definition of p_dual_src_export_gfx11 must be a v1", instr.get());
+               check(instr->definitions[2].regClass() == program->lane_mask,
+                     "Third definition of p_dual_src_export_gfx11 must be a lane mask",
+                     instr.get());
                check(instr->definitions[3].regClass() == program->lane_mask,
                      "Fourth definition of p_dual_src_export_gfx11 must be a lane mask",
                      instr.get());