aco/gfx11: fix FS input loads in quad-divergent control flow

author Rhys Perry <pendingchaos02@gmail.com>

Wed, 26 Oct 2022 20:11:31 +0000 (21:11 +0100)

committer Marge Bot <emma+marge@anholt.net>

Tue, 1 Nov 2022 12:42:43 +0000 (12:42 +0000)
author Rhys Perry <pendingchaos02@gmail.com>
Wed, 26 Oct 2022 20:11:31 +0000 (21:11 +0100)
committer Marge Bot <emma+marge@anholt.net>
Tue, 1 Nov 2022 12:42:43 +0000 (12:42 +0000)
diff --git a/src/amd/compiler/aco_builder_h.py b/src/amd/compiler/aco_builder_h.py

index af91669..cff50c8 100644 (file)
--- a/src/amd/compiler/aco_builder_h.py
+++ b/src/amd/compiler/aco_builder_h.py
@@ -522,7 +522,7 @@ public:
     }
  <%
  import itertools
-formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.product(range(5), range(6))) + [(8, 1), (1, 8)]),
+formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.product(range(5), range(6))) + [(8, 1), (1, 8), (2, 6)]),
             ("sop1", [Format.SOP1], 'SOP1_instruction', [(0, 1), (1, 0), (1, 1), (2, 1), (3, 2)]),
             ("sop2", [Format.SOP2], 'SOP2_instruction', itertools.product([1, 2], [2, 3])),
             ("sopk", [Format.SOPK], 'SOPK_instruction', itertools.product([0, 1, 2], [0, 1])),
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp

index 352f793..399fbc7 100644 (file)
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -72,6 +72,8 @@ struct if_context {
     bool divergent_old;
     bool exec_potentially_empty_discard_old;
     bool exec_potentially_empty_break_old;
+   bool had_divergent_discard_old;
+   bool had_divergent_discard_then;
     uint16_t exec_potentially_empty_break_depth_old;
  
     unsigned BB_if_idx;
@@ -5306,6 +5308,13 @@ visit_store_output(isel_context* ctx, nir_intrinsic_instr* instr)
     }
  }
  
+bool
+in_exec_divergent_or_in_loop(isel_context* ctx)
+{
+   return ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent ||
+          ctx->cf_info.had_divergent_discard;
+}
+
  void
  emit_interp_instr_gfx11(isel_context* ctx, unsigned idx, unsigned component, Temp src, Temp dst,
                          Temp prim_mask)
@@ -5315,7 +5324,16 @@ emit_interp_instr_gfx11(isel_context* ctx, unsigned idx, unsigned component, Tem
  
     Builder bld(ctx->program, ctx->block);
  
-   //TODO: this doesn't work in quad-divergent control flow
+   if (in_exec_divergent_or_in_loop(ctx)) {
+      Operand prim_mask_op = bld.m0(prim_mask);
+      prim_mask_op.setLateKill(true); /* we don't want the bld.lm definition to use m0 */
+      Operand coord2_op(coord2);
+      coord2_op.setLateKill(true); /* we re-use the destination reg in the middle */
+      bld.pseudo(aco_opcode::p_interp_gfx11, Definition(dst), bld.def(bld.lm),
+                 Operand(v1.as_linear()), Operand::c32(idx), Operand::c32(component), coord1,
+                 coord2_op, prim_mask_op);
+      return;
+   }
  
     Temp p = bld.ldsdir(aco_opcode::lds_param_load, bld.def(v1), bld.m0(prim_mask), idx, component);
  
@@ -5385,13 +5403,22 @@ emit_interp_mov_instr(isel_context* ctx, unsigned idx, unsigned component, unsig
  {
     Builder bld(ctx->program, ctx->block);
     if (ctx->options->gfx_level >= GFX11) {
-      //TODO: this doesn't work in quad-divergent control flow and ignores vertex_id
-      Temp p = bld.ldsdir(aco_opcode::lds_param_load, bld.def(v1), bld.m0(prim_mask), idx, component);
+      // TODO: this ignores vertex_id
        uint16_t dpp_ctrl = dpp_quad_perm(0, 0, 0, 0);
-      Temp res = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p, dpp_ctrl);
+      if (in_exec_divergent_or_in_loop(ctx)) {
+         Operand prim_mask_op = bld.m0(prim_mask);
+         prim_mask_op.setLateKill(true); /* we don't want the bld.lm definition to use m0 */
+         bld.pseudo(aco_opcode::p_interp_gfx11, Definition(dst), bld.def(bld.lm),
+                    Operand(v1.as_linear()), Operand::c32(idx), Operand::c32(component),
+                    Operand::c32(dpp_ctrl), prim_mask_op);
+      } else {
+         Temp p =
+            bld.ldsdir(aco_opcode::lds_param_load, bld.def(v1), bld.m0(prim_mask), idx, component);
+         Temp res = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p, dpp_ctrl);
  
-      /* lds_param_load must be done in WQM, and the result kept valid for helper lanes. */
-      emit_wqm(bld, res, dst, true);
+         /* lds_param_load must be done in WQM, and the result kept valid for helper lanes. */
+         emit_wqm(bld, res, dst, true);
+      }
     } else {
        bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand::c32(vertex_id),
                   bld.m0(prim_mask), idx, component);
@@ -5825,7 +5852,8 @@ visit_load_input(isel_context* ctx, nir_intrinsic_instr* instr)
              unsigned chan_component = (component + i) % 4;
              unsigned chan_idx = idx + (component + i) / 4;
              vec->operands[i] = Operand(bld.tmp(instr->dest.ssa.bit_size == 16 ? v2b : v1));
-            emit_interp_mov_instr(ctx, chan_idx, chan_component, vertex_id, vec->operands[i].getTemp(), prim_mask);
+            emit_interp_mov_instr(ctx, chan_idx, chan_component, vertex_id,
+                                  vec->operands[i].getTemp(), prim_mask);
           }
           vec->definitions[0] = Definition(dst);
           bld.insert(std::move(vec));
@@ -8980,6 +9008,7 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
  
        if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
           ctx->cf_info.exec_potentially_empty_discard = true;
+
        ctx->block->kind |= block_kind_uses_discard;
        ctx->program->needs_exact = true;
        break;
@@ -8992,6 +9021,7 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
  
        if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
           ctx->cf_info.exec_potentially_empty_discard = true;
+
        ctx->block->kind |= block_kind_uses_discard;
        ctx->program->needs_exact = true;
        break;
@@ -9007,12 +9037,15 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
           assert(src.regClass() == bld.lm);
           cond =
              bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
+
+         ctx->cf_info.had_divergent_discard |= nir_src_is_divergent(instr->src[0]);
        }
  
        bld.pseudo(aco_opcode::p_discard_if, cond);
  
        if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
           ctx->cf_info.exec_potentially_empty_discard = true;
+      ctx->cf_info.had_divergent_discard |= in_exec_divergent_or_in_loop(ctx);
        ctx->block->kind |= block_kind_uses_discard;
        ctx->program->needs_exact = true;
        break;
@@ -10554,6 +10587,7 @@ begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond,
     ic->exec_potentially_empty_break_old = ctx->cf_info.exec_potentially_empty_break;
     ic->exec_potentially_empty_break_depth_old = ctx->cf_info.exec_potentially_empty_break_depth;
     ic->divergent_old = ctx->cf_info.parent_if.is_divergent;
+   ic->had_divergent_discard_old = ctx->cf_info.had_divergent_discard;
     ctx->cf_info.parent_if.is_divergent = true;
  
     /* divergent branches use cbranch_execz */
@@ -10621,6 +10655,9 @@ begin_divergent_if_else(isel_context* ctx, if_context* ic,
     ctx->cf_info.exec_potentially_empty_break = false;
     ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
  
+   ic->had_divergent_discard_then = ctx->cf_info.had_divergent_discard;
+   ctx->cf_info.had_divergent_discard = ic->had_divergent_discard_old;
+
     /** emit logical else block */
     ctx->program->next_divergent_if_logical_depth++;
     Block* BB_else_logical = ctx->program->create_and_insert_block();
@@ -10683,6 +10720,7 @@ end_divergent_if(isel_context* ctx, if_context* ic)
        ctx->cf_info.exec_potentially_empty_break = false;
        ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
     }
+   ctx->cf_info.had_divergent_discard |= ic->had_divergent_discard_then;
  }
  
  static void
@@ -10709,6 +10747,8 @@ begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond)
     ctx->cf_info.has_branch = false;
     ctx->cf_info.parent_loop.has_divergent_branch = false;
  
+   ic->had_divergent_discard_old = ctx->cf_info.had_divergent_discard;
+
     /** emit then block */
     ctx->program->next_uniform_if_depth++;
     Block* BB_then = ctx->program->create_and_insert_block();
@@ -10742,6 +10782,9 @@ begin_uniform_if_else(isel_context* ctx, if_context* ic)
     ctx->cf_info.has_branch = false;
     ctx->cf_info.parent_loop.has_divergent_branch = false;
  
+   ic->had_divergent_discard_then = ctx->cf_info.had_divergent_discard;
+   ctx->cf_info.had_divergent_discard = ic->had_divergent_discard_old;
+
     /** emit else block */
     Block* BB_else = ctx->program->create_and_insert_block();
     add_edge(ic->BB_if_idx, BB_else);
@@ -10770,6 +10813,7 @@ end_uniform_if(isel_context* ctx, if_context* ic)
  
     ctx->cf_info.has_branch &= ic->uniform_has_then_branch;
     ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
+   ctx->cf_info.had_divergent_discard |= ic->had_divergent_discard_then;
  
     /** emit endif merge block */
     ctx->program->next_uniform_if_depth--;
diff --git a/src/amd/compiler/aco_instruction_selection.h b/src/amd/compiler/aco_instruction_selection.h

index 4c37178..5370034 100644 (file)
--- a/src/amd/compiler/aco_instruction_selection.h
+++ b/src/amd/compiler/aco_instruction_selection.h
@@ -74,6 +74,7 @@ struct isel_context {
        struct {
           bool is_divergent = false;
        } parent_if;
+      bool had_divergent_discard = false;
        bool exec_potentially_empty_discard =
           false; /* set to false when loop_nest_depth==0 && parent_if.is_divergent==false */
        uint16_t exec_potentially_empty_break_depth = UINT16_MAX;
diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp

index 028876e..9732933 100644 (file)
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
@@ -2378,6 +2378,54 @@ lower_to_hw_instr(Program* program)
                 bld.sop1(aco_opcode::s_setpc_b64, instr->operands[0]);
                 break;
              }
+            case aco_opcode::p_interp_gfx11: {
+               assert(instr->definitions[0].regClass() == v1 ||
+                      instr->definitions[0].regClass() == v2b);
+               assert(instr->definitions[1].regClass() == bld.lm);
+               assert(instr->operands[0].regClass() == v1.as_linear());
+               assert(instr->operands[1].isConstant());
+               assert(instr->operands[2].isConstant());
+               assert(instr->operands.back().physReg() == m0);
+               Definition dst = instr->definitions[0];
+               PhysReg exec_tmp = instr->definitions[1].physReg();
+               PhysReg lin_vgpr = instr->operands[0].physReg();
+               unsigned attribute = instr->operands[1].constantValue();
+               unsigned component = instr->operands[2].constantValue();
+               uint16_t dpp_ctrl = 0;
+               Operand coord1, coord2;
+               if (instr->operands.size() == 6) {
+                  assert(instr->operands[3].regClass() == v1);
+                  assert(instr->operands[4].regClass() == v1);
+                  coord1 = instr->operands[3];
+                  coord2 = instr->operands[4];
+               } else {
+                  assert(instr->operands[3].isConstant());
+                  dpp_ctrl = instr->operands[3].constantValue();
+               }
+
+               bld.sop1(Builder::s_mov, Definition(exec_tmp, bld.lm), Operand(exec, bld.lm));
+               bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), Operand(exec, bld.lm));
+               bld.ldsdir(aco_opcode::lds_param_load, Definition(lin_vgpr, v1), Operand(m0, s1),
+                          attribute, component);
+               bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(exec_tmp, bld.lm));
+
+               Operand p(lin_vgpr, v1);
+               Operand dst_op(dst.physReg(), v1);
+               if (instr->operands.size() == 5) {
+                  bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(dst), p, dpp_ctrl);
+               } else if (dst.regClass() == v2b) {
+                  bld.vinterp_inreg(aco_opcode::v_interp_p10_f16_f32_inreg, Definition(dst), p,
+                                    coord1, p);
+                  bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, Definition(dst), p,
+                                    coord2, dst_op);
+               } else {
+                  bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, Definition(dst), p, coord1,
+                                    p);
+                  bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, Definition(dst), p, coord2,
+                                    dst_op);
+               }
+               break;
+            }
              default: break;
              }
           } else if (instr->isBranch()) {
diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py

index 52a52af..2594d4f 100644 (file)
--- a/src/amd/compiler/aco_opcodes.py
+++ b/src/amd/compiler/aco_opcodes.py
@@ -335,6 +335,11 @@ opcode("p_init_scratch")
  # jumps to a shader epilog
  opcode("p_jump_to_epilog")
  
+# loads and interpolates a fragment shader input with a correct exec mask
+#dst0=result, dst1=exec_tmp, src0=linear_vgpr, src1=attribute, src2=component, src3=coord1, src4=coord2, src5=m0
+#dst0=result, dst1=exec_tmp, src0=linear_vgpr, src1=attribute, src2=component, src3=dpp_ctrl, src4=m0
+opcode("p_interp_gfx11")
+
  # SOP2 instructions: 2 scalar inputs, 1 scalar output (+optional scc)
  SOP2 = {
    # GFX6, GFX7, GFX8, GFX9, GFX10,GFX11,name
diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp

index a79af92..0c128db 100644 (file)
--- a/src/amd/compiler/aco_optimizer.cpp
+++ b/src/amd/compiler/aco_optimizer.cpp
@@ -673,6 +673,7 @@ alu_can_accept_constant(aco_opcode opcode, unsigned operand)
     case aco_opcode::v_readfirstlane_b32:
     case aco_opcode::p_extract:
     case aco_opcode::p_insert: return operand != 0;
+   case aco_opcode::p_interp_gfx11: return false;
     default: return true;
     }
  }
diff --git a/src/amd/compiler/aco_reduce_assign.cpp b/src/amd/compiler/aco_reduce_assign.cpp

index 3bc63d8..3c31b46 100644 (file)
--- a/src/amd/compiler/aco_reduce_assign.cpp
+++ b/src/amd/compiler/aco_reduce_assign.cpp
@@ -45,11 +45,13 @@ setup_reduce_temp(Program* program)
     std::vector<bool> hasReductions(program->blocks.size());
     for (Block& block : program->blocks) {
        for (aco_ptr<Instruction>& instr : block.instructions) {
-         if (instr->format != Format::PSEUDO_REDUCTION)
-            continue;
-
-         maxSize = MAX2(maxSize, instr->operands[0].size());
-         hasReductions[block.index] = true;
+         if (instr->opcode == aco_opcode::p_interp_gfx11) {
+            maxSize = MAX2(maxSize, 1);
+            hasReductions[block.index] = true;
+         } else if (instr->format == Format::PSEUDO_REDUCTION) {
+            maxSize = MAX2(maxSize, instr->operands[0].size());
+            hasReductions[block.index] = true;
+         }
        }
     }
  
@@ -92,10 +94,10 @@ setup_reduce_temp(Program* program)
        std::vector<aco_ptr<Instruction>>::iterator it;
        for (it = block.instructions.begin(); it != block.instructions.end(); ++it) {
           Instruction* instr = (*it).get();
-         if (instr->format != Format::PSEUDO_REDUCTION)
+         if (instr->format != Format::PSEUDO_REDUCTION &&
+             instr->opcode != aco_opcode::p_interp_gfx11)
              continue;
  
-         ReduceOp op = instr->reduction().reduce_op;
           reduceTmp_in_loop |= block.loop_nest_depth > 0;
  
           if ((int)last_top_level_block_idx != inserted_at) {
@@ -122,22 +124,26 @@ setup_reduce_temp(Program* program)
           }
  
           /* same as before, except for the vector temporary instead of the reduce temporary */
-         unsigned cluster_size = instr->reduction().cluster_size;
-         bool need_vtmp = op == imul32 || op == fadd64 || op == fmul64 || op == fmin64 ||
-                          op == fmax64 || op == umin64 || op == umax64 || op == imin64 ||
-                          op == imax64 || op == imul64;
-         bool gfx10_need_vtmp = op == imul8 || op == imax8 || op == imin8 || op == umin8 ||
-                                op == imul16 || op == imax16 || op == imin16 || op == umin16 ||
-                                op == iadd64;
-
-         if (program->gfx_level >= GFX10 && cluster_size == 64)
-            need_vtmp = true;
-         if (program->gfx_level >= GFX10 && gfx10_need_vtmp)
-            need_vtmp = true;
-         if (program->gfx_level <= GFX7)
-            need_vtmp = true;
-
-         need_vtmp |= cluster_size == 32;
+         bool need_vtmp = false;
+         if (instr->isReduction()) {
+            ReduceOp op = instr->reduction().reduce_op;
+            unsigned cluster_size = instr->reduction().cluster_size;
+            need_vtmp = op == imul32 || op == fadd64 || op == fmul64 || op == fmin64 ||
+                        op == fmax64 || op == umin64 || op == umax64 || op == imin64 ||
+                        op == imax64 || op == imul64;
+            bool gfx10_need_vtmp = op == imul8 || op == imax8 || op == imin8 || op == umin8 ||
+                                   op == imul16 || op == imax16 || op == imin16 || op == umin16 ||
+                                   op == iadd64;
+
+            if (program->gfx_level >= GFX10 && cluster_size == 64)
+               need_vtmp = true;
+            if (program->gfx_level >= GFX10 && gfx10_need_vtmp)
+               need_vtmp = true;
+            if (program->gfx_level <= GFX7)
+               need_vtmp = true;
+
+            need_vtmp |= cluster_size == 32;
+         }
  
           vtmp_in_loop |= need_vtmp && block.loop_nest_depth > 0;
           if (need_vtmp && (int)last_top_level_block_idx != vtmp_inserted_at) {
@@ -158,9 +164,15 @@ setup_reduce_temp(Program* program)
              }
           }
  
-         instr->operands[1] = Operand(reduceTmp);
-         if (need_vtmp)
-            instr->operands[2] = Operand(vtmp);
+         if (instr->isReduction()) {
+            instr->operands[1] = Operand(reduceTmp);
+            if (need_vtmp)
+               instr->operands[2] = Operand(vtmp);
+         } else {
+            assert(instr->opcode == aco_opcode::p_interp_gfx11);
+            instr->operands[0] = Operand(reduceTmp);
+            instr->operands[0].setLateKill(true);
+         }
        }
     }
  }
diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp

index 9cbc208..0a3401e 100644 (file)
--- a/src/amd/compiler/aco_register_allocation.cpp
+++ b/src/amd/compiler/aco_register_allocation.cpp
@@ -607,7 +607,9 @@ get_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr
     amd_gfx_level gfx_level = program->gfx_level;
  
     if (instr->isPseudo()) {
-      if (gfx_level >= GFX8)
+      if (instr->opcode == aco_opcode::p_interp_gfx11)
+         return std::make_pair(4u, 4u);
+      else if (gfx_level >= GFX8)
           return std::make_pair(rc.bytes() % 2 == 0 ? 2 : 1, rc.bytes());
        else
           return std::make_pair(4, rc.size() * 4u);
diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp

index fef525d..d0367e7 100644 (file)
--- a/src/amd/compiler/aco_validate.cpp
+++ b/src/amd/compiler/aco_validate.cpp
@@ -262,6 +262,7 @@ validate_ir(Program* program)
                 bool can_be_undef = is_phi(instr) || instr->isEXP() || instr->isReduction() ||
                                     instr->opcode == aco_opcode::p_create_vector ||
                                     instr->opcode == aco_opcode::p_jump_to_epilog ||
+                                   (instr->opcode == aco_opcode::p_interp_gfx11 && i == 0) ||
                                     (flat && i == 1) || (instr->isMIMG() && (i == 1 || i == 2)) ||
                                     ((instr->isMUBUF() || instr->isMTBUF()) && i == 1) ||
                                     (instr->isScratch() && i == 0);
author	Rhys Perry <pendingchaos02@gmail.com>
	Wed, 26 Oct 2022 20:11:31 +0000 (21:11 +0100)
committer	Marge Bot <emma+marge@anholt.net>
	Tue, 1 Nov 2022 12:42:43 +0000 (12:42 +0000)
src/amd/compiler/aco_builder_h.py		patch \| blob \| history
src/amd/compiler/aco_instruction_selection.cpp		patch \| blob \| history
src/amd/compiler/aco_instruction_selection.h		patch \| blob \| history
src/amd/compiler/aco_lower_to_hw_instr.cpp		patch \| blob \| history
src/amd/compiler/aco_opcodes.py		patch \| blob \| history
src/amd/compiler/aco_optimizer.cpp		patch \| blob \| history
src/amd/compiler/aco_reduce_assign.cpp		patch \| blob \| history
src/amd/compiler/aco_register_allocation.cpp		patch \| blob \| history
src/amd/compiler/aco_validate.cpp		patch \| blob \| history