ir3: Add subgroup pseudoinstructions
authorConnor Abbott <cwabbott0@gmail.com>
Mon, 31 May 2021 10:09:42 +0000 (12:09 +0200)
committerMarge Bot <eric+marge@anholt.net>
Thu, 8 Jul 2021 16:02:41 +0000 (16:02 +0000)
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6752>

src/freedreno/ir3/disasm-a3xx.c
src/freedreno/ir3/instr-a3xx.h
src/freedreno/ir3/ir3.h
src/freedreno/ir3/ir3_compiler_nir.c
src/freedreno/ir3/ir3_dce.c
src/freedreno/ir3/ir3_lower_subgroups.c [new file with mode: 0644]
src/freedreno/ir3/ir3_sched.c
src/freedreno/ir3/ir3_validate.c
src/freedreno/ir3/meson.build

index 0651fe6..3be1a3e 100644 (file)
@@ -191,6 +191,13 @@ static const struct opc_info {
        OPC(1, OPC_SWZ,          swz),
        OPC(1, OPC_SCT,          sct),
        OPC(1, OPC_GAT,          gat),
+       OPC(1, OPC_BALLOT_MACRO, ballot.macro),
+       OPC(1, OPC_ANY_MACRO,    any.macro),
+       OPC(1, OPC_ALL_MACRO,    all.macro),
+       OPC(1, OPC_ELECT_MACRO,  elect.macro),
+       OPC(1, OPC_READ_COND_MACRO, read_cond.macro),
+       OPC(1, OPC_READ_FIRST_MACRO, read_first.macro),
+       OPC(1, OPC_SWZ_SHARED_MACRO, swz_shared.macro),
 
        /* category 2: */
        OPC(2, OPC_ADD_F,        add.f),
index 8fb954e..3400d35 100644 (file)
@@ -113,6 +113,15 @@ typedef enum {
        OPC_MOV_RELGPR      = _OPC(1, 43),
        OPC_MOV_RELCONST    = _OPC(1, 44),
 
+       /* Macros that expand to an if statement + move */
+       OPC_BALLOT_MACRO    = _OPC(1, 50),
+       OPC_ANY_MACRO       = _OPC(1, 51),
+       OPC_ALL_MACRO       = _OPC(1, 52),
+       OPC_ELECT_MACRO     = _OPC(1, 53),
+       OPC_READ_COND_MACRO = _OPC(1, 54),
+       OPC_READ_FIRST_MACRO = _OPC(1, 55),
+       OPC_SWZ_SHARED_MACRO = _OPC(1, 56),
+
        /* category 2: */
        OPC_ADD_F           = _OPC(2, 0),
        OPC_MIN_F           = _OPC(2, 1),
index a742af0..3e0b4f0 100644 (file)
@@ -1458,6 +1458,8 @@ __ssa_srcp_n(struct ir3_instruction *instr, unsigned n)
        list_for_each_entry_rev(struct ir3_instruction, __instr, __list, node)
 #define foreach_instr_safe(__instr, __list) \
        list_for_each_entry_safe(struct ir3_instruction, __instr, __list, node)
+#define foreach_instr_from_safe(__instr, __start, __list) \
+       list_for_each_entry_from_safe(struct ir3_instruction, __instr, __start, __list, node)
 
 /* iterators for blocks: */
 #define foreach_block(__block, __list) \
@@ -1526,6 +1528,9 @@ bool ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v);
 /* register assignment: */
 int ir3_ra(struct ir3_shader_variant *v);
 
+/* lower subgroup ops: */
+bool ir3_lower_subgroups(struct ir3 *ir);
+
 /* legalize: */
 bool ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary);
 
@@ -1692,6 +1697,20 @@ ir3_MOVMSK(struct ir3_block *block, unsigned components)
 }
 
 static inline struct ir3_instruction *
+ir3_BALLOT_MACRO(struct ir3_block *block, struct ir3_instruction *src, unsigned components)
+{
+       struct ir3_instruction *instr = ir3_instr_create(block, OPC_BALLOT_MACRO, 1, 1);
+
+       struct ir3_register *dst = __ssa_dst(instr);
+       dst->flags |= IR3_REG_SHARED;
+       dst->wrmask = (1 << components) - 1;
+
+       __ssa_src(instr, src, 0);
+
+       return instr;
+}
+
+static inline struct ir3_instruction *
 ir3_NOP(struct ir3_block *block)
 {
        return ir3_instr_create(block, OPC_NOP, 0, 0);
@@ -1852,6 +1871,21 @@ INSTR0(PREDF)
 INSTR0(PREDE)
 INSTR0(GETONE)
 
+/* cat1 macros */
+INSTR1(ANY_MACRO)
+INSTR1(ALL_MACRO)
+INSTR1(READ_FIRST_MACRO)
+INSTR2(READ_COND_MACRO)
+
+static inline struct ir3_instruction *
+ir3_ELECT_MACRO(struct ir3_block *block)
+{
+       struct ir3_instruction *instr =
+               ir3_instr_create(block, OPC_ELECT_MACRO, 1, 0);
+       __ssa_dst(instr);
+       return instr;
+}
+
 /* cat2 instructions, most 2 src but some 1 src: */
 INSTR2(ADD_F)
 INSTR2(MIN_F)
index 6b219a2..ef44485 100644 (file)
@@ -4054,6 +4054,8 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 
        IR3_PASS(ir, ir3_postsched, so);
 
+       IR3_PASS(ir, ir3_lower_subgroups);
+
        if (so->type == MESA_SHADER_FRAGMENT)
                pack_inlocs(ctx);
 
index a45a2e4..76aaebd 100644 (file)
@@ -158,9 +158,6 @@ find_and_remove_unused(struct ir3 *ir, struct ir3_shader_variant *so)
                }
        }
 
-       /* note that we can end up with unused indirects, but we should
-        * not end up with unused predicates.
-        */
        for (i = 0; i < ir->a0_users_count; i++) {
                struct ir3_instruction *instr = ir->a0_users[i];
                if (instr && (instr->flags & IR3_INSTR_UNUSED))
@@ -173,6 +170,12 @@ find_and_remove_unused(struct ir3 *ir, struct ir3_shader_variant *so)
                        ir->a1_users[i] = NULL;
        }
 
+       for (i = 0; i < ir->predicates_count; i++) {
+               struct ir3_instruction *instr = ir->predicates[i];
+               if (instr && (instr->flags & IR3_INSTR_UNUSED))
+                       ir->predicates[i] = NULL;
+       }
+
        /* cleanup unused inputs: */
        foreach_input_n (in, n, ir)
                if (in->flags & IR3_INSTR_UNUSED)
diff --git a/src/freedreno/ir3/ir3_lower_subgroups.c b/src/freedreno/ir3/ir3_lower_subgroups.c
new file mode 100644 (file)
index 0000000..2efdf09
--- /dev/null
@@ -0,0 +1,254 @@
+/*
+ * Copyright (C) 2021 Valve Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ir3.h"
+
+/* Lower several macro-instructions needed for shader subgroup support that
+ * must be turned into if statements. We do this after RA and post-RA
+ * scheduling to give the scheduler a chance to rearrange them, because RA
+ * may need to insert OPC_META_READ_FIRST to handle splitting live ranges, and
+ * also because some (e.g. BALLOT and READ_FIRST) must produce a shared
+ * register that cannot be spilled to a normal register until after the if,
+ * which makes implementing spilling more complicated if they are already
+ * lowered.
+ */
+
+static void
+replace_pred(struct ir3_block *block, struct ir3_block *old_pred,
+                        struct ir3_block *new_pred)
+{
+       for (unsigned i = 0; i < block->predecessors_count; i++) {
+               if (block->predecessors[i] == old_pred) {
+                       block->predecessors[i] = new_pred;
+                       return;
+               }
+       }
+}
+
+static void
+replace_physical_pred(struct ir3_block *block, struct ir3_block *old_pred,
+                                         struct ir3_block *new_pred)
+{
+       for (unsigned i = 0; i < block->physical_predecessors_count; i++) {
+               if (block->physical_predecessors[i] == old_pred) {
+                       block->physical_predecessors[i] = new_pred;
+                       return;
+               }
+       }
+}
+
+static void
+mov_immed(struct ir3_register *dst, struct ir3_block *block, unsigned immed)
+{
+       struct ir3_instruction *mov = ir3_instr_create(block, OPC_MOV, 1, 1);
+       struct ir3_register *mov_dst = ir3_dst_create(mov, dst->num, dst->flags);
+       mov_dst->wrmask = dst->wrmask;
+       struct ir3_register *src =
+               ir3_src_create(mov, INVALID_REG, (dst->flags & IR3_REG_HALF) | IR3_REG_IMMED);
+       src->uim_val = immed;
+       mov->cat1.dst_type = (dst->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
+       mov->cat1.src_type = mov->cat1.dst_type;
+       mov->repeat = util_last_bit(mov_dst->wrmask) - 1;
+}
+
+static struct ir3_block *
+split_block(struct ir3 *ir, struct ir3_block *before_block,
+                   struct ir3_instruction *instr, struct ir3_block **then)
+{
+       struct ir3_block *then_block = ir3_block_create(ir);
+       struct ir3_block *after_block = ir3_block_create(ir);
+       list_add(&then_block->node, &before_block->node);
+       list_add(&after_block->node, &then_block->node);
+
+       for (unsigned i = 0; i < ARRAY_SIZE(before_block->successors); i++) {
+               after_block->successors[i] = before_block->successors[i];
+               if (after_block->successors[i])
+                       replace_pred(after_block->successors[i], before_block, after_block);
+       }
+
+       for (unsigned i = 0; i < ARRAY_SIZE(before_block->physical_successors); i++) {
+               after_block->physical_successors[i] = before_block->physical_successors[i];
+               if (after_block->physical_successors[i]) {
+                       replace_physical_pred(after_block->physical_successors[i],
+                                                                 before_block, after_block);
+               }
+       }
+
+       before_block->successors[0] = then_block;
+       before_block->successors[1] = after_block;
+       before_block->physical_successors[0] = then_block;
+       before_block->physical_successors[1] = after_block;
+       ir3_block_add_predecessor(then_block, before_block);
+       ir3_block_add_predecessor(after_block, before_block);
+       ir3_block_add_physical_predecessor(then_block, before_block);
+       ir3_block_add_physical_predecessor(after_block, before_block);
+
+       then_block->successors[0] = after_block;
+       then_block->physical_successors[0] = after_block;
+       ir3_block_add_predecessor(after_block, then_block);
+       ir3_block_add_physical_predecessor(after_block, then_block);
+       
+       foreach_instr_from_safe (rem_instr, &instr->node, &before_block->instr_list) {
+               list_del(&rem_instr->node);
+               list_addtail(&rem_instr->node, &after_block->instr_list);
+               rem_instr->block = after_block;
+       }
+
+       after_block->brtype = before_block->brtype;
+       after_block->condition = before_block->condition;
+
+       *then = then_block;
+       return after_block;
+}
+
+static bool
+lower_block(struct ir3 *ir, struct ir3_block **block)
+{
+       bool progress = false;
+
+       foreach_instr_safe (instr, &(*block)->instr_list) {
+               switch (instr->opc) {
+               case OPC_BALLOT_MACRO:
+               case OPC_ANY_MACRO:
+               case OPC_ALL_MACRO:
+               case OPC_ELECT_MACRO:
+               case OPC_READ_COND_MACRO:
+               case OPC_READ_FIRST_MACRO:
+               case OPC_SWZ_SHARED_MACRO:
+                       break;
+               default:
+                       continue;
+               }
+
+               struct ir3_block *before_block = *block;
+               struct ir3_block *then_block;
+               struct ir3_block *after_block =
+                       split_block(ir, before_block, instr, &then_block);
+
+               /* For ballot, the destination must be initialized to 0 before we do
+                * the movmsk because the condition may be 0 and then the movmsk will
+                * be skipped. Because it's a shared register we have to wrap the
+                * initialization in a getone block.
+                */
+               if (instr->opc == OPC_BALLOT_MACRO) {
+                       before_block->brtype = IR3_BRANCH_GETONE;
+                       before_block->condition = NULL;
+                       mov_immed(instr->dsts[0], then_block, 0);
+                       before_block = after_block;
+                       after_block = split_block(ir, before_block, instr, &then_block);
+               }
+
+               switch (instr->opc) {
+               case OPC_BALLOT_MACRO:
+               case OPC_READ_COND_MACRO:
+               case OPC_ANY_MACRO:
+               case OPC_ALL_MACRO:
+                       before_block->condition = instr->srcs[0]->def->instr;
+                       break;
+               default:
+                       before_block->condition = NULL;
+                       break;
+               }
+
+               switch (instr->opc) {
+               case OPC_BALLOT_MACRO:
+               case OPC_READ_COND_MACRO:
+                       before_block->brtype = IR3_BRANCH_COND;
+                       break;
+               case OPC_ANY_MACRO:
+                       before_block->brtype = IR3_BRANCH_ANY;
+                       break;
+               case OPC_ALL_MACRO:
+                       before_block->brtype = IR3_BRANCH_ALL;
+                       break;
+               case OPC_ELECT_MACRO:
+               case OPC_READ_FIRST_MACRO:
+               case OPC_SWZ_SHARED_MACRO:
+                       before_block->brtype = IR3_BRANCH_GETONE;
+                       break;
+               default:
+                       unreachable("bad opcode");
+               }
+
+               switch (instr->opc) {
+               case OPC_ALL_MACRO:
+               case OPC_ANY_MACRO:
+               case OPC_ELECT_MACRO:
+                       mov_immed(instr->dsts[0], then_block, 1);
+                       mov_immed(instr->dsts[0], before_block, 0);
+                       break;
+
+               case OPC_BALLOT_MACRO: {
+                       unsigned comp_count = util_last_bit(instr->dsts[0]->wrmask);
+                       struct ir3_instruction *movmsk = ir3_instr_create(then_block, OPC_MOVMSK, 1, 0);
+                       ir3_dst_create(movmsk, instr->dsts[0]->num, instr->dsts[0]->flags);
+                       movmsk->repeat = comp_count - 1;
+                       break;
+               }
+
+               case OPC_READ_COND_MACRO:
+               case OPC_READ_FIRST_MACRO: {
+                       struct ir3_instruction *mov = ir3_instr_create(then_block, OPC_MOV, 1, 1);
+                       unsigned src = instr->opc == OPC_READ_COND_MACRO ? 1 : 0;
+                       ir3_dst_create(mov, instr->dsts[0]->num, instr->dsts[0]->flags);
+                       struct ir3_register *new_src = ir3_src_create(mov, 0, 0);
+                       *new_src = *instr->srcs[src];
+                       mov->cat1.dst_type = mov->cat1.src_type = TYPE_U32;
+                       break;
+               }
+
+               case OPC_SWZ_SHARED_MACRO: {
+                       struct ir3_instruction *swz =
+                               ir3_instr_create(then_block, OPC_SWZ, 2, 2);
+                       ir3_dst_create(swz, instr->dsts[0]->num, instr->dsts[0]->flags);
+                       ir3_dst_create(swz, instr->dsts[1]->num, instr->dsts[1]->flags);
+                       ir3_src_create(swz, instr->srcs[0]->num, instr->srcs[0]->flags);
+                       ir3_src_create(swz, instr->srcs[1]->num, instr->srcs[1]->flags);
+                       swz->cat1.dst_type = swz->cat1.src_type = TYPE_U32;
+                       swz->repeat = 1;
+                       break;
+               }
+
+               default:
+                       unreachable("bad opcode");
+               }
+
+               *block = after_block;
+               list_delinit(&instr->node);
+               progress = true;
+       }
+
+       return progress;
+}
+
+bool
+ir3_lower_subgroups(struct ir3 *ir)
+{
+       bool progress = false;
+
+       foreach_block (block, &ir->block_list)
+               progress |= lower_block(ir, &block);
+
+       return progress;
+}
+
index 69e1df0..e8f979c 100644 (file)
@@ -919,6 +919,9 @@ split_pred(struct ir3_sched_ctx *ctx)
        for (i = 0; i < ir->predicates_count; i++) {
                struct ir3_instruction *predicated = ir->predicates[i];
 
+               if (!predicated)
+                       continue;
+
                /* skip instructions already scheduled: */
                if (is_scheduled(predicated))
                        continue;
index a500d9b..4eb1b67 100644 (file)
@@ -200,12 +200,18 @@ validate_instr(struct ir3_validate_ctx *ctx, struct ir3_instruction *instr)
         */
        switch (opc_cat(instr->opc)) {
        case 1: /* move instructions */
-               if (instr->opc == OPC_MOVMSK) {
+               if (instr->opc == OPC_MOVMSK || instr->opc == OPC_BALLOT_MACRO) {
                        validate_assert(ctx, instr->dsts_count == 1);
-                       validate_assert(ctx, instr->srcs_count == 0);
                        validate_assert(ctx, instr->dsts[0]->flags & IR3_REG_SHARED);
                        validate_assert(ctx, !(instr->dsts[0]->flags & IR3_REG_HALF));
                        validate_assert(ctx, util_is_power_of_two_or_zero(instr->dsts[0]->wrmask + 1));
+               } else if (instr->opc == OPC_ANY_MACRO || instr->opc == OPC_ALL_MACRO ||
+                                  instr->opc == OPC_READ_FIRST_MACRO ||
+                                  instr->opc == OPC_READ_COND_MACRO) {
+                       /* nothing yet */
+               } else if (instr->opc == OPC_ELECT_MACRO) {
+                       validate_assert(ctx, instr->dsts_count == 1);
+                       validate_assert(ctx, !(instr->dsts[0]->flags & IR3_REG_SHARED));
                } else {
                        foreach_dst (dst, instr)
                                validate_reg_size(ctx, dst, instr->cat1.dst_type);
index ea98381..312cdbf 100644 (file)
@@ -88,6 +88,7 @@ libfreedreno_ir3_files = files(
   'ir3_legalize.c',
   'ir3_liveness.c',
   'ir3_lower_parallelcopy.c',
+  'ir3_lower_subgroups.c',
   'ir3_merge_regs.c',
   'ir3_nir.c',
   'ir3_nir.h',