From 17f7453d45c79bb8c52c4f9d491a3b63c1fcb76a Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Mon, 31 May 2021 12:09:42 +0200 Subject: [PATCH] ir3: Add subgroup pseudoinstructions Part-of: --- src/freedreno/ir3/disasm-a3xx.c | 7 + src/freedreno/ir3/instr-a3xx.h | 9 ++ src/freedreno/ir3/ir3.h | 34 +++++ src/freedreno/ir3/ir3_compiler_nir.c | 2 + src/freedreno/ir3/ir3_dce.c | 9 +- src/freedreno/ir3/ir3_lower_subgroups.c | 254 ++++++++++++++++++++++++++++++++ src/freedreno/ir3/ir3_sched.c | 3 + src/freedreno/ir3/ir3_validate.c | 10 +- src/freedreno/ir3/meson.build | 1 + 9 files changed, 324 insertions(+), 5 deletions(-) create mode 100644 src/freedreno/ir3/ir3_lower_subgroups.c diff --git a/src/freedreno/ir3/disasm-a3xx.c b/src/freedreno/ir3/disasm-a3xx.c index 0651fe6..3be1a3e 100644 --- a/src/freedreno/ir3/disasm-a3xx.c +++ b/src/freedreno/ir3/disasm-a3xx.c @@ -191,6 +191,13 @@ static const struct opc_info { OPC(1, OPC_SWZ, swz), OPC(1, OPC_SCT, sct), OPC(1, OPC_GAT, gat), + OPC(1, OPC_BALLOT_MACRO, ballot.macro), + OPC(1, OPC_ANY_MACRO, any.macro), + OPC(1, OPC_ALL_MACRO, all.macro), + OPC(1, OPC_ELECT_MACRO, elect.macro), + OPC(1, OPC_READ_COND_MACRO, read_cond.macro), + OPC(1, OPC_READ_FIRST_MACRO, read_first.macro), + OPC(1, OPC_SWZ_SHARED_MACRO, swz_shared.macro), /* category 2: */ OPC(2, OPC_ADD_F, add.f), diff --git a/src/freedreno/ir3/instr-a3xx.h b/src/freedreno/ir3/instr-a3xx.h index 8fb954e..3400d35 100644 --- a/src/freedreno/ir3/instr-a3xx.h +++ b/src/freedreno/ir3/instr-a3xx.h @@ -113,6 +113,15 @@ typedef enum { OPC_MOV_RELGPR = _OPC(1, 43), OPC_MOV_RELCONST = _OPC(1, 44), + /* Macros that expand to an if statement + move */ + OPC_BALLOT_MACRO = _OPC(1, 50), + OPC_ANY_MACRO = _OPC(1, 51), + OPC_ALL_MACRO = _OPC(1, 52), + OPC_ELECT_MACRO = _OPC(1, 53), + OPC_READ_COND_MACRO = _OPC(1, 54), + OPC_READ_FIRST_MACRO = _OPC(1, 55), + OPC_SWZ_SHARED_MACRO = _OPC(1, 56), + /* category 2: */ OPC_ADD_F = _OPC(2, 0), OPC_MIN_F = _OPC(2, 1), diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h index a742af0..3e0b4f0 100644 --- a/src/freedreno/ir3/ir3.h +++ b/src/freedreno/ir3/ir3.h @@ -1458,6 +1458,8 @@ __ssa_srcp_n(struct ir3_instruction *instr, unsigned n) list_for_each_entry_rev(struct ir3_instruction, __instr, __list, node) #define foreach_instr_safe(__instr, __list) \ list_for_each_entry_safe(struct ir3_instruction, __instr, __list, node) +#define foreach_instr_from_safe(__instr, __start, __list) \ + list_for_each_entry_from_safe(struct ir3_instruction, __instr, __start, __list, node) /* iterators for blocks: */ #define foreach_block(__block, __list) \ @@ -1526,6 +1528,9 @@ bool ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v); /* register assignment: */ int ir3_ra(struct ir3_shader_variant *v); +/* lower subgroup ops: */ +bool ir3_lower_subgroups(struct ir3 *ir); + /* legalize: */ bool ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary); @@ -1692,6 +1697,20 @@ ir3_MOVMSK(struct ir3_block *block, unsigned components) } static inline struct ir3_instruction * +ir3_BALLOT_MACRO(struct ir3_block *block, struct ir3_instruction *src, unsigned components) +{ + struct ir3_instruction *instr = ir3_instr_create(block, OPC_BALLOT_MACRO, 1, 1); + + struct ir3_register *dst = __ssa_dst(instr); + dst->flags |= IR3_REG_SHARED; + dst->wrmask = (1 << components) - 1; + + __ssa_src(instr, src, 0); + + return instr; +} + +static inline struct ir3_instruction * ir3_NOP(struct ir3_block *block) { return ir3_instr_create(block, OPC_NOP, 0, 0); @@ -1852,6 +1871,21 @@ INSTR0(PREDF) INSTR0(PREDE) INSTR0(GETONE) +/* cat1 macros */ +INSTR1(ANY_MACRO) +INSTR1(ALL_MACRO) +INSTR1(READ_FIRST_MACRO) +INSTR2(READ_COND_MACRO) + +static inline struct ir3_instruction * +ir3_ELECT_MACRO(struct ir3_block *block) +{ + struct ir3_instruction *instr = + ir3_instr_create(block, OPC_ELECT_MACRO, 1, 0); + __ssa_dst(instr); + return instr; +} + /* cat2 instructions, most 2 src but some 1 src: */ INSTR2(ADD_F) INSTR2(MIN_F) diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c index 6b219a2..ef44485 100644 --- a/src/freedreno/ir3/ir3_compiler_nir.c +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -4054,6 +4054,8 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, IR3_PASS(ir, ir3_postsched, so); + IR3_PASS(ir, ir3_lower_subgroups); + if (so->type == MESA_SHADER_FRAGMENT) pack_inlocs(ctx); diff --git a/src/freedreno/ir3/ir3_dce.c b/src/freedreno/ir3/ir3_dce.c index a45a2e4..76aaebd 100644 --- a/src/freedreno/ir3/ir3_dce.c +++ b/src/freedreno/ir3/ir3_dce.c @@ -158,9 +158,6 @@ find_and_remove_unused(struct ir3 *ir, struct ir3_shader_variant *so) } } - /* note that we can end up with unused indirects, but we should - * not end up with unused predicates. - */ for (i = 0; i < ir->a0_users_count; i++) { struct ir3_instruction *instr = ir->a0_users[i]; if (instr && (instr->flags & IR3_INSTR_UNUSED)) @@ -173,6 +170,12 @@ find_and_remove_unused(struct ir3 *ir, struct ir3_shader_variant *so) ir->a1_users[i] = NULL; } + for (i = 0; i < ir->predicates_count; i++) { + struct ir3_instruction *instr = ir->predicates[i]; + if (instr && (instr->flags & IR3_INSTR_UNUSED)) + ir->predicates[i] = NULL; + } + /* cleanup unused inputs: */ foreach_input_n (in, n, ir) if (in->flags & IR3_INSTR_UNUSED) diff --git a/src/freedreno/ir3/ir3_lower_subgroups.c b/src/freedreno/ir3/ir3_lower_subgroups.c new file mode 100644 index 0000000..2efdf09 --- /dev/null +++ b/src/freedreno/ir3/ir3_lower_subgroups.c @@ -0,0 +1,254 @@ +/* + * Copyright (C) 2021 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ir3.h" + +/* Lower several macro-instructions needed for shader subgroup support that + * must be turned into if statements. We do this after RA and post-RA + * scheduling to give the scheduler a chance to rearrange them, because RA + * may need to insert OPC_META_READ_FIRST to handle splitting live ranges, and + * also because some (e.g. BALLOT and READ_FIRST) must produce a shared + * register that cannot be spilled to a normal register until after the if, + * which makes implementing spilling more complicated if they are already + * lowered. + */ + +static void +replace_pred(struct ir3_block *block, struct ir3_block *old_pred, + struct ir3_block *new_pred) +{ + for (unsigned i = 0; i < block->predecessors_count; i++) { + if (block->predecessors[i] == old_pred) { + block->predecessors[i] = new_pred; + return; + } + } +} + +static void +replace_physical_pred(struct ir3_block *block, struct ir3_block *old_pred, + struct ir3_block *new_pred) +{ + for (unsigned i = 0; i < block->physical_predecessors_count; i++) { + if (block->physical_predecessors[i] == old_pred) { + block->physical_predecessors[i] = new_pred; + return; + } + } +} + +static void +mov_immed(struct ir3_register *dst, struct ir3_block *block, unsigned immed) +{ + struct ir3_instruction *mov = ir3_instr_create(block, OPC_MOV, 1, 1); + struct ir3_register *mov_dst = ir3_dst_create(mov, dst->num, dst->flags); + mov_dst->wrmask = dst->wrmask; + struct ir3_register *src = + ir3_src_create(mov, INVALID_REG, (dst->flags & IR3_REG_HALF) | IR3_REG_IMMED); + src->uim_val = immed; + mov->cat1.dst_type = (dst->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32; + mov->cat1.src_type = mov->cat1.dst_type; + mov->repeat = util_last_bit(mov_dst->wrmask) - 1; +} + +static struct ir3_block * +split_block(struct ir3 *ir, struct ir3_block *before_block, + struct ir3_instruction *instr, struct ir3_block **then) +{ + struct ir3_block *then_block = ir3_block_create(ir); + struct ir3_block *after_block = ir3_block_create(ir); + list_add(&then_block->node, &before_block->node); + list_add(&after_block->node, &then_block->node); + + for (unsigned i = 0; i < ARRAY_SIZE(before_block->successors); i++) { + after_block->successors[i] = before_block->successors[i]; + if (after_block->successors[i]) + replace_pred(after_block->successors[i], before_block, after_block); + } + + for (unsigned i = 0; i < ARRAY_SIZE(before_block->physical_successors); i++) { + after_block->physical_successors[i] = before_block->physical_successors[i]; + if (after_block->physical_successors[i]) { + replace_physical_pred(after_block->physical_successors[i], + before_block, after_block); + } + } + + before_block->successors[0] = then_block; + before_block->successors[1] = after_block; + before_block->physical_successors[0] = then_block; + before_block->physical_successors[1] = after_block; + ir3_block_add_predecessor(then_block, before_block); + ir3_block_add_predecessor(after_block, before_block); + ir3_block_add_physical_predecessor(then_block, before_block); + ir3_block_add_physical_predecessor(after_block, before_block); + + then_block->successors[0] = after_block; + then_block->physical_successors[0] = after_block; + ir3_block_add_predecessor(after_block, then_block); + ir3_block_add_physical_predecessor(after_block, then_block); + + foreach_instr_from_safe (rem_instr, &instr->node, &before_block->instr_list) { + list_del(&rem_instr->node); + list_addtail(&rem_instr->node, &after_block->instr_list); + rem_instr->block = after_block; + } + + after_block->brtype = before_block->brtype; + after_block->condition = before_block->condition; + + *then = then_block; + return after_block; +} + +static bool +lower_block(struct ir3 *ir, struct ir3_block **block) +{ + bool progress = false; + + foreach_instr_safe (instr, &(*block)->instr_list) { + switch (instr->opc) { + case OPC_BALLOT_MACRO: + case OPC_ANY_MACRO: + case OPC_ALL_MACRO: + case OPC_ELECT_MACRO: + case OPC_READ_COND_MACRO: + case OPC_READ_FIRST_MACRO: + case OPC_SWZ_SHARED_MACRO: + break; + default: + continue; + } + + struct ir3_block *before_block = *block; + struct ir3_block *then_block; + struct ir3_block *after_block = + split_block(ir, before_block, instr, &then_block); + + /* For ballot, the destination must be initialized to 0 before we do + * the movmsk because the condition may be 0 and then the movmsk will + * be skipped. Because it's a shared register we have to wrap the + * initialization in a getone block. + */ + if (instr->opc == OPC_BALLOT_MACRO) { + before_block->brtype = IR3_BRANCH_GETONE; + before_block->condition = NULL; + mov_immed(instr->dsts[0], then_block, 0); + before_block = after_block; + after_block = split_block(ir, before_block, instr, &then_block); + } + + switch (instr->opc) { + case OPC_BALLOT_MACRO: + case OPC_READ_COND_MACRO: + case OPC_ANY_MACRO: + case OPC_ALL_MACRO: + before_block->condition = instr->srcs[0]->def->instr; + break; + default: + before_block->condition = NULL; + break; + } + + switch (instr->opc) { + case OPC_BALLOT_MACRO: + case OPC_READ_COND_MACRO: + before_block->brtype = IR3_BRANCH_COND; + break; + case OPC_ANY_MACRO: + before_block->brtype = IR3_BRANCH_ANY; + break; + case OPC_ALL_MACRO: + before_block->brtype = IR3_BRANCH_ALL; + break; + case OPC_ELECT_MACRO: + case OPC_READ_FIRST_MACRO: + case OPC_SWZ_SHARED_MACRO: + before_block->brtype = IR3_BRANCH_GETONE; + break; + default: + unreachable("bad opcode"); + } + + switch (instr->opc) { + case OPC_ALL_MACRO: + case OPC_ANY_MACRO: + case OPC_ELECT_MACRO: + mov_immed(instr->dsts[0], then_block, 1); + mov_immed(instr->dsts[0], before_block, 0); + break; + + case OPC_BALLOT_MACRO: { + unsigned comp_count = util_last_bit(instr->dsts[0]->wrmask); + struct ir3_instruction *movmsk = ir3_instr_create(then_block, OPC_MOVMSK, 1, 0); + ir3_dst_create(movmsk, instr->dsts[0]->num, instr->dsts[0]->flags); + movmsk->repeat = comp_count - 1; + break; + } + + case OPC_READ_COND_MACRO: + case OPC_READ_FIRST_MACRO: { + struct ir3_instruction *mov = ir3_instr_create(then_block, OPC_MOV, 1, 1); + unsigned src = instr->opc == OPC_READ_COND_MACRO ? 1 : 0; + ir3_dst_create(mov, instr->dsts[0]->num, instr->dsts[0]->flags); + struct ir3_register *new_src = ir3_src_create(mov, 0, 0); + *new_src = *instr->srcs[src]; + mov->cat1.dst_type = mov->cat1.src_type = TYPE_U32; + break; + } + + case OPC_SWZ_SHARED_MACRO: { + struct ir3_instruction *swz = + ir3_instr_create(then_block, OPC_SWZ, 2, 2); + ir3_dst_create(swz, instr->dsts[0]->num, instr->dsts[0]->flags); + ir3_dst_create(swz, instr->dsts[1]->num, instr->dsts[1]->flags); + ir3_src_create(swz, instr->srcs[0]->num, instr->srcs[0]->flags); + ir3_src_create(swz, instr->srcs[1]->num, instr->srcs[1]->flags); + swz->cat1.dst_type = swz->cat1.src_type = TYPE_U32; + swz->repeat = 1; + break; + } + + default: + unreachable("bad opcode"); + } + + *block = after_block; + list_delinit(&instr->node); + progress = true; + } + + return progress; +} + +bool +ir3_lower_subgroups(struct ir3 *ir) +{ + bool progress = false; + + foreach_block (block, &ir->block_list) + progress |= lower_block(ir, &block); + + return progress; +} + diff --git a/src/freedreno/ir3/ir3_sched.c b/src/freedreno/ir3/ir3_sched.c index 69e1df0..e8f979c 100644 --- a/src/freedreno/ir3/ir3_sched.c +++ b/src/freedreno/ir3/ir3_sched.c @@ -919,6 +919,9 @@ split_pred(struct ir3_sched_ctx *ctx) for (i = 0; i < ir->predicates_count; i++) { struct ir3_instruction *predicated = ir->predicates[i]; + if (!predicated) + continue; + /* skip instructions already scheduled: */ if (is_scheduled(predicated)) continue; diff --git a/src/freedreno/ir3/ir3_validate.c b/src/freedreno/ir3/ir3_validate.c index a500d9b..4eb1b67 100644 --- a/src/freedreno/ir3/ir3_validate.c +++ b/src/freedreno/ir3/ir3_validate.c @@ -200,12 +200,18 @@ validate_instr(struct ir3_validate_ctx *ctx, struct ir3_instruction *instr) */ switch (opc_cat(instr->opc)) { case 1: /* move instructions */ - if (instr->opc == OPC_MOVMSK) { + if (instr->opc == OPC_MOVMSK || instr->opc == OPC_BALLOT_MACRO) { validate_assert(ctx, instr->dsts_count == 1); - validate_assert(ctx, instr->srcs_count == 0); validate_assert(ctx, instr->dsts[0]->flags & IR3_REG_SHARED); validate_assert(ctx, !(instr->dsts[0]->flags & IR3_REG_HALF)); validate_assert(ctx, util_is_power_of_two_or_zero(instr->dsts[0]->wrmask + 1)); + } else if (instr->opc == OPC_ANY_MACRO || instr->opc == OPC_ALL_MACRO || + instr->opc == OPC_READ_FIRST_MACRO || + instr->opc == OPC_READ_COND_MACRO) { + /* nothing yet */ + } else if (instr->opc == OPC_ELECT_MACRO) { + validate_assert(ctx, instr->dsts_count == 1); + validate_assert(ctx, !(instr->dsts[0]->flags & IR3_REG_SHARED)); } else { foreach_dst (dst, instr) validate_reg_size(ctx, dst, instr->cat1.dst_type); diff --git a/src/freedreno/ir3/meson.build b/src/freedreno/ir3/meson.build index ea98381..312cdbf 100644 --- a/src/freedreno/ir3/meson.build +++ b/src/freedreno/ir3/meson.build @@ -88,6 +88,7 @@ libfreedreno_ir3_files = files( 'ir3_legalize.c', 'ir3_liveness.c', 'ir3_lower_parallelcopy.c', + 'ir3_lower_subgroups.c', 'ir3_merge_regs.c', 'ir3_nir.c', 'ir3_nir.h', -- 2.7.4