From aa7393f81a17b8b3b3510ac78211ef28b3017468 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Fri, 20 May 2022 12:31:12 -0400 Subject: [PATCH] pan/va: Add flow control merging pass Signed-off-by: Alyssa Rosenzweig Part-of: --- src/panfrost/bifrost/meson.build | 1 + src/panfrost/bifrost/valhall/va_compiler.h | 1 + src/panfrost/bifrost/valhall/va_merge_flow.c | 228 +++++++++++++++++++++++++++ 3 files changed, 230 insertions(+) create mode 100644 src/panfrost/bifrost/valhall/va_merge_flow.c diff --git a/src/panfrost/bifrost/meson.build b/src/panfrost/bifrost/meson.build index c7dfca5..eccb337 100644 --- a/src/panfrost/bifrost/meson.build +++ b/src/panfrost/bifrost/meson.build @@ -51,6 +51,7 @@ libpanfrost_bifrost_files = files( 'valhall/va_lower_isel.c', 'valhall/va_lower_split_64bit.c', 'valhall/va_optimize.c', + 'valhall/va_merge_flow.c', 'valhall/va_pack.c', 'valhall/va_perf.c', 'valhall/va_validate.c', diff --git a/src/panfrost/bifrost/valhall/va_compiler.h b/src/panfrost/bifrost/valhall/va_compiler.h index db91351..9d19be3 100644 --- a/src/panfrost/bifrost/valhall/va_compiler.h +++ b/src/panfrost/bifrost/valhall/va_compiler.h @@ -41,6 +41,7 @@ void va_fuse_add_imm(bi_instr *I); void va_lower_constants(bi_context *ctx, bi_instr *I); void va_lower_isel(bi_instr *I); void va_insert_flow_control_nops(bi_context *ctx); +void va_merge_flow(bi_context *ctx); uint64_t va_pack_instr(const bi_instr *I); static inline unsigned diff --git a/src/panfrost/bifrost/valhall/va_merge_flow.c b/src/panfrost/bifrost/valhall/va_merge_flow.c new file mode 100644 index 0000000..f50ac2f --- /dev/null +++ b/src/panfrost/bifrost/valhall/va_merge_flow.c @@ -0,0 +1,228 @@ +/* + * Copyright (C) 2022 Collabora Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "va_compiler.h" +#include "valhall_enums.h" +#include "bi_builder.h" + +/* + * Merge NOPs with flow control with nearby instructions to eliminate the NOPs, + * according to the following rules: + * + * 1. Waits may be combined by waiting on a union of the slots. + * 2. Waits may be moved up as far as the first (last) asynchronous instruction + * writing to a slot waited on, at a performance cost. + * 3. Discard may be moved down, at a performance cost. + * 4. Reconverge must be on the last instruction of the block. + * 5. End must be on the last instruction of the block. + * + * For simplicity, this pass only merges within a single basic block. This + * should be sufficient. The algorithm works as follows. + * + * Because reconverge/end must be on the last instruction, there may be only + * one of these in a block. First check for a NOP at the end, and if it has + * reconverge/end flow control, merge with the penultimate instruction. Now we + * only have to worry about waits and discard. + * + * The cost of moving a wait up is assumed to be greater than the cost of moving + * a discard down, so we next move waits while we have more freedom. For each + * wait, merge with an instruction above that either is free or has another + * wait, and merge the flow control, aborting if we hit a message signaling the + * relevant slot. By maintaining the candidate instruction at all steps, this is + * implemented in a linear time walk. + * + * Finally, all that's left are discards. Move discards down to merge with a + * free instruction with a similar linear time walk. + * + * Since discarding helper threads is an optimization (not required for + * correctness), we may remove discards if we believe them harmful to the + * performance of the shader. Keeping with the local structure of the pass, we + * remove discards only if they are at the end of the last block and cannot be + * merged with any other instruction. Such a condition means every following + * instruction has incompatible flow control (wait and end). In practice, this + * means BLEND and ATEST run for helper threads in certain shaders to save a + * NOP, but BLEND is a no-op for helper threads anyway. + * + * One case we don't handle is merging "foo, bar, wait, reconverge" to + * "foo.wait, bar.reconverge". This sequence is rarely generated by the dataflow + * analysis, so we don't care for the complexity. + */ + +/* + * If the last instruction has reconverge or end, try to merge with the + * second to last instruction. + * + * Precondition: block has at least 2 instructions. + */ +static void +merge_end_reconverge(bi_block *block) +{ + bi_instr *last = list_last_entry(&block->instructions, bi_instr, link); + bi_instr *penult = bi_prev_op(last); + + if (last->op != BI_OPCODE_NOP) return; + if (last->flow != VA_FLOW_RECONVERGE && last->flow != VA_FLOW_END) return; + + /* End implies all other flow control, so remove blocking flow control */ + if (last->flow == VA_FLOW_END) { + while (penult->op == BI_OPCODE_NOP) { + bi_remove_instruction(penult); + + /* There may be nothing left */ + if (list_is_singular(&block->instructions)) + return; + + penult = bi_prev_op(last); + } + } + + /* If there is blocking flow control, we can't merge */ + if (penult->flow != VA_FLOW_NONE) return; + + /* Else, merge */ + penult->flow = last->flow; + bi_remove_instruction(last); +} + +static bool +is_wait_or_none(enum va_flow flow) +{ + return (flow <= VA_FLOW_WAIT); +} + +/* + * Calculate the union of two waits. We may wait on any combination of slots #0, + * #1, #2 or the entirety of 0126 and 01267. If we wait on the entirety, the + * union is trivial. If we do not wait on slot #6 (by extension slot #7), we + * wait only on slots #0, #1, and #2, for which the waits are encoded as a + * bitset and the union is just a bitwise OR. + */ +static enum va_flow +union_waits(enum va_flow x, enum va_flow y) +{ + assert(is_wait_or_none(x) && is_wait_or_none(y)); + + if ((x == VA_FLOW_WAIT) || (y == VA_FLOW_WAIT)) + return VA_FLOW_WAIT; + else if ((x == VA_FLOW_WAIT0126) || (y == VA_FLOW_WAIT0126)) + return VA_FLOW_WAIT0126; + else + return x | y; +} + +static void +merge_waits(bi_block *block) +{ + /* Most recent instruction with which we can merge, or NULL if none */ + bi_instr *last_free = NULL; + + bi_foreach_instr_in_block_safe(block, I) { + if (last_free != NULL && + I->op == BI_OPCODE_NOP && is_wait_or_none(I->flow)) { + + /* Merge waits with compatible instructions */ + last_free->flow = union_waits(last_free->flow, I->flow); + bi_remove_instruction(I); + continue; + } + + /* Don't move waits past async instructions, since they might be what + * we're waiting for. If we wanted to optimize this case, we could check + * the signaled slots. + */ + if (bi_opcode_props[I->op].message) + last_free = NULL; + + /* We can only merge with instructions whose flow control is a wait. + * This includes such an instruction after merging in a wait. It also + * includes async instructions. + */ + if (is_wait_or_none(I->flow)) + last_free = I; + } +} + +static bool +bi_is_first_instr(bi_block *block, bi_instr *I) +{ + return block->instructions.next == &I->link; +} + +static void +merge_discard(bi_block *block) +{ + bi_instr *last_free = NULL; + + bi_foreach_instr_in_block_safe_rev(block, I) { + if ((I->op == BI_OPCODE_NOP) && (I->flow == VA_FLOW_DISCARD)) { + /* Try to merge with the instruction *preceding* discard, because + * because flow control happens at the end of an instruction and + * discard is a NOP. + */ + if (!bi_is_first_instr(block, I)) { + bi_instr *prev = bi_prev_op(I); + + if (prev->flow == VA_FLOW_NONE) { + prev->flow = VA_FLOW_DISCARD; + bi_remove_instruction(I); + continue; + } + } + + /* Or try to merge with the next instruction with no flow control */ + if (last_free != NULL) { + last_free->flow = VA_FLOW_DISCARD; + bi_remove_instruction(I); + continue; + } + + /* If there's nowhere to merge and this is the end of the shader, just + * remove the discard. + */ + if (!block->successors[0] && !block->successors[1]) { + bi_remove_instruction(I); + continue; + } + } + + /* Allow merging into free instructions */ + if (I->flow == VA_FLOW_NONE) + last_free = I; + } +} + +void +va_merge_flow(bi_context *ctx) +{ + bi_foreach_block(ctx, block) { + /* If there are less than 2 instructions, there's nothing to merge */ + if (list_is_empty(&block->instructions)) continue; + if (list_is_singular(&block->instructions)) continue; + + merge_end_reconverge(block); + merge_waits(block); + + if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->inputs->is_blend) + merge_discard(block); + } +} -- 2.7.4