--- /dev/null
+/*
+ * Copyright (C) 2022 Collabora Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "va_compiler.h"
+#include "valhall_enums.h"
+
+/*
+ * Valhall sources may marked as the last use of a register, according
+ * to the following rules:
+ *
+ * 1. The last use of a register should be marked allowing the hardware
+ * to elide register writes.
+ * 2. Staging sources may be read at any time before the asynchronous
+ * instruction completes. If a register is used as both a staging source and
+ * a regular source, the regular source cannot be marked until the program
+ * waits for the asynchronous instruction.
+ * 3. Marking a register pair marks both registers in the pair.
+ *
+ * Last use information follows immediately from (post-RA) liveness analysis:
+ * a register is dead immediately after its last use.
+ *
+ * Staging information follows from scoreboard analysis: do not mark registers
+ * that are read by a pending asynchronous instruction. Note that the Valhall
+ * scoreboard analysis does not track reads, so we handle that with our own
+ * (simplified) scoreboard analysis.
+ *
+ * Register pairs are marked conservatively: if either register in a pair cannot
+ * be marked, do not mark either register.
+ */
+
+static uint64_t
+bi_staging_read_mask(const bi_instr *I)
+{
+ uint64_t mask = 0;
+
+ bi_foreach_src(I, s) {
+ if (bi_is_staging_src(I, s) && !bi_is_null(I->src[s])) {
+ assert(I->src[s].type == BI_INDEX_REGISTER);
+ unsigned reg = I->src[s].value;
+ unsigned count = bi_count_read_registers(I, s);
+
+ mask |= (BITFIELD64_MASK(count) << reg);
+ }
+ }
+
+ return mask;
+}
+
+static bool
+bi_writes_reg(const bi_instr *I, unsigned reg)
+{
+ bi_foreach_dest(I, d) {
+ if (bi_is_null(I->dest[d]))
+ continue;
+
+ assert(I->dest[d].type == BI_INDEX_REGISTER);
+
+ unsigned count = bi_count_write_registers(I, d);
+
+ if (reg >= I->dest[d].value && (reg - I->dest[d].value) < count)
+ return true;
+ }
+
+ return false;
+}
+
+static unsigned
+waits_on_slot(enum va_flow flow, unsigned slot)
+{
+ return (flow == VA_FLOW_WAIT) || (flow == VA_FLOW_WAIT0126) ||
+ (va_flow_is_wait_or_none(flow) && (flow & BITFIELD_BIT(slot)));
+}
+
+static void
+scoreboard_update(struct bi_scoreboard_state *st, const bi_instr *I)
+{
+ /* Mark read staging registers */
+ st->read[I->slot] |= bi_staging_read_mask(I);
+
+ /* Unmark registers after they are waited on */
+ for (unsigned i = 0; i < VA_NUM_GENERAL_SLOTS; ++i) {
+ if (waits_on_slot(I->flow, i))
+ st->read[i] = 0;
+ }
+}
+
+static void
+va_analyze_scoreboard_reads(bi_context *ctx)
+{
+ u_worklist worklist;
+ bi_worklist_init(ctx, &worklist);
+
+ bi_foreach_block(ctx, block) {
+ bi_worklist_push_tail(&worklist, block);
+
+ /* Reset analysis from previous pass */
+ block->scoreboard_in = (struct bi_scoreboard_state){ 0 };
+ block->scoreboard_out = (struct bi_scoreboard_state){ 0 };
+ }
+
+ /* Perform forward data flow analysis to calculate dependencies */
+ while (!u_worklist_is_empty(&worklist)) {
+ /* Pop from the front for forward analysis */
+ bi_block *blk = bi_worklist_pop_head(&worklist);
+
+ bi_foreach_predecessor(blk, pred) {
+ for (unsigned i = 0; i < VA_NUM_GENERAL_SLOTS; ++i)
+ blk->scoreboard_in.read[i] |= (*pred)->scoreboard_out.read[i];
+ }
+
+ struct bi_scoreboard_state state = blk->scoreboard_in;
+
+ bi_foreach_instr_in_block(blk, I)
+ scoreboard_update(&state, I);
+
+ /* If there was progress, reprocess successors */
+ if (memcmp(&state, &blk->scoreboard_out, sizeof(state)) != 0) {
+ bi_foreach_successor(blk, succ)
+ bi_worklist_push_tail(&worklist, succ);
+ }
+
+ blk->scoreboard_out = state;
+ }
+
+ u_worklist_fini(&worklist);
+}
+
+void
+va_mark_last(bi_context *ctx)
+{
+ /* Analyze the shader globally */
+ bi_postra_liveness(ctx);
+ va_analyze_scoreboard_reads(ctx);
+
+ bi_foreach_block(ctx, block) {
+ uint64_t live = block->reg_live_out;
+
+ /* Mark all last uses */
+ bi_foreach_instr_in_block_rev(block, I) {
+ bi_foreach_src(I, s) {
+ if (I->src[s].type != BI_INDEX_REGISTER)
+ continue;
+
+ unsigned nr = bi_count_read_registers(I, s);
+ uint64_t mask = BITFIELD64_MASK(nr) << I->src[s].value;
+
+ /* If the register dead after this instruction, it's the last use */
+ I->src[s].discard = (live & mask) == 0;
+
+ /* If the register is overwritten this cycle, it is implicitly
+ * discarded, but that won't show up in the liveness analysis.
+ */
+ I->src[s].discard |= bi_writes_reg(I, I->src[s].value);
+ }
+
+ live = bi_postra_liveness_ins(live, I);
+ }
+
+ struct bi_scoreboard_state st = block->scoreboard_in;
+
+ bi_foreach_instr_in_block(block, I) {
+ /* Unmark registers read by a pending async instruction */
+ bi_foreach_src(I, s) {
+ if (!I->src[s].discard)
+ continue;
+
+ assert(I->src[s].type == BI_INDEX_REGISTER);
+
+ uint64_t pending_regs = st.read[0] | st.read[1] | st.read[2];
+ bool pending = (pending_regs & BITFIELD64_BIT(I->src[s].value));
+
+ if (bi_is_staging_src(I, s) || pending)
+ I->src[s].discard = false;
+ }
+
+ /* Unmark register pairs where one half must be preserved */
+ bi_foreach_src(I, s) {
+ /* Only look for "real" architectural registers */
+ if (s >= 3)
+ break;
+
+ if (va_src_info(I->op, s).size == VA_SIZE_64) {
+ bool both_discard = I->src[s].discard && I->src[s + 1].discard;
+
+ I->src[s + 0].discard = both_discard;
+ I->src[s + 1].discard = both_discard;
+ }
+ }
+
+ scoreboard_update(&st, I);
+ }
+ }
+}