--- /dev/null
+/*
+ * Copyright (C) 2022 Alyssa Rosenzweig <alyssa@rosenzweig.io>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "agx_compiler.h"
+#include "agx_builder.h"
+
+/*
+ * Emits code for
+ *
+ * for (int i = 0; i < n; ++i)
+ * registers[dests[i]] = registers[srcs[i]];
+ *
+ * ...with all copies happening in parallel.
+ *
+ * That is, emit machine instructions equivalent to a parallel copy. This is
+ * used to lower not only parallel copies but also collects and splits, which
+ * also have parallel copy semantics.
+ *
+ * We only handles register-register copies, not general agx_index sources. This
+ * suffices for its internal use for register allocation.
+ */
+
+static void
+do_copy(agx_builder *b, const struct agx_copy *copy)
+{
+ agx_mov_to(b, agx_register(copy->dest, copy->size),
+ agx_register(copy->src, copy->size));
+}
+
+static void
+do_swap(agx_builder *b, const struct agx_copy *copy)
+{
+ if (copy->dest == copy->src)
+ return;
+
+ agx_index x = agx_register(copy->dest, copy->size);
+ agx_index y = agx_register(copy->src, copy->size);
+
+ agx_xor_to(b, x, x, y);
+ agx_xor_to(b, y, x, y);
+ agx_xor_to(b, x, x, y);
+}
+
+struct copy_ctx {
+ /* Number of copies being processed */
+ unsigned entry_count;
+
+ /* For each physreg, the number of pending copy entries that use it as a
+ * source. Once this drops to zero, then the physreg is unblocked and can
+ * be moved to.
+ */
+ unsigned physreg_use_count[AGX_NUM_REGS];
+
+ /* For each physreg, the pending copy_entry that uses it as a dest. */
+ struct agx_copy *physreg_dest[AGX_NUM_REGS];
+
+ struct agx_copy entries[AGX_NUM_REGS];
+};
+
+static bool
+entry_blocked(struct agx_copy *entry, struct copy_ctx *ctx)
+{
+ for (unsigned i = 0; i < agx_size_align_16(entry->size); i++) {
+ if (ctx->physreg_use_count[entry->dest + i] != 0)
+ return true;
+ }
+
+ return false;
+}
+
+static bool
+is_real(struct agx_copy *entry)
+{
+ /* TODO: Allow immediates in agx_copy */
+ return true;
+}
+
+/* TODO: Generalize to other bit sizes */
+static void
+split_32bit_copy(struct copy_ctx *ctx, struct agx_copy *entry)
+{
+ assert(!entry->done);
+ assert(is_real(entry));
+ assert(agx_size_align_16(entry->size) == 2);
+ struct agx_copy *new_entry = &ctx->entries[ctx->entry_count++];
+
+ new_entry->dest = entry->dest + 1;
+ new_entry->src = entry->src + 1;
+ new_entry->done = false;
+ entry->size = AGX_SIZE_16;
+ new_entry->size = AGX_SIZE_16;
+ ctx->physreg_dest[entry->dest + 1] = new_entry;
+}
+
+void
+agx_emit_parallel_copies(agx_builder *b,
+ struct agx_copy *copies,
+ unsigned num_copies)
+{
+ struct copy_ctx _ctx = {
+ .entry_count = num_copies
+ };
+
+ struct copy_ctx *ctx = &_ctx;
+
+ /* Set up the bookkeeping */
+ memset(ctx->physreg_dest, 0, sizeof(ctx->physreg_dest));
+ memset(ctx->physreg_use_count, 0, sizeof(ctx->physreg_use_count));
+
+ for (unsigned i = 0; i < ctx->entry_count; i++) {
+ struct agx_copy *entry = &copies[i];
+
+ ctx->entries[i] = *entry;
+
+ for (unsigned j = 0; j < agx_size_align_16(entry->size); j++) {
+ if (is_real(entry))
+ ctx->physreg_use_count[entry->src + j]++;
+
+ /* Copies should not have overlapping destinations. */
+ assert(!ctx->physreg_dest[entry->dest + j]);
+ ctx->physreg_dest[entry->dest + j] = entry;
+ }
+ }
+
+ bool progress = true;
+ while (progress) {
+ progress = false;
+
+ /* Step 1: resolve paths in the transfer graph. This means finding
+ * copies whose destination aren't blocked by something else and then
+ * emitting them, continuing this process until every copy is blocked
+ * and there are only cycles left.
+ *
+ * TODO: We should note that src is also available in dest to unblock
+ * cycles that src is involved in.
+ */
+
+ for (unsigned i = 0; i < ctx->entry_count; i++) {
+ struct agx_copy *entry = &ctx->entries[i];
+ if (!entry->done && !entry_blocked(entry, ctx)) {
+ entry->done = true;
+ progress = true;
+ do_copy(b, entry);
+ for (unsigned j = 0; j < agx_size_align_16(entry->size); j++) {
+ if (is_real(entry))
+ ctx->physreg_use_count[entry->src + j]--;
+ ctx->physreg_dest[entry->dest + j] = NULL;
+ }
+ }
+ }
+
+ if (progress)
+ continue;
+
+ /* Step 2: Find partially blocked copies and split them. In the
+ * mergedregs case, we can 32-bit copies which are only blocked on one
+ * 16-bit half, and splitting them helps get things moving.
+ *
+ * We can skip splitting copies if the source isn't a register,
+ * however, because it does not unblock anything and therefore doesn't
+ * contribute to making forward progress with step 1. These copies
+ * should still be resolved eventually in step 1 because they can't be
+ * part of a cycle.
+ */
+ for (unsigned i = 0; i < ctx->entry_count; i++) {
+ struct agx_copy *entry = &ctx->entries[i];
+ if (entry->done || (agx_size_align_16(entry->size) != 2))
+ continue;
+
+ if (((ctx->physreg_use_count[entry->dest] == 0 ||
+ ctx->physreg_use_count[entry->dest + 1] == 0)) &&
+ is_real(entry)) {
+ split_32bit_copy(ctx, entry);
+ progress = true;
+ }
+ }
+ }
+
+ /* Step 3: resolve cycles through swapping.
+ *
+ * At this point, the transfer graph should consist of only cycles.
+ * The reason is that, given any physreg n_1 that's the source of a
+ * remaining entry, it has a destination n_2, which (because every
+ * copy is blocked) is the source of some other copy whose destination
+ * is n_3, and so we can follow the chain until we get a cycle. If we
+ * reached some other node than n_1:
+ *
+ * n_1 -> n_2 -> ... -> n_i
+ * ^ |
+ * |-------------|
+ *
+ * then n_2 would be the destination of 2 copies, which is illegal
+ * (checked above in an assert). So n_1 must be part of a cycle:
+ *
+ * n_1 -> n_2 -> ... -> n_i
+ * ^ |
+ * |---------------------|
+ *
+ * and this must be only cycle n_1 is involved in, because any other
+ * path starting from n_1 would also have to end in n_1, resulting in
+ * a node somewhere along the way being the destination of 2 copies
+ * when the 2 paths merge.
+ *
+ * The way we resolve the cycle is through picking a copy (n_1, n_2)
+ * and swapping n_1 and n_2. This moves n_1 to n_2, so n_2 is taken
+ * out of the cycle:
+ *
+ * n_1 -> ... -> n_i
+ * ^ |
+ * |--------------|
+ *
+ * and we can keep repeating this until the cycle is empty.
+ */
+
+ for (unsigned i = 0; i < ctx->entry_count; i++) {
+ struct agx_copy *entry = &ctx->entries[i];
+ if (entry->done)
+ continue;
+
+ assert(is_real(entry));
+
+ /* catch trivial copies */
+ if (entry->dest == entry->src) {
+ entry->done = true;
+ continue;
+ }
+
+ do_swap(b, entry);
+
+ /* Split any blocking copies whose sources are only partially
+ * contained within our destination.
+ */
+ if (agx_size_align_16(entry->size) == 1) {
+ for (unsigned j = 0; j < ctx->entry_count; j++) {
+ struct agx_copy *blocking = &ctx->entries[j];
+
+ if (blocking->done)
+ continue;
+
+ if (blocking->src <= entry->dest &&
+ blocking->src + 1 >= entry->dest &&
+ agx_size_align_16(blocking->size) == 2) {
+ split_32bit_copy(ctx, blocking);
+ }
+ }
+ }
+
+ /* Update sources of blocking copies.
+ *
+ * Note: at this point, every blocking copy's source should be
+ * contained within our destination.
+ */
+ for (unsigned j = 0; j < ctx->entry_count; j++) {
+ struct agx_copy *blocking = &ctx->entries[j];
+ if (blocking->src >= entry->dest &&
+ blocking->src < entry->dest + agx_size_align_16(entry->size)) {
+ blocking->src = entry->src + (blocking->src - entry->dest);
+ }
+ }
+
+ entry->done = true;
+ }
+}
memcpy(block->regs_out, used_regs, sizeof(used_regs));
}
+/*
+ * Resolve an agx_index of type NORMAL or REGISTER to a physical register, once
+ * registers have been allocated for all SSA values.
+ */
+static unsigned
+agx_index_to_reg(uint8_t *ssa_to_reg, agx_index idx)
+{
+ if (idx.type == AGX_INDEX_NORMAL) {
+ return ssa_to_reg[idx.value];
+ } else {
+ assert(idx.type == AGX_INDEX_REGISTER);
+ return idx.value;
+ }
+}
+
void
agx_ra(agx_context *ctx)
{
agx_foreach_instr_global_safe(ctx, ins) {
/* Lower away RA pseudo-instructions */
if (ins->op == AGX_OPCODE_P_COMBINE) {
- /* TODO: Optimize out the moves! */
- assert(ins->dest[0].type == AGX_INDEX_NORMAL);
- enum agx_size common_size = ins->dest[0].size;
- unsigned base = ssa_to_reg[ins->dest[0].value];
- unsigned size = common_size == AGX_SIZE_32 ? 2 : 1;
-
- /* Move the sources */
- agx_builder b = agx_init_builder(ctx, agx_after_instr(ins));
-
- /* TODO: Eliminate the intermediate copy by handling parallel copies */
- for (unsigned i = 0; i < 4; ++i) {
- if (agx_is_null(ins->src[i])) continue;
- unsigned base = ins->src[i].value;
- if (ins->src[i].type == AGX_INDEX_NORMAL)
- base = ssa_to_reg[base];
- else
- assert(ins->src[i].type == AGX_INDEX_REGISTER);
-
- assert(ins->src[i].size == common_size);
+ unsigned base = agx_index_to_reg(ssa_to_reg, ins->dest[0]);
+ unsigned width = agx_size_align_16(ins->dest[0].size);
- agx_mov_to(&b, agx_register(124*2 + (i * size), common_size),
- agx_register(base, common_size));
- }
+ struct agx_copy copies[4];
+ unsigned n = 0;
+ /* Move the sources */
for (unsigned i = 0; i < 4; ++i) {
if (agx_is_null(ins->src[i])) continue;
- agx_index src = ins->src[i];
-
- if (src.type == AGX_INDEX_NORMAL)
- src = agx_register(alloc[src.value], src.size);
+ assert(ins->src[i].size == ins->dest[0].size);
- agx_mov_to(&b, agx_register(base + (i * size), common_size),
- agx_register(124*2 + (i * size), common_size));
+ copies[n++] = (struct agx_copy) {
+ .dest = base + (i * width),
+ .src = agx_index_to_reg(ssa_to_reg, ins->src[i]) ,
+ .size = ins->src[i].size
+ };
}
- /* We've lowered away, delete the old */
- agx_remove_instruction(ins);
+ /* Lower away the copies pseudo-instruction */
+ agx_builder b = agx_init_builder(ctx, agx_after_instr(ins));
+ agx_emit_parallel_copies(&b, copies, n);
continue;
} else if (ins->op == AGX_OPCODE_P_EXTRACT) {
/* Uses the destination size */
- assert(ins->dest[0].type == AGX_INDEX_NORMAL);
- unsigned base = ins->src[0].value;
-
- if (ins->src[0].type != AGX_INDEX_REGISTER) {
- assert(ins->src[0].type == AGX_INDEX_NORMAL);
- base = alloc[base];
- }
-
- unsigned size = ins->dest[0].size == AGX_SIZE_64 ? 4 : ins->dest[0].size == AGX_SIZE_32 ? 2 : 1;
- unsigned left = ssa_to_reg[ins->dest[0].value];
- unsigned right = ssa_to_reg[ins->src[0].value] + (size * ins->imm);
+ unsigned size = agx_size_align_16(ins->dest[0].size);
+ unsigned left = agx_index_to_reg(ssa_to_reg, ins->dest[0]);
+ unsigned right = agx_index_to_reg(ssa_to_reg, ins->src[0]) + (size * ins->imm);
if (left != right) {
agx_builder b = agx_init_builder(ctx, agx_after_instr(ins));