/*
+ * Copyright (C) 2021 Valve Corporation
* Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
*
* Permission is hereby granted, free of charge, to any person obtaining a
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * Authors:
- * Rob Clark <robclark@freedesktop.org>
*/
+#include "ir3_ra.h"
+#include "ir3_shader.h"
+#include "util/rb_tree.h"
#include "util/u_math.h"
-#include "util/register_allocate.h"
-#include "util/ralloc.h"
-#include "util/bitset.h"
-#include "ir3.h"
-#include "ir3_shader.h"
-#include "ir3_ra.h"
+/* This file implements an SSA-based register allocator. Unlike other
+ * SSA-based allocators, it handles vector split/collect "smartly," meaning
+ * that multiple values may share the same register interval. From the
+ * perspective of the allocator itself, only the top-level intervals matter,
+ * and the allocator is only concerned with allocating top-level intervals,
+ * which may mean moving other top-level intervals around. Other intervals,
+ * like the destination of a split instruction or the source of a collect
+ * instruction, are "locked" to their parent interval. The details of this are
+ * mostly handled by ir3_merge_regs and ir3_reg_ctx.
+ *
+ * We currently don't do any backtracking, but we do use the merge sets as a
+ * form of affinity to try to avoid moves from phis/splits/collects. Each
+ * merge set is what a more "classic" graph-coloring or live-range based
+ * allocator would consider a single register, but here we use it as merely a
+ * hint, except when multiple overlapping values are live at the same time.
+ * Each merge set has a "preferred" register, and we try to honor that when
+ * allocating values in the merge set.
+ */
+/* ir3_reg_ctx implementation. */
-#ifdef DEBUG
-#define RA_DEBUG (ir3_shader_debug & IR3_DBG_RAMSGS)
-#else
-#define RA_DEBUG 0
-#endif
-#define d(fmt, ...) do { if (RA_DEBUG) { \
- printf("RA: "fmt"\n", ##__VA_ARGS__); \
-} } while (0)
+static int
+ir3_reg_interval_cmp(const struct rb_node *node, const void *data)
+{
+ physreg_t reg = *(const physreg_t *)data;
+ const struct ir3_reg_interval *interval = ir3_rb_node_to_interval_const(node);
+ if (interval->reg->interval_start > reg)
+ return -1;
+ else if (interval->reg->interval_end <= reg)
+ return 1;
+ else
+ return 0;
+}
-#define di(instr, fmt, ...) do { if (RA_DEBUG) { \
- printf("RA: "fmt": ", ##__VA_ARGS__); \
- ir3_print_instr(instr); \
-} } while (0)
+static struct ir3_reg_interval *
+ir3_reg_interval_search(struct rb_tree *tree, unsigned offset)
+{
+ struct rb_node *node = rb_tree_search(tree, &offset, ir3_reg_interval_cmp);
+ return node ? ir3_rb_node_to_interval(node) : NULL;
+}
-/*
- * Register Assignment:
- *
- * Uses the register_allocate util, which implements graph coloring
- * algo with interference classes. To handle the cases where we need
- * consecutive registers (for example, texture sample instructions),
- * we model these as larger (double/quad/etc) registers which conflict
- * with the corresponding registers in other classes.
- *
- * Additionally we create additional classes for half-regs, which
- * do not conflict with the full-reg classes. We do need at least
- * sizes 1-4 (to deal w/ texture sample instructions output to half-
- * reg). At the moment we don't create the higher order half-reg
- * classes as half-reg frequently does not have enough precision
- * for texture coords at higher resolutions.
- *
- * There are some additional cases that we need to handle specially,
- * as the graph coloring algo doesn't understand "partial writes".
- * For example, a sequence like:
- *
- * add r0.z, ...
- * sam (f32)(xy)r0.x, ...
- * ...
- * sam (f32)(xyzw)r0.w, r0.x, ... ; 3d texture, so r0.xyz are coord
- *
- * In this scenario, we treat r0.xyz as class size 3, which is written
- * (from a use/def perspective) at the 'add' instruction and ignore the
- * subsequent partial writes to r0.xy. So the 'add r0.z, ...' is the
- * defining instruction, as it is the first to partially write r0.xyz.
- *
- * To address the fragmentation that this can potentially cause, a
- * two pass register allocation is used. After the first pass the
- * assignment of scalars is discarded, but the assignment of vecN (for
- * N > 1) is used to pre-color in the second pass, which considers
- * only scalars.
- *
- * Arrays of arbitrary size are handled via pre-coloring a consecutive
- * sequence of registers. Additional scalar (single component) reg
- * names are allocated starting at ctx->class_base[total_class_count]
- * (see arr->base), which are pre-colored. In the use/def graph direct
- * access is treated as a single element use/def, and indirect access
- * is treated as use or def of all array elements. (Only the first
- * def is tracked, in case of multiple indirect writes, etc.)
- *
- * TODO arrays that fit in one of the pre-defined class sizes should
- * not need to be pre-colored, but instead could be given a normal
- * vreg name. (Ignoring this for now since it is a good way to work
- * out the kinks with arbitrary sized arrays.)
- *
- * TODO might be easier for debugging to split this into two passes,
- * the first assigning vreg names in a way that we could ir3_print()
- * the result.
+static struct ir3_reg_interval *
+ir3_reg_interval_search_sloppy(struct rb_tree *tree, unsigned offset)
+{
+ struct rb_node *node = rb_tree_search_sloppy(tree, &offset, ir3_reg_interval_cmp);
+ return node ? ir3_rb_node_to_interval(node) : NULL;
+}
+
+/* Get the interval covering the reg, or the closest to the right if it
+ * doesn't exist.
*/
+static struct ir3_reg_interval *
+ir3_reg_interval_search_right(struct rb_tree *tree, unsigned offset)
+{
+ struct ir3_reg_interval *interval = ir3_reg_interval_search_sloppy(tree, offset);
+ if (!interval) {
+ return NULL;
+ } else if (interval->reg->interval_end > offset) {
+ return interval;
+ } else {
+ /* There is no interval covering reg, and ra_file_search_sloppy()
+ * returned the closest range to the left, so the next interval to the
+ * right should be the closest to the right.
+ */
+ return ir3_reg_interval_next_or_null(interval);
+ }
+}
+
+static int
+ir3_reg_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b)
+{
+ const struct ir3_reg_interval *a = ir3_rb_node_to_interval_const(_a);
+ const struct ir3_reg_interval *b = ir3_rb_node_to_interval_const(_b);
+ return b->reg->interval_start - a->reg->interval_start;
+}
+static void
+interval_insert(struct ir3_reg_ctx *ctx, struct rb_tree *tree,
+ struct ir3_reg_interval *interval)
+{
+ struct ir3_reg_interval *right =
+ ir3_reg_interval_search_right(tree, interval->reg->interval_start);
+ if (right && right->reg->interval_start < interval->reg->interval_end) {
+ /* We disallow trees where different members have different half-ness.
+ * This means that we can't treat bitcasts as copies like normal
+ * split/collect, so something like this would require an extra copy
+ * in mergedregs mode, and count as 4 half-units of register pressure
+ * instead of 2:
+ *
+ * f16vec2 foo = unpackFloat2x16(bar)
+ * ... = foo.x
+ * ... = bar
+ *
+ * However, relaxing this rule would open a huge can of worms. What
+ * happens when there's a vector of 16 things, and the fifth element
+ * has been bitcasted as a half-reg? Would that element alone have to
+ * be small enough to be used as a half-reg source? Let's keep that
+ * can of worms firmly shut for now.
+ */
+ assert((interval->reg->flags & IR3_REG_HALF) ==
+ (right->reg->flags & IR3_REG_HALF));
-static struct ir3_instruction * name_to_instr(struct ir3_ra_ctx *ctx, unsigned name);
+ if (right->reg->interval_end <= interval->reg->interval_end &&
+ right->reg->interval_start >= interval->reg->interval_start) {
+ /* Check if we're inserting something that's already inserted */
+ assert(interval != right);
-static bool name_is_array(struct ir3_ra_ctx *ctx, unsigned name);
-static struct ir3_array * name_to_array(struct ir3_ra_ctx *ctx, unsigned name);
+ /* "right" is contained in "interval" and must become a child of
+ * it. There may be further children too.
+ */
+ for (struct ir3_reg_interval *next = ir3_reg_interval_next(right);
+ right && right->reg->interval_start < interval->reg->interval_end;
+ right = next, next = ir3_reg_interval_next_or_null(next)) {
+ /* "right" must be contained in "interval." */
+ assert(right->reg->interval_end <= interval->reg->interval_end);
+ assert((interval->reg->flags & IR3_REG_HALF) ==
+ (right->reg->flags & IR3_REG_HALF));
+ if (!right->parent)
+ ctx->interval_delete(ctx, right);
+ right->parent = interval;
+ rb_tree_remove(tree, &right->node);
+ rb_tree_insert(&interval->children, &right->node,
+ ir3_reg_interval_insert_cmp);
+ }
+ } else {
+ /* "right" must contain "interval," since intervals must form a
+ * tree.
+ */
+ assert(right->reg->interval_start <= interval->reg->interval_start);
+ interval->parent = right;
+ interval_insert(ctx, &right->children, interval);
+ return;
+ }
+ }
-/* does it conflict? */
-static inline bool
-intersects(unsigned a_start, unsigned a_end, unsigned b_start, unsigned b_end)
-{
- return !((a_start >= b_end) || (b_start >= a_end));
+ if (!interval->parent)
+ ctx->interval_add(ctx, interval);
+ rb_tree_insert(tree, &interval->node, ir3_reg_interval_insert_cmp);
+ interval->inserted = true;
}
-static bool
-instr_before(struct ir3_instruction *a, struct ir3_instruction *b)
+void
+ir3_reg_interval_insert(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval)
{
- if (a->flags & IR3_INSTR_UNUSED)
- return false;
- return (a->ip < b->ip);
+ interval_insert(ctx, &ctx->intervals, interval);
}
-static struct ir3_instruction *
-get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr,
- int *sz, int *off)
+void
+ir3_reg_interval_remove(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval)
{
- struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
- struct ir3_instruction *d = NULL;
+ if (interval->parent) {
+ rb_tree_remove(&interval->parent->children, &interval->node);
+ } else {
+ ctx->interval_delete(ctx, interval);
+ rb_tree_remove(&ctx->intervals, &interval->node);
+ }
+
+ rb_tree_foreach_safe(struct ir3_reg_interval, child, &interval->children, node) {
+ rb_tree_remove(&interval->children, &child->node);
+ child->parent = interval->parent;
- if (ctx->scalar_pass) {
- id->defn = instr;
- id->off = 0;
- id->sz = 1; /* considering things as N scalar regs now */
+ if (interval->parent) {
+ rb_tree_insert(&child->parent->children, &child->node,
+ ir3_reg_interval_insert_cmp);
+ } else {
+ ctx->interval_readd(ctx, interval, child);
+ rb_tree_insert(&ctx->intervals, &child->node,
+ ir3_reg_interval_insert_cmp);
+ }
}
- if (id->defn) {
- *sz = id->sz;
- *off = id->off;
- return id->defn;
+ interval->inserted = false;
+}
+
+void
+ir3_reg_interval_remove_all(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval)
+{
+ assert(!interval->parent);
+
+ ctx->interval_delete(ctx, interval);
+ rb_tree_remove(&ctx->intervals, &interval->node);
+}
+
+static void
+interval_dump(struct ir3_reg_interval *interval, unsigned indent)
+{
+ for (unsigned i = 0; i < indent; i++)
+ printf("\t");
+ printf("reg %u start %u\n", interval->reg->name, interval->reg->interval_start);
+
+ rb_tree_foreach(struct ir3_reg_interval, child, &interval->children, node) {
+ interval_dump(child, indent + 1);
}
- if (instr->opc == OPC_META_COLLECT) {
- /* What about the case where collect is subset of array, we
- * need to find the distance between where actual array starts
- * and collect.. that probably doesn't happen currently.
- */
- int dsz, doff;
+ for (unsigned i = 0; i < indent; i++)
+ printf("\t");
+ printf("reg %u end %u\n", interval->reg->name, interval->reg->interval_end);
+}
- /* note: don't use foreach_ssa_src as this gets called once
- * while assigning regs (which clears SSA flag)
- */
- foreach_src_n (src, n, instr) {
- struct ir3_instruction *dd;
- if (!src->def)
- continue;
+void
+ir3_reg_interval_dump(struct ir3_reg_interval *interval)
+{
+ interval_dump(interval, 0);
+}
- dd = get_definer(ctx, src->def->instr, &dsz, &doff);
+/* These are the core datastructures used by the register allocator. First
+ * ra_interval and ra_file, which are used for intra-block tracking and use
+ * the ir3_reg_ctx infrastructure:
+ */
- if ((!d) || instr_before(dd, d)) {
- d = dd;
- *sz = dsz;
- *off = doff - n;
- }
- }
+struct ra_interval {
+ struct ir3_reg_interval interval;
- } else if (instr->cp.right || instr->cp.left) {
- /* covers also the meta:fo case, which ends up w/ single
- * scalar instructions for each component:
- */
- struct ir3_instruction *f = ir3_neighbor_first(instr);
+ struct rb_node physreg_node;
+ physreg_t physreg_start, physreg_end;
- /* by definition, the entire sequence forms one linked list
- * of single scalar register nodes (even if some of them may
- * be splits from a texture sample (for example) instr. We
- * just need to walk the list finding the first element of
- * the group defined (lowest ip)
- */
- int cnt = 0;
+ /* True if this is a source of the current instruction which is entirely
+ * killed. This means we can allocate the dest over it, but we can't break
+ * it up.
+ */
+ bool is_killed;
- /* need to skip over unused in the group: */
- while (f && (f->flags & IR3_INSTR_UNUSED)) {
- f = f->cp.right;
- cnt++;
- }
+ /* True if this interval cannot be moved from its position. This is only
+ * used for precolored inputs to ensure that other inputs don't get
+ * allocated on top of them.
+ */
+ bool frozen;
+};
- while (f) {
- if ((!d) || instr_before(f, d))
- d = f;
- if (f == instr)
- *off = cnt;
- f = f->cp.right;
- cnt++;
- }
+struct ra_file {
+ struct ir3_reg_ctx reg_ctx;
+
+ BITSET_DECLARE(available, RA_MAX_FILE_SIZE);
+ BITSET_DECLARE(available_to_evict, RA_MAX_FILE_SIZE);
+
+ struct rb_tree physreg_intervals;
+
+ unsigned size;
+ unsigned start;
+};
+
+/* State for inter-block tracking. When we split a live range to make space
+ * for a vector, we may need to insert fixup code when a block has multiple
+ * predecessors that have moved the same live value to different registers.
+ * This keeps track of state required to do that.
+ */
+
+struct ra_block_state {
+ /* Map of defining ir3_register -> physreg it was allocated to at the end
+ * of the block.
+ */
+ struct hash_table *renames;
+
+ /* For loops, we need to process a block before all its predecessors have
+ * been processed. In particular, we need to pick registers for values
+ * without knowing if all the predecessors have been renamed. This keeps
+ * track of the registers we chose so that when we visit the back-edge we
+ * can move them appropriately. If all predecessors have been visited
+ * before this block is visited then we don't need to fill this out. This
+ * is a map from ir3_register -> physreg.
+ */
+ struct hash_table *entry_regs;
+
+ /* True if the block has been visited and "renames" is complete.
+ */
+ bool visited;
+};
+
+struct ra_parallel_copy {
+ struct ra_interval *interval;
+ physreg_t src;
+};
+
+/* The main context: */
+
+struct ra_ctx {
+ /* r0.x - r47.w. On a6xx with merged-regs, hr0.x-hr47.w go into the bottom
+ * half of this file too.
+ */
+ struct ra_file full;
+
+ /* hr0.x - hr63.w, only used without merged-regs. */
+ struct ra_file half;
+
+ /* Shared regs. */
+ struct ra_file shared;
+
+ struct ir3_liveness *live;
+
+ struct ir3_block *block;
+
+ const struct ir3_compiler *compiler;
+ gl_shader_stage stage;
- *sz = cnt;
+ /* Pending moves of top-level intervals that will be emitted once we're
+ * finished:
+ */
+ DECLARE_ARRAY(struct ra_parallel_copy, parallel_copies);
+
+ struct ra_interval *intervals;
+ struct ra_block_state *blocks;
+
+ bool merged_regs;
+};
+
+#define foreach_interval(interval, file) \
+ rb_tree_foreach(struct ra_interval, interval, &(file)->physreg_intervals, physreg_node)
+#define foreach_interval_rev(interval, file) \
+ rb_tree_foreach(struct ra_interval, interval, &(file)->physreg_intervals, physreg_node)
+#define foreach_interval_safe(interval, file) \
+ rb_tree_foreach_safe(struct ra_interval, interval, &(file)->physreg_intervals, physreg_node)
+#define foreach_interval_rev_safe(interval, file) \
+ rb_tree_foreach_rev_safe(struct ra_interval, interval, &(file)->physreg_intervals, physreg_node)
+
+static struct ra_interval *
+rb_node_to_interval(struct rb_node *node)
+{
+ return rb_node_data(struct ra_interval, node, physreg_node);
+}
+
+static const struct ra_interval *
+rb_node_to_interval_const(const struct rb_node *node)
+{
+ return rb_node_data(struct ra_interval, node, physreg_node);
+}
+
+static struct ra_interval *
+ra_interval_next(struct ra_interval *interval)
+{
+ struct rb_node *next = rb_node_next(&interval->physreg_node);
+ return next ? rb_node_to_interval(next) : NULL;
+}
+static struct ra_interval *
+ra_interval_next_or_null(struct ra_interval *interval)
+{
+ return interval ? ra_interval_next(interval) : NULL;
+}
+
+static int
+ra_interval_cmp(const struct rb_node *node, const void *data)
+{
+ physreg_t reg = *(const physreg_t *)data;
+ const struct ra_interval *interval = rb_node_to_interval_const(node);
+ if (interval->physreg_start > reg)
+ return -1;
+ else if (interval->physreg_end <= reg)
+ return 1;
+ else
+ return 0;
+}
+
+static struct ra_interval *
+ra_interval_search_sloppy(struct rb_tree *tree, physreg_t reg)
+{
+ struct rb_node *node = rb_tree_search_sloppy(tree, ®, ra_interval_cmp);
+ return node ? rb_node_to_interval(node) : NULL;
+}
+
+/* Get the interval covering the reg, or the closest to the right if it
+ * doesn't exist.
+ */
+static struct ra_interval *
+ra_interval_search_right(struct rb_tree *tree, physreg_t reg)
+{
+ struct ra_interval *interval = ra_interval_search_sloppy(tree, reg);
+ if (!interval) {
+ return NULL;
+ } else if (interval->physreg_end > reg) {
+ return interval;
} else {
- /* second case is looking directly at the instruction which
- * produces multiple values (eg, texture sample), rather
- * than the split nodes that point back to that instruction.
- * This isn't quite right, because it may be part of a larger
- * group, such as:
- *
- * sam (f32)(xyzw)r0.x, ...
- * add r1.x, ...
- * add r1.y, ...
- * sam (f32)(xyzw)r2.x, r0.w <-- (r0.w, r1.x, r1.y)
- *
- * need to come up with a better way to handle that case.
+ /* There is no interval covering reg, and ra_file_search_sloppy()
+ * returned the closest range to the left, so the next interval to the
+ * right should be the closest to the right.
*/
- if (instr->address) {
- *sz = instr->regs[0]->size;
- } else {
- *sz = util_last_bit(instr->regs[0]->wrmask);
- }
- *off = 0;
- d = instr;
+ return ra_interval_next_or_null(interval);
}
+}
+
+static struct ra_interval *
+ra_file_search_right(struct ra_file *file, physreg_t reg)
+{
+ return ra_interval_search_right(&file->physreg_intervals, reg);
+}
+
+static int
+ra_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b)
+{
+ const struct ra_interval *a = rb_node_to_interval_const(_a);
+ const struct ra_interval *b = rb_node_to_interval_const(_b);
+ return b->physreg_start - a->physreg_start;
+}
+
+static struct ra_interval *
+ir3_reg_interval_to_ra_interval(struct ir3_reg_interval *interval)
+{
+ return rb_node_data(struct ra_interval, interval, interval);
+}
- if (d->opc == OPC_META_SPLIT) {
- struct ir3_instruction *dd;
- int dsz, doff;
+static struct ra_file *
+ir3_reg_ctx_to_file(struct ir3_reg_ctx *ctx)
+{
+ return rb_node_data(struct ra_file, ctx, reg_ctx);
+}
- dd = get_definer(ctx, d->regs[1]->def->instr, &dsz, &doff);
+static void
+interval_add(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_interval)
+{
+ struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
+ struct ra_file *file = ir3_reg_ctx_to_file(ctx);
- /* by definition, should come before: */
- ra_assert(ctx, instr_before(dd, d));
+ /* We can assume in this case that physreg_start/physreg_end is already
+ * initialized.
+ */
+ for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
+ BITSET_CLEAR(file->available, i);
+ BITSET_CLEAR(file->available_to_evict, i);
+ }
- *sz = MAX2(*sz, dsz);
+ rb_tree_insert(&file->physreg_intervals, &interval->physreg_node,
+ ra_interval_insert_cmp);
+}
- if (instr->opc == OPC_META_SPLIT)
- *off = MAX2(*off, instr->split.off);
+static void
+interval_delete(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_interval)
+{
+ struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
+ struct ra_file *file = ir3_reg_ctx_to_file(ctx);
- d = dd;
+ for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
+ BITSET_SET(file->available, i);
+ BITSET_SET(file->available_to_evict, i);
}
- ra_assert(ctx, d->opc != OPC_META_SPLIT);
+ rb_tree_remove(&file->physreg_intervals, &interval->physreg_node);
+}
+
+static void
+interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_parent,
+ struct ir3_reg_interval *_child)
+{
+ struct ra_interval *parent = ir3_reg_interval_to_ra_interval(_parent);
+ struct ra_interval *child = ir3_reg_interval_to_ra_interval(_child);
- id->defn = d;
- id->sz = *sz;
- id->off = *off;
+ child->physreg_start = parent->physreg_start +
+ (child->interval.reg->interval_start - parent->interval.reg->interval_start);
+ child->physreg_end = child->physreg_start +
+ (child->interval.reg->interval_end - child->interval.reg->interval_start);
- return d;
+ interval_add(ctx, _child);
}
+
static void
-ra_block_find_definers(struct ir3_ra_ctx *ctx, struct ir3_block *block)
+ra_file_init(struct ra_file *file)
{
- foreach_instr (instr, &block->instr_list) {
- struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
- if (instr->regs_count == 0)
- continue;
- /* couple special cases: */
- if (writes_addr0(instr) || writes_addr1(instr) || writes_pred(instr)) {
- id->cls = -1;
- } else if (instr->regs[0]->flags & IR3_REG_ARRAY) {
- id->cls = total_class_count;
- } else {
- /* and the normal case: */
- id->defn = get_definer(ctx, instr, &id->sz, &id->off);
- id->cls = ra_size_to_class(id->sz, is_half(id->defn), is_shared(id->defn));
-
- /* this is a bit of duct-tape.. if we have a scenario like:
- *
- * sam (f32)(x) out.x, ...
- * sam (f32)(x) out.y, ...
- *
- * Then the fanout/split meta instructions for the two different
- * tex instructions end up grouped as left/right neighbors. The
- * upshot is that in when you get_definer() on one of the meta:fo's
- * you get definer as the first sam with sz=2, but when you call
- * get_definer() on the either of the sam's you get itself as the
- * definer with sz=1.
- *
- * (We actually avoid this scenario exactly, the neighbor links
- * prevent one of the output mov's from being eliminated, so this
- * hack should be enough. But probably we need to rethink how we
- * find the "defining" instruction.)
- *
- * TODO how do we figure out offset properly...
- */
- if (id->defn != instr) {
- struct ir3_ra_instr_data *did = &ctx->instrd[id->defn->ip];
- if (did->sz < id->sz) {
- did->sz = id->sz;
- did->cls = id->cls;
- }
- }
- }
+ for (unsigned i = 0; i < file->size; i++) {
+ BITSET_SET(file->available, i);
+ BITSET_SET(file->available_to_evict, i);
}
+
+ file->start = 0;
+
+ rb_tree_init(&file->reg_ctx.intervals);
+ rb_tree_init(&file->physreg_intervals);
+
+ file->reg_ctx.interval_add = interval_add;
+ file->reg_ctx.interval_delete = interval_delete;
+ file->reg_ctx.interval_readd = interval_readd;
}
-/* give each instruction a name (and ip), and count up the # of names
- * of each class
- */
static void
-ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
+ra_file_insert(struct ra_file *file, struct ra_interval *interval)
{
- foreach_instr (instr, &block->instr_list) {
- struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
+ assert(interval->physreg_start < interval->physreg_end);
+ assert(interval->physreg_end <= file->size);
+ if (interval->interval.reg->flags & IR3_REG_HALF)
+ assert(interval->physreg_end <= RA_HALF_SIZE);
-#ifdef DEBUG
- instr->name = ~0;
-#endif
+ ir3_reg_interval_insert(&file->reg_ctx, &interval->interval);
+}
- ctx->instr_cnt++;
+static void
+ra_file_remove(struct ra_file *file, struct ra_interval *interval)
+{
+ ir3_reg_interval_remove(&file->reg_ctx, &interval->interval);
+}
- if (!writes_gpr(instr))
- continue;
+static void
+ra_file_mark_killed(struct ra_file *file, struct ra_interval *interval)
+{
+ assert(!interval->interval.parent);
- if (id->defn != instr)
- continue;
+ for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
+ BITSET_SET(file->available, i);
+ }
- /* In scalar pass, collect/split don't get their own names,
- * but instead inherit them from their src(s):
- *
- * Possibly we don't need this because of scalar_name(), but
- * it does make the ir3_print() dumps easier to read.
- */
- if (ctx->scalar_pass) {
- if (instr->opc == OPC_META_SPLIT) {
- instr->name = instr->regs[1]->def->instr->name + instr->split.off;
- continue;
- }
+ interval->is_killed = true;
+}
- if (instr->opc == OPC_META_COLLECT) {
- instr->name = instr->regs[1]->def->instr->name;
- continue;
- }
- }
+static physreg_t
+ra_interval_get_physreg(const struct ra_interval *interval)
+{
+ unsigned child_start = interval->interval.reg->interval_start;
- /* arrays which don't fit in one of the pre-defined class
- * sizes are pre-colored:
- */
- if ((id->cls >= 0) && (id->cls < total_class_count)) {
- /* in the scalar pass, we generate a name for each
- * scalar component, instr->name is the name of the
- * first component.
- */
- unsigned n = ctx->scalar_pass ? dest_regs(instr) : 1;
- instr->name = ctx->class_alloc_count[id->cls];
- ctx->class_alloc_count[id->cls] += n;
- ctx->alloc_count += n;
- }
+ while (interval->interval.parent) {
+ interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
}
+
+ return interval->physreg_start +
+ (child_start - interval->interval.reg->interval_start);
+}
+
+static unsigned
+ra_interval_get_num(const struct ra_interval *interval)
+{
+ return ra_physreg_to_num(ra_interval_get_physreg(interval),
+ interval->interval.reg->flags);
}
-/**
- * Set a value for max register target.
- *
- * Currently this just rounds up to a multiple of full-vec4 (ie. the
- * granularity that we configure the hw for.. there is no point to
- * using r3.x if you aren't going to make r3.yzw available). But
- * in reality there seems to be multiple thresholds that affect the
- * number of waves.. and we should round up the target to the next
- * threshold when we round-robin registers, to give postsched more
- * options. When we understand that better, this is where we'd
- * implement that.
- */
static void
-ra_set_register_target(struct ir3_ra_ctx *ctx, unsigned max_target)
+ra_interval_init(struct ra_interval *interval, struct ir3_register *reg)
{
- const unsigned hvec4 = 4;
- const unsigned vec4 = 2 * hvec4;
+ ir3_reg_interval_init(&interval->interval, reg);
+ interval->is_killed = false;
+ interval->frozen = false;
+}
- ctx->max_target = align(max_target, vec4);
+static void
+ra_interval_dump(struct ra_interval *interval)
+{
+ printf("physreg %u ", interval->physreg_start);
- d("New max_target=%u", ctx->max_target);
+ ir3_reg_interval_dump(&interval->interval);
}
-static int
-pick_in_range(BITSET_WORD *regs, unsigned min, unsigned max)
+static void
+ra_file_dump(struct ra_file *file)
{
- for (unsigned i = min; i <= max; i++) {
- if (BITSET_TEST(regs, i)) {
- return i;
- }
+ rb_tree_foreach(struct ra_interval, interval, &file->physreg_intervals, physreg_node) {
+ ra_interval_dump(interval);
}
- return -1;
+
+ unsigned start, end;
+ printf("available:\n");
+ BITSET_FOREACH_RANGE(start, end, file->available, file->size) {
+ printf("%u-%u ", start, end);
+ }
+ printf("\n");
+
+ printf("available to evict:\n");
+ BITSET_FOREACH_RANGE(start, end, file->available_to_evict, file->size) {
+ printf("%u-%u ", start, end);
+ }
+ printf("\n");
+ printf("start: %u\n", file->start);
}
-static int
-pick_in_range_rev(BITSET_WORD *regs, int min, int max)
+static void
+ra_ctx_dump(struct ra_ctx *ctx)
{
- for (int i = max; i >= min; i--) {
- if (BITSET_TEST(regs, i)) {
- return i;
- }
+ printf("full:\n");
+ ra_file_dump(&ctx->full);
+ printf("half:\n");
+ ra_file_dump(&ctx->half);
+ printf("shared:\n");
+ ra_file_dump(&ctx->shared);
+}
+
+static unsigned
+reg_file_size(struct ra_file *file, struct ir3_register *reg)
+{
+ /* Half-regs can only take up the first half of the combined regfile */
+ if (reg->flags & IR3_REG_HALF)
+ return MIN2(file->size, RA_HALF_SIZE);
+ else
+ return file->size;
+}
+
+/* ra_pop_interval/ra_push_interval provide an API to shuffle around multiple
+ * top-level intervals at once. Pop multiple intervals, then push them back in
+ * any order.
+ */
+
+struct ra_removed_interval {
+ struct ra_interval *interval;
+ unsigned size;
+};
+
+static struct ra_removed_interval
+ra_pop_interval(struct ra_ctx *ctx, struct ra_file *file,
+ struct ra_interval *interval)
+{
+ assert(!interval->interval.parent);
+
+ /* Check if we've already moved this reg before */
+ unsigned pcopy_index;
+ for (pcopy_index = 0; pcopy_index < ctx->parallel_copies_count; pcopy_index++) {
+ if (ctx->parallel_copies[pcopy_index].interval == interval)
+ break;
+ }
+
+ if (pcopy_index == ctx->parallel_copies_count) {
+ array_insert(ctx, ctx->parallel_copies, (struct ra_parallel_copy) {
+ .interval = interval,
+ .src = interval->physreg_start,
+ });
}
- return -1;
+
+ ir3_reg_interval_remove_all(&file->reg_ctx, &interval->interval);
+
+ return (struct ra_removed_interval) {
+ .interval = interval,
+ .size = interval->physreg_end - interval->physreg_start,
+ };
}
-/* register selector for the a6xx+ merged register file: */
-static unsigned int
-ra_select_reg_merged(unsigned int n, BITSET_WORD *regs, void *data)
+static void
+ra_push_interval(struct ra_ctx *ctx, struct ra_file *file,
+ const struct ra_removed_interval *removed, physreg_t dst)
{
- struct ir3_ra_ctx *ctx = data;
- struct ra_class *classp = ra_get_node_class(ctx->g, n);
- unsigned int class = ra_class_index(classp);
- bool half, shared;
- int sz = ra_class_to_size(class, &half, &shared);
+ struct ra_interval *interval = removed->interval;
- assert (sz > 0);
+ interval->physreg_start = dst;
+ interval->physreg_end = dst + removed->size;
- /* dimensions within the register class: */
- unsigned max_target, start;
+ ir3_reg_interval_insert(&file->reg_ctx, &interval->interval);
+}
- /* the regs bitset will include *all* of the virtual regs, but we lay
- * out the different classes consecutively in the virtual register
- * space. So we just need to think about the base offset of a given
- * class within the virtual register space, and offset the register
- * space we search within by that base offset.
- */
- unsigned base;
-
- /* TODO I think eventually we want to round-robin in vector pass
- * as well, but needs some more work to calculate # of live vals
- * for this. (Maybe with some work, we could just figure out
- * the scalar target and use that, since that is what we care
- * about in the end.. but that would mean setting up use-def/
- * liveranges for scalar pass before doing vector pass.)
- *
- * For now, in the vector class, just move assignments for scalar
- * vals higher to hopefully prevent them from limiting where vecN
- * values can be placed. Since the scalar values are re-assigned
- * in the 2nd pass, we don't really care where they end up in the
- * vector pass.
- */
- if (!ctx->scalar_pass) {
- base = ctx->set->gpr_to_ra_reg[class][0];
- if (shared) {
- max_target = SHARED_CLASS_REGS(class - SHARED_OFFSET);
- } else if (half) {
- max_target = HALF_CLASS_REGS(class - HALF_OFFSET);
- } else {
- max_target = CLASS_REGS(class);
+/* Pick up the interval and place it at "dst". */
+static void
+ra_move_interval(struct ra_ctx *ctx, struct ra_file *file,
+ struct ra_interval *interval, physreg_t dst)
+{
+ struct ra_removed_interval temp = ra_pop_interval(ctx, file, interval);
+ ra_push_interval(ctx, file, &temp, dst);
+}
+
+static bool
+get_reg_specified(struct ra_file *file, struct ir3_register *reg, physreg_t physreg, bool is_source)
+{
+ for (unsigned i = 0; i < reg_size(reg); i++) {
+ if (!BITSET_TEST(is_source ? file->available_to_evict : file->available, physreg + i))
+ return false;
+ }
+
+ return true;
+}
+
+/* Try to evict any registers conflicting with the proposed spot "physreg" for
+ * "reg". That is, move them to other places so that we can allocate "physreg"
+ * here.
+ */
+
+static bool
+try_evict_regs(struct ra_ctx *ctx, struct ra_file *file,
+ struct ir3_register *reg, physreg_t physreg,
+ unsigned *_eviction_count, bool is_source, bool speculative)
+{
+ BITSET_DECLARE(available_to_evict, RA_MAX_FILE_SIZE);
+ memcpy(available_to_evict, file->available_to_evict, sizeof(available_to_evict));
+
+ for (unsigned i = 0; i < reg_size(reg); i++)
+ BITSET_CLEAR(available_to_evict, physreg + i);
+
+ unsigned eviction_count = 0;
+ /* Iterate over each range conflicting with physreg */
+ for (struct ra_interval *conflicting = ra_file_search_right(file, physreg),
+ *next = ra_interval_next_or_null(conflicting);
+ conflicting != NULL && conflicting->physreg_start < physreg + reg_size(reg);
+ conflicting = next, next = ra_interval_next_or_null(next)) {
+ if (!is_source && conflicting->is_killed)
+ continue;
+
+ if (conflicting->frozen) {
+ assert(speculative);
+ return false;
}
- if ((sz == 1) && !shared) {
- return pick_in_range_rev(regs, base, base + max_target);
- } else {
- return pick_in_range(regs, base, base + max_target);
+ unsigned avail_start, avail_end;
+ bool evicted = false;
+ BITSET_FOREACH_RANGE(avail_start, avail_end, available_to_evict,
+ reg_file_size(file, conflicting->interval.reg)) {
+ unsigned size = avail_end - avail_start;
+
+ /* non-half registers must be aligned */
+ if (!(conflicting->interval.reg->flags & IR3_REG_HALF) && avail_start % 2 == 1) {
+ avail_start++;
+ size--;
+ }
+
+ if (size >= conflicting->physreg_end - conflicting->physreg_start) {
+ for (unsigned i = 0; i < conflicting->physreg_end - conflicting->physreg_start; i++)
+ BITSET_CLEAR(available_to_evict, avail_start + i);
+ eviction_count += conflicting->physreg_end - conflicting->physreg_start;
+ if (!speculative)
+ ra_move_interval(ctx, file, conflicting, avail_start);
+ evicted = true;
+ break;
+ }
}
- } else {
- ra_assert(ctx, sz == 1);
+
+ if (!evicted)
+ return false;
}
- /* NOTE: this is only used in scalar pass, so the register
- * class will be one of the scalar classes (ie. idx==0):
+ *_eviction_count = eviction_count;
+ return true;
+}
+
+static int removed_interval_cmp(const void *_i1, const void *_i2)
+{
+ const struct ra_removed_interval *i1 = _i1;
+ const struct ra_removed_interval *i2 = _i2;
+
+ /* We sort the registers as follows:
+ *
+ * |--------------------------------------------------------------------|
+ * | | | | |
+ * | Half live-through | Half killed | Full killed | Full live-through |
+ * | | | | |
+ * |--------------------------------------------------------------------|
+ * | |
+ * | Destination |
+ * | |
+ * |-----------------|
+ *
+ * Half-registers have to be first so that they stay in the low half of
+ * the register file. Then half and full killed must stay together so that
+ * there's a contiguous range where we can put the register. With this
+ * structure we should be able to accomodate any collection of intervals
+ * such that the total number of half components is within the half limit
+ * and the combined components are within the full limit.
*/
- base = ctx->set->gpr_to_ra_reg[class][0];
- if (shared) {
- max_target = SHARED_CLASS_REGS(0);
- start = 0;
- } else if (half) {
- max_target = ctx->max_target;
- start = ctx->start_search_reg;
+
+ unsigned i1_align = reg_elem_size(i1->interval->interval.reg);
+ unsigned i2_align = reg_elem_size(i2->interval->interval.reg);
+ if (i1_align > i2_align)
+ return 1;
+ if (i1_align < i2_align)
+ return -1;
+
+ if (i1_align == 1) {
+ if (i2->interval->is_killed)
+ return -1;
+ if (i1->interval->is_killed)
+ return 1;
} else {
- max_target = ctx->max_target / 2;
- start = ctx->start_search_reg;
+ if (i2->interval->is_killed)
+ return 1;
+ if (i1->interval->is_killed)
+ return -1;
}
- /* For cat4 instructions, if the src reg is already assigned, and
- * avail to pick, use it. Because this doesn't introduce unnecessary
- * dependencies, and it potentially avoids needing (ss) syncs to
- * for write after read hazards:
- */
- struct ir3_instruction *instr = name_to_instr(ctx, n);
- if (is_sfu(instr)) {
- struct ir3_register *src = instr->regs[1];
- int src_n;
-
- if ((src->flags & IR3_REG_ARRAY) && !(src->flags & IR3_REG_RELATIV)) {
- struct ir3_array *arr = ir3_lookup_array(ctx->ir, src->array.id);
- src_n = arr->base + src->array.offset;
+ return 0;
+}
+
+/* "Compress" all the live intervals so that there is enough space for the
+ * destination register. As there can be gaps when a more-aligned interval
+ * follows a less-aligned interval, this also sorts them to remove such
+ * "padding", which may be required when space is very tight. This isn't
+ * amazing, but should be used only as a last resort in case the register file
+ * is almost full and badly fragmented.
+ *
+ * Return the physreg to use.
+ */
+static physreg_t
+compress_regs_left(struct ra_ctx *ctx, struct ra_file *file, unsigned size,
+ unsigned align, bool is_source)
+{
+ DECLARE_ARRAY(struct ra_removed_interval, intervals);
+ intervals_count = intervals_sz = 0;
+ intervals = NULL;
+
+ unsigned removed_full_size = 0;
+ unsigned removed_half_size = 0;
+ unsigned file_size = align == 1 ? MIN2(file->size, RA_HALF_SIZE) : file->size;
+ physreg_t start_reg = 0;
+
+ foreach_interval_rev_safe(interval, file) {
+ /* Check if we can sort the intervals *after* this one and have
+ * enough space leftover to accomodate "size" units.
+ */
+ if (align == 1) {
+ if (interval->physreg_end + removed_half_size <= file_size - size) {
+ start_reg = interval->physreg_end;
+ break;
+ }
} else {
- src_n = scalar_name(ctx, src->def->instr, 0);
+ if (interval->physreg_end + removed_half_size <= file_size -
+ removed_full_size - size) {
+ start_reg = interval->physreg_end;
+ break;
+ }
}
- unsigned reg = ra_get_node_reg(ctx->g, src_n);
+ /* We assume that all frozen intervals are at the start and that we
+ * can avoid popping them.
+ */
+ assert(!interval->frozen);
- /* Check if the src register has been assigned yet: */
- if (reg != NO_REG) {
- if (BITSET_TEST(regs, reg)) {
- return reg;
- }
+ /* Killed sources don't count because they go at the end and can
+ * overlap the register we're trying to add.
+ */
+ if (!interval->is_killed && !is_source) {
+ if (interval->interval.reg->flags & IR3_REG_HALF)
+ removed_half_size += interval->physreg_end - interval->physreg_start;
+ else
+ removed_full_size += interval->physreg_end - interval->physreg_start;
}
- }
- int r = pick_in_range(regs, base + start, base + max_target);
- if (r < 0) {
- /* wrap-around: */
- r = pick_in_range(regs, base, base + start);
+ /* Now that we've done the accounting, pop this off */
+ d("popping interval %u physreg %u\n", interval->interval.reg->name, interval->physreg_start);
+ array_insert(ctx, intervals, ra_pop_interval(ctx, file, interval));
}
- if (r < 0) {
- /* overflow, we need to increase max_target: */
- ra_set_register_target(ctx, ctx->max_target + 1);
- return ra_select_reg_merged(n, regs, data);
+ /* TODO: In addition to skipping registers at the beginning that are
+ * well-packed, we should try to skip registers at the end.
+ */
+
+ qsort(intervals, intervals_count, sizeof(*intervals), removed_interval_cmp);
+
+ physreg_t physreg = start_reg;
+ physreg_t ret_reg = (physreg_t) ~0;
+ for (unsigned i = 0; i < intervals_count; i++) {
+ if (ret_reg == (physreg_t) ~0 &&
+ ((intervals[i].interval->is_killed && !is_source) ||
+ !(intervals[i].interval->interval.reg->flags & IR3_REG_HALF))) {
+ ret_reg = ALIGN(physreg, align);
+ }
+
+ if (ret_reg != (physreg_t) ~0 &&
+ (is_source || !intervals[i].interval->is_killed)) {
+ physreg = MAX2(physreg, ret_reg + size);
+ }
+
+ if (!(intervals[i].interval->interval.reg->flags & IR3_REG_HALF)) {
+ physreg = ALIGN(physreg, 2);
+ }
+
+ if (physreg + intervals[i].size >
+ reg_file_size(file, intervals[i].interval->interval.reg)) {
+ d("ran out of room for interval %u!\n", intervals[i].interval->interval.reg->name);
+ unreachable("reg pressure calculation was wrong!");
+ return 0;
+ }
+
+ d("pushing interval %u physreg %u\n", intervals[i].interval->interval.reg->name, physreg);
+ ra_push_interval(ctx, file, &intervals[i], physreg);
+
+ physreg += intervals[i].size;
}
- if (classp == ctx->set->half_classes[0]) {
- int n = r - base;
- ctx->start_search_reg = (n + 1) % ctx->max_target;
- } else if (classp == ctx->set->classes[0]) {
- int n = (r - base) * 2;
- ctx->start_search_reg = (n + 1) % ctx->max_target;
+ if (ret_reg == (physreg_t) ~0)
+ ret_reg = physreg;
+
+ ret_reg = ALIGN(ret_reg, align);
+ if (ret_reg + size > file_size) {
+ d("ran out of room for the new interval!\n");
+ unreachable("reg pressure calculation was wrong!");
+ return 0;
}
- return r;
+ return ret_reg;
}
static void
-ra_init(struct ir3_ra_ctx *ctx)
+update_affinity(struct ir3_register *reg, physreg_t physreg)
+{
+ if (!reg->merge_set || reg->merge_set->preferred_reg != (physreg_t) ~0)
+ return;
+
+ if (physreg < reg->merge_set_offset)
+ return;
+
+ reg->merge_set->preferred_reg = physreg - reg->merge_set_offset;
+}
+
+/* Try to find free space for a register without shuffling anything. This uses
+ * a round-robin algorithm to reduce false dependencies.
+ */
+static physreg_t
+find_best_gap(struct ra_file *file, unsigned file_size,
+ unsigned size, unsigned align, bool is_source)
{
- unsigned n, base;
+ BITSET_WORD *available = is_source ? file->available_to_evict : file->available;
+
+ unsigned start = ALIGN(file->start, align) % (file_size - size + align);
+ unsigned candidate = start;
+ do {
+ bool is_available = true;
+ for (unsigned i = 0; i < size; i++) {
+ if (!BITSET_TEST(available, candidate + i)) {
+ is_available = false;
+ break;
+ }
+ }
- ir3_clear_mark(ctx->ir);
- n = ir3_count_instructions_ra(ctx->ir);
+ if (is_available) {
+ file->start = (candidate + size) % file_size;
+ return candidate;
+ }
- ctx->instrd = rzalloc_array(NULL, struct ir3_ra_instr_data, n);
+ candidate += align;
+ if (candidate + size > file_size)
+ candidate = 0;
+ } while (candidate != start);
+
+ return (physreg_t) ~0;
+}
- foreach_block (block, &ctx->ir->block_list) {
- ra_block_find_definers(ctx, block);
- }
+static struct ra_file *
+ra_get_file(struct ra_ctx *ctx, struct ir3_register *reg)
+{
+ if (reg->flags & IR3_REG_SHARED)
+ return &ctx->shared;
+ else if (ctx->merged_regs || !(reg->flags & IR3_REG_HALF))
+ return &ctx->full;
+ else
+ return &ctx->half;
+}
- foreach_block (block, &ctx->ir->block_list) {
- ra_block_name_instructions(ctx, block);
+/* This is the main entrypoint for picking a register. Pick a free register
+ * for "reg", shuffling around sources if necessary. In the normal case where
+ * "is_source" is false, this register can overlap with killed sources
+ * (intervals with "is_killed == true"). If "is_source" is true, then
+ * is_killed is ignored and the register returned must not overlap with killed
+ * sources. This must be used for tied registers, because we're actually
+ * allocating the destination and the tied source at the same time.
+ */
+
+static physreg_t
+get_reg(struct ra_ctx *ctx, struct ra_file *file, struct ir3_register *reg,
+ bool is_source)
+{
+ unsigned file_size = reg_file_size(file, reg);
+ if (reg->merge_set && reg->merge_set->preferred_reg != (physreg_t) ~0) {
+ physreg_t preferred_reg =
+ reg->merge_set->preferred_reg + reg->merge_set_offset;
+ if (preferred_reg < file_size &&
+ preferred_reg % reg_elem_size(reg) == 0 &&
+ get_reg_specified(file, reg, preferred_reg, is_source))
+ return preferred_reg;
}
- /* figure out the base register name for each class. The
- * actual ra name is class_base[cls] + instr->name;
+ /* If this register is a subset of a merge set which we have not picked a
+ * register for, first try to allocate enough space for the entire merge
+ * set.
*/
- ctx->class_base[0] = 0;
- for (unsigned i = 1; i <= total_class_count; i++) {
- ctx->class_base[i] = ctx->class_base[i-1] +
- ctx->class_alloc_count[i-1];
+ unsigned size = reg_size(reg);
+ if (reg->merge_set && reg->merge_set->preferred_reg == (physreg_t)~0 &&
+ size < reg->merge_set->size) {
+ physreg_t best_reg =
+ find_best_gap(file, file_size, reg->merge_set->size, reg->merge_set->alignment, is_source);
+ if (best_reg != (physreg_t) ~0u) {
+ best_reg += reg->merge_set_offset;
+ return best_reg;
+ }
}
- /* and vreg names for array elements: */
- base = ctx->class_base[total_class_count];
- foreach_array (arr, &ctx->ir->array_list) {
- arr->base = base;
- ctx->class_alloc_count[total_class_count] += arr->length;
- base += arr->length;
+ /* For ALU and SFU instructions, if the src reg is avail to pick, use it.
+ * Because this doesn't introduce unnecessary dependencies, and it
+ * potentially avoids needing (ss) syncs for write after read hazards for
+ * SFU instructions:
+ */
+ if (is_sfu(reg->instr) || is_alu(reg->instr)) {
+ for (unsigned i = 1; i < reg->instr->regs_count; i++) {
+ struct ir3_register *src = reg->instr->regs[i];
+ if (!ra_reg_is_src(src))
+ continue;
+ if (ra_get_file(ctx, src) == file && reg_size(src) >= size) {
+ struct ra_interval *src_interval =
+ &ctx->intervals[src->def->name];
+ physreg_t src_physreg = ra_interval_get_physreg(src_interval);
+ if (src_physreg % reg_elem_size(reg) == 0 &&
+ src_physreg + size <= file_size &&
+ get_reg_specified(file, reg, src_physreg, is_source))
+ return src_physreg;
+ }
+ }
}
- ctx->alloc_count += ctx->class_alloc_count[total_class_count];
-
- /* Add vreg names for r0.xyz */
- ctx->r0_xyz_nodes = ctx->alloc_count;
- ctx->alloc_count += 3;
- ctx->hr0_xyz_nodes = ctx->alloc_count;
- ctx->alloc_count += 3;
- /* Add vreg name for prefetch-exclusion range: */
- ctx->prefetch_exclude_node = ctx->alloc_count++;
+ physreg_t best_reg =
+ find_best_gap(file, file_size, size, reg_elem_size(reg), is_source);
+ if (best_reg != (physreg_t) ~0u) {
+ return best_reg;
+ }
- if (RA_DEBUG) {
- d("INSTRUCTION VREG NAMES:");
- foreach_block (block, &ctx->ir->block_list) {
- foreach_instr (instr, &block->instr_list) {
- if (!ctx->instrd[instr->ip].defn)
- continue;
- if (!writes_gpr(instr))
- continue;
- di(instr, "%04u", scalar_name(ctx, instr, 0));
+ /* Ok, we couldn't find anything that fits. Here is where we have to start
+ * moving things around to make stuff fit. First try solely evicting
+ * registers in the way.
+ */
+ unsigned best_eviction_count = ~0;
+ for (physreg_t i = 0; i + size <= file_size; i += reg_elem_size(reg)) {
+ unsigned eviction_count;
+ if (try_evict_regs(ctx, file, reg, i, &eviction_count, is_source, true)) {
+ if (eviction_count < best_eviction_count) {
+ best_eviction_count = eviction_count;
+ best_reg = i;
}
}
- d("ARRAY VREG NAMES:");
- foreach_array (arr, &ctx->ir->array_list) {
- d("%04u: arr%u", arr->base, arr->id);
- }
- d("EXTRA VREG NAMES:");
- d("%04u: r0_xyz_nodes", ctx->r0_xyz_nodes);
- d("%04u: hr0_xyz_nodes", ctx->hr0_xyz_nodes);
- d("%04u: prefetch_exclude_node", ctx->prefetch_exclude_node);
+ }
+
+ if (best_eviction_count != ~0) {
+ ASSERTED bool result =
+ try_evict_regs(ctx, file, reg, best_reg, &best_eviction_count, is_source, false);
+ assert(result);
+ return best_reg;
}
- ctx->g = ra_alloc_interference_graph(ctx->set->regs, ctx->alloc_count);
- ralloc_steal(ctx->g, ctx->instrd);
- ctx->def = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
- ctx->use = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
-
- /* TODO add selector callback for split (pre-a6xx) register file: */
- if (ctx->v->mergedregs) {
- ra_set_select_reg_callback(ctx->g, ra_select_reg_merged, ctx);
+ /* Use the dumb fallback only if try_evict_regs() fails. */
+ return compress_regs_left(ctx, file, reg_size(reg), reg_elem_size(reg), is_source);
+}
- if (ctx->scalar_pass) {
- ctx->name_to_instr = _mesa_hash_table_create(ctx->g,
- _mesa_hash_int, _mesa_key_int_equal);
- }
+static void
+assign_reg(struct ir3_instruction *instr, struct ir3_register *reg, unsigned num)
+{
+ if (reg->flags & IR3_REG_ARRAY) {
+ reg->array.base = num;
+ if (reg->flags & IR3_REG_RELATIV)
+ reg->array.offset += num;
+ else
+ reg->num = num + reg->array.offset;
+ } else {
+ reg->num = num;
}
}
-/* Map the name back to instruction: */
-static struct ir3_instruction *
-name_to_instr(struct ir3_ra_ctx *ctx, unsigned name)
+static void
+mark_src_killed(struct ra_ctx *ctx, struct ir3_register *src)
{
- ra_assert(ctx, !name_is_array(ctx, name));
- struct hash_entry *entry = _mesa_hash_table_search(ctx->name_to_instr, &name);
- if (entry)
- return entry->data;
- ra_unreachable(ctx, "invalid instr name");
- return NULL;
+ struct ra_interval *interval = &ctx->intervals[src->def->name];
+
+ if (!(src->flags & IR3_REG_FIRST_KILL) || interval->is_killed ||
+ interval->interval.parent || !rb_tree_is_empty(&interval->interval.children))
+ return;
+
+ ra_file_mark_killed(ra_get_file(ctx, src), interval);
}
-static bool
-name_is_array(struct ir3_ra_ctx *ctx, unsigned name)
+static void
+insert_dst(struct ra_ctx *ctx, struct ir3_register *dst)
{
- return name >= ctx->class_base[total_class_count];
+ struct ra_file *file = ra_get_file(ctx, dst);
+ struct ra_interval *interval = &ctx->intervals[dst->name];
+
+ d("insert dst %u physreg %u", dst->name, ra_interval_get_physreg(interval));
+
+ if (!(dst->flags & IR3_REG_UNUSED))
+ ra_file_insert(file, interval);
+
+ assign_reg(dst->instr, dst, ra_interval_get_num(interval));
}
-static struct ir3_array *
-name_to_array(struct ir3_ra_ctx *ctx, unsigned name)
+static void
+allocate_dst_fixed(struct ra_ctx *ctx, struct ir3_register *dst, physreg_t physreg)
{
- ra_assert(ctx, name_is_array(ctx, name));
- foreach_array (arr, &ctx->ir->array_list) {
- if (name < (arr->base + arr->length))
- return arr;
- }
- ra_unreachable(ctx, "invalid array name");
- return NULL;
+ struct ra_interval *interval = &ctx->intervals[dst->name];
+ update_affinity(dst, physreg);
+
+ ra_interval_init(interval, dst);
+ interval->physreg_start = physreg;
+ interval->physreg_end = physreg + reg_size(dst);
}
static void
-ra_destroy(struct ir3_ra_ctx *ctx)
+allocate_dst(struct ra_ctx *ctx, struct ir3_register *dst)
{
- ralloc_free(ctx->g);
+ struct ra_file *file = ra_get_file(ctx, dst);
+
+ struct ir3_register *tied = ra_dst_get_tied_src(ctx->compiler, dst);
+ if (tied) {
+ struct ra_interval *tied_interval = &ctx->intervals[tied->def->name];
+ struct ra_interval *dst_interval = &ctx->intervals[dst->name];
+ physreg_t tied_physreg = ra_interval_get_physreg(tied_interval);
+ if (tied_interval->is_killed) {
+ /* The easy case: the source is killed, so we can just reuse it
+ * for the destination.
+ */
+ allocate_dst_fixed(ctx, dst, ra_interval_get_physreg(tied_interval));
+ } else {
+ /* The source is live-through, so we need to get a free register
+ * (which is free for both the source and destination!), copy the
+ * original source to it, then use that for the source and
+ * destination.
+ */
+ physreg_t physreg = get_reg(ctx, file, dst, true);
+ allocate_dst_fixed(ctx, dst, physreg);
+ array_insert(ctx, ctx->parallel_copies, (struct ra_parallel_copy) {
+ .interval = dst_interval,
+ .src = tied_physreg,
+ });
+ }
+
+ return;
+ }
+
+ /* All the hard work is done by get_reg here. */
+ physreg_t physreg = get_reg(ctx, file, dst, false);
+
+ allocate_dst_fixed(ctx, dst, physreg);
}
static void
-__def(struct ir3_ra_ctx *ctx, struct ir3_ra_block_data *bd, unsigned name,
- struct ir3_instruction *instr)
+assign_src(struct ra_ctx *ctx, struct ir3_instruction *instr, struct ir3_register *src)
{
- ra_assert(ctx, name < ctx->alloc_count);
+ struct ra_interval *interval = &ctx->intervals[src->def->name];
+ struct ra_file *file = ra_get_file(ctx, src);
- /* split/collect do not actually define any real value */
- if ((instr->opc == OPC_META_SPLIT) || (instr->opc == OPC_META_COLLECT))
- return;
+ bool array_rmw = ra_reg_is_array_rmw(src);
- /* defined on first write: */
- if (!ctx->def[name])
- ctx->def[name] = instr->ip;
- ctx->use[name] = MAX2(ctx->use[name], instr->ip);
- BITSET_SET(bd->def, name);
+ struct ir3_register *tied = ra_src_get_tied_dst(ctx->compiler, instr, src);
+ physreg_t physreg;
+ if (tied) {
+ struct ra_interval *tied_interval = &ctx->intervals[tied->name];
+ physreg = ra_interval_get_physreg(tied_interval);
+ } else {
+ physreg = ra_interval_get_physreg(interval);
+ }
+
+ assign_reg(instr, src, ra_physreg_to_num(physreg, src->flags));
+
+ if (src->flags & IR3_REG_FIRST_KILL)
+ ra_file_remove(file, interval);
+
+ /* This source is also a destination. */
+ if (array_rmw) {
+ struct ra_interval *dst_interval = &ctx->intervals[src->name];
+ ra_interval_init(dst_interval, src);
+ dst_interval->physreg_start = physreg;
+ dst_interval->physreg_end = physreg + src->size * reg_elem_size(src);
+ ra_file_insert(file, dst_interval);
+ }
}
+/* Insert a parallel copy instruction before the instruction with the parallel
+ * copy entries we've built up.
+ */
static void
-__use(struct ir3_ra_ctx *ctx, struct ir3_ra_block_data *bd, unsigned name,
- struct ir3_instruction *instr)
+insert_parallel_copy_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
{
- ra_assert(ctx, name < ctx->alloc_count);
- ctx->use[name] = MAX2(ctx->use[name], instr->ip);
- if (!BITSET_TEST(bd->def, name))
- BITSET_SET(bd->use, name);
+ if (ctx->parallel_copies_count == 0)
+ return;
+
+ struct ir3_instruction *pcopy =
+ ir3_instr_create(instr->block, OPC_META_PARALLEL_COPY, 2 * ctx->parallel_copies_count);
+
+ for (unsigned i = 0; i < ctx->parallel_copies_count; i++) {
+ struct ra_parallel_copy *entry = &ctx->parallel_copies[i];
+ struct ir3_register *reg =
+ ir3_reg_create(pcopy, ra_interval_get_num(entry->interval),
+ entry->interval->interval.reg->flags & ~IR3_REG_SSA);
+ reg->size = entry->interval->interval.reg->size;
+ reg->wrmask = entry->interval->interval.reg->wrmask;
+ }
+
+ for (unsigned i = 0; i < ctx->parallel_copies_count; i++) {
+ struct ra_parallel_copy *entry = &ctx->parallel_copies[i];
+ struct ir3_register *reg =
+ ir3_reg_create(pcopy,
+ ra_physreg_to_num(entry->src, entry->interval->interval.reg->flags),
+ entry->interval->interval.reg->flags & ~(IR3_REG_DEST | IR3_REG_SSA));
+ reg->size = entry->interval->interval.reg->size;
+ reg->wrmask = entry->interval->interval.reg->wrmask;
+ }
+
+ list_del(&pcopy->node);
+ list_addtail(&pcopy->node, &instr->node);
+ ctx->parallel_copies_count = 0;
}
static void
-ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
+handle_normal_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
{
- struct ir3_ra_block_data *bd;
- unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
+ /* First, mark sources as going-to-be-killed while allocating the dest. */
+ ra_foreach_src(src, instr) {
+ mark_src_killed(ctx, src);
+ }
-#define def(name, instr) __def(ctx, bd, name, instr)
-#define use(name, instr) __use(ctx, bd, name, instr)
+ /* Allocate the destination. */
+ ra_foreach_dst(dst, instr) {
+ if (ra_reg_is_array_rmw(dst))
+ continue;
+ allocate_dst(ctx, dst);
+ }
- bd = rzalloc(ctx->g, struct ir3_ra_block_data);
+ /* Now handle sources. Go backward so that in case there are multiple
+ * sources with the same def and that def is killed we only remove it at
+ * the end.
+ */
+ ra_foreach_src_rev(src, instr) {
+ assign_src(ctx, instr, src);
+ }
- bd->def = rzalloc_array(bd, BITSET_WORD, bitset_words);
- bd->use = rzalloc_array(bd, BITSET_WORD, bitset_words);
- bd->livein = rzalloc_array(bd, BITSET_WORD, bitset_words);
- bd->liveout = rzalloc_array(bd, BITSET_WORD, bitset_words);
+ /* Now finally insert the destination into the map. */
+ ra_foreach_dst(dst, instr) {
+ if (ra_reg_is_array_rmw(dst))
+ continue;
+ insert_dst(ctx, dst);
+ }
- block->data = bd;
+ insert_parallel_copy_instr(ctx, instr);
+}
- struct ir3_instruction *first_non_input = NULL;
- foreach_instr (instr, &block->instr_list) {
- if (instr->opc != OPC_META_INPUT) {
- first_non_input = instr;
- break;
- }
+static void
+handle_split(struct ra_ctx *ctx, struct ir3_instruction *instr)
+{
+ struct ir3_register *dst = instr->regs[0];
+ struct ir3_register *src = instr->regs[1];
+
+ if (dst->merge_set == NULL || src->def->merge_set != dst->merge_set) {
+ handle_normal_instr(ctx, instr);
+ return;
}
- foreach_instr (instr, &block->instr_list) {
- foreach_def (name, ctx, instr) {
- if (name_is_array(ctx, name)) {
- struct ir3_array *arr = name_to_array(ctx, name);
-
- arr->start_ip = MIN2(arr->start_ip, instr->ip);
- arr->end_ip = MAX2(arr->end_ip, instr->ip);
-
- for (unsigned i = 0; i < arr->length; i++) {
- unsigned name = arr->base + i;
- if(arr->half)
- ra_set_node_class(ctx->g, name, ctx->set->half_classes[0]);
- else
- ra_set_node_class(ctx->g, name, ctx->set->classes[0]);
- }
- } else {
- struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
- if (is_shared(instr)) {
- ra_set_node_class(ctx->g, name,
- ctx->set->shared_classes[id->cls - SHARED_OFFSET]);
- } else if (is_half(instr)) {
- ra_set_node_class(ctx->g, name,
- ctx->set->half_classes[id->cls - HALF_OFFSET]);
- } else {
- ra_set_node_class(ctx->g, name,
- ctx->set->classes[id->cls]);
- }
- }
+ struct ra_interval *src_interval = &ctx->intervals[src->def->name];
- def(name, instr);
+ physreg_t physreg = ra_interval_get_physreg(src_interval);
+ assign_src(ctx, instr, src);
- if ((instr->opc == OPC_META_INPUT) && first_non_input)
- use(name, first_non_input);
+ allocate_dst_fixed(ctx, dst, physreg - src->def->merge_set_offset + dst->merge_set_offset);
+ insert_dst(ctx, dst);
+}
- /* Texture instructions with writemasks can be treated as smaller
- * vectors (or just scalars!) to allocate knowing that the
- * masked-out regs won't be written, but we need to make sure that
- * the start of the vector doesn't come before the first register
- * or we'll wrap.
- */
- if (is_tex_or_prefetch(instr)) {
- int writemask_skipped_regs = ffs(instr->regs[0]->wrmask) - 1;
- int r0_xyz = is_half(instr) ?
- ctx->hr0_xyz_nodes : ctx->r0_xyz_nodes;
- for (int i = 0; i < writemask_skipped_regs; i++)
- ra_add_node_interference(ctx->g, name, r0_xyz + i);
- }
+static void
+handle_collect(struct ra_ctx *ctx, struct ir3_instruction *instr)
+{
+ struct ir3_merge_set *dst_set = instr->regs[0]->merge_set;
+ unsigned dst_offset = instr->regs[0]->merge_set_offset;
+
+ if (!dst_set || dst_set->regs_count == 1) {
+ handle_normal_instr(ctx, instr);
+ return;
+ }
- /* Pre-fetched textures have a lower limit for bits to encode dst
- * register, so add additional interference with registers above
- * that limit.
- */
- if (instr->opc == OPC_META_TEX_PREFETCH) {
- ra_add_node_interference(ctx->g, name,
- ctx->prefetch_exclude_node);
- }
- }
+ /* We need to check if any of the sources are contained in an interval
+ * that is at least as large as the vector. In this case, we should put
+ * the vector inside that larger interval. (There should be one
+ * unambiguous place to put it, because values sharing the same merge set
+ * should be allocated together.) This can happen in a case like:
+ *
+ * ssa_1 (wrmask=0xf) = ...
+ * ssa_2 = split ssa_1 off:0
+ * ssa_3 = split ssa_1 off:1
+ * ssa_4 (wrmask=0x3) = collect (kill)ssa_2, (kill)ssa_3
+ * ... = (kill)ssa_1
+ * ... = (kill)ssa_4
+ *
+ * ssa_4 will be coalesced with ssa_1 and needs to be allocated inside it.
+ */
+ physreg_t dst_fixed = (physreg_t) ~0u;
- foreach_use (name, ctx, instr) {
- if (name_is_array(ctx, name)) {
- struct ir3_array *arr = name_to_array(ctx, name);
+ for (unsigned i = 1; i < instr->regs_count; i++) {
+ if (!ra_reg_is_src(instr->regs[i]))
+ continue;
- arr->start_ip = MIN2(arr->start_ip, instr->ip);
- arr->end_ip = MAX2(arr->end_ip, instr->ip);
+ if (instr->regs[i]->flags & IR3_REG_FIRST_KILL) {
+ mark_src_killed(ctx, instr->regs[i]);
+ }
- /* NOTE: arrays are not SSA so unconditionally
- * set use bit:
- */
- BITSET_SET(bd->use, name);
- }
+ struct ir3_register *src = instr->regs[i];
+ struct ra_interval *interval = &ctx->intervals[src->def->name];
- use(name, instr);
+ if (src->def->merge_set != dst_set || interval->is_killed)
+ continue;
+ while (interval->interval.parent != NULL) {
+ interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
}
-
- foreach_name (name, ctx, instr) {
- /* split/collect instructions have duplicate names
- * as real instructions, so they skip the hashtable:
+ if (reg_size(interval->interval.reg) >= reg_size(instr->regs[0])) {
+ dst_fixed = interval->physreg_start - interval->interval.reg->merge_set_offset + dst_offset;
+ } else {
+ /* For sources whose root interval is smaller than the
+ * destination (i.e. the normal case), we will shuffle them
+ * around after allocating the destination. Mark them killed so
+ * that the destination can be allocated over them, even if they
+ * aren't actually killed.
*/
- if (ctx->name_to_instr && !((instr->opc == OPC_META_SPLIT) ||
- (instr->opc == OPC_META_COLLECT))) {
- /* this is slightly annoying, we can't just use an
- * integer on the stack
- */
- unsigned *key = ralloc(ctx->name_to_instr, unsigned);
- *key = name;
- ra_assert(ctx, !_mesa_hash_table_search(ctx->name_to_instr, key));
- _mesa_hash_table_insert(ctx->name_to_instr, key, instr);
- }
+ ra_file_mark_killed(ra_get_file(ctx, src), interval);
}
}
-}
-
-static bool
-ra_compute_livein_liveout(struct ir3_ra_ctx *ctx)
-{
- unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
- bool progress = false;
- foreach_block (block, &ctx->ir->block_list) {
- struct ir3_ra_block_data *bd = block->data;
+ if (dst_fixed != (physreg_t) ~0u)
+ allocate_dst_fixed(ctx, instr->regs[0], dst_fixed);
+ else
+ allocate_dst(ctx, instr->regs[0]);
- /* update livein: */
- for (unsigned i = 0; i < bitset_words; i++) {
- /* anything used but not def'd within a block is
- * by definition a live value coming into the block:
- */
- BITSET_WORD new_livein =
- (bd->use[i] | (bd->liveout[i] & ~bd->def[i]));
+ /* Remove the temporary is_killed we added */
+ for (unsigned i = 1; i < instr->regs_count; i++) {
+ if (!ra_reg_is_src(instr->regs[i]))
+ continue;
- if (new_livein & ~bd->livein[i]) {
- bd->livein[i] |= new_livein;
- progress = true;
- }
+ struct ir3_register *src = instr->regs[i];
+ struct ra_interval *interval = &ctx->intervals[src->def->name];
+ while (interval->interval.parent != NULL) {
+ interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
}
- /* update liveout: */
- for (unsigned j = 0; j < ARRAY_SIZE(block->successors); j++) {
- struct ir3_block *succ = block->successors[j];
- struct ir3_ra_block_data *succ_bd;
+ /* Filter out cases where it actually should be killed */
+ if (interval != &ctx->intervals[src->def->name] ||
+ !(src->flags & IR3_REG_KILL))
+ interval->is_killed = false;
+ }
- if (!succ)
- continue;
- succ_bd = succ->data;
+ ra_foreach_src_rev(src, instr) {
+ assign_src(ctx, instr, src);
+ }
- for (unsigned i = 0; i < bitset_words; i++) {
- /* add anything that is livein in a successor block
- * to our liveout:
- */
- BITSET_WORD new_liveout =
- (succ_bd->livein[i] & ~bd->liveout[i]);
+ /* Note: insert_dst will automatically shuffle around any intervals that
+ * are a child of the collect by making them children of the collect.
+ */
- if (new_liveout) {
- bd->liveout[i] |= new_liveout;
- progress = true;
- }
- }
- }
- }
+ insert_dst(ctx, instr->regs[0]);
- return progress;
+ insert_parallel_copy_instr(ctx, instr);
}
+/* Parallel copies before RA should only be at the end of the block, for
+ * phi's. For these we only need to fill in the sources, and then we fill in
+ * the destinations in the successor block.
+ */
static void
-print_bitset(const char *name, BITSET_WORD *bs, unsigned cnt)
-{
- bool first = true;
- debug_printf("RA: %s:", name);
- for (unsigned i = 0; i < cnt; i++) {
- if (BITSET_TEST(bs, i)) {
- if (!first)
- debug_printf(",");
- debug_printf(" %04u", i);
- first = false;
- }
- }
- debug_printf("\n");
-}
-
-/* size of one component of instruction result, ie. half vs full: */
-static unsigned
-live_size(struct ir3_instruction *instr)
+handle_pcopy(struct ra_ctx *ctx, struct ir3_instruction *instr)
{
- if (is_half(instr)) {
- return 1;
- } else if (is_shared(instr)) {
- /* doesn't count towards footprint */
- return 0;
- } else {
- return 2;
+ ra_foreach_src_rev(src, instr) {
+ assign_src(ctx, instr, src);
}
}
-static unsigned
-name_size(struct ir3_ra_ctx *ctx, unsigned name)
+/* Some inputs may need to be precolored. We need to handle those first, so
+ * that other non-precolored inputs don't accidentally get allocated over
+ * them. Inputs are the very first thing in the shader, so it shouldn't be a
+ * problem to allocate them to a specific physreg.
+ */
+
+static void
+handle_precolored_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
{
- if (name_is_array(ctx, name)) {
- struct ir3_array *arr = name_to_array(ctx, name);
- return arr->half ? 1 : 2;
- } else {
- struct ir3_instruction *instr = name_to_instr(ctx, name);
- /* in scalar pass, each name represents on scalar value,
- * half or full precision
- */
- return live_size(instr);
- }
+ if (instr->regs[0]->num == INVALID_REG)
+ return;
+
+ struct ra_interval *interval = &ctx->intervals[instr->regs[0]->name];
+ physreg_t physreg = ra_reg_get_physreg(instr->regs[0]);
+ allocate_dst_fixed(ctx, instr->regs[0], physreg);
+ insert_dst(ctx, instr->regs[0]);
+ interval->frozen = true;
}
-static unsigned
-ra_calc_block_live_values(struct ir3_ra_ctx *ctx, struct ir3_block *block)
+static void
+handle_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
{
- struct ir3_ra_block_data *bd = block->data;
- unsigned name;
+ if (instr->regs[0]->num != INVALID_REG)
+ return;
- ra_assert(ctx, ctx->name_to_instr);
+ allocate_dst(ctx, instr->regs[0]);
- /* TODO this gets a bit more complicated in non-scalar pass.. but
- * possibly a lowball estimate is fine to start with if we do
- * round-robin in non-scalar pass? Maybe we just want to handle
- * that in a different fxn?
- */
- ra_assert(ctx, ctx->scalar_pass);
+ struct ra_file *file = ra_get_file(ctx, instr->regs[0]);
+ struct ra_interval *interval = &ctx->intervals[instr->regs[0]->name];
+ ra_file_insert(file, interval);
+}
- BITSET_WORD *live =
- rzalloc_array(bd, BITSET_WORD, BITSET_WORDS(ctx->alloc_count));
+static void
+assign_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
+{
+ struct ra_interval *interval = &ctx->intervals[instr->regs[0]->name];
+ struct ra_file *file = ra_get_file(ctx, instr->regs[0]);
- /* Add the live input values: */
- unsigned livein = 0;
- BITSET_FOREACH_SET (name, bd->livein, ctx->alloc_count) {
- livein += name_size(ctx, name);
- BITSET_SET(live, name);
+ if (instr->regs[0]->num == INVALID_REG) {
+ assign_reg(instr, instr->regs[0], ra_interval_get_num(interval));
+ } else {
+ interval->frozen = false;
}
- d("---------------------");
- d("block%u: LIVEIN: %u", block_id(block), livein);
-
- unsigned max = livein;
- int cur_live = max;
-
- /* Now that we know the live inputs to the block, iterate the
- * instructions adjusting the current # of live values as we
- * see their last use:
- */
- foreach_instr (instr, &block->instr_list) {
- if (RA_DEBUG)
- print_bitset("LIVE", live, ctx->alloc_count);
- di(instr, "CALC");
-
- unsigned new_live = 0; /* newly live values */
- unsigned new_dead = 0; /* newly no-longer live values */
- unsigned next_dead = 0; /* newly dead following this instr */
-
- foreach_def (name, ctx, instr) {
- /* NOTE: checking ctx->def filters out things like split/
- * collect which are just redefining existing live names
- * or array writes to already live array elements:
- */
- if (ctx->def[name] != instr->ip)
- continue;
- new_live += live_size(instr);
- d("NEW_LIVE: %u (new_live=%u, use=%u)", name, new_live, ctx->use[name]);
- BITSET_SET(live, name);
- /* There can be cases where this is *also* the last use
- * of a value, for example instructions that write multiple
- * values, only some of which are used. These values are
- * dead *after* (rather than during) this instruction.
- */
- if (ctx->use[name] != instr->ip)
- continue;
- next_dead += live_size(instr);
- d("NEXT_DEAD: %u (next_dead=%u)", name, next_dead);
- BITSET_CLEAR(live, name);
- }
-
- /* To be more resilient against special cases where liverange
- * is extended (like first_non_input), rather than using the
- * foreach_use() iterator, we iterate the current live values
- * instead:
- */
- BITSET_FOREACH_SET (name, live, ctx->alloc_count) {
- /* Is this the last use? */
- if (ctx->use[name] != instr->ip)
- continue;
- new_dead += name_size(ctx, name);
- d("NEW_DEAD: %u (new_dead=%u)", name, new_dead);
- BITSET_CLEAR(live, name);
- }
+ if (instr->regs[0]->flags & IR3_REG_UNUSED)
+ ra_file_remove(file, interval);
- cur_live += new_live;
- cur_live -= new_dead;
+ ra_foreach_src_rev(src, instr)
+ assign_src(ctx, instr, src);
+}
- ra_assert(ctx, cur_live >= 0);
- d("CUR_LIVE: %u", cur_live);
+/* chmask is a bit weird, because it has pre-colored sources due to the need
+ * to pass some registers to the next stage. Fortunately there are only at
+ * most two, and there should be no other live values by the time we get to
+ * this instruction, so we only have to do the minimum and don't need any
+ * fancy fallbacks.
+ *
+ * TODO: Add more complete handling of precolored sources, e.g. for function
+ * argument handling. We'd need a way to mark sources as fixed so that they
+ * don't get moved around when placing other sources in the fallback case, and
+ * a duplication of much of the logic in get_reg(). This also opens another
+ * can of worms, e.g. what if the precolored source is a split of a vector
+ * which is still live -- this breaks our assumption that splits don't incur
+ * any "extra" register requirements and we'd have to break it out of the
+ * parent ra_interval.
+ */
- max = MAX2(max, cur_live);
+static void
+handle_precolored_source(struct ra_ctx *ctx, struct ir3_register *src)
+{
+ struct ra_file *file = ra_get_file(ctx, src);
+ struct ra_interval *interval = &ctx->intervals[src->def->name];
+ physreg_t physreg = ra_reg_get_physreg(src);
- /* account for written values which are not used later,
- * but after updating max (since they are for one cycle
- * live)
- */
- cur_live -= next_dead;
- ra_assert(ctx, cur_live >= 0);
+ if (ra_interval_get_num(interval) == src->num)
+ return;
- if (RA_DEBUG) {
- unsigned cnt = 0;
- BITSET_FOREACH_SET (name, live, ctx->alloc_count) {
- cnt += name_size(ctx, name);
- }
- ra_assert(ctx, cur_live == cnt);
+ /* Try evicting stuff in our way if it isn't free. This won't move
+ * anything unless it overlaps with our precolored physreg, so we don't
+ * have to worry about evicting other precolored sources.
+ */
+ if (!get_reg_specified(file, src, physreg, true)) {
+ unsigned eviction_count;
+ if (!try_evict_regs(ctx, file, src, physreg, &eviction_count, true, false)) {
+ unreachable("failed to evict for precolored source!");
+ return;
}
}
- d("block%u max=%u", block_id(block), max);
+ ra_move_interval(ctx, file, interval, physreg);
+}
- /* the remaining live should match liveout (for extra sanity testing): */
- if (RA_DEBUG) {
- unsigned new_dead = 0;
- BITSET_FOREACH_SET (name, live, ctx->alloc_count) {
- /* Is this the last use? */
- if (ctx->use[name] != block->end_ip)
- continue;
- new_dead += name_size(ctx, name);
- d("NEW_DEAD: %u (new_dead=%u)", name, new_dead);
- BITSET_CLEAR(live, name);
- }
- unsigned liveout = 0;
- BITSET_FOREACH_SET (name, bd->liveout, ctx->alloc_count) {
- liveout += name_size(ctx, name);
- BITSET_CLEAR(live, name);
- }
+static void
+handle_chmask(struct ra_ctx *ctx, struct ir3_instruction *instr)
+{
+ /* Note: we purposely don't mark sources as killed, so that we can reuse
+ * some of the get_reg() machinery as-if the source is a destination.
+ * Marking it as killed would make e.g. get_reg_specified() wouldn't work
+ * correctly.
+ */
+ ra_foreach_src(src, instr) {
+ assert(src->num != INVALID_REG);
+ handle_precolored_source(ctx, src);
+ }
- if (cur_live != liveout) {
- print_bitset("LEAKED", live, ctx->alloc_count);
- /* TODO there are a few edge cases where live-range extension
- * tells us a value is livein. But not used by the block or
- * liveout for the block. Possibly a bug in the liverange
- * extension. But for now leave the assert disabled:
- ra_assert(ctx, cur_live == liveout);
- */
- }
+ ra_foreach_src(src, instr) {
+ struct ra_file *file = ra_get_file(ctx, src);
+ struct ra_interval *interval = &ctx->intervals[src->def->name];
+ if (src->flags & IR3_REG_FIRST_KILL)
+ ra_file_remove(file, interval);
}
- ralloc_free(live);
+ /* add dummy destination for validation */
+ assign_reg(instr, instr->regs[0], 0);
- return max;
+ insert_parallel_copy_instr(ctx, instr);
}
-static unsigned
-ra_calc_max_live_values(struct ir3_ra_ctx *ctx)
+static physreg_t
+read_register(struct ra_ctx *ctx, struct ir3_block *block, struct ir3_register *def)
{
- unsigned max = 0;
-
- foreach_block (block, &ctx->ir->block_list) {
- unsigned block_live = ra_calc_block_live_values(ctx, block);
- max = MAX2(max, block_live);
+ struct ra_block_state *state = &ctx->blocks[block->index];
+ if (state->renames) {
+ struct hash_entry *entry = _mesa_hash_table_search(state->renames, def);
+ if (entry) {
+ return (physreg_t)(uintptr_t)entry->data;
+ }
}
- return max;
+ return ra_reg_get_physreg(def);
}
static void
-ra_add_interference(struct ir3_ra_ctx *ctx)
+handle_live_in(struct ra_ctx *ctx, struct ir3_register *def)
{
- struct ir3 *ir = ctx->ir;
+ physreg_t physreg = ~0;
+ for (unsigned i = 0; i < ctx->block->predecessors_count; i++) {
+ struct ir3_block *pred = ctx->block->predecessors[i];
+ struct ra_block_state *pred_state = &ctx->blocks[pred->index];
- /* initialize array live ranges: */
- foreach_array (arr, &ir->array_list) {
- arr->start_ip = ~0;
- arr->end_ip = 0;
- }
+ if (!pred_state->visited)
+ continue;
- /* set up the r0.xyz precolor regs. */
- for (int i = 0; i < 3; i++) {
- ra_set_node_reg(ctx->g, ctx->r0_xyz_nodes + i, i);
- ra_set_node_reg(ctx->g, ctx->hr0_xyz_nodes + i,
- ctx->set->first_half_reg + i);
+ physreg = read_register(ctx, pred, def);
+ break;
}
- /* pre-color node that conflict with half/full regs higher than what
- * can be encoded for tex-prefetch:
- */
- ra_set_node_reg(ctx->g, ctx->prefetch_exclude_node,
- ctx->set->prefetch_exclude_reg);
+ assert(physreg != (physreg_t)~0);
+
+ struct ra_interval *interval = &ctx->intervals[def->name];
+ struct ra_file *file = ra_get_file(ctx, def);
+ ra_interval_init(interval, def);
+ interval->physreg_start = physreg;
+ interval->physreg_end = physreg + reg_size(def);
+ ra_file_insert(file, interval);
+}
- /* compute live ranges (use/def) on a block level, also updating
- * block's def/use bitmasks (used below to calculate per-block
- * livein/liveout):
+static void
+handle_live_out(struct ra_ctx *ctx, struct ir3_register *def)
+{
+ /* Skip parallelcopy's which in the original program are only used as phi
+ * arguments. Even though phi arguments are live out, they are only
+ * assigned when the phi is.
*/
- foreach_block (block, &ir->block_list) {
- ra_block_compute_live_ranges(ctx, block);
+ if (def->instr->opc == OPC_META_PARALLEL_COPY)
+ return;
+
+ struct ra_block_state *state = &ctx->blocks[ctx->block->index];
+ struct ra_interval *interval = &ctx->intervals[def->name];
+ physreg_t physreg = ra_interval_get_physreg(interval);
+ if (physreg != ra_reg_get_physreg(def)) {
+ if (!state->renames)
+ state->renames = _mesa_pointer_hash_table_create(ctx);
+ _mesa_hash_table_insert(state->renames, def, (void *)(uintptr_t)physreg);
}
+}
- /* update per-block livein/liveout: */
- while (ra_compute_livein_liveout(ctx)) {}
+static void
+handle_phi(struct ra_ctx *ctx, struct ir3_register *def)
+{
+ struct ra_file *file = ra_get_file(ctx, def);
+ struct ra_interval *interval = &ctx->intervals[def->name];
- if (RA_DEBUG) {
- d("AFTER LIVEIN/OUT:");
- foreach_block (block, &ir->block_list) {
- struct ir3_ra_block_data *bd = block->data;
- d("block%u:", block_id(block));
- print_bitset(" def", bd->def, ctx->alloc_count);
- print_bitset(" use", bd->use, ctx->alloc_count);
- print_bitset(" l/i", bd->livein, ctx->alloc_count);
- print_bitset(" l/o", bd->liveout, ctx->alloc_count);
- }
- foreach_array (arr, &ir->array_list) {
- d("array%u:", arr->id);
- d(" length: %u", arr->length);
- d(" start_ip: %u", arr->start_ip);
- d(" end_ip: %u", arr->end_ip);
- }
+ /* phis are always scalar, so they should already be the smallest possible
+ * size. However they may be coalesced with other live-in values/phi
+ * nodes, so check for that here.
+ */
+ struct ir3_reg_interval *parent_ir3 =
+ ir3_reg_interval_search(&file->reg_ctx.intervals, def->interval_start);
+ physreg_t physreg;
+ if (parent_ir3) {
+ struct ra_interval *parent = ir3_reg_interval_to_ra_interval(parent_ir3);
+ physreg = ra_interval_get_physreg(parent) +
+ (def->interval_start - parent_ir3->reg->interval_start);
+ } else {
+ physreg = get_reg(ctx, file, def, false);
}
- /* extend start/end ranges based on livein/liveout info from cfg: */
- foreach_block (block, &ir->block_list) {
- struct ir3_ra_block_data *bd = block->data;
-
- for (unsigned i = 0; i < ctx->alloc_count; i++) {
- if (BITSET_TEST(bd->livein, i)) {
- ctx->def[i] = MIN2(ctx->def[i], block->start_ip);
- ctx->use[i] = MAX2(ctx->use[i], block->start_ip);
- }
+ allocate_dst_fixed(ctx, def, physreg);
- if (BITSET_TEST(bd->liveout, i)) {
- ctx->def[i] = MIN2(ctx->def[i], block->end_ip);
- ctx->use[i] = MAX2(ctx->use[i], block->end_ip);
- }
- }
+ ra_file_insert(file, interval);
+}
- foreach_array (arr, &ctx->ir->array_list) {
- for (unsigned i = 0; i < arr->length; i++) {
- if (BITSET_TEST(bd->livein, i + arr->base)) {
- arr->start_ip = MIN2(arr->start_ip, block->start_ip);
- }
- if (BITSET_TEST(bd->liveout, i + arr->base)) {
- arr->end_ip = MAX2(arr->end_ip, block->end_ip);
- }
- }
+static void
+assign_phi(struct ra_ctx *ctx, struct ir3_instruction *phi)
+{
+ struct ra_file *file = ra_get_file(ctx, phi->regs[0]);
+ struct ra_interval *interval = &ctx->intervals[phi->regs[0]->name];
+ assert(!interval->interval.parent);
+ unsigned num = ra_interval_get_num(interval);
+ assign_reg(phi, phi->regs[0], num);
+
+ /* Assign the parallelcopy sources of this phi */
+ for (unsigned i = 1; i < phi->regs_count; i++) {
+ if (phi->regs[i]->def) {
+ assign_reg(phi, phi->regs[i], num);
+ assign_reg(phi, phi->regs[i]->def, num);
}
}
- if (ctx->name_to_instr) {
- unsigned max = ra_calc_max_live_values(ctx);
- ra_set_register_target(ctx, max);
- }
-
- for (unsigned i = 0; i < ctx->alloc_count; i++) {
- for (unsigned j = 0; j < ctx->alloc_count; j++) {
- if (intersects(ctx->def[i], ctx->use[i],
- ctx->def[j], ctx->use[j])) {
- ra_add_node_interference(ctx->g, i, j);
- }
- }
- }
+ if (phi->regs[0]->flags & IR3_REG_UNUSED)
+ ra_file_remove(file, interval);
}
-/* NOTE: instr could be NULL for IR3_REG_ARRAY case, for the first
- * array access(es) which do not have any previous access to depend
- * on from scheduling point of view
+/* When we split a live range, we sometimes need to emit fixup code at the end
+ * of a block. For example, something like:
+ *
+ * a = ...
+ * if (...) {
+ * ...
+ * a' = a
+ * b = ... // a evicted to make room for b
+ * ...
+ * }
+ * ... = a
+ *
+ * When we insert the copy to a' in insert_parallel_copy_instr(), this forces
+ * to insert another copy "a = a'" at the end of the if. Normally this would
+ * also entail adding a phi node, but since we're about to go out of SSA
+ * anyway we just insert an extra move. Note, however, that "b" might be used
+ * in a phi node at the end of the if and share registers with "a", so we
+ * have to be careful to extend any preexisting parallelcopy instruction
+ * instead of creating our own in order to guarantee that they properly get
+ * swapped.
*/
+
static void
-reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg,
- struct ir3_instruction *instr)
+insert_liveout_copy(struct ir3_block *block, physreg_t dst, physreg_t src,
+ struct ir3_register *reg)
{
- struct ir3_ra_instr_data *id;
-
- if (reg->flags & IR3_REG_ARRAY) {
- struct ir3_array *arr =
- ir3_lookup_array(ctx->ir, reg->array.id);
- unsigned name = arr->base + reg->array.offset;
- unsigned r = ra_get_node_reg(ctx->g, name);
- unsigned num = ctx->set->ra_reg_to_gpr[r];
-
- if (reg->flags & IR3_REG_RELATIV) {
- reg->array.base = arr->reg;
- reg->array.offset = num;
- } else {
- reg->num = num;
- reg->flags &= ~IR3_REG_SSA;
- }
+ struct ir3_instruction *old_pcopy = NULL;
+ if (!list_is_empty(&block->instr_list)) {
+ struct ir3_instruction *last =
+ LIST_ENTRY(struct ir3_instruction, block->instr_list.prev, node);
+ if (last->opc == OPC_META_PARALLEL_COPY)
+ old_pcopy = last;
+ }
- reg->flags &= ~IR3_REG_ARRAY;
- } else if ((id = &ctx->instrd[instr->ip]) && id->defn) {
- unsigned first_component = 0;
+ unsigned old_pcopy_regs = old_pcopy ? old_pcopy->regs_count : 0;
+ struct ir3_instruction *pcopy =
+ ir3_instr_create(block, OPC_META_PARALLEL_COPY,
+ 2 + old_pcopy_regs);
- /* Special case for tex instructions, which may use the wrmask
- * to mask off the first component(s). In the scalar pass,
- * this means the masked off component(s) are not def'd/use'd,
- * so we get a bogus value when we ask the register_allocate
- * algo to get the assigned reg for the unused/untouched
- * component. So we need to consider the first used component:
- */
- if (ctx->scalar_pass && is_tex_or_prefetch(id->defn)) {
- unsigned n = ffs(id->defn->regs[0]->wrmask);
- ra_assert(ctx, n > 0);
- first_component = n - 1;
- }
+ for (unsigned i = 0; i < old_pcopy_regs / 2; i++) {
+ pcopy->regs[pcopy->regs_count++] = old_pcopy->regs[i];
+ }
- unsigned name = scalar_name(ctx, id->defn, first_component);
- unsigned r = ra_get_node_reg(ctx->g, name);
- unsigned num = ctx->set->ra_reg_to_gpr[r] + id->off;
+ struct ir3_register *dst_reg =
+ ir3_reg_create(pcopy, ra_physreg_to_num(dst, reg->flags), reg->flags);
+ dst_reg->wrmask = reg->wrmask;
+ dst_reg->size = reg->size;
- ra_assert(ctx, !(reg->flags & IR3_REG_RELATIV));
+ for (unsigned i = old_pcopy_regs / 2; i < old_pcopy_regs; i++) {
+ pcopy->regs[pcopy->regs_count++] = old_pcopy->regs[i];
+ }
- ra_assert(ctx, num >= first_component);
+ struct ir3_register *src_reg =
+ ir3_reg_create(pcopy, ra_physreg_to_num(src, reg->flags),
+ reg->flags & ~IR3_REG_DEST);
+ src_reg->wrmask = reg->wrmask;
+ src_reg->size = reg->size;
- if (is_shared(id->defn))
- num += FIRST_SHARED_REG;
+ if (old_pcopy)
+ list_del(&old_pcopy->node);
+}
- reg->num = num - first_component;
+static void
+insert_live_in_move(struct ra_ctx *ctx, struct ra_interval *interval)
+{
+ physreg_t physreg = ra_interval_get_physreg(interval);
+
+ for (unsigned i = 0; i < ctx->block->predecessors_count; i++) {
+ struct ir3_block *pred = ctx->block->predecessors[i];
+ struct ra_block_state *pred_state = &ctx->blocks[pred->index];
- reg->flags &= ~IR3_REG_SSA;
+ if (!pred_state->visited)
+ continue;
- if (is_half(id->defn))
- reg->flags |= IR3_REG_HALF;
+ physreg_t pred_reg = read_register(ctx, pred, interval->interval.reg);
+ if (pred_reg != physreg) {
+ insert_liveout_copy(pred, physreg, pred_reg, interval->interval.reg);
+ }
}
}
-/* helper to determine which regs to assign in which pass: */
-static bool
-should_assign(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr)
+static void
+insert_file_live_in_moves(struct ra_ctx *ctx, struct ra_file *file)
{
- /* Array regs are precolored completely separately, and we need to keep
- * their array-ness until the end to be able to compute the array reg's
- * live interval in the scalar pass.
- */
- if (instr->regs[0]->flags & IR3_REG_ARRAY)
- return ctx->scalar_pass;
-
- if ((instr->opc == OPC_META_SPLIT) &&
- (util_bitcount(instr->regs[1]->wrmask) > 1))
- return !ctx->scalar_pass;
- if ((instr->opc == OPC_META_COLLECT) &&
- (util_bitcount(instr->regs[0]->wrmask) > 1))
- return !ctx->scalar_pass;
- return ctx->scalar_pass;
+ BITSET_WORD *live_in = ctx->live->live_in[ctx->block->index];
+ rb_tree_foreach(struct ra_interval, interval, &file->physreg_intervals, physreg_node) {
+ /* Skip phi nodes. This needs to happen after phi nodes are allocated,
+ * because we may have to move live-ins around to make space for phi
+ * nodes, but we shouldn't be handling phi nodes here.
+ */
+ if (BITSET_TEST(live_in, interval->interval.reg->name))
+ insert_live_in_move(ctx, interval);
+ }
}
static void
-ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block)
+insert_entry_regs(struct ra_block_state *state, struct ra_file *file)
{
- foreach_instr (instr, &block->instr_list) {
-
- if (writes_gpr(instr)) {
- if (should_assign(ctx, instr)) {
- reg_assign(ctx, instr->regs[0], instr);
- }
- }
-
- foreach_src_n (reg, n, instr) {
- struct ir3_instruction *src = reg->def ? reg->def->instr : NULL;
-
- if (src && should_assign(ctx, instr))
- reg_assign(ctx, src->regs[0], src);
-
- /* Note: reg->def could be null for IR3_REG_ARRAY */
- if (((reg->flags & IR3_REG_ARRAY) && ctx->scalar_pass) ||
- (src && should_assign(ctx, src))) {
- reg_assign(ctx, instr->regs[n+1], src);
- }
- }
- }
-
- /* We need to pre-color outputs for the scalar pass in
- * ra_precolor_assigned(), so we need to actually assign
- * them in the first pass:
- */
- if (!ctx->scalar_pass) {
- foreach_input (in, ctx->ir) {
- reg_assign(ctx, in->regs[0], in);
- }
+ rb_tree_foreach(struct ra_interval, interval, &file->physreg_intervals, physreg_node) {
+ _mesa_hash_table_insert(state->entry_regs, interval->interval.reg,
+ (void *)(uintptr_t)interval->physreg_start);
}
}
static void
-assign_arr_base(struct ir3_ra_ctx *ctx, struct ir3_array *arr,
- struct ir3_instruction **precolor, unsigned nprecolor)
+insert_live_in_moves(struct ra_ctx *ctx)
{
- /* In the mergedregs case, we convert full precision arrays
- * to their effective half-precision base, and find conflicts
- * amongst all other arrays/inputs.
- *
- * In the splitregs case (halfreg file and fullreg file do
- * not conflict), we ignore arrays and other pre-colors that
- * are not the same precision.
- */
- bool mergedregs = ctx->v->mergedregs;
- unsigned base = 0;
+ insert_file_live_in_moves(ctx, &ctx->full);
+ insert_file_live_in_moves(ctx, &ctx->half);
+ insert_file_live_in_moves(ctx, &ctx->shared);
- /* figure out what else we conflict with which has already
- * been assigned:
+ /* If not all predecessors are visited, insert live-in regs so that
+ * insert_live_out_moves() will work.
*/
-retry:
- foreach_array (arr2, &ctx->ir->array_list) {
- if (arr2 == arr)
+ bool all_preds_visited = true;
+ for (unsigned i = 0; i < ctx->block->predecessors_count; i++) {
+ if (!ctx->blocks[ctx->block->predecessors[i]->index].visited) {
+ all_preds_visited = false;
break;
- ra_assert(ctx, arr2->start_ip <= arr2->end_ip);
-
- unsigned base2 = arr2->reg;
- unsigned len2 = arr2->length;
- unsigned len = arr->length;
-
- if (mergedregs) {
- /* convert into half-reg space: */
- if (!arr2->half) {
- base2 *= 2;
- len2 *= 2;
- }
- if (!arr->half) {
- len *= 2;
- }
- } else if (arr2->half != arr->half) {
- /* for split-register-file mode, we only conflict with
- * other arrays of same precision:
- */
- continue;
- }
-
- /* if it intersects with liverange AND register range.. */
- if (intersects(arr->start_ip, arr->end_ip,
- arr2->start_ip, arr2->end_ip) &&
- intersects(base, base + len,
- base2, base2 + len2)) {
- base = MAX2(base, base2 + len2);
- goto retry;
}
}
- /* also need to not conflict with any pre-assigned inputs: */
- for (unsigned i = 0; i < nprecolor; i++) {
- struct ir3_instruction *instr = precolor[i];
+ if (!all_preds_visited) {
+ struct ra_block_state *state = &ctx->blocks[ctx->block->index];
+ state->entry_regs = _mesa_pointer_hash_table_create(ctx);
+
+ insert_entry_regs(state, &ctx->full);
+ insert_entry_regs(state, &ctx->half);
+ insert_entry_regs(state, &ctx->shared);
+ }
+}
- if (!instr || (instr->flags & IR3_INSTR_UNUSED))
+static void
+insert_live_out_move(struct ra_ctx *ctx, struct ra_interval *interval)
+{
+ for (unsigned i = 0; i < 2; i++) {
+ if (!ctx->block->successors[i])
continue;
- struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
+ struct ir3_block *succ = ctx->block->successors[i];
+ struct ra_block_state *succ_state = &ctx->blocks[succ->index];
- /* only consider the first component: */
- if (id->off > 0)
+ if (!succ_state->visited)
continue;
- unsigned name = ra_name(ctx, id);
- unsigned regid = instr->regs[0]->num;
- unsigned reglen = class_sizes[id->cls];
- unsigned len = arr->length;
-
- if (mergedregs) {
- /* convert into half-reg space: */
- if (!is_half(instr)) {
- regid *= 2;
- reglen *= 2;
- }
- if (!arr->half) {
- len *= 2;
- }
- } else if (is_half(instr) != arr->half) {
- /* for split-register-file mode, we only conflict with
- * other arrays of same precision:
- */
+ struct hash_entry *entry =
+ _mesa_hash_table_search(succ_state->entry_regs, interval->interval.reg);
+ if (!entry)
continue;
- }
- /* Check if array intersects with liverange AND register
- * range of the input:
- */
- if (intersects(arr->start_ip, arr->end_ip,
- ctx->def[name], ctx->use[name]) &&
- intersects(base, base + len,
- regid, regid + reglen)) {
- base = MAX2(base, regid + reglen);
- goto retry;
+ physreg_t new_reg = (physreg_t)(uintptr_t)entry->data;
+ if (new_reg != interval->physreg_start) {
+ insert_liveout_copy(ctx->block, new_reg, interval->physreg_start,
+ interval->interval.reg);
}
}
+}
- /* convert back from half-reg space to fullreg space: */
- if (mergedregs && !arr->half) {
- base = DIV_ROUND_UP(base, 2);
+static void
+insert_file_live_out_moves(struct ra_ctx *ctx, struct ra_file *file)
+{
+ rb_tree_foreach(struct ra_interval, interval, &file->physreg_intervals, physreg_node) {
+ insert_live_out_move(ctx, interval);
}
+}
- arr->reg = base;
+static void
+insert_live_out_moves(struct ra_ctx *ctx)
+{
+ insert_file_live_out_moves(ctx, &ctx->full);
+ insert_file_live_out_moves(ctx, &ctx->half);
+ insert_file_live_out_moves(ctx, &ctx->shared);
}
-/* handle pre-colored registers. This includes "arrays" (which could be of
- * length 1, used for phi webs lowered to registers in nir), as well as
- * special shader input values that need to be pinned to certain registers.
- */
static void
-ra_precolor(struct ir3_ra_ctx *ctx, struct ir3_instruction **precolor, unsigned nprecolor)
+handle_block(struct ra_ctx *ctx, struct ir3_block *block)
{
- for (unsigned i = 0; i < nprecolor; i++) {
- if (precolor[i] && !(precolor[i]->flags & IR3_INSTR_UNUSED)) {
- struct ir3_instruction *instr = precolor[i];
+ ctx->block = block;
+
+ /* Reset the register files from the last block */
+ ra_file_init(&ctx->full);
+ ra_file_init(&ctx->half);
+ ra_file_init(&ctx->shared);
+
+ /* Handle live-ins, phis, and input meta-instructions. These all appear
+ * live at the beginning of the block, and interfere with each other
+ * therefore need to be allocated "in parallel". This means that we
+ * have to allocate all of them, inserting them into the file, and then
+ * delay updating the IR until all of them are allocated.
+ *
+ * Handle precolored inputs first, because we need to make sure that other
+ * inputs don't overwrite them. We shouldn't have both live-ins/phi nodes
+ * and inputs at the same time, because the first block doesn't have
+ * predecessors. Therefore handle_live_in doesn't have to worry about
+ * them.
+ */
- if (instr->regs[0]->num == INVALID_REG)
- continue;
+ foreach_instr (instr, &block->instr_list) {
+ if (instr->opc == OPC_META_INPUT)
+ handle_precolored_input(ctx, instr);
+ else
+ break;
+ }
- struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
-
- ra_assert(ctx, !(instr->regs[0]->flags & (IR3_REG_HALF | IR3_REG_SHARED)));
-
- /* 'base' is in scalar (class 0) but we need to map that
- * the conflicting register of the appropriate class (ie.
- * input could be vec2/vec3/etc)
- *
- * Note that the higher class (larger than scalar) regs
- * are setup to conflict with others in the same class,
- * so for example, R1 (scalar) is also the first component
- * of D1 (vec2/double):
- *
- * Single (base) | Double
- * --------------+---------------
- * R0 | D0
- * R1 | D0 D1
- * R2 | D1 D2
- * R3 | D2
- * .. and so on..
- */
- unsigned regid = instr->regs[0]->num;
- ra_assert(ctx, regid >= id->off);
- regid -= id->off;
+ unsigned name;
+ BITSET_FOREACH_SET(name, ctx->live->live_in[block->index],
+ ctx->live->definitions_count) {
+ struct ir3_register *reg = ctx->live->definitions[name];
+ handle_live_in(ctx, reg);
+ }
- unsigned reg = ctx->set->gpr_to_ra_reg[id->cls][regid];
- unsigned name = ra_name(ctx, id);
- ra_set_node_reg(ctx->g, name, reg);
- }
+ foreach_instr (instr, &block->instr_list) {
+ if (instr->opc == OPC_META_PHI)
+ handle_phi(ctx, instr->regs[0]);
+ else if (instr->opc == OPC_META_INPUT || instr->opc == OPC_META_TEX_PREFETCH)
+ handle_input(ctx, instr);
+ else
+ break;
}
- /*
- * Pre-assign array elements:
+ /* After this point, every live-in/phi/input has an interval assigned to
+ * it. We delay actually assigning values until everything has been
+ * allocated, so we can simply ignore any parallel copy entries created
+ * when shuffling them around.
*/
- foreach_array (arr, &ctx->ir->array_list) {
-
- if (arr->end_ip == 0)
- continue;
-
- if (!ctx->scalar_pass)
- assign_arr_base(ctx, arr, precolor, nprecolor);
+ ctx->parallel_copies_count = 0;
- for (unsigned i = 0; i < arr->length; i++) {
- unsigned cls = arr->half ? HALF_OFFSET : 0;
+ insert_live_in_moves(ctx);
- ra_set_node_reg(ctx->g,
- arr->base + i, /* vreg name */
- ctx->set->gpr_to_ra_reg[cls][arr->reg + i]);
- }
+ if (RA_DEBUG) {
+ printf("after live-in block %u:\n", block->index);
+ ra_ctx_dump(ctx);
}
- if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
- foreach_array (arr, &ctx->ir->array_list) {
- unsigned first = arr->reg;
- unsigned last = arr->reg + arr->length - 1;
- debug_printf("arr[%d] at r%d.%c->r%d.%c\n", arr->id,
- (first >> 2), "xyzw"[first & 0x3],
- (last >> 2), "xyzw"[last & 0x3]);
+ /* Now we're done with processing live-ins, and can handle the body of the
+ * block.
+ */
+ foreach_instr (instr, &block->instr_list) {
+ if (RA_DEBUG) {
+ printf("processing: ");
+ ir3_print_instr(instr);
}
+
+ if (instr->opc == OPC_META_PHI)
+ assign_phi(ctx, instr);
+ else if (instr->opc == OPC_META_INPUT || instr->opc == OPC_META_TEX_PREFETCH)
+ assign_input(ctx, instr);
+ else if (instr->opc == OPC_META_SPLIT)
+ handle_split(ctx, instr);
+ else if (instr->opc == OPC_META_COLLECT)
+ handle_collect(ctx, instr);
+ else if (instr->opc == OPC_META_PARALLEL_COPY)
+ handle_pcopy(ctx, instr);
+ else if (instr->opc == OPC_CHMASK)
+ handle_chmask(ctx, instr);
+ else
+ handle_normal_instr(ctx, instr);
+
+ if (RA_DEBUG)
+ ra_ctx_dump(ctx);
}
-}
-static void
-precolor(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr)
-{
- struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
- unsigned n = dest_regs(instr);
- for (unsigned i = 0; i < n; i++) {
- /* tex instructions actually have a wrmask, and
- * don't touch masked out components. So we
- * shouldn't precolor them::
- */
- if (is_tex_or_prefetch(instr) &&
- !(instr->regs[0]->wrmask & (1 << i)))
- continue;
+ insert_live_out_moves(ctx);
- unsigned name = scalar_name(ctx, instr, i);
- unsigned regid = instr->regs[0]->num + i;
+ BITSET_FOREACH_SET(name, ctx->live->live_out[block->index],
+ ctx->live->definitions_count) {
+ struct ir3_register *reg = ctx->live->definitions[name];
+ handle_live_out(ctx, reg);
+ }
- if (instr->regs[0]->flags & IR3_REG_SHARED)
- regid -= FIRST_SHARED_REG;
+ ctx->blocks[block->index].visited = true;
- unsigned vreg = ctx->set->gpr_to_ra_reg[id->cls][regid];
- ra_set_node_reg(ctx->g, name, vreg);
+ for (unsigned i = 0; i < block->dom_children_count; i++) {
+ handle_block(ctx, block->dom_children[i]);
}
}
-/* pre-color non-scalar registers based on the registers assigned in previous
- * pass. Do this by looking actually at the fanout instructions.
- */
-static void
-ra_precolor_assigned(struct ir3_ra_ctx *ctx)
+static unsigned
+calc_target_full_pressure(struct ir3_shader_variant *v, unsigned pressure)
{
- ra_assert(ctx, ctx->scalar_pass);
+ /* Registers are allocated in units of vec4, so switch from units of
+ * half-regs to vec4.
+ */
+ unsigned reg_count = DIV_ROUND_UP(pressure, 2 * 4);
+
+ bool double_threadsize = ir3_should_double_threadsize(v, reg_count);
+
+ unsigned target = reg_count;
+ unsigned reg_independent_max_waves =
+ ir3_get_reg_independent_max_waves(v, double_threadsize);
+ unsigned reg_dependent_max_waves =
+ ir3_get_reg_dependent_max_waves(v->shader->compiler, reg_count,
+ double_threadsize);
+ unsigned target_waves =
+ MIN2(reg_independent_max_waves, reg_dependent_max_waves);
+
+ while (target <= RA_FULL_SIZE / (2 * 4) &&
+ ir3_should_double_threadsize(v, target) == double_threadsize &&
+ ir3_get_reg_dependent_max_waves(v->shader->compiler, target,
+ double_threadsize) >= target_waves)
+ target++;
+
+ return (target - 1) * 2 * 4;
+}
- foreach_block (block, &ctx->ir->block_list) {
- foreach_instr (instr, &block->instr_list) {
+int
+ir3_ra(struct ir3_shader_variant *v)
+{
+ ir3_calc_dominance(v->ir);
- if (!writes_gpr(instr))
- continue;
+ ir3_create_parallel_copies(v->ir);
- if (should_assign(ctx, instr))
- continue;
+ struct ir3_liveness *live = ir3_calc_liveness(v);
- precolor(ctx, instr);
+ ir3_debug_print(v->ir, "AFTER: create_parallel_copies");
- foreach_src (src, instr) {
- if (!src->def)
- continue;
- precolor(ctx, src->def->instr);
- }
- }
- }
-}
+ ir3_merge_regs(live, v->ir);
-static int
-ra_alloc(struct ir3_ra_ctx *ctx)
-{
- if (!ra_allocate(ctx->g))
- return -1;
+ struct ir3_pressure max_pressure;
+ ir3_calc_pressure(v, live, &max_pressure);
+ d("max pressure:");
+ d("\tfull: %u", max_pressure.full);
+ d("\thalf: %u", max_pressure.half);
+ d("\tshared: %u", max_pressure.shared);
- foreach_block (block, &ctx->ir->block_list) {
- ra_block_alloc(ctx, block);
+ if (v->mergedregs) {
+ max_pressure.full += max_pressure.half;
+ max_pressure.half = 0;
}
- return 0;
-}
-
-/* if we end up with split/collect instructions with non-matching src
- * and dest regs, that means something has gone wrong. Which makes it
- * a pretty good sanity check.
- */
-static void
-ra_sanity_check(struct ir3 *ir)
-{
- foreach_block (block, &ir->block_list) {
- foreach_instr (instr, &block->instr_list) {
- if (instr->opc == OPC_META_SPLIT) {
- struct ir3_register *dst = instr->regs[0];
- struct ir3_register *src = instr->regs[1];
- debug_assert(dst->num == (src->num + instr->split.off));
- } else if (instr->opc == OPC_META_COLLECT) {
- struct ir3_register *dst = instr->regs[0];
-
- foreach_src_n (src, n, instr) {
- debug_assert(dst->num == (src->num - n));
- }
- }
- }
+ if (max_pressure.full > RA_FULL_SIZE ||
+ max_pressure.half > RA_HALF_SIZE ||
+ max_pressure.shared > RA_SHARED_SIZE) {
+ d("max pressure exceeded!");
+ return 1;
}
-}
-
-static int
-ir3_ra_pass(struct ir3_shader_variant *v, struct ir3_instruction **precolor,
- unsigned nprecolor, bool scalar_pass)
-{
- struct ir3_ra_ctx ctx = {
- .v = v,
- .ir = v->ir,
- .set = v->mergedregs ?
- v->ir->compiler->mergedregs_set : v->ir->compiler->set,
- .scalar_pass = scalar_pass,
- };
- int ret;
- ret = setjmp(ctx.jmp_env);
- if (ret)
- goto fail;
+ struct ra_ctx *ctx = rzalloc(NULL, struct ra_ctx);
- ra_init(&ctx);
- ra_add_interference(&ctx);
- ra_precolor(&ctx, precolor, nprecolor);
- if (scalar_pass)
- ra_precolor_assigned(&ctx);
- ret = ra_alloc(&ctx);
+ ctx->merged_regs = v->mergedregs;
+ ctx->compiler = v->shader->compiler;
+ ctx->stage = v->type;
+ ctx->live = live;
+ ctx->intervals = rzalloc_array(ctx, struct ra_interval, live->definitions_count);
+ ctx->blocks = rzalloc_array(ctx, struct ra_block_state, live->block_count);
-fail:
- ra_destroy(&ctx);
+ ctx->full.size = calc_target_full_pressure(v, max_pressure.full);
+ d("full size: %u", ctx->full.size);
+
+ if (!v->mergedregs)
+ ctx->half.size = RA_HALF_SIZE;
- return ret;
-}
+ ctx->shared.size = RA_SHARED_SIZE;
-int
-ir3_ra(struct ir3_shader_variant *v, struct ir3_instruction **precolor,
- unsigned nprecolor)
-{
- int ret;
+ handle_block(ctx, ir3_start_block(v->ir));
- /* First pass, assign the vecN (non-scalar) registers: */
- ret = ir3_ra_pass(v, precolor, nprecolor, false);
- if (ret)
- return ret;
+ /* Strip array-ness and SSA-ness at the end, because various helpers still
+ * need to work even on definitions that have already been assigned. For
+ * example, we need to preserve array-ness so that array live-ins have the
+ * right size.
+ */
+ foreach_block (block, &v->ir->block_list) {
+ foreach_instr (instr, &block->instr_list) {
+ for (unsigned i = 0; i < instr->regs_count; i++) {
+ instr->regs[i]->flags &= ~IR3_REG_SSA;
- ir3_debug_print(v->ir, "AFTER: ir3_ra (1st pass)");
+ /* Parallel copies of array registers copy the whole register,
+ * and we need some way to let the parallel copy code know
+ * that this was an array whose size is determined by
+ * reg->size. So keep the array flag on those.
+ */
+ if (!is_meta(instr))
+ instr->regs[i]->flags &= ~IR3_REG_ARRAY;
+ }
+ }
+ }
- /* Second pass, assign the scalar registers: */
- ret = ir3_ra_pass(v, precolor, nprecolor, true);
- if (ret)
- return ret;
+ ir3_debug_print(v->ir, "AFTER: register allocation");
- ir3_debug_print(v->ir, "AFTER: ir3_ra (2st pass)");
+ ir3_lower_copies(v);
-#ifdef DEBUG
-# define SANITY_CHECK DEBUG
-#else
-# define SANITY_CHECK 0
-#endif
- if (SANITY_CHECK)
- ra_sanity_check(v->ir);
+ ir3_debug_print(v->ir, "AFTER: ir3_lower_copies");
- return ret;
+ ralloc_free(ctx);
+ ralloc_free(live);
+ return 0;
}
+