BITSET_WORD *conflicts;
struct util_dynarray conflict_list;
- /* Number of conflicts that must be allocated to physical registers.
- */
- unsigned phys_conflicts;
-
- unsigned node_conflicts;
-
- /* Number of conflicts that can be allocated to either. */
- unsigned total_conflicts;
+ unsigned num_conflicts;
int assigned_color;
};
struct regalloc_ctx {
- unsigned bitset_words, num_nodes_and_regs;
+ unsigned bitset_words;
struct reg_info *registers;
/* Reusable scratch liveness array */
/* Liveness analysis */
-static void propagate_liveness_instr(gpir_node *node, BITSET_WORD *live,
- gpir_compiler *comp)
+static void propagate_liveness_node(gpir_node *node, BITSET_WORD *live,
+ gpir_compiler *comp)
{
/* KILL */
if (node->type == gpir_node_type_store) {
memcpy(ctx->live, block->live_out, ctx->bitset_words * sizeof(BITSET_WORD));
list_for_each_entry_rev(gpir_node, node, &block->node_list, list) {
- propagate_liveness_instr(node, ctx->live, block->comp);
+ propagate_liveness_node(node, ctx->live, block->comp);
}
bool changed = false;
BITSET_SET(a->conflicts, j);
BITSET_SET(b->conflicts, i);
- a->total_conflicts++;
- b->total_conflicts++;
- if (j < ctx->comp->cur_reg)
- a->phys_conflicts++;
- else
- a->node_conflicts++;
-
- if (i < ctx->comp->cur_reg)
- b->phys_conflicts++;
- else
- b->node_conflicts++;
-
+ a->num_conflicts++;
+ b->num_conflicts++;
util_dynarray_append(&a->conflict_list, unsigned, j);
util_dynarray_append(&b->conflict_list, unsigned, i);
}
*/
static void add_all_interferences(struct regalloc_ctx *ctx,
unsigned i,
- BITSET_WORD *live_nodes,
BITSET_WORD *live_regs)
{
- int live_node;
- BITSET_FOREACH_SET(live_node, live_nodes, ctx->comp->cur_index) {
- add_interference(ctx, i,
- live_node + ctx->comp->cur_reg);
- }
-
int live_reg;
- BITSET_FOREACH_SET(live_reg, ctx->live, ctx->comp->cur_index) {
+ BITSET_FOREACH_SET(live_reg, ctx->live, ctx->comp->cur_reg) {
add_interference(ctx, i, live_reg);
}
}
static void print_liveness(struct regalloc_ctx *ctx,
- BITSET_WORD *live_reg, BITSET_WORD *live_val)
+ BITSET_WORD *live_reg)
{
if (!(lima_debug & LIMA_DEBUG_GP))
return;
BITSET_FOREACH_SET(live_idx, live_reg, ctx->comp->cur_reg) {
printf("reg%d ", live_idx);
}
- BITSET_FOREACH_SET(live_idx, live_val, ctx->comp->cur_index) {
- printf("%d ", live_idx);
- }
printf("\n");
}
static void calc_interference(struct regalloc_ctx *ctx)
{
- BITSET_WORD *live_nodes =
- rzalloc_array(ctx->mem_ctx, BITSET_WORD, ctx->comp->cur_index);
-
list_for_each_entry(gpir_block, block, &ctx->comp->block_list, list) {
/* Initialize liveness at the end of the block, but exclude values that
* definitely aren't defined by the end. This helps out with
list_for_each_entry_rev(gpir_node, node, &block->node_list, list) {
gpir_debug("processing node %d\n", node->index);
- print_liveness(ctx, ctx->live, live_nodes);
- if (node->type != gpir_node_type_store &&
- node->type != gpir_node_type_branch) {
- add_all_interferences(ctx, node->index + ctx->comp->cur_reg,
- live_nodes, ctx->live);
-
- /* KILL */
- BITSET_CLEAR(live_nodes, node->index);
- } else if (node->op == gpir_op_store_reg) {
+ print_liveness(ctx, ctx->live);
+ if (node->op == gpir_op_store_reg) {
gpir_store_node *store = gpir_node_to_store(node);
- add_all_interferences(ctx, store->reg->index,
- live_nodes, ctx->live);
+ add_all_interferences(ctx, store->reg->index, ctx->live);
/* KILL */
BITSET_CLEAR(ctx->live, store->reg->index);
- }
-
- /* GEN */
- if (node->type == gpir_node_type_store) {
- gpir_store_node *store = gpir_node_to_store(node);
- BITSET_SET(live_nodes, store->child->index);
- } else if (node->type == gpir_node_type_alu) {
- gpir_alu_node *alu = gpir_node_to_alu(node);
- for (int i = 0; i < alu->num_child; i++)
- BITSET_SET(live_nodes, alu->children[i]->index);
- } else if (node->type == gpir_node_type_branch) {
- gpir_branch_node *branch = gpir_node_to_branch(node);
- BITSET_SET(live_nodes, branch->cond->index);
} else if (node->op == gpir_op_load_reg) {
+ /* GEN */
gpir_load_node *load = gpir_node_to_load(node);
BITSET_SET(ctx->live, load->reg->index);
}
static bool can_simplify(struct regalloc_ctx *ctx, unsigned i)
{
struct reg_info *info = &ctx->registers[i];
- if (i < ctx->comp->cur_reg) {
- /* Physical regs. */
- return info->phys_conflicts + info->node_conflicts < GPIR_PHYSICAL_REG_NUM;
- } else {
- /* Nodes: if we manage to allocate all of its conflicting physical
- * registers, they will take up at most GPIR_PHYSICAL_REG_NUM colors, so
- * we can ignore any more than that.
- */
- return MIN2(info->phys_conflicts, GPIR_PHYSICAL_REG_NUM) +
- info->node_conflicts < GPIR_PHYSICAL_REG_NUM + GPIR_VALUE_REG_NUM;
- }
+ return info->num_conflicts < GPIR_PHYSICAL_REG_NUM;
}
static void push_stack(struct regalloc_ctx *ctx, unsigned i)
{
ctx->stack[ctx->stack_size++] = i;
- if (i < ctx->comp->cur_reg)
- gpir_debug("pushing reg%u\n", i);
- else
- gpir_debug("pushing %d\n", i - ctx->comp->cur_reg);
+ gpir_debug("pushing reg%u\n", i);
struct reg_info *info = &ctx->registers[i];
assert(info->visited);
util_dynarray_foreach(&info->conflict_list, unsigned, conflict) {
struct reg_info *conflict_info = &ctx->registers[*conflict];
- if (i < ctx->comp->cur_reg) {
- assert(conflict_info->phys_conflicts > 0);
- conflict_info->phys_conflicts--;
- } else {
- assert(conflict_info->node_conflicts > 0);
- conflict_info->node_conflicts--;
- }
+ assert(conflict_info->num_conflicts > 0);
+ conflict_info->num_conflicts--;
if (!ctx->registers[*conflict].visited && can_simplify(ctx, *conflict)) {
ctx->worklist[ctx->worklist_end++] = *conflict;
ctx->registers[*conflict].visited = true;
ctx->stack_size = 0;
/* Step 1: find the initially simplifiable registers */
- for (int i = 0; i < ctx->comp->cur_reg + ctx->comp->cur_index; i++) {
+ for (int i = 0; i < ctx->comp->cur_reg; i++) {
if (can_simplify(ctx, i)) {
ctx->worklist[ctx->worklist_end++] = i;
ctx->registers[i].visited = true;
push_stack(ctx, ctx->worklist[ctx->worklist_start++]);
}
- if (ctx->stack_size < ctx->num_nodes_and_regs) {
+ if (ctx->stack_size < ctx->comp->cur_reg) {
/* If there are still unsimplifiable nodes left, we need to
* optimistically push a node onto the stack. Choose the one with
* the smallest number of current neighbors, since that's the most
*/
unsigned min_conflicts = UINT_MAX;
unsigned best_reg = 0;
- for (unsigned reg = 0; reg < ctx->num_nodes_and_regs; reg++) {
+ for (int reg = 0; reg < ctx->comp->cur_reg; reg++) {
struct reg_info *info = &ctx->registers[reg];
if (info->visited)
continue;
- if (info->phys_conflicts + info->node_conflicts < min_conflicts) {
+ if (info->num_conflicts < min_conflicts) {
best_reg = reg;
- min_conflicts = info->phys_conflicts + info->node_conflicts;
+ min_conflicts = info->num_conflicts;
}
}
gpir_debug("optimistic triggered\n");
}
/* Step 4: pop off the stack and assign colors */
- for (int i = ctx->num_nodes_and_regs - 1; i >= 0; i--) {
+ for (int i = ctx->comp->cur_reg - 1; i >= 0; i--) {
unsigned idx = ctx->stack[i];
struct reg_info *reg = &ctx->registers[idx];
- unsigned num_available_regs;
- if (idx < ctx->comp->cur_reg) {
- num_available_regs = GPIR_PHYSICAL_REG_NUM;
- } else {
- num_available_regs = GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM;
- }
-
bool found = false;
- unsigned start = i % num_available_regs;
- for (unsigned j = 0; j < num_available_regs; j++) {
- unsigned candidate = (j + start) % num_available_regs;
+ unsigned start = i % GPIR_PHYSICAL_REG_NUM;
+ for (unsigned j = 0; j < GPIR_PHYSICAL_REG_NUM; j++) {
+ unsigned candidate = (j + start) % GPIR_PHYSICAL_REG_NUM;
bool available = true;
util_dynarray_foreach(®->conflict_list, unsigned, conflict_idx) {
struct reg_info *conflict = &ctx->registers[*conflict_idx];
{
list_for_each_entry(gpir_block, block, &ctx->comp->block_list, list) {
list_for_each_entry(gpir_node, node, &block->node_list, list) {
- if (node->index >= 0) {
- node->value_reg =
- ctx->registers[ctx->comp->cur_reg + node->index].assigned_color;
- }
-
if (node->op == gpir_op_load_reg) {
gpir_load_node *load = gpir_node_to_load(node);
unsigned color = ctx->registers[load->reg->index].assigned_color;
}
}
+/* Value register allocation */
+
+/* Define a special token for when the register is occupied by a preallocated
+ * physical register (i.e. load_reg/store_reg). Normally entries in the "live"
+ * array points to the definition of the value, but there may be multiple
+ * definitions in this case, and they will certainly come from other basic
+ * blocks, so it doesn't make sense to do that here.
+ */
+static gpir_node __physreg_live;
+#define PHYSREG_LIVE (&__physreg_live)
+
+struct value_regalloc_ctx {
+ gpir_node *last_written[GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM];
+ gpir_node *complex1_last_written[GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM];
+ gpir_node *live[GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM];
+ gpir_node *last_complex1;
+ unsigned alloc_start;
+};
+
+static unsigned find_free_value_reg(struct value_regalloc_ctx *ctx)
+{
+ /* Implement round-robin allocation */
+ unsigned reg_offset = ctx->alloc_start++;
+ if (ctx->alloc_start == GPIR_PHYSICAL_REG_NUM + GPIR_VALUE_REG_NUM)
+ ctx->alloc_start = 0;
+
+ unsigned reg = UINT_MAX;
+ for (unsigned reg_base = 0;
+ reg_base < GPIR_PHYSICAL_REG_NUM + GPIR_VALUE_REG_NUM;
+ reg_base++) {
+ unsigned cur_reg = (reg_base + reg_offset) % (GPIR_PHYSICAL_REG_NUM + GPIR_VALUE_REG_NUM);
+ if (!ctx->live[cur_reg]) {
+ reg = cur_reg;
+ break;
+ }
+ }
+
+ return reg;
+}
+
+static void add_fake_dep(gpir_node *node, gpir_node *src,
+ struct value_regalloc_ctx *ctx)
+{
+ assert(src->value_reg >= 0);
+ if (ctx->last_written[src->value_reg] &&
+ ctx->last_written[src->value_reg] != node) {
+ gpir_node_add_dep(ctx->last_written[src->value_reg], node,
+ GPIR_DEP_WRITE_AFTER_READ);
+ }
+
+ /* For a sequence of schedule_first nodes right before a complex1
+ * node, add any extra fake dependencies necessary so that the
+ * schedule_first nodes can be scheduled right after the complex1 is
+ * scheduled. We have to save the last_written before complex1 happens to
+ * avoid adding dependencies to children of the complex1 node which would
+ * create a cycle.
+ */
+
+ if (gpir_op_infos[node->op].schedule_first &&
+ ctx->last_complex1 &&
+ ctx->complex1_last_written[src->value_reg]) {
+ gpir_node_add_dep(ctx->complex1_last_written[src->value_reg],
+ ctx->last_complex1,
+ GPIR_DEP_WRITE_AFTER_READ);
+ }
+}
+
+static bool handle_value_read(gpir_node *node, gpir_node *src,
+ struct value_regalloc_ctx *ctx)
+{
+ /* If already allocated, don't allocate it */
+ if (src->value_reg < 0) {
+ unsigned reg = find_free_value_reg(ctx);
+ if (reg == UINT_MAX)
+ return false;
+
+ src->value_reg = reg;
+ ctx->live[reg] = src;
+ }
+
+ /* Add any fake dependencies. Note: this is the actual result of value
+ * register allocation. We throw away node->value_reg afterwards, since
+ * it's really the fake dependencies which constrain the post-RA scheduler
+ * enough to make sure it never needs to spill to temporaries.
+ */
+ add_fake_dep(node, src, ctx);
+
+ return true;
+}
+
+static bool handle_reg_read(gpir_load_node *load,
+ struct value_regalloc_ctx *ctx)
+{
+ unsigned idx = load->index * 4 + load->component;
+ if (!ctx->live[idx]) {
+ ctx->live[idx] = PHYSREG_LIVE;
+ } else if (ctx->live[idx] != PHYSREG_LIVE) {
+ /* This slot is occupied by some other value register, so we need to
+ * evict it. This effectively splits the live range of the value
+ * register. NB: since we create fake dependencies on the fly, and the
+ * fake dependencies are the only output of this pass, we don't actually
+ * have to record where the split happened or that this value was
+ * assigned to two different registers. Any actual live range splitting
+ * happens in the post-RA scheduler, which moves the value to and from
+ * the register file. This will just cause some reads of the value
+ * register to have different fake dependencies.
+ */
+ unsigned new_reg = find_free_value_reg(ctx);
+ if (new_reg == UINT_MAX)
+ return false;
+ ctx->live[new_reg] = ctx->live[idx];
+ ctx->live[new_reg]->value_reg = new_reg;
+ ctx->live[idx] = PHYSREG_LIVE;
+ }
+
+ if (ctx->last_written[idx]) {
+ gpir_node_add_dep(ctx->last_written[idx], &load->node,
+ GPIR_DEP_WRITE_AFTER_READ);
+ }
+
+ return true;
+}
+
+static void handle_reg_write(gpir_store_node *store,
+ struct value_regalloc_ctx *ctx)
+{
+ unsigned idx = store->index * 4 + store->component;
+ store->node.value_reg = idx;
+ ctx->last_written[idx] = &store->node;
+ ctx->live[idx] = NULL;
+}
+
+static void handle_value_write(gpir_node *node,
+ struct value_regalloc_ctx *ctx)
+{
+ ctx->last_written[node->value_reg] = node;
+ ctx->live[node->value_reg] = NULL;
+}
+
+static bool regalloc_value_regs(gpir_block *block)
+{
+ struct value_regalloc_ctx ctx = { { 0 } };
+
+ list_for_each_entry(gpir_node, node, &block->node_list, list) {
+ node->value_reg = -1;
+ }
+
+ list_for_each_entry_rev(gpir_node, node, &block->node_list, list) {
+ if (node->op == gpir_op_complex1) {
+ ctx.last_complex1 = node;
+ memcpy(ctx.complex1_last_written, ctx.last_written,
+ sizeof(ctx.complex1_last_written));
+ }
+
+ if (node->type != gpir_node_type_store &&
+ node->type != gpir_node_type_branch) {
+ handle_value_write(node, &ctx);
+ } else if (node->op == gpir_op_store_reg) {
+ handle_reg_write(gpir_node_to_store(node), &ctx);
+ }
+
+ if (node->type == gpir_node_type_store) {
+ gpir_store_node *store = gpir_node_to_store(node);
+ if (!handle_value_read(&store->node, store->child, &ctx))
+ return false;
+ } else if (node->type == gpir_node_type_alu) {
+ gpir_alu_node *alu = gpir_node_to_alu(node);
+ for (int i = 0; i < alu->num_child; i++) {
+ if (!handle_value_read(&alu->node, alu->children[i], &ctx))
+ return false;
+ }
+ } else if (node->type == gpir_node_type_branch) {
+ /* At the end of a block the top 11 values are always free, so
+ * branches should always succeed.
+ */
+ gpir_branch_node *branch = gpir_node_to_branch(node);
+ ASSERTED bool result = handle_value_read(&branch->node,
+ branch->cond, &ctx);
+ assert(result);
+ } else if (node->op == gpir_op_load_reg) {
+ gpir_load_node *load = gpir_node_to_load(node);
+ if (!handle_reg_read(load, &ctx))
+ return false;
+ }
+ }
+
+ return true;
+}
+
static void regalloc_print_result(gpir_compiler *comp)
{
if (!(lima_debug & LIMA_DEBUG_GP))
struct regalloc_ctx ctx;
ctx.mem_ctx = ralloc_context(NULL);
- ctx.num_nodes_and_regs = comp->cur_reg + comp->cur_index;
- ctx.bitset_words = BITSET_WORDS(ctx.num_nodes_and_regs);
+ ctx.bitset_words = BITSET_WORDS(comp->cur_reg);
ctx.live = ralloc_array(ctx.mem_ctx, BITSET_WORD, ctx.bitset_words);
- ctx.worklist = ralloc_array(ctx.mem_ctx, unsigned, ctx.num_nodes_and_regs);
- ctx.stack = ralloc_array(ctx.mem_ctx, unsigned, ctx.num_nodes_and_regs);
+ ctx.worklist = ralloc_array(ctx.mem_ctx, unsigned, comp->cur_reg);
+ ctx.stack = ralloc_array(ctx.mem_ctx, unsigned, comp->cur_reg);
ctx.comp = comp;
- ctx.registers = rzalloc_array(ctx.mem_ctx, struct reg_info, ctx.num_nodes_and_regs);
- for (unsigned i = 0; i < ctx.num_nodes_and_regs; i++) {
+ ctx.registers = rzalloc_array(ctx.mem_ctx, struct reg_info, comp->cur_reg);
+ for (int i = 0; i < comp->cur_reg; i++) {
ctx.registers[i].conflicts = rzalloc_array(ctx.mem_ctx, BITSET_WORD,
ctx.bitset_words);
util_dynarray_init(&ctx.registers[i].conflict_list, ctx.mem_ctx);
}
assign_regs(&ctx);
+ list_for_each_entry(gpir_block, block, &comp->block_list, list) {
+ if (!regalloc_value_regs(block))
+ return false;
+ }
+
regalloc_print_result(comp);
ralloc_free(ctx.mem_ctx);
return true;