#include "compiler.h"
#include "util/u_memory.h"
-/* A simple liveness-based dead code elimination pass. In 'soft' mode, dead
- * instructions are kept but write to null, which is required for correct
- * operation post-schedule pass (where dead instructions correspond to
- * instructions whose destinations are consumed immediately as a passthrough
- * register. If the destinations are not garbage collected, impossible register
- * encodings will result.)
- */
+/* A simple liveness-based dead code elimination pass. */
void
-bi_opt_dead_code_eliminate(bi_context *ctx, bool soft)
+bi_opt_dead_code_eliminate(bi_context *ctx)
{
unsigned temp_count = bi_max_temp(ctx);
all_null &= bi_is_null(ins->dest[d]);
}
- if (all_null && !soft && !bi_side_effects(ins->op))
+ if (all_null && !bi_side_effects(ins->op))
bi_remove_instruction(ins);
else
bi_liveness_ins_update(live, ins, temp_count);
block->base.live_in = live;
}
}
+
+/* Post-RA liveness-based dead code analysis to clean up results of bundling */
+
+static uint64_t
+bi_postra_liveness_ins(uint64_t live, bi_instr *ins)
+{
+ bi_foreach_dest(ins, d) {
+ if (ins->dest[d].type == BI_INDEX_REGISTER) {
+ unsigned nr = bi_count_write_registers(ins, d);
+ unsigned reg = ins->dest[d].value;
+ live &= ~(BITFIELD64_MASK(nr) << reg);
+ }
+ }
+
+ bi_foreach_src(ins, s) {
+ if (ins->src[s].type == BI_INDEX_REGISTER) {
+ unsigned nr = bi_count_read_registers(ins, s);
+ unsigned reg = ins->src[s].value;
+ live |= (BITFIELD64_MASK(nr) << reg);
+ }
+ }
+
+ return live;
+}
+
+static bool
+bi_postra_liveness_block(bi_block *blk)
+{
+ pan_foreach_successor((&blk->base), _succ) {
+ bi_block *succ = (bi_block *) _succ;
+ blk->reg_live_out |= succ->reg_live_in;
+ }
+
+ uint64_t live = blk->reg_live_out;
+
+ bi_foreach_instr_in_block_rev(blk, ins)
+ live = bi_postra_liveness_ins(live, ins);
+
+ bool progress = blk->reg_live_in != live;
+ blk->reg_live_in = live;
+ return progress;
+}
+
+/* Globally, liveness analysis uses a fixed-point algorithm based on a
+ * worklist. We initialize a work list with the exit block. We iterate the work
+ * list to compute live_in from live_out for each block on the work list,
+ * adding the predecessors of the block to the work list if we made progress.
+ */
+
+static void
+bi_postra_liveness(bi_context *ctx)
+{
+ struct set *work_list = _mesa_set_create(NULL,
+ _mesa_hash_pointer,
+ _mesa_key_pointer_equal);
+
+ struct set *visited = _mesa_set_create(NULL,
+ _mesa_hash_pointer,
+ _mesa_key_pointer_equal);
+
+ struct set_entry *cur;
+ cur = _mesa_set_add(work_list, pan_exit_block(&ctx->blocks));
+
+ do {
+ bi_block *blk = (struct bi_block *) cur->key;
+ _mesa_set_remove(work_list, cur);
+
+ /* Update its liveness information */
+ bool progress = bi_postra_liveness_block(blk);
+
+ /* If we made progress, we need to process the predecessors */
+
+ if (progress || !_mesa_set_search(visited, blk)) {
+ pan_foreach_predecessor((&blk->base), pred)
+ _mesa_set_add(work_list, pred);
+ }
+
+ _mesa_set_add(visited, blk);
+ } while((cur = _mesa_set_next_entry(work_list, NULL)) != NULL);
+
+ _mesa_set_destroy(visited, NULL);
+ _mesa_set_destroy(work_list, NULL);
+}
+
+void
+bi_opt_dce_post_ra(bi_context *ctx)
+{
+ bi_postra_liveness(ctx);
+
+ bi_foreach_block_rev(ctx, _block) {
+ bi_block *block = (bi_block *) _block;
+ uint64_t live = block->reg_live_out;
+
+ bi_foreach_instr_in_block_rev(block, ins) {
+ bi_foreach_dest(ins, d) {
+ if (ins->dest[d].type != BI_INDEX_REGISTER)
+ continue;
+
+ unsigned nr = bi_count_write_registers(ins, d);
+ unsigned reg = ins->dest[d].value;
+ uint64_t mask = (BITFIELD64_MASK(nr) << reg);
+ bool cullable = (ins->op != BI_OPCODE_BLEND);
+
+ if (!(live & mask) && cullable)
+ ins->dest[d] = bi_null();
+ }
+
+ live = bi_postra_liveness_ins(live, ins);
+ }
+ }
+}
#include "panfrost/util/lcra.h"
#include "util/u_memory.h"
-/* A clause may contain 1 message-passing instruction. No subsequent
- * instruction in the clause may access its registers due to data races.
- * Scheduling ensures this is possible but RA needs to preserve this. The
- * simplest solution is forcing accessed registers live in _all_ words at the
- * end (and consequently throughout) the clause, addressing corner cases where
- * a single component is masked out */
-
-static void
-bi_mark_msg_live(bi_block *block, bi_clause *clause, unsigned node_count, uint16_t *live)
-{
- bi_foreach_instr_in_clause(block, clause, ins) {
- if (!bi_opcode_props[ins->op].message) continue;
-
- bi_foreach_dest(ins, d) {
- unsigned node = bi_get_node(ins->dest[d]);
- if (node < node_count)
- live[node] |= bi_writemask(ins, d);
- }
-
- bi_foreach_src(ins, s) {
- unsigned node = bi_get_node(ins->src[s]);
- if (node < node_count) {
- unsigned count = bi_count_read_registers(ins, s);
- unsigned rmask = (1 << (4 * count)) - 1;
- live[node] |= (rmask << (4 * ins->src[s].offset));
- }
- }
-
- break;
- }
-}
-
static void
-bi_mark_interference(bi_block *block, bi_clause *clause, struct lcra_state *l, uint16_t *live, unsigned node_count, bool is_blend)
+bi_mark_interference(bi_block *block, struct lcra_state *l, uint16_t *live, unsigned node_count, bool is_blend)
{
- bi_foreach_instr_in_clause_rev(block, clause, ins) {
+ bi_foreach_instr_in_block_rev(block, ins) {
/* Mark all registers live after the instruction as
* interfering with the destination */
bi_block *blk = (bi_block *) _blk;
uint16_t *live = mem_dup(_blk->live_out, node_count * sizeof(uint16_t));
- bi_foreach_clause_in_block_rev(blk, clause) {
- bi_mark_msg_live(blk, clause, node_count, live);
- bi_mark_interference(blk, clause, l, live, node_count,
- ctx->inputs->is_blend);
- }
+ bi_mark_interference(blk, l, live, node_count,
+ ctx->inputs->is_blend);
free(live);
}
static void
bi_spill_dest(bi_builder *b, bi_index index, bi_index temp, uint32_t offset,
- bi_clause *clause, bi_block *block, unsigned channels)
+ bi_instr *instr, bi_block *block, unsigned channels)
{
- b->cursor = bi_after_clause(clause);
-
- /* setup FAU as [offset][0] */
- bi_instr *st = bi_store(b, channels * 32, temp,
- bi_passthrough(BIFROST_SRC_FAU_LO),
- bi_passthrough(BIFROST_SRC_FAU_HI),
+ b->cursor = bi_after_instr(instr);
+ bi_store(b, channels * 32, temp, bi_imm_u32(offset), bi_zero(),
BI_SEG_TL);
- bi_clause *singleton = bi_singleton(b->shader, st, block, 0, (1 << 0),
- offset, true);
-
- list_add(&singleton->link, &clause->link);
b->shader->spills++;
}
static void
bi_fill_src(bi_builder *b, bi_index index, bi_index temp, uint32_t offset,
- bi_clause *clause, bi_block *block, unsigned channels)
+ bi_instr *instr, bi_block *block, unsigned channels)
{
- b->cursor = bi_before_clause(clause);
+ b->cursor = bi_before_instr(instr);
bi_instr *ld = bi_load_to(b, channels * 32, temp,
- bi_passthrough(BIFROST_SRC_FAU_LO),
- bi_passthrough(BIFROST_SRC_FAU_HI),
- BI_SEG_TL);
+ bi_imm_u32(offset), bi_zero(), BI_SEG_TL);
ld->no_spill = true;
- bi_clause *singleton = bi_singleton(b->shader, ld, block, 0,
- (1 << 0), offset, true);
-
- list_addtail(&singleton->link, &clause->link);
b->shader->fills++;
}
static unsigned
-bi_clause_mark_spill(bi_context *ctx, bi_block *block,
- bi_clause *clause, bi_index index, bi_index *temp)
+bi_instr_mark_spill(bi_context *ctx, bi_block *block,
+ bi_instr *ins, bi_index index, bi_index *temp)
{
unsigned channels = 0;
- bi_foreach_instr_in_clause(block, clause, ins) {
- bi_foreach_dest(ins, d) {
- if (!bi_is_equiv(ins->dest[d], index)) continue;
- if (bi_is_null(*temp)) *temp = bi_temp_reg(ctx);
- ins->no_spill = true;
+ bi_foreach_dest(ins, d) {
+ if (!bi_is_equiv(ins->dest[d], index)) continue;
+ if (bi_is_null(*temp)) *temp = bi_temp_reg(ctx);
+ ins->no_spill = true;
- unsigned offset = ins->dest[d].offset;
- ins->dest[d] = bi_replace_index(ins->dest[d], *temp);
- ins->dest[d].offset = offset;
+ unsigned offset = ins->dest[d].offset;
+ ins->dest[d] = bi_replace_index(ins->dest[d], *temp);
+ ins->dest[d].offset = offset;
- unsigned newc = util_last_bit(bi_writemask(ins, d)) >> 2;
- channels = MAX2(channels, newc);
- }
+ unsigned newc = util_last_bit(bi_writemask(ins, d)) >> 2;
+ channels = MAX2(channels, newc);
}
return channels;
}
static bool
-bi_clause_mark_fill(bi_context *ctx, bi_block *block, bi_clause *clause,
+bi_instr_mark_fill(bi_context *ctx, bi_block *block, bi_instr *ins,
bi_index index, bi_index *temp)
{
- bool fills = false;
-
- bi_foreach_instr_in_clause(block, clause, ins) {
- if (!bi_has_arg(ins, index)) continue;
- if (bi_is_null(*temp)) *temp = bi_temp_reg(ctx);
- bi_rewrite_index_src_single(ins, index, *temp);
- fills = true;
- }
-
- return fills;
+ if (!bi_has_arg(ins, index)) return false;
+ if (bi_is_null(*temp)) *temp = bi_temp_reg(ctx);
+ bi_rewrite_index_src_single(ins, index, *temp);
+ return true;
}
/* Once we've chosen a spill node, spill it. Precondition: node is a valid
/* Spill after every store, fill before every load */
bi_foreach_block(ctx, _block) {
bi_block *block = (bi_block *) _block;
- bi_foreach_clause_in_block_safe(block, clause) {
- bi_index tmp = bi_null();
-
- unsigned local_channels = bi_clause_mark_spill(ctx,
- block, clause, index, &tmp);
+ bi_foreach_instr_in_block_safe(block, instr) {
+ bi_index tmp;
+ unsigned local_channels = bi_instr_mark_spill(ctx,
+ block, instr, index, &tmp);
channels = MAX2(channels, local_channels);
if (local_channels) {
bi_spill_dest(&_b, index, tmp, offset,
- clause, block, channels);
+ instr, block, channels);
}
/* For SSA form, if we write/spill, there was no prior
* garbage */
bool should_fill = !local_channels || index.reg;
- should_fill &= bi_clause_mark_fill(ctx, block, clause,
+ should_fill &= bi_instr_mark_fill(ctx, block, instr,
index, &tmp);
if (should_fill) {
- bi_fill_src(&_b, index, tmp, offset, clause,
+ bi_fill_src(&_b, index, tmp, offset, instr,
block, channels);
}
}
/* Has a message-passing instruction already been assigned? */
bool message;
- /* Indices already read, this needs to be tracked to avoid hazards
+ /* Indices already accessed, this needs to be tracked to avoid hazards
* around message-passing instructions */
- unsigned read_count;
- bi_index reads[BI_MAX_SRCS * 16];
+ unsigned access_count;
+ bi_index accesses[(BI_MAX_SRCS + 1) * 16];
unsigned tuple_count;
struct bi_const_state consts[8];
bi_instr *pinstr = tuple->add;
bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));
- bi_instr *fma = bi_seg_add_to(&b, bi_word(pinstr->dest[0], 0),
- pinstr->src[0], pinstr->preserve_null, pinstr->seg);
+ bi_instr *fma = bi_seg_add_to(&b, pinstr->dest[0], pinstr->src[0],
+ pinstr->preserve_null, pinstr->seg);
pinstr->op = BI_OPCODE_SEG_ADD;
- pinstr->dest[0] = bi_word(pinstr->dest[0], 1);
pinstr->src[0] = pinstr->src[1];
pinstr->src[1] = bi_null();
+ assert(pinstr->dest[0].type == BI_INDEX_REGISTER);
+ pinstr->dest[0].value += 1;
+
return fma;
}
return (count == 1);
}
-/* Insert a clause wrapping a single instruction */
-
-bi_clause *
-bi_singleton(void *memctx, bi_instr *ins,
- bi_block *block,
- unsigned scoreboard_id,
- unsigned dependencies,
- uint64_t combined_constant,
- bool osrb)
-{
- bi_clause *u = rzalloc(memctx, bi_clause);
- u->tuple_count = 1;
-
- ASSERTED bool can_fma = bi_opcode_props[ins->op].fma;
- bool can_add = bi_opcode_props[ins->op].add;
- assert(can_fma || can_add);
-
- if (can_add)
- u->tuples[0].add = ins;
- else
- u->tuples[0].fma = ins;
-
- u->scoreboard_id = scoreboard_id;
- u->staging_barrier = osrb;
- u->dependencies = dependencies;
-
- if (ins->op == BI_OPCODE_ATEST)
- u->dependencies |= (1 << 6);
-
- if (ins->op == BI_OPCODE_BLEND)
- u->dependencies |= (1 << 6) | (1 << 7);
-
- /* Let's be optimistic, we'll fix up later */
- u->flow_control = BIFROST_FLOW_NBTB;
-
- assert(!ins->branch_target);
-
- if (combined_constant) {
- /* Clause in 64-bit, above in 32-bit */
- u->constant_count = 1;
- u->constants[0] = combined_constant;
- u->tuples[0].fau_idx = bi_constant_field(0) |
- (combined_constant & 0xF);
- }
-
- u->next_clause_prefetch = (ins->op != BI_OPCODE_JUMP);
- u->message_type = bi_message_type_for_instr(ins);
- u->message = u->message_type ? ins : NULL;
- u->block = block;
-
- return u;
-}
-
/* Scheduler predicates */
ASSERTED static bool
{
bi_foreach_src(add, s) {
bi_index src = add->src[s];
- unsigned count = bi_count_read_registers(add, s);
- if (!bi_is_equiv(fma, src))
+ if (src.type != BI_INDEX_REGISTER)
continue;
- /* fma \in [src, src + src_count) */
- if (!(fma.offset >= src.offset && fma.offset < src.offset + count))
- continue;
+ unsigned count = bi_count_read_registers(add, s);
+ bool read = false;
+
+ for (unsigned d = 0; d < count; ++d)
+ read |= bi_is_equiv(fma, bi_register(src.value + d));
- if (!bi_reads_t(add, s))
+ if (read && !bi_reads_t(add, s))
return true;
}
/* Message-passing instructions are not guaranteed write within the
* same clause (most likely they will not), so if a later instruction
- * in the clause reads from the destination, the message-passing
+ * in the clause accesses the destination, the message-passing
* instruction can't be scheduled */
- if (bi_opcode_props[instr->op].sr_write) {
- for (unsigned i = 0; i < clause->read_count; ++i) {
- if (bi_is_equiv(instr->dest[0], clause->reads[i]))
- return false;
+ if (bi_opcode_props[instr->op].sr_write && !bi_is_null(instr->dest[0])) {
+ unsigned nr = bi_count_write_registers(instr, 0);
+ assert(instr->dest[0].type == BI_INDEX_REGISTER);
+ unsigned reg = instr->dest[0].value;
+
+ for (unsigned i = 0; i < clause->access_count; ++i) {
+ bi_index idx = clause->accesses[i];
+ for (unsigned d = 0; d < nr; ++d) {
+ if (bi_is_equiv(bi_register(reg + d), idx))
+ return false;
+ }
+ }
+ }
+
+ if (bi_opcode_props[instr->op].sr_read && !bi_is_null(instr->src[0])) {
+ unsigned nr = bi_count_read_registers(instr, 0);
+ assert(instr->src[0].type == BI_INDEX_REGISTER);
+ unsigned reg = instr->src[0].value;
+
+ for (unsigned i = 0; i < clause->access_count; ++i) {
+ bi_index idx = clause->accesses[i];
+ for (unsigned d = 0; d < nr; ++d) {
+ if (bi_is_equiv(bi_register(reg + d), idx))
+ return false;
+ }
}
}
signed cost = bi_instr_cost(instr);
- if (cost <= best_cost) {
+ if (cost < best_cost) {
best_idx = i;
best_cost = cost;
}
bi_update_fau(clause, tuple, instr, fma, true);
/* TODO: maybe opt a bit? or maybe doesn't matter */
- assert(clause->read_count + BI_MAX_SRCS <= ARRAY_SIZE(clause->reads));
- memcpy(clause->reads + clause->read_count, instr->src, sizeof(instr->src));
- clause->read_count += BI_MAX_SRCS;
+ assert(clause->access_count + BI_MAX_SRCS <= ARRAY_SIZE(clause->accesses));
+ memcpy(clause->accesses + clause->access_count, instr->src, sizeof(instr->src));
+ clause->access_count += BI_MAX_SRCS;
+ clause->accesses[clause->access_count++] = instr->dest[0];
if (bi_writes_reg(instr))
tuple->reg.nr_writes++;
/* Update state to reflect taking the instruction */
bi_instr *instr = st.instructions[idx];
+
BITSET_CLEAR(st.worklist, idx);
bi_update_worklist(st, idx);
bi_pop_instr(clause, tuple, instr, fma);
return true;
}
-static void
-bi_lower_fau(bi_context *ctx, bi_block *block)
+void
+bi_lower_fau(bi_context *ctx)
{
- bi_builder b = bi_init_builder(ctx, bi_after_block(ctx->current_block));
-
- bi_foreach_instr_in_block_safe(block, _ins) {
- bi_instr *ins = (bi_instr *) _ins;
+ bi_foreach_instr_global_safe(ctx, ins) {
+ bi_builder b = bi_init_builder(ctx, bi_before_instr(ins));
uint32_t constants[2];
unsigned cwords = 0;
bi_foreach_src(ins, s) {
if (bi_check_fau_src(ins, s, constants, &cwords, &fau)) continue;
- b.cursor = bi_before_instr(ins);
bi_index copy = bi_mov_i32(&b, ins->src[s]);
ins->src[s] = bi_replace_index(ins->src[s], copy);
}
{
bi_foreach_block(ctx, block) {
bi_block *bblock = (bi_block *) block;
- bi_lower_fau(ctx, bblock);
bi_schedule_block(ctx, bblock);
}
- bi_opt_dead_code_eliminate(ctx, true);
+ bi_opt_dce_post_ra(ctx);
}
#ifndef NDEBUG
bi_opt_push_ubo(ctx);
bi_opt_constant_fold(ctx);
bi_opt_copy_prop(ctx);
- bi_opt_dead_code_eliminate(ctx, false);
+ bi_opt_dead_code_eliminate(ctx);
bi_foreach_block(ctx, _block) {
bi_block *block = (bi_block *) _block;
if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
bi_print_shader(ctx, stdout);
+ bi_lower_fau(ctx);
+ bi_register_allocate(ctx);
+ if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
+ bi_print_shader(ctx, stdout);
bi_schedule(ctx);
bi_assign_scoreboard(ctx);
- bi_register_allocate(ctx);
if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
bi_print_shader(ctx, stdout);
/* If true, uses clauses; if false, uses instructions */
bool scheduled;
struct list_head clauses; /* list of bi_clause */
+
+ /* Post-RA liveness */
+ uint64_t reg_live_in, reg_live_out;
} bi_block;
typedef struct {
/* BIR passes */
void bi_opt_copy_prop(bi_context *ctx);
-void bi_opt_dead_code_eliminate(bi_context *ctx, bool soft);
+void bi_opt_dead_code_eliminate(bi_context *ctx);
+void bi_opt_dce_post_ra(bi_context *ctx);
void bi_opt_push_ubo(bi_context *ctx);
void bi_opt_constant_fold(bi_context *ctx);
void bi_lower_swizzle(bi_context *ctx);
+void bi_lower_fau(bi_context *ctx);
void bi_schedule(bi_context *ctx);
void bi_assign_scoreboard(bi_context *ctx);
void bi_register_allocate(bi_context *ctx);
int bi_test_packing(void);
int bi_test_packing_formats(void);
-bi_clause *
-bi_singleton(void *memctx, bi_instr *ins,
- bi_block *block,
- unsigned scoreboard_id,
- unsigned dependencies,
- uint64_t combined_constant,
- bool osrb);
-
/* Liveness */
void bi_compute_liveness(bi_context *ctx);