From c81c022e666d13ff5a38895295f068f1469a4b62 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Thu, 23 Dec 2021 11:09:42 -0500 Subject: [PATCH] pan/bi: Implement basic scoreboarding pass Extend our existing bi_scoreboard infrastructure with a simple data flow analysis pass that calculates which dependency slots need waiting. We still lack a heuristic for selecting dependency slots. Signed-off-by: Alyssa Rosenzweig Part-of: --- src/panfrost/bifrost/bi_pack.c | 6 + src/panfrost/bifrost/bi_schedule.c | 3 - src/panfrost/bifrost/bi_scoreboard.c | 264 +++++++++++++++++++++++++++++++---- 3 files changed, 244 insertions(+), 29 deletions(-) diff --git a/src/panfrost/bifrost/bi_pack.c b/src/panfrost/bifrost/bi_pack.c index c5dbdeb..58a8cb8 100644 --- a/src/panfrost/bifrost/bi_pack.c +++ b/src/panfrost/bifrost/bi_pack.c @@ -36,6 +36,12 @@ bi_pack_header(bi_clause *clause, bi_clause *next_1, bi_clause *next_2) unsigned dependency_wait = next_1 ? next_1->dependencies : 0; dependency_wait |= next_2 ? next_2->dependencies : 0; + /* Signal barriers (slot #7) immediately. This is not optimal but good + * enough. Doing better requires extending the IR and scheduler. + */ + if (clause->message_type == BIFROST_MESSAGE_BARRIER) + dependency_wait |= BITFIELD_BIT(7); + bool staging_barrier = next_1 ? next_1->staging_barrier : false; staging_barrier |= next_2 ? next_2->staging_barrier : 0; diff --git a/src/panfrost/bifrost/bi_schedule.c b/src/panfrost/bifrost/bi_schedule.c index 33e3211..c0901eb 100644 --- a/src/panfrost/bifrost/bi_schedule.c +++ b/src/panfrost/bifrost/bi_schedule.c @@ -1860,9 +1860,6 @@ bi_schedule_clause(bi_context *ctx, bi_block *block, struct bi_worklist st, uint clause->next_clause_prefetch = !last || (last->op != BI_OPCODE_JUMP); clause->block = block; - /* TODO: scoreboard assignment post-sched */ - clause->dependencies |= (1 << 0); - /* We emit in reverse and emitted to the back of the tuples array, so * move it up front for easy indexing */ memmove(clause->tuples, diff --git a/src/panfrost/bifrost/bi_scoreboard.c b/src/panfrost/bifrost/bi_scoreboard.c index 484f8a7..dd5054d 100644 --- a/src/panfrost/bifrost/bi_scoreboard.c +++ b/src/panfrost/bifrost/bi_scoreboard.c @@ -54,55 +54,267 @@ */ #define BI_NUM_GENERAL_SLOTS 6 +#define BI_NUM_SLOTS 8 +#define BI_NUM_REGISTERS 64 +#define BI_SLOT_SERIAL 0 /* arbitrary */ -/* A model for the state of the scoreboard */ +/* + * Due to the crude scoreboarding we do, we need to serialize varying loads and + * memory access. Identify these instructions here. + */ +static bool +bi_should_serialize(bi_instr *I) +{ + /* Although nominally on the attribute unit, image loads have the same + * coherency requirements as general memory loads. Serialize them for + * now until we can do something more clever. + */ + if (I->op == BI_OPCODE_LD_ATTR_TEX) + return true; -struct bi_scoreboard_state { - /* TODO: what do we track here for a heuristic? */ -}; + switch (bi_opcode_props[I->op].message) { + case BIFROST_MESSAGE_VARYING: + case BIFROST_MESSAGE_LOAD: + case BIFROST_MESSAGE_STORE: + case BIFROST_MESSAGE_ATOMIC: + return true; + default: + return false; + } +} /* Given a scoreboard model, choose a slot for a clause wrapping a given * message passing instruction. No side effects. */ static unsigned -bi_choose_scoreboard_slot(struct bi_scoreboard_state *st, bi_instr *message) +bi_choose_scoreboard_slot(bi_instr *message) { - /* A clause that does not produce a message must use slot #0 */ - if (!message) - return 0; - - switch (message->op) { /* ATEST, ZS_EMIT must be issued with slot #0 */ - case BI_OPCODE_ATEST: - case BI_OPCODE_ZS_EMIT: + if (message->op == BI_OPCODE_ATEST || message->op == BI_OPCODE_ZS_EMIT) return 0; /* BARRIER must be issued with slot #7 */ - case BI_OPCODE_BARRIER: + if (message->op == BI_OPCODE_BARRIER) return 7; - default: - break; - } + /* For now, make serialization is easy */ + if (bi_should_serialize(message)) + return BI_SLOT_SERIAL; - /* TODO: Use a heuristic */ return 0; } +static uint64_t +bi_read_mask(bi_instr *I, bool staging_only) +{ + uint64_t mask = 0; + + if (staging_only && !bi_opcode_props[I->op].sr_read) + return mask; + + bi_foreach_src(I, s) { + if (I->src[s].type == BI_INDEX_REGISTER) { + unsigned reg = I->src[s].value; + unsigned count = bi_count_read_registers(I, s); + + mask |= (BITFIELD64_MASK(count) << reg); + } + + if (staging_only) + break; + } + + return mask; +} + +static uint64_t +bi_write_mask(bi_instr *I) +{ + uint64_t mask = 0; + + bi_foreach_dest(I, d) { + if (bi_is_null(I->dest[d])) continue; + + assert(I->dest[d].type == BI_INDEX_REGISTER); + + unsigned reg = I->dest[d].value; + unsigned count = bi_count_write_registers(I, d); + + mask |= (BITFIELD64_MASK(count) << reg); + } + + /* Instructions like AXCHG.i32 unconditionally both read and write + * staging registers. Even if we discard the result, the write still + * happens logically and needs to be included in our calculations. + * Obscurely, ATOM_CX is sr_write but can ignore the staging register in + * certain circumstances; this does not require consideration. + */ + if (bi_opcode_props[I->op].sr_write && bi_is_null(I->dest[0]) && + !bi_is_null(I->src[0])) { + + unsigned reg = I->src[0].value; + unsigned count = bi_count_write_registers(I, 0); + + mask |= (BITFIELD64_MASK(count) << reg); + } + + return mask; +} + +/* Update the scoreboard model to assign an instruction to a given slot */ + +static void +bi_push_clause(struct bi_scoreboard_state *st, bi_clause *clause) +{ + bi_instr *I = clause->message; + unsigned slot = clause->scoreboard_id; + + if (!I) + return; + + st->read[slot] |= bi_read_mask(I, true); + + if (bi_opcode_props[I->op].sr_write) + st->write[slot] |= bi_write_mask(I); +} + +/* Adds a dependency on each slot writing any specified register */ + +static void +bi_depend_on_writers(bi_clause *clause, struct bi_scoreboard_state *st, uint64_t regmask) +{ + for (unsigned slot = 0; slot < ARRAY_SIZE(st->write); ++slot) { + if (!(st->write[slot] & regmask)) + continue; + + st->write[slot] = 0; + st->read[slot] = 0; + + clause->dependencies |= BITFIELD_BIT(slot); + } +} + +static void +bi_set_staging_barrier(bi_clause *clause, struct bi_scoreboard_state *st, uint64_t regmask) +{ + for (unsigned slot = 0; slot < ARRAY_SIZE(st->read); ++slot) { + if (!(st->read[slot] & regmask)) + continue; + + st->read[slot] = 0; + clause->staging_barrier = true; + } +} + +/* Sets the dependencies for a given clause, updating the model */ + +static void +bi_set_dependencies(bi_block *block, bi_clause *clause, struct bi_scoreboard_state *st) +{ + bi_foreach_instr_in_clause(block, clause, I) { + uint64_t read = bi_read_mask(I, false); + uint64_t written = bi_write_mask(I); + + /* Read-after-write; write-after-write */ + bi_depend_on_writers(clause, st, read | written); + + /* Write-after-read */ + bi_set_staging_barrier(clause, st, written); + } + + /* LD_VAR instructions must be serialized per-quad. Just always depend + * on any LD_VAR instructions. This isn't optimal, but doing better + * requires divergence-aware data flow analysis. + * + * Similarly, memory loads/stores need to be synchronized. For now, + * force them to be serialized. This is not optimal. + */ + if (clause->message && bi_should_serialize(clause->message)) + clause->dependencies |= BITFIELD_BIT(BI_SLOT_SERIAL); + + /* Barriers must wait on all slots to flush existing work. It might be + * possible to skip this with more information about the barrier. For + * now, be conservative. + */ + if (clause->message && clause->message->op == BI_OPCODE_BARRIER) + clause->dependencies |= BITFIELD_MASK(BI_NUM_GENERAL_SLOTS); +} + +static bool +scoreboard_block_update(bi_block *blk) +{ + bool progress = false; + + /* pending_in[s] = sum { p in pred[s] } ( pending_out[p] ) */ + bi_foreach_predecessor(blk, pred) { + for (unsigned i = 0; i < BI_NUM_SLOTS; ++i) { + blk->scoreboard_in.read[i] |= pred->scoreboard_out.read[i]; + blk->scoreboard_in.write[i] |= pred->scoreboard_out.write[i]; + } + } + + struct bi_scoreboard_state state = blk->scoreboard_in; + + /* Assign locally */ + + bi_foreach_clause_in_block(blk, clause) { + bi_set_dependencies(blk, clause, &state); + bi_push_clause(&state, clause); + } + + /* To figure out progress, diff scoreboard_out */ + + for (unsigned i = 0; i < BI_NUM_SLOTS; ++i) + progress |= !!memcmp(&state, &blk->scoreboard_out, sizeof(state)); + + blk->scoreboard_out = state; + + return progress; +} + void bi_assign_scoreboard(bi_context *ctx) { - struct bi_scoreboard_state st = {}; - - /* Assign slots */ + /* First, assign slots. */ bi_foreach_block(ctx, block) { bi_foreach_clause_in_block(block, clause) { - unsigned slot = bi_choose_scoreboard_slot(&st, clause->message); - clause->scoreboard_id = slot; - - bi_clause *next = bi_next_clause(ctx, block, clause); - if (next) - next->dependencies |= (1 << slot); + if (clause->message) { + unsigned slot = bi_choose_scoreboard_slot(clause->message); + clause->scoreboard_id = slot; + } } } + + /* Next, perform forward data flow analysis to calculate dependencies */ + /* Set of bi_block */ + struct set *work_list = _mesa_set_create(NULL, + _mesa_hash_pointer, + _mesa_key_pointer_equal); + + struct set *visited = _mesa_set_create(NULL, + _mesa_hash_pointer, + _mesa_key_pointer_equal); + + /* Initialize the work list with the first block */ + struct set_entry *cur; + + cur = _mesa_set_add(work_list, bi_start_block(&ctx->blocks)); + + /* Iterate the work list */ + do { + bi_block *blk = (struct bi_block *) cur->key; + _mesa_set_remove(work_list, cur); + + bool progress = scoreboard_block_update(blk); + + if (progress || !_mesa_set_search(visited, blk)) { + bi_foreach_successor(blk, pred) + _mesa_set_add(work_list, pred); + } + + _mesa_set_add(visited, blk); + } while((cur = _mesa_set_next_entry(work_list, NULL)) != NULL); + + _mesa_set_destroy(visited, NULL); + _mesa_set_destroy(work_list, NULL); } -- 2.7.4