From 599662205020df1bc919d48a9809251dbd366714 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Wed, 20 Jan 2021 21:02:12 -0500 Subject: [PATCH] pan/bi: Calculate dependency graph when bundling MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Code is ported from Midgard, modified to be scalar, post-RA, and to put the arrays on the worklist instead of the instruction to save memory. This enables out-of-order scheduling. total tuples in shared programs: 128691 -> 125304 (-2.63%) tuples in affected programs: 114091 -> 110704 (-2.97%) helped: 844 HURT: 377 helped stats (abs) min: 1.0 max: 150.0 x̄: 4.88 x̃: 3 helped stats (rel) min: 0.30% max: 26.42% x̄: 5.56% x̃: 4.35% HURT stats (abs) min: 1.0 max: 8.0 x̄: 1.94 x̃: 1 HURT stats (rel) min: 0.20% max: 33.33% x̄: 6.84% x̃: 3.23% 95% mean confidence interval for tuples value: -3.16 -2.38 95% mean confidence interval for tuples %-change: -2.19% -1.27% Tuples are helped. total clauses in shared programs: 27579 -> 26059 (-5.51%) clauses in affected programs: 20606 -> 19086 (-7.38%) helped: 941 HURT: 39 helped stats (abs) min: 1.0 max: 21.0 x̄: 1.66 x̃: 1 helped stats (rel) min: 0.69% max: 44.44% x̄: 10.48% x̃: 9.09% HURT stats (abs) min: 1.0 max: 2.0 x̄: 1.15 x̃: 1 HURT stats (rel) min: 1.89% max: 10.00% x̄: 4.73% x̃: 4.55% 95% mean confidence interval for clauses value: -1.63 -1.47 95% mean confidence interval for clauses %-change: -10.27% -9.48% Clauses are helped. total cycles in shared programs: 12262.54 -> 12154.79 (-0.88%) cycles in affected programs: 2210.54 -> 2102.79 (-4.87%) helped: 374 HURT: 56 helped stats (abs) min: 0.041665999999999315 max: 6.25 x̄: 0.30 x̃: 0 helped stats (rel) min: 0.42% max: 26.00% x̄: 6.90% x̃: 7.14% HURT stats (abs) min: 0.041665999999999315 max: 0.5833319999999986 x̄: 0.11 x̃: 0 HURT stats (rel) min: 0.16% max: 100.00% x̄: 55.17% x̃: 50.00% 95% mean confidence interval for cycles value: -0.29 -0.21 95% mean confidence interval for cycles %-change: -1.37% 3.73% Inconclusive result (%-change mean confidence interval includes 0). total arith in shared programs: 4852.29 -> 4658.13 (-4.00%) arith in affected programs: 4525.17 -> 4331 (-4.29%) helped: 1112 HURT: 166 helped stats (abs) min: 0.041665999999999315 max: 6.25 x̄: 0.19 x̃: 0 helped stats (rel) min: 0.42% max: 33.33% x̄: 6.59% x̃: 5.36% HURT stats (abs) min: 0.041665999999999315 max: 0.5833319999999986 x̄: 0.07 x̃: 0 HURT stats (rel) min: 0.16% max: 100.00% x̄: 25.05% x̃: 2.40% 95% mean confidence interval for arith value: -0.17 -0.14 95% mean confidence interval for arith %-change: -3.44% -1.51% Arith are helped. total quadwords in shared programs: 117141 -> 111394 (-4.91%) quadwords in affected programs: 104390 -> 98643 (-5.51%) helped: 1245 HURT: 76 helped stats (abs) min: 1.0 max: 69.0 x̄: 4.74 x̃: 4 helped stats (rel) min: 0.28% max: 35.00% x̄: 7.88% x̃: 6.45% HURT stats (abs) min: 1.0 max: 8.0 x̄: 2.01 x̃: 1 HURT stats (rel) min: 0.20% max: 10.00% x̄: 3.52% x̃: 4.25% 95% mean confidence interval for quadwords value: -4.61 -4.09 95% mean confidence interval for quadwords %-change: -7.56% -6.88% Quadwords are helped. Signed-off-by: Alyssa Rosenzweig Part-of: --- src/panfrost/bifrost/bi_schedule.c | 181 +++++++++++++++++++++++++++++++-- src/panfrost/bifrost/bifrost.h | 1 + src/panfrost/bifrost/bifrost_compile.c | 3 +- 3 files changed, 177 insertions(+), 8 deletions(-) diff --git a/src/panfrost/bifrost/bi_schedule.c b/src/panfrost/bifrost/bi_schedule.c index e409917..468a986 100644 --- a/src/panfrost/bifrost/bi_schedule.c +++ b/src/panfrost/bifrost/bi_schedule.c @@ -38,6 +38,13 @@ struct bi_worklist { /* Bitset of instructions in the block ready for scheduling */ BITSET_WORD *worklist; + + /* The backwards dependency graph. nr_dependencies is the number of + * unscheduled instructions that must still be scheduled after (before) + * this instruction. dependents are which instructions need to be + * scheduled before (after) this instruction. */ + unsigned *dep_counts; + BITSET_WORD **dependents; }; /* State of a single tuple and clause under construction */ @@ -149,6 +156,138 @@ bi_supports_dtsel(bi_instr *ins) } } +/* Adds an edge to the dependency graph */ + +static void +bi_push_dependency(unsigned parent, unsigned child, + BITSET_WORD **dependents, unsigned *dep_counts) +{ + if (!BITSET_TEST(dependents[parent], child)) { + BITSET_SET(dependents[parent], child); + dep_counts[child]++; + } +} + +static void +add_dependency(struct util_dynarray *table, unsigned index, unsigned child, + BITSET_WORD **dependents, unsigned *dep_counts) +{ + assert(index < 64); + util_dynarray_foreach(table + index, unsigned, parent) + bi_push_dependency(*parent, child, dependents, dep_counts); +} + +static void +mark_access(struct util_dynarray *table, unsigned index, unsigned parent) +{ + assert(index < 64); + util_dynarray_append(&table[index], unsigned, parent); +} + +static bool +bi_is_sched_barrier(bi_instr *I) +{ + switch (I->op) { + case BI_OPCODE_BARRIER: + case BI_OPCODE_DISCARD_F32: + return true; + default: + return false; + } +} + +static void +bi_create_dependency_graph(struct bi_worklist st, bool inorder) +{ + struct util_dynarray last_read[64], last_write[64]; + + for (unsigned i = 0; i < 64; ++i) { + util_dynarray_init(&last_read[i], NULL); + util_dynarray_init(&last_write[i], NULL); + } + + /* Initialize dependency graph */ + for (unsigned i = 0; i < st.count; ++i) { + st.dependents[i] = + calloc(BITSET_WORDS(st.count), sizeof(BITSET_WORD)); + + st.dep_counts[i] = 0; + } + + unsigned prev_msg = ~0; + + /* Populate dependency graph */ + for (signed i = st.count - 1; i >= 0; --i) { + bi_instr *ins = st.instructions[i]; + + bi_foreach_src(ins, s) { + if (ins->src[s].type != BI_INDEX_REGISTER) continue; + unsigned count = bi_count_read_registers(ins, s); + + for (unsigned c = 0; c < count; ++c) + add_dependency(last_write, ins->src[s].value + c, i, st.dependents, st.dep_counts); + } + + /* Keep message-passing ops in order. (This pass only cares + * about bundling; reordering of message-passing instructions + * happens during earlier scheduling.) */ + + if (bi_message_type_for_instr(ins)) { + if (prev_msg != ~0) + bi_push_dependency(prev_msg, i, st.dependents, st.dep_counts); + + prev_msg = i; + } + + /* Handle schedule barriers, adding All the deps */ + if (inorder || bi_is_sched_barrier(ins)) { + for (unsigned j = 0; j < st.count; ++j) { + if (i == j) continue; + + bi_push_dependency(MAX2(i, j), MIN2(i, j), + st.dependents, st.dep_counts); + } + } + + bi_foreach_dest(ins, d) { + if (ins->dest[d].type != BI_INDEX_REGISTER) continue; + unsigned dest = ins->dest[d].value; + + unsigned count = bi_count_write_registers(ins, d); + + for (unsigned c = 0; c < count; ++c) { + add_dependency(last_read, dest + c, i, st.dependents, st.dep_counts); + add_dependency(last_write, dest + c, i, st.dependents, st.dep_counts); + mark_access(last_write, dest + c, i); + } + } + + bi_foreach_src(ins, s) { + if (ins->src[s].type != BI_INDEX_REGISTER) continue; + + unsigned count = bi_count_read_registers(ins, s); + + for (unsigned c = 0; c < count; ++c) + mark_access(last_read, ins->src[s].value + c, i); + } + } + + /* If there is a branch, all instructions depend on it, as interblock + * execution must be purely in-order */ + + bi_instr *last = st.instructions[st.count - 1]; + if (last->branch_target || last->op == BI_OPCODE_JUMP) { + for (signed i = st.count - 2; i >= 0; --i) + bi_push_dependency(st.count - 1, i, st.dependents, st.dep_counts); + } + + /* Free the intermediate structures */ + for (unsigned i = 0; i < 64; ++i) { + util_dynarray_fini(&last_read[i]); + util_dynarray_fini(&last_write[i]); + } +} + /* Scheduler pseudoinstruction lowerings to enable instruction pairings. * Currently only support CUBEFACE -> *CUBEFACE1/+CUBEFACE2 */ @@ -275,14 +414,23 @@ bi_flatten_block(bi_block *block, unsigned *len) */ static struct bi_worklist -bi_initialize_worklist(bi_block *block) +bi_initialize_worklist(bi_block *block, bool inorder) { struct bi_worklist st = { }; st.instructions = bi_flatten_block(block, &st.count); - if (st.count) { - st.worklist = calloc(BITSET_WORDS(st.count), sizeof(BITSET_WORD)); - BITSET_SET(st.worklist, st.count - 1); + if (!st.count) + return st; + + st.dependents = calloc(st.count, sizeof(st.dependents[0])); + st.dep_counts = calloc(st.count, sizeof(st.dep_counts[0])); + + bi_create_dependency_graph(st, inorder); + st.worklist = calloc(BITSET_WORDS(st.count), sizeof(BITSET_WORD)); + + for (unsigned i = 0; i < st.count; ++i) { + if (st.dep_counts[i] == 0) + BITSET_SET(st.worklist, i); } return st; @@ -291,6 +439,8 @@ bi_initialize_worklist(bi_block *block) static void bi_free_worklist(struct bi_worklist st) { + free(st.dep_counts); + free(st.dependents); free(st.instructions); free(st.worklist); } @@ -298,8 +448,24 @@ bi_free_worklist(struct bi_worklist st) static void bi_update_worklist(struct bi_worklist st, unsigned idx) { - if (idx >= 1) - BITSET_SET(st.worklist, idx - 1); + assert(st.dep_counts[idx] == 0); + + if (!st.dependents[idx]) + return; + + /* Iterate each dependent to remove one dependency (`done`), + * adding dependents to the worklist where possible. */ + + unsigned i; + BITSET_FOREACH_SET(i, st.dependents[idx], st.count) { + assert(st.dep_counts[i] != 0); + unsigned new_deps = --st.dep_counts[i]; + + if (new_deps == 0) + BITSET_SET(st.worklist, i); + } + + free(st.dependents[idx]); } /* To work out the back-to-back flag, we need to detect branches and @@ -1568,7 +1734,8 @@ bi_schedule_block(bi_context *ctx, bi_block *block) list_inithead(&block->clauses); /* Copy list to dynamic array */ - struct bi_worklist st = bi_initialize_worklist(block); + struct bi_worklist st = bi_initialize_worklist(block, + bifrost_debug & BIFROST_DBG_INORDER); if (!st.count) { bi_free_worklist(st); diff --git a/src/panfrost/bifrost/bifrost.h b/src/panfrost/bifrost/bifrost.h index be3a2d0..436404f 100644 --- a/src/panfrost/bifrost/bifrost.h +++ b/src/panfrost/bifrost/bifrost.h @@ -36,6 +36,7 @@ #define BIFROST_DBG_VERBOSE 0x0008 #define BIFROST_DBG_INTERNAL 0x0010 #define BIFROST_DBG_NOSCHED 0x0020 +#define BIFROST_DBG_INORDER 0x0040 extern int bifrost_debug; diff --git a/src/panfrost/bifrost/bifrost_compile.c b/src/panfrost/bifrost/bifrost_compile.c index 3b57794..594d02d 100644 --- a/src/panfrost/bifrost/bifrost_compile.c +++ b/src/panfrost/bifrost/bifrost_compile.c @@ -43,7 +43,8 @@ static const struct debug_named_value bifrost_debug_options[] = { {"shaderdb", BIFROST_DBG_SHADERDB, "Print statistics"}, {"verbose", BIFROST_DBG_VERBOSE, "Disassemble verbosely"}, {"internal", BIFROST_DBG_INTERNAL, "Dump even internal shaders"}, - {"nosched", BIFROST_DBG_NOSCHED, "Force trivial scheduling"}, + {"nosched", BIFROST_DBG_NOSCHED, "Force trivial bundling"}, + {"inorder", BIFROST_DBG_INORDER, "Force in-order bundling"}, DEBUG_NAMED_VALUE_END }; -- 2.7.4