From 8dfc6ee317dd01016a26f1fb21829b89277e59d7 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Fri, 20 Jul 2018 12:05:57 -0700 Subject: [PATCH] v3d: Rotate through registers to improve post-RA scheduling options. Similarly to VC4's implementation, by not picking r0 immediately upon freeing it, we give the scheduler more of a chance to fit later writes in earlier. I'm not clear on whether there's any real cost to picking phys over accumulators, so keep that behavior for now. shader-db: total instructions in shared programs: 96831 -> 95669 (-1.20%) instructions in affected programs: 77254 -> 76092 (-1.50%) --- src/broadcom/compiler/vir_register_allocate.c | 45 +++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c index 4ec5f23..aa5e213 100644 --- a/src/broadcom/compiler/vir_register_allocate.c +++ b/src/broadcom/compiler/vir_register_allocate.c @@ -238,6 +238,43 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp) BITSET_CLEAR(c->spillable, i); } +struct v3d_ra_select_callback_data { + uint32_t next_acc; + uint32_t next_phys; +}; + +static unsigned int +v3d_ra_select_callback(struct ra_graph *g, BITSET_WORD *regs, void *data) +{ + struct v3d_ra_select_callback_data *v3d_ra = data; + + /* Choose an accumulator if possible (I think it's lower power than + * phys regs), but round-robin through them to give post-RA + * instruction selection more options. + */ + for (int i = 0; i < ACC_COUNT; i++) { + int acc_off = (v3d_ra->next_acc + i) % ACC_COUNT; + int acc = ACC_INDEX + acc_off; + + if (BITSET_TEST(regs, acc)) { + v3d_ra->next_acc = acc_off + 1; + return acc; + } + } + + for (int i = 0; i < PHYS_COUNT; i++) { + int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT; + int phys = PHYS_INDEX + phys_off; + + if (BITSET_TEST(regs, phys)) { + v3d_ra->next_phys = phys_off + 1; + return phys; + } + } + + unreachable("RA must pass us at least one possible reg."); +} + bool vir_init_reg_sets(struct v3d_compiler *compiler) { @@ -309,6 +346,13 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled) struct qpu_reg *temp_registers = calloc(c->num_temps, sizeof(*temp_registers)); int acc_nodes[ACC_COUNT]; + struct v3d_ra_select_callback_data callback_data = { + .next_acc = 0, + /* Start at RF3, to try to keep the TLB writes from using + * RF0-2. + */ + .next_phys = 3, + }; *spilled = false; @@ -328,6 +372,7 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled) struct ra_graph *g = ra_alloc_interference_graph(c->compiler->regs, c->num_temps + ARRAY_SIZE(acc_nodes)); + ra_set_select_reg_callback(g, v3d_ra_select_callback, &callback_data); /* Make some fixed nodes for the accumulators, which we will need to * interfere with when ops have implied r3/r4 writes or for the thread -- 2.7.4