From 023f27fada5abea779757b441eda9772d255ac94 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Wed, 26 Oct 2022 11:23:51 -0400 Subject: [PATCH] agx: Coalesce collects when possible MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Track collects and use them as affinities when choosing registers. On glmark2: total instructions in shared programs: 5498 -> 5388 (-2.00%) instructions in affected programs: 2748 -> 2638 (-4.00%) helped: 31 HURT: 0 helped stats (abs) min: 1.0 max: 12.0 x̄: 3.55 x̃: 3 helped stats (rel) min: 0.09% max: 57.14% x̄: 10.58% x̃: 5.97% 95% mean confidence interval for instructions value: -4.61 -2.49 95% mean confidence interval for instructions %-change: -15.16% -6.00% Instructions are helped. total bytes in shared programs: 37280 -> 36620 (-1.77%) bytes in affected programs: 18880 -> 18220 (-3.50%) helped: 31 HURT: 0 helped stats (abs) min: 6.0 max: 72.0 x̄: 21.29 x̃: 18 helped stats (rel) min: 0.07% max: 48.98% x̄: 9.16% x̃: 5.17% 95% mean confidence interval for bytes value: -27.64 -14.94 95% mean confidence interval for bytes %-change: -13.03% -5.29% Bytes are helped. total halfregs in shared programs: 1267 -> 1279 (0.95%) halfregs in affected programs: 37 -> 49 (32.43%) helped: 0 HURT: 9 HURT stats (abs) min: 1.0 max: 2.0 x̄: 1.33 x̃: 1 HURT stats (rel) min: 16.67% max: 66.67% x̄: 35.58% x̃: 28.57% 95% mean confidence interval for halfregs value: 0.95 1.72 95% mean confidence interval for halfregs %-change: 21.50% 49.67% Halfregs are HURT. Signed-off-by: Alyssa Rosenzweig Part-of: --- src/asahi/compiler/agx_register_allocate.c | 101 +++++++++++++++++++++++++++-- 1 file changed, 96 insertions(+), 5 deletions(-) diff --git a/src/asahi/compiler/agx_register_allocate.c b/src/asahi/compiler/agx_register_allocate.c index 4269b85..c9cc320 100644 --- a/src/asahi/compiler/agx_register_allocate.c +++ b/src/asahi/compiler/agx_register_allocate.c @@ -34,6 +34,9 @@ struct ra_ctx { BITSET_WORD *visited; BITSET_WORD *used_regs; + /* For affinities */ + agx_instr **src_to_collect; + /* Maximum number of registers that RA is allowed to use */ unsigned bound; }; @@ -139,6 +142,88 @@ assign_regs(struct ra_ctx *rctx, agx_index v, unsigned reg) BITSET_SET_RANGE(rctx->used_regs, reg, end); } +static unsigned +affinity_base_of_collect(struct ra_ctx *rctx, agx_instr *collect, unsigned src) +{ + unsigned src_reg = rctx->ssa_to_reg[collect->src[src].value]; + unsigned src_offset = src * agx_size_align_16(collect->src[src].size); + + if (src_reg >= src_offset) + return src_reg - src_offset; + else + return ~0; +} + +static unsigned +pick_regs(struct ra_ctx *rctx, agx_instr *I, unsigned d) +{ + agx_index idx = I->dest[d]; + assert(idx.type == AGX_INDEX_NORMAL); + + unsigned count = agx_write_registers(I, d); + unsigned align = agx_size_align_16(idx.size); + + /* Try to allocate collects compatibly with their sources */ + if (I->op == AGX_OPCODE_COLLECT) { + agx_foreach_ssa_src(I, s) { + assert(BITSET_TEST(rctx->visited, I->src[s].value) && + "registers assigned in an order compatible with dominance " + "and this is not a phi node, so we have assigned a register"); + + unsigned base = affinity_base_of_collect(rctx, I, s); + if (base >= rctx->bound || (base + count) > rctx->bound) + continue; + + if (!BITSET_TEST_RANGE(rctx->used_regs, base, base + count - 1)) + return base; + } + } + + /* Try to allocate sources of collects contiguously */ + if (rctx->src_to_collect[idx.value] != NULL) { + agx_instr *collect = rctx->src_to_collect[idx.value]; + + assert(count == align && "collect sources are scalar"); + + /* Find our offset in the collect. If our source is repeated in the + * collect, this may not be unique. We arbitrarily choose the first. + */ + unsigned our_source = ~0; + agx_foreach_ssa_src(collect, s) { + if (agx_is_equiv(collect->src[s], idx)) { + our_source = s; + break; + } + } + + assert(our_source < collect->nr_srcs && "source must be in the collect"); + + /* See if we can allocate compatibly with any source of the collect */ + agx_foreach_ssa_src(collect, s) { + if (!BITSET_TEST(rctx->visited, collect->src[s].value)) + continue; + + /* Determine where the collect should start relative to the source */ + unsigned base = affinity_base_of_collect(rctx, collect, s); + if (base >= rctx->bound) + continue; + + unsigned our_reg = base + (our_source * align); + + /* Don't allocate past the end of the register file */ + if ((our_reg + align) > rctx->bound) + continue; + + /* If those registers are free, then choose them */ + if (!BITSET_TEST_RANGE(rctx->used_regs, our_reg, our_reg + align - 1)) + return our_reg; + } + } + + /* Default to any contiguous sequence of registers */ + return find_regs(rctx->used_regs, count, align, rctx->bound); +} + /** Assign registers to SSA values in a block. */ static void @@ -201,11 +286,7 @@ agx_ra_assign_local(struct ra_ctx *rctx) * because of the SSA form. */ agx_foreach_ssa_dest(I, d) { - unsigned count = agx_write_registers(I, d); - unsigned align = agx_size_align_16(I->dest[d].size); - - assign_regs(rctx, I->dest[d], - find_regs(used_regs, count, align, rctx->bound)); + assign_regs(rctx, I->dest[d], pick_regs(rctx, I, d)); } } @@ -297,9 +378,17 @@ agx_ra(agx_context *ctx) agx_compute_liveness(ctx); uint8_t *ssa_to_reg = calloc(ctx->alloc, sizeof(uint8_t)); uint8_t *ncomps = calloc(ctx->alloc, sizeof(uint8_t)); + agx_instr **src_to_collect = calloc(ctx->alloc, sizeof(agx_instr *)); BITSET_WORD *visited = calloc(BITSET_WORDS(ctx->alloc), sizeof(BITSET_WORD)); agx_foreach_instr_global(ctx, I) { + /* Record collects so we can coalesce when assigning */ + if (I->op == AGX_OPCODE_COLLECT) { + agx_foreach_ssa_src(I, s) { + src_to_collect[I->src[s].value] = I; + } + } + agx_foreach_ssa_dest(I, d) { unsigned v = I->dest[d].value; assert(ncomps[v] == 0 && "broken SSA"); @@ -315,6 +404,7 @@ agx_ra(agx_context *ctx) .shader = ctx, .block = block, .ssa_to_reg = ssa_to_reg, + .src_to_collect = src_to_collect, .ncomps = ncomps, .visited = visited, .bound = AGX_NUM_REGS @@ -436,6 +526,7 @@ agx_ra(agx_context *ctx) } } + free(src_to_collect); free(ssa_to_reg); free(ncomps); free(visited); -- 2.7.4