From 0ffcb19b9d9fbe902224542047c389a661fbf816 Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Fri, 19 Feb 2021 12:33:49 +0100 Subject: [PATCH] ir3: Rewrite register allocation Switch to the new SSA-based register allocator. Part-of: --- src/freedreno/ci/traces-freedreno.yml | 6 +- src/freedreno/ir3/ir3.h | 35 +- src/freedreno/ir3/ir3_a6xx.c | 96 +- src/freedreno/ir3/ir3_compiler.c | 2 - src/freedreno/ir3/ir3_compiler.h | 2 - src/freedreno/ir3/ir3_compiler_nir.c | 53 +- src/freedreno/ir3/ir3_context.c | 2 +- src/freedreno/ir3/ir3_group.c | 187 -- src/freedreno/ir3/ir3_liveness.c | 184 ++ src/freedreno/ir3/ir3_lower_parallelcopy.c | 514 ++++ src/freedreno/ir3/ir3_merge_regs.c | 574 ++++ src/freedreno/ir3/ir3_postsched.c | 17 +- src/freedreno/ir3/ir3_print.c | 10 + src/freedreno/ir3/ir3_ra.c | 2844 +++++++++++--------- src/freedreno/ir3/ir3_ra.h | 511 ++-- src/freedreno/ir3/ir3_ra_regset.c | 249 -- src/freedreno/ir3/ir3_spill.c | 362 +++ src/freedreno/ir3/meson.build | 6 +- .../freedreno/ci/piglit-freedreno-a530-fails.txt | 6 +- .../freedreno/ci/piglit-freedreno-a630-fails.txt | 5 - 20 files changed, 3551 insertions(+), 2114 deletions(-) delete mode 100644 src/freedreno/ir3/ir3_group.c create mode 100644 src/freedreno/ir3/ir3_liveness.c create mode 100644 src/freedreno/ir3/ir3_lower_parallelcopy.c create mode 100644 src/freedreno/ir3/ir3_merge_regs.c delete mode 100644 src/freedreno/ir3/ir3_ra_regset.c create mode 100644 src/freedreno/ir3/ir3_spill.c diff --git a/src/freedreno/ci/traces-freedreno.yml b/src/freedreno/ci/traces-freedreno.yml index 05e3e3d..dd1376d 100644 --- a/src/freedreno/ci/traces-freedreno.yml +++ b/src/freedreno/ci/traces-freedreno.yml @@ -435,11 +435,11 @@ traces: - path: minetest/minetest.trace expectations: - device: freedreno-a306 - checksum: 9227cc8d4e6445f2323438340f2a5d9b + checksum: daedbc987cc1b1f934364ce6b633bc54 - device: freedreno-a530 - checksum: d0d655d81fabeb4087bf7c4837301f2a + checksum: 0054f0ba67ace5d2defe17b74b5364e9 - device: freedreno-a630 - checksum: c7349124612a8760ddd825b903561ec4 + checksum: b47f8151d4310d87070deea9059d001b - path: neverball/neverball.trace expectations: # Skipped since it's long on a530. diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h index 6cb3895..c2733a0 100644 --- a/src/freedreno/ir3/ir3.h +++ b/src/freedreno/ir3/ir3.h @@ -83,6 +83,17 @@ struct ir3_info { uint16_t instrs_per_cat[8]; }; +struct ir3_merge_set { + uint16_t preferred_reg; + uint16_t size; + uint16_t alignment; + + unsigned interval_start; + + unsigned regs_count; + struct ir3_register **regs; +}; + struct ir3_register { enum { IR3_REG_CONST = 0x001, @@ -119,6 +130,9 @@ struct ir3_register { IR3_REG_ARRAY = 0x8000, IR3_REG_DEST = 0x10000, + IR3_REG_KILL = 0x20000, + IR3_REG_FIRST_KILL = 0x40000, + IR3_REG_UNUSED = 0x80000, } flags; /* used for cat5 instructions, but also for internal/IR level @@ -142,6 +156,7 @@ struct ir3_register { * rN.x becomes: (N << 2) | x */ uint16_t num; + uint16_t name; union { /* immediate: */ int32_t iim_val; @@ -169,6 +184,10 @@ struct ir3_register { * back to a previous instruction that we depend on). */ struct ir3_register *def; + + unsigned merge_set_offset; + struct ir3_merge_set *merge_set; + unsigned interval_start, interval_end; }; /* @@ -178,12 +197,12 @@ struct ir3_register { unsigned name ## _count, name ## _sz; \ type * name; -#define array_insert(ctx, arr, val) do { \ +#define array_insert(ctx, arr, ...) do { \ if (arr ## _count == arr ## _sz) { \ arr ## _sz = MAX2(2 * arr ## _sz, 16); \ arr = reralloc_size(ctx, arr, arr ## _sz * sizeof(arr[0])); \ } \ - arr[arr ##_count++] = val; \ + arr[arr ##_count++] = __VA_ARGS__; \ } while (0) struct ir3_instruction { @@ -696,12 +715,12 @@ bool ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags); set_foreach ((__instr)->uses, __entry) \ if ((__use = (void *)__entry->key)) -static inline uint32_t reg_num(struct ir3_register *reg) +static inline uint32_t reg_num(const struct ir3_register *reg) { return reg->num >> 2; } -static inline uint32_t reg_comp(struct ir3_register *reg) +static inline uint32_t reg_comp(const struct ir3_register *reg) { return reg->num & 0x3; } @@ -1479,9 +1498,6 @@ bool ir3_cp_postsched(struct ir3 *ir); /* Make arrays SSA */ bool ir3_array_to_ssa(struct ir3 *ir); -/* group neighbors and insert mov's to resolve conflicts: */ -bool ir3_group(struct ir3 *ir); - /* scheduling: */ bool ir3_sched_add_deps(struct ir3 *ir); int ir3_sched(struct ir3 *ir); @@ -1489,11 +1505,8 @@ int ir3_sched(struct ir3 *ir); struct ir3_context; bool ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v); -bool ir3_a6xx_fixup_atomic_dests(struct ir3 *ir, struct ir3_shader_variant *so); - /* register assignment: */ -struct ir3_ra_reg_set * ir3_ra_alloc_reg_set(struct ir3_compiler *compiler, bool mergedregs); -int ir3_ra(struct ir3_shader_variant *v, struct ir3_instruction **precolor, unsigned nprecolor); +int ir3_ra(struct ir3_shader_variant *v); /* legalize: */ bool ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary); diff --git a/src/freedreno/ir3/ir3_a6xx.c b/src/freedreno/ir3/ir3_a6xx.c index 8481e5b..1ce1a81 100644 --- a/src/freedreno/ir3/ir3_a6xx.c +++ b/src/freedreno/ir3/ir3_a6xx.c @@ -125,9 +125,9 @@ emit_intrinsic_atomic_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr) * src1.z - is 'data' for cmpxchg * * The combining src and dest kinda doesn't work out so well with how - * scheduling and RA work. So for now we create a dummy src2.x, and - * then in a later fixup path, insert an extra MOV out of src1.x. - * See ir3_a6xx_fixup_atomic_dests(). + * scheduling and RA work. So we create a dummy src2 which is tied to the + * destination in RA (i.e. must be allocated to the same vec2/vec3 + * register) and then immediately extract the first component. * * Note that nir already multiplies the offset by four */ @@ -193,7 +193,10 @@ emit_intrinsic_atomic_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr) /* even if nothing consume the result, we can't DCE the instruction: */ array_insert(b, b->keeps, atomic); - return atomic; + atomic->regs[0]->wrmask = src1->regs[0]->wrmask; + struct ir3_instruction *split; + ir3_split_dest(b, &split, atomic, 0, 1); + return split; } /* src[] = { deref, coord, sample_index }. const_index[] = {} */ @@ -270,9 +273,9 @@ emit_intrinsic_atomic_image(struct ir3_context *ctx, nir_intrinsic_instr *intr) * src1.z - is 'value' for cmpxchg * * The combining src and dest kinda doesn't work out so well with how - * scheduling and RA work. So for now we create a dummy src2.x, and - * then in a later fixup path, insert an extra MOV out of src1.x. - * See ir3_a6xx_fixup_atomic_dests(). + * scheduling and RA work. So we create a dummy src2 which is tied to the + * destination in RA (i.e. must be allocated to the same vec2/vec3 + * register) and then immediately extract the first component. */ dummy = create_immed(b, 0); src0 = ir3_create_collect(ctx, coords, ncoords); @@ -341,7 +344,10 @@ emit_intrinsic_atomic_image(struct ir3_context *ctx, nir_intrinsic_instr *intr) /* even if nothing consume the result, we can't DCE the instruction: */ array_insert(b, b->keeps, atomic); - return atomic; + atomic->regs[0]->wrmask = src1->regs[0]->wrmask; + struct ir3_instruction *split; + ir3_split_dest(b, &split, atomic, 0, 1); + return split; } static void @@ -373,77 +379,3 @@ const struct ir3_context_funcs ir3_a6xx_funcs = { .emit_intrinsic_image_size = emit_intrinsic_image_size, }; -/* - * Special pass to run after instruction scheduling to insert an - * extra mov from src1.x to dst. This way the other compiler passes - * can ignore this quirk of the new instruction encoding. - * - * This should run after RA. - */ - -static struct ir3_instruction * -get_atomic_dest_mov(struct ir3_instruction *atomic) -{ - struct ir3_instruction *mov; - - /* if we've already created the mov-out, then re-use it: */ - if (atomic->data) - return atomic->data; - - /* We are already out of SSA here, so we can't use the nice builders: */ - mov = ir3_instr_create(atomic->block, OPC_MOV, 2); - ir3_reg_create(mov, 0, 0); /* dst */ - ir3_reg_create(mov, 0, 0); /* src */ - - mov->cat1.src_type = TYPE_U32; - mov->cat1.dst_type = TYPE_U32; - - /* extract back out the 'dummy' which serves as stand-in for dest: */ - struct ir3_instruction *src = atomic->regs[3]->instr; - debug_assert(src->opc == OPC_META_COLLECT); - - *mov->regs[0] = *atomic->regs[0]; - *mov->regs[1] = *src->regs[1]->instr->regs[0]; - - mov->flags |= IR3_INSTR_SY; - - /* it will have already been appended to the end of the block, which - * isn't where we want it, so fix-up the location: - */ - ir3_instr_move_after(mov, atomic); - - return atomic->data = mov; -} - -bool -ir3_a6xx_fixup_atomic_dests(struct ir3 *ir, struct ir3_shader_variant *so) -{ - bool progress = false; - - if (ir3_shader_nibo(so) == 0 && !so->bindless_ibo) - return false; - - foreach_block (block, &ir->block_list) { - foreach_instr (instr, &block->instr_list) { - instr->data = NULL; - } - } - - foreach_block (block, &ir->block_list) { - foreach_instr_safe (instr, &block->instr_list) { - foreach_src (reg, instr) { - struct ir3_instruction *src = reg->instr; - - if (!src) - continue; - - if (is_atomic(src->opc) && (src->flags & IR3_INSTR_G)) { - reg->instr = get_atomic_dest_mov(src); - progress = true; - } - } - } - } - - return progress; -} diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c index 41847e1..99f895e 100644 --- a/src/freedreno/ir3/ir3_compiler.c +++ b/src/freedreno/ir3/ir3_compiler.c @@ -78,7 +78,6 @@ ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id, bool robust_ubo_acce compiler->dev = dev; compiler->gpu_id = gpu_id; compiler->robust_ubo_access = robust_ubo_access; - compiler->set = ir3_ra_alloc_reg_set(compiler, false); /* All known GPU's have 32k local memory (aka shared) */ compiler->local_mem_size = 32 * 1024; @@ -88,7 +87,6 @@ ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id, bool robust_ubo_acce compiler->max_waves = 16; if (compiler->gpu_id >= 600) { - compiler->mergedregs_set = ir3_ra_alloc_reg_set(compiler, true); compiler->samgq_workaround = true; /* a6xx split the pipeline state into geometry and fragment state, in * order to let the VS run ahead of the FS. As a result there are now diff --git a/src/freedreno/ir3/ir3_compiler.h b/src/freedreno/ir3/ir3_compiler.h index 2366bf6..ba6737b 100644 --- a/src/freedreno/ir3/ir3_compiler.h +++ b/src/freedreno/ir3/ir3_compiler.h @@ -38,8 +38,6 @@ struct ir3_shader; struct ir3_compiler { struct fd_device *dev; uint32_t gpu_id; - struct ir3_ra_reg_set *set; - struct ir3_ra_reg_set *mergedregs_set; uint32_t shader_count; struct disk_cache *disk_cache; diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c index f63a3ce..d95fe12 100644 --- a/src/freedreno/ir3/ir3_compiler_nir.c +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -3811,13 +3811,17 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, /* Vertex shaders in a tessellation or geometry pipeline treat END as a * NOP and has an epilogue that writes the VS outputs to local storage, to * be read by the HS. Then it resets execution mask (chmask) and chains - * to the next shader (chsh). + * to the next shader (chsh). There are also a few output values which we + * must send to the next stage via registers, and in order for both stages + * to agree on the register used we must force these to be in specific + * registers. */ if ((so->type == MESA_SHADER_VERTEX && (so->key.has_gs || so->key.tessellation)) || (so->type == MESA_SHADER_TESS_EVAL && so->key.has_gs)) { struct ir3_instruction *outputs[3]; unsigned outidxs[3]; + unsigned regids[3]; unsigned outputs_count = 0; if (ctx->primitive_id) { @@ -3828,6 +3832,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, ir3_create_collect(ctx, &ctx->primitive_id, 1); outputs[outputs_count] = out; outidxs[outputs_count] = n; + regids[outputs_count] = regid(0, 1); outputs_count++; } @@ -3838,6 +3843,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, ir3_create_collect(ctx, &ctx->gs_header, 1); outputs[outputs_count] = out; outidxs[outputs_count] = n; + regids[outputs_count] = regid(0, 0); outputs_count++; } @@ -3848,6 +3854,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, ir3_create_collect(ctx, &ctx->tcs_header, 1); outputs[outputs_count] = out; outidxs[outputs_count] = n; + regids[outputs_count] = regid(0, 0); outputs_count++; } @@ -3858,7 +3865,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, __ssa_dst(chmask); for (unsigned i = 0; i < outputs_count; i++) - __ssa_src(chmask, outputs[i], 0); + __ssa_src(chmask, outputs[i], 0)->num = regids[i]; chmask->end.outidxs = ralloc_array(chmask, unsigned, outputs_count); memcpy(chmask->end.outidxs, outidxs, sizeof(unsigned) * outputs_count); @@ -3959,6 +3966,8 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, ir3_debug_print(ir, "AFTER: nir->ir3"); ir3_validate(ir); + IR3_PASS(ir, ir3_array_to_ssa); + do { progress = false; @@ -3980,11 +3989,6 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, IR3_PASS(ir, ir3_sched_add_deps); - /* Group left/right neighbors, inserting mov's where needed to - * solve conflicts: - */ - IR3_PASS(ir, ir3_group); - /* At this point, all the dead code should be long gone: */ assert(!IR3_PASS(ir, ir3_dce, so)); @@ -4012,20 +4016,12 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, so->binning_pass; if (pre_assign_inputs) { - for (unsigned i = 0; i < ctx->ninputs; i++) { - struct ir3_instruction *instr = ctx->inputs[i]; + foreach_input (in, ir) { + assert(in->opc == OPC_META_INPUT); + unsigned inidx = in->input.inidx; - if (!instr) - continue; - - unsigned n = i / 4; - unsigned c = i % 4; - unsigned regid = so->nonbinning->inputs[n].regid + c; - - instr->regs[0]->num = regid; + in->regs[0]->num = so->nonbinning->inputs[inidx].regid; } - - ret = ir3_ra(so, ctx->inputs, ctx->ninputs); } else if (ctx->tcs_header) { /* We need to have these values in the same registers between VS and TCS * since the VS chains to TCS and doesn't get the sysvals redelivered. @@ -4033,8 +4029,6 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, ctx->tcs_header->regs[0]->num = regid(0, 0); ctx->primitive_id->regs[0]->num = regid(0, 1); - struct ir3_instruction *precolor[] = { ctx->tcs_header, ctx->primitive_id }; - ret = ir3_ra(so, precolor, ARRAY_SIZE(precolor)); } else if (ctx->gs_header) { /* We need to have these values in the same registers between producer * (VS or DS) and GS since the producer chains to GS and doesn't get @@ -4043,29 +4037,22 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, ctx->gs_header->regs[0]->num = regid(0, 0); ctx->primitive_id->regs[0]->num = regid(0, 1); - struct ir3_instruction *precolor[] = { ctx->gs_header, ctx->primitive_id }; - ret = ir3_ra(so, precolor, ARRAY_SIZE(precolor)); } else if (so->num_sampler_prefetch) { assert(so->type == MESA_SHADER_FRAGMENT); - struct ir3_instruction *precolor[2]; int idx = 0; foreach_input (instr, ir) { if (instr->input.sysval != SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL) continue; - assert(idx < ARRAY_SIZE(precolor)); - - precolor[idx] = instr; + assert(idx < 2); instr->regs[0]->num = idx; - idx++; } - ret = ir3_ra(so, precolor, idx); - } else { - ret = ir3_ra(so, NULL, 0); } + ret = ir3_ra(so); + if (ret) { DBG("RA failed!"); goto out; @@ -4073,10 +4060,6 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, IR3_PASS(ir, ir3_postsched, so); - if (compiler->gpu_id >= 600) { - IR3_PASS(ir, ir3_a6xx_fixup_atomic_dests, so); - } - if (so->type == MESA_SHADER_FRAGMENT) pack_inlocs(ctx); diff --git a/src/freedreno/ir3/ir3_context.c b/src/freedreno/ir3/ir3_context.c index 59cc14e..0e58d3c 100644 --- a/src/freedreno/ir3/ir3_context.c +++ b/src/freedreno/ir3/ir3_context.c @@ -111,7 +111,7 @@ ir3_context_init(struct ir3_compiler *compiler, if ((so->type == MESA_SHADER_FRAGMENT) && (compiler->gpu_id >= 600)) NIR_PASS_V(ctx->s, ir3_nir_lower_tex_prefetch); - NIR_PASS_V(ctx->s, nir_convert_from_ssa, true); + NIR_PASS(progress, ctx->s, nir_lower_phis_to_scalar, true); /* Super crude heuristic to limit # of tex prefetch in small * shaders. This completely ignores loops.. but that's really diff --git a/src/freedreno/ir3/ir3_group.c b/src/freedreno/ir3/ir3_group.c deleted file mode 100644 index 6efaf29..0000000 --- a/src/freedreno/ir3/ir3_group.c +++ /dev/null @@ -1,187 +0,0 @@ -/* - * Copyright (C) 2014 Rob Clark - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors: - * Rob Clark - */ - -#include "ir3.h" - -/* - * Find/group instruction neighbors: - */ - -static void -insert_mov(struct ir3_instruction *collect, int idx) -{ - struct ir3_instruction *src = ssa(collect->regs[idx+1]); - struct ir3_instruction *mov = ir3_MOV(src->block, src, - (collect->regs[idx+1]->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32); - - collect->regs[idx+1]->def = mov->regs[0]; - - /* if collect and src are in the same block, move the inserted mov - * to just before the collect to avoid a use-before-def. Otherwise - * it should be safe to leave at the end of the block it is in: - */ - if (src->block == collect->block) { - ir3_instr_move_before(mov, collect); - } -} - -/* verify that cur != instr, but cur is also not in instr's neighbor-list: */ -static bool -in_neighbor_list(struct ir3_instruction *instr, struct ir3_instruction *cur, int pos) -{ - int idx = 0; - - if (!instr) - return false; - - if (instr == cur) - return true; - - for (instr = ir3_neighbor_first(instr); instr; instr = instr->cp.right) - if ((idx++ != pos) && (instr == cur)) - return true; - - return false; -} - -static void -group_collect(struct ir3_instruction *collect) -{ - struct ir3_register **regs = &collect->regs[1]; - unsigned n = collect->regs_count - 1; - - /* first pass, figure out what has conflicts and needs a mov - * inserted. Do this up front, before starting to setup - * left/right neighbor pointers. Trying to do it in a single - * pass could result in a situation where we can't even setup - * the mov's right neighbor ptr if the next instr also needs - * a mov. - */ -restart: - for (unsigned i = 0; i < n; i++) { - struct ir3_instruction *instr = ssa(regs[i]); - if (instr) { - struct ir3_instruction *left = (i > 0) ? ssa(regs[i - 1]) : NULL; - struct ir3_instruction *right = (i < (n-1)) ? ssa(regs[i + 1]) : NULL; - bool conflict; - - /* check for left/right neighbor conflicts: */ - conflict = conflicts(instr->cp.left, left) || - conflicts(instr->cp.right, right); - - /* Mixing array elements and higher register classes - * (ie. groups) doesn't really work out in RA. See: - * - * https://trello.com/c/DqeDkeVf/156-bug-with-stk-70frag - */ - if (instr->regs[0]->flags & IR3_REG_ARRAY) - conflict = true; - - /* we also can't have an instr twice in the group: */ - for (unsigned j = i + 1; (j < n) && !conflict; j++) - if (in_neighbor_list(ssa(regs[j]), instr, i)) - conflict = true; - - if (conflict) { - insert_mov(collect, i); - /* inserting the mov may have caused a conflict - * against the previous: - */ - goto restart; - } - } - } - - /* second pass, now that we've inserted mov's, fixup left/right - * neighbors. This is guaranteed to succeed, since by definition - * the newly inserted mov's cannot conflict with anything. - */ - for (unsigned i = 0; i < n; i++) { - struct ir3_instruction *instr = ssa(regs[i]); - if (instr) { - struct ir3_instruction *left = (i > 0) ? ssa(regs[i - 1]) : NULL; - struct ir3_instruction *right = (i < (n-1)) ? ssa(regs[i + 1]) : NULL; - - debug_assert(!conflicts(instr->cp.left, left)); - if (left) { - instr->cp.left_cnt++; - instr->cp.left = left; - } - - debug_assert(!conflicts(instr->cp.right, right)); - if (right) { - instr->cp.right_cnt++; - instr->cp.right = right; - } - } - } -} - -static bool -instr_find_neighbors(struct ir3_instruction *instr) -{ - bool progress = false; - - if (ir3_instr_check_mark(instr)) - return false; - - if (instr->opc == OPC_META_COLLECT) { - group_collect(instr); - progress = true; - } - - foreach_ssa_src (src, instr) - progress |= instr_find_neighbors(src); - - return progress; -} - -static bool -find_neighbors(struct ir3 *ir) -{ - bool progress = false; - unsigned i; - - foreach_block (block, &ir->block_list) { - for (i = 0; i < block->keeps_count; i++) { - struct ir3_instruction *instr = block->keeps[i]; - progress |= instr_find_neighbors(instr); - } - - /* We also need to account for if-condition: */ - if (block->condition) - progress |= instr_find_neighbors(block->condition); - } - - return progress; -} - -bool -ir3_group(struct ir3 *ir) -{ - ir3_clear_mark(ir); - return find_neighbors(ir); -} diff --git a/src/freedreno/ir3/ir3_liveness.c b/src/freedreno/ir3/ir3_liveness.c new file mode 100644 index 0000000..02abb0f --- /dev/null +++ b/src/freedreno/ir3/ir3_liveness.c @@ -0,0 +1,184 @@ +/* + * Copyright (C) 2021 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ir3_ra.h" +#include "ir3_shader.h" +#include "ralloc.h" + +/* A note on how phi node uses are handled: + * + * - Phi node sources are considered to happen after the end of the + * predecessor block, so the live_out for that block contains phi sources. + * - On the other hand, phi destinations are considered to happen at the start + * of the block, so that live_in does *not* contain phi destinations. This + * is mainly because phi destinations and live-through values have to be + * treated very differently by RA at the beginning of a block. + */ + +static bool +compute_block_liveness(struct ir3_liveness *live, struct ir3_block *block, + BITSET_WORD *tmp_live, unsigned bitset_words) +{ + memcpy(tmp_live, live->live_out[block->index], bitset_words * + sizeof(BITSET_WORD)); + + /* Process instructions */ + foreach_instr_rev (instr, &block->instr_list) { + ra_foreach_dst(dst, instr) { + if (BITSET_TEST(tmp_live, dst->name)) + dst->flags &= ~IR3_REG_UNUSED; + else + dst->flags |= IR3_REG_UNUSED; + BITSET_CLEAR(tmp_live, dst->name); + } + + /* Phi node uses occur after the predecessor block */ + if (instr->opc != OPC_META_PHI) { + ra_foreach_src(src, instr) { + if (BITSET_TEST(tmp_live, src->def->name)) + src->flags &= ~IR3_REG_KILL; + else + src->flags |= IR3_REG_KILL; + } + + ra_foreach_src(src, instr) { + if (BITSET_TEST(tmp_live, src->def->name)) + src->flags &= ~IR3_REG_FIRST_KILL; + else + src->flags |= IR3_REG_FIRST_KILL; + BITSET_SET(tmp_live, src->def->name); + } + } + } + + memcpy(live->live_in[block->index], tmp_live, + bitset_words * sizeof(BITSET_WORD)); + + bool progress = false; + for (unsigned i = 0; i < block->predecessors_count; i++) { + const struct ir3_block *pred = block->predecessors[i]; + for (unsigned j = 0; j < bitset_words; j++) { + if (tmp_live[j] & ~live->live_out[pred->index][j]) + progress = true; + live->live_out[pred->index][j] |= tmp_live[j]; + } + + /* Process phi sources. */ + foreach_instr (phi, &block->instr_list) { + if (phi->opc != OPC_META_PHI) + break; + if (!phi->regs[1 + i]->def) + continue; + unsigned name = phi->regs[1 + i]->def->name; + if (!BITSET_TEST(live->live_out[pred->index], name)) { + progress = true; + BITSET_SET(live->live_out[pred->index], name); + } + } + } + + return progress; +} + +struct ir3_liveness *ir3_calc_liveness(struct ir3_shader_variant *v) +{ + struct ir3_liveness *live = rzalloc(NULL, struct ir3_liveness); + + /* Reserve name 0 to mean "doesn't have a name yet" to make the debug + * output nicer. + */ + array_insert(live, live->definitions, NULL); + + /* Build definition <-> name mapping */ + unsigned block_count = 0; + foreach_block (block, &v->ir->block_list) { + block->index = block_count++; + foreach_instr (instr, &block->instr_list) { + ra_foreach_dst(dst, instr) { + dst->name = live->definitions_count; + array_insert(live, live->definitions, dst); + } + } + } + + live->block_count = block_count; + + unsigned bitset_words = BITSET_WORDS(live->definitions_count); + BITSET_WORD *tmp_live = ralloc_array(live, BITSET_WORD, bitset_words); + live->live_in = ralloc_array(live, BITSET_WORD *, block_count); + live->live_out = ralloc_array(live, BITSET_WORD *, block_count); + unsigned i = 0; + foreach_block (block, &v->ir->block_list) { + block->index = i++; + live->live_in[block->index] = rzalloc_array(live, BITSET_WORD, bitset_words); + live->live_out[block->index] = rzalloc_array(live, BITSET_WORD, bitset_words); + } + + bool progress = true; + while (progress) { + progress = false; + foreach_block_rev (block, &v->ir->block_list) { + progress |= + compute_block_liveness(live, block, tmp_live, bitset_words); + } + } + + return live; +} + +/* Return true if "def" is live after "instr". It's assumed that "def" + * dominates "instr". + */ +bool +ir3_def_live_after(struct ir3_liveness *live, struct ir3_register *def, + struct ir3_instruction *instr) +{ + /* If it's live out then it's definitely live at the instruction. */ + if (BITSET_TEST(live->live_out[instr->block->index], def->name)) + return true; + + /* If it's not live in and not defined in the same block then the live + * range can't extend to the instruction. + */ + if (def->instr->block != instr->block && + !BITSET_TEST(live->live_in[instr->block->index], def->name)) + return false; + + /* Ok, now comes the tricky case, where "def" is killed somewhere in + * "instr"'s block and we have to check if it's before or after. + */ + foreach_instr_rev (test_instr, &instr->block->instr_list) { + if (test_instr == instr) + break; + + for (unsigned i = 0; i < test_instr->regs_count; i++) { + if (test_instr->regs[i]->flags & IR3_REG_DEST) + continue; + if (test_instr->regs[i]->def == def) + return true; + } + } + + return false; +} + diff --git a/src/freedreno/ir3/ir3_lower_parallelcopy.c b/src/freedreno/ir3/ir3_lower_parallelcopy.c new file mode 100644 index 0000000..68f1fef --- /dev/null +++ b/src/freedreno/ir3/ir3_lower_parallelcopy.c @@ -0,0 +1,514 @@ +/* + * Copyright (C) 2021 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ir3_ra.h" +#include "ir3_shader.h" + +struct copy_src { + unsigned flags; + union { + uint32_t imm; + physreg_t reg; + unsigned const_num; + }; +}; + +struct copy_entry { + physreg_t dst; + unsigned flags; + bool done; + + struct copy_src src; +}; + +static unsigned +copy_entry_size(const struct copy_entry *entry) +{ + return (entry->flags & IR3_REG_HALF) ? 1 : 2; +} + +static struct copy_src +get_copy_src(const struct ir3_register *reg, unsigned offset) +{ + if (reg->flags & IR3_REG_IMMED) { + return (struct copy_src) { + .flags = IR3_REG_IMMED, + .imm = reg->uim_val, + }; + } else if (reg->flags & IR3_REG_CONST) { + return (struct copy_src) { + .flags = IR3_REG_CONST, + .const_num = reg->num, + }; + } else { + return (struct copy_src) { + .flags = 0, + .reg = ra_reg_get_physreg(reg) + offset, + }; + } +} + +static void +do_xor(struct ir3_instruction *instr, unsigned dst_num, unsigned src1_num, unsigned src2_num, unsigned flags) +{ + struct ir3_instruction *xor = ir3_instr_create(instr->block, OPC_XOR_B, 3); + struct ir3_register *dst = ir3_reg_create(xor, dst_num, flags | IR3_REG_DEST); + dst->wrmask = 1; + struct ir3_register *src1 = ir3_reg_create(xor, src1_num, flags); + src1->wrmask = 1; + struct ir3_register *src2 = ir3_reg_create(xor, src2_num, flags); + src2->wrmask = 1; + + ir3_instr_move_before(xor, instr); +} + +static void +do_swap(struct ir3_instruction *instr, const struct copy_entry *entry) +{ + assert(!entry->src.flags); + /* TODO implement shared swaps */ + assert(!(entry->flags & IR3_REG_SHARED)); + + if (entry->flags & IR3_REG_HALF) { + /* We currently make sure to never emit parallel copies where the + * source/destination is a half-reg above the range accessable to half + * registers. However, when a full-reg source overlaps a half-reg + * destination or vice versa, it can be very, very complicated to come + * up with a series of "legal" swaps and copies to resolve the + * parallel copy. So here we provide a fallback to implement the + * "illegal" swap instead. This may also be useful for implementing + * "spilling" half-regs to the inaccessable space. + */ + if (entry->src.reg >= RA_HALF_SIZE) { + /* Choose a temporary that doesn't overlap src or dst */ + physreg_t tmp = entry->dst < 2 ? 2 : 0; + + /* Swap src and the temporary */ + do_swap(instr, &(struct copy_entry) { + .src = { .reg = entry->src.reg & ~1u }, + .dst = tmp, + .flags = entry->flags & ~IR3_REG_HALF, + }); + + /* Do the original swap with src replaced with tmp */ + do_swap(instr, &(struct copy_entry) { + .src = { .reg = tmp + (entry->src.reg & 1) }, + .dst = entry->dst, + .flags = entry->flags, + }); + + /* Swap src and the temporary back */ + do_swap(instr, &(struct copy_entry) { + .src = { .reg = entry->src.reg & ~1u }, + .dst = tmp, + .flags = entry->flags & ~IR3_REG_HALF, + }); + return; + } + + /* If dst is not addressable, we only need to swap the arguments and + * let the case above handle it. + */ + if (entry->dst >= RA_HALF_SIZE) { + do_swap(instr, &(struct copy_entry) { + .src = { .reg = entry->dst }, + .dst = entry->src.reg, + .flags = entry->flags, + }); + return; + } + } + + unsigned src_num = ra_physreg_to_num(entry->src.reg, entry->flags); + unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags); + + do_xor(instr, dst_num, dst_num, src_num, entry->flags); + do_xor(instr, src_num, src_num, dst_num, entry->flags); + do_xor(instr, dst_num, dst_num, src_num, entry->flags); +} + +static void +do_copy(struct ir3_instruction *instr, const struct copy_entry *entry) +{ + /* TODO implement shared copies */ + assert(!(entry->flags & IR3_REG_SHARED)); + + if (entry->flags & IR3_REG_HALF) { + /* See do_swap() for why this is here. */ + if (entry->dst >= RA_HALF_SIZE) { + /* TODO: is there a hw instruction we can use for this case? */ + physreg_t tmp = !entry->src.flags && entry->src.reg < 2 ? 2 : 0; + + do_swap(instr, &(struct copy_entry) { + .src = { .reg = entry->dst & ~1u }, + .dst = tmp, + .flags = entry->flags & ~IR3_REG_HALF, + }); + + do_copy(instr, &(struct copy_entry) { + .src = entry->src, + .dst = tmp + (entry->dst & 1), + .flags = entry->flags, + }); + + do_swap(instr, &(struct copy_entry) { + .src = { .reg = entry->dst & ~1u }, + .dst = tmp, + .flags = entry->flags & ~IR3_REG_HALF, + }); + return; + } + + if (!entry->src.flags && entry->src.reg >= RA_HALF_SIZE) { + unsigned src_num = + ra_physreg_to_num(entry->src.reg & ~1u, entry->flags & ~IR3_REG_HALF); + unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags); + + if (entry->src.reg % 2 == 0) { + /* cov.u32u16 dst, src */ + struct ir3_instruction *cov = ir3_instr_create(instr->block, OPC_MOV, 2); + ir3_reg_create(cov, dst_num, entry->flags | IR3_REG_DEST)->wrmask = 1; + ir3_reg_create(cov, src_num, entry->flags & ~IR3_REG_HALF)->wrmask = 1; + cov->cat1.dst_type = TYPE_U16; + cov->cat1.src_type = TYPE_U32; + ir3_instr_move_before(cov, instr); + } else { + /* shr.b dst, src, h(16) */ + struct ir3_instruction *shr = ir3_instr_create(instr->block, OPC_SHR_B, 3); + ir3_reg_create(shr, dst_num, entry->flags | IR3_REG_DEST)->wrmask = 1; + ir3_reg_create(shr, src_num, entry->flags & ~IR3_REG_HALF)->wrmask = 1; + ir3_reg_create(shr, 0, entry->flags | IR3_REG_IMMED)->uim_val = 16; + ir3_instr_move_before(shr, instr); + } + return; + } + } + + unsigned src_num = ra_physreg_to_num(entry->src.reg, entry->flags); + unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags); + + struct ir3_instruction *mov = ir3_instr_create(instr->block, OPC_MOV, 2); + ir3_reg_create(mov, dst_num, entry->flags | IR3_REG_DEST)->wrmask = 1; + ir3_reg_create(mov, src_num, entry->flags | entry->src.flags)->wrmask = 1; + mov->cat1.dst_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32; + mov->cat1.src_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32; + if (entry->src.flags & IR3_REG_IMMED) + mov->regs[1]->uim_val = entry->src.imm; + else if (entry->src.flags & IR3_REG_CONST) + mov->regs[1]->num = entry->src.const_num; + ir3_instr_move_before(mov, instr); +} + +struct copy_ctx { + /* For each physreg, the number of pending copy entries that use it as a + * source. Once this drops to zero, then the physreg is unblocked and can + * be moved to. + */ + unsigned physreg_use_count[RA_MAX_FILE_SIZE]; + + /* For each physreg, the pending copy_entry that uses it as a dest. */ + struct copy_entry *physreg_dst[RA_MAX_FILE_SIZE]; + + struct copy_entry entries[RA_MAX_FILE_SIZE]; + unsigned entry_count; +}; + +static bool +entry_blocked(struct copy_entry *entry, struct copy_ctx *ctx) +{ + for (unsigned i = 0; i < copy_entry_size(entry); i++) { + if (ctx->physreg_use_count[entry->dst + i] != 0) + return true; + } + + return false; +} + +static void +split_32bit_copy(struct copy_ctx *ctx, struct copy_entry *entry) +{ + assert(!entry->done); + assert(!(entry->flags & (IR3_REG_IMMED | IR3_REG_CONST))); + assert(copy_entry_size(entry) == 2); + struct copy_entry *new_entry = &ctx->entries[ctx->entry_count++]; + + new_entry->dst = entry->dst + 1; + new_entry->src.flags = entry->src.flags; + new_entry->src.reg = entry->src.reg + 1; + new_entry->done = false; + entry->flags |= IR3_REG_HALF; + new_entry->flags = entry->flags; + ctx->physreg_dst[entry->dst + 1] = new_entry; +} + +static void +_handle_copies(struct ir3_instruction *instr, struct copy_ctx *ctx) +{ + /* Set up the bookkeeping */ + memset(ctx->physreg_dst, 0, sizeof(ctx->physreg_dst)); + memset(ctx->physreg_use_count, 0, sizeof(ctx->physreg_use_count)); + + for (unsigned i = 0; i < ctx->entry_count; i++) { + struct copy_entry *entry = &ctx->entries[i]; + for (unsigned j = 0; j < copy_entry_size(entry); j++) { + if (!entry->src.flags) + ctx->physreg_use_count[entry->src.reg + j]++; + + /* Copies should not have overlapping destinations. */ + assert(!ctx->physreg_dst[entry->dst + j]); + ctx->physreg_dst[entry->dst + j] = entry; + } + } + + bool progress = true; + while (progress) { + progress = false; + + /* Step 1: resolve paths in the transfer graph. This means finding + * copies whose destination aren't blocked by something else and then + * emitting them, continuing this process until every copy is blocked + * and there are only cycles left. + * + * TODO: We should note that src is also available in dst to unblock + * cycles that src is involved in. + */ + + for (unsigned i = 0; i < ctx->entry_count; i++) { + struct copy_entry *entry = &ctx->entries[i]; + if (!entry->done && !entry_blocked(entry, ctx)) { + entry->done = true; + progress = true; + do_copy(instr, entry); + for (unsigned j = 0; j < copy_entry_size(entry); j++) { + if (!entry->src.flags) + ctx->physreg_use_count[entry->src.reg + j]--; + ctx->physreg_dst[entry->dst + j] = NULL; + } + } + } + + if (progress) + continue; + + /* Step 2: Find partially blocked copies and split them. In the + * mergedregs case, we can 32-bit copies which are only blocked on one + * 16-bit half, and splitting them helps get things moving. + * + * We can skip splitting copies if the source isn't a register, + * however, because it does not unblock anything and therefore doesn't + * contribute to making forward progress with step 1. These copies + * should still be resolved eventually in step 1 because they can't be + * part of a cycle. + */ + for (unsigned i = 0; i < ctx->entry_count; i++) { + struct copy_entry *entry = &ctx->entries[i]; + if (entry->done || entry->flags & IR3_REG_HALF) + continue; + + if (((ctx->physreg_use_count[entry->dst] == 0 || + ctx->physreg_use_count[entry->dst + 1] == 0)) && + !(entry->flags & (IR3_REG_IMMED | IR3_REG_CONST))) { + split_32bit_copy(ctx, entry); + progress = true; + } + } + } + + /* Step 3: resolve cycles through swapping. + * + * At this point, the transfer graph should consist of only cycles. + * The reason is that, given any physreg n_1 that's the source of a + * remaining entry, it has a destination n_2, which (because every + * copy is blocked) is the source of some other copy whose destination + * is n_3, and so we can follow the chain until we get a cycle. If we + * reached some other node than n_1: + * + * n_1 -> n_2 -> ... -> n_i + * ^ | + * |-------------| + * + * then n_2 would be the destination of 2 copies, which is illegal + * (checked above in an assert). So n_1 must be part of a cycle: + * + * n_1 -> n_2 -> ... -> n_i + * ^ | + * |---------------------| + * + * and this must be only cycle n_1 is involved in, because any other + * path starting from n_1 would also have to end in n_1, resulting in + * a node somewhere along the way being the destination of 2 copies + * when the 2 paths merge. + * + * The way we resolve the cycle is through picking a copy (n_1, n_2) + * and swapping n_1 and n_2. This moves n_1 to n_2, so n_2 is taken + * out of the cycle: + * + * n_1 -> ... -> n_i + * ^ | + * |--------------| + * + * and we can keep repeating this until the cycle is empty. + */ + + for (unsigned i = 0; i < ctx->entry_count; i++) { + struct copy_entry *entry = &ctx->entries[i]; + if (entry->done) + continue; + + assert(!entry->src.flags); + + /* catch trivial copies */ + if (entry->dst == entry->src.reg) { + entry->done = true; + continue; + } + + do_swap(instr, entry); + + /* Split any blocking copies whose sources are only partially + * contained within our destination. + */ + if (entry->flags & IR3_REG_HALF) { + for (unsigned j = 0; j < ctx->entry_count; j++) { + struct copy_entry *blocking = &ctx->entries[j]; + + if (blocking->done) + continue; + + if (blocking->src.reg <= entry->dst && + blocking->src.reg + 1 >= entry->dst && + !(blocking->flags & IR3_REG_HALF)) { + split_32bit_copy(ctx, blocking); + } + } + } + + /* Update sources of blocking copies. + * + * Note: at this point, every blocking copy's source should be + * contained within our destination. + */ + for (unsigned j = 0; j < ctx->entry_count; j++) { + struct copy_entry *blocking = &ctx->entries[j]; + if (blocking->src.reg >= entry->dst && + blocking->src.reg < entry->dst + copy_entry_size(entry)) { + blocking->src.reg = entry->src.reg + (blocking->src.reg - entry->dst); + } + } + } +} + +static void +handle_copies(struct ir3_instruction *instr, struct copy_entry *entries, + unsigned entry_count, bool mergedregs) +{ + struct copy_ctx ctx; + + if (mergedregs) { + /* Half regs and full regs are in the same file, so handle everything + * at once. + */ + memcpy(ctx.entries, entries, sizeof(struct copy_entry) * entry_count); + ctx.entry_count = entry_count; + _handle_copies(instr, &ctx); + } else { + /* There may be both half copies and full copies, so we have to split + * them up since they don't interfere. + */ + ctx.entry_count = 0; + for (unsigned i = 0; i < entry_count; i++) { + if (entries[i].flags & IR3_REG_HALF) + ctx.entries[ctx.entry_count++] = entries[i]; + } + _handle_copies(instr, &ctx); + + ctx.entry_count = 0; + for (unsigned i = 0; i < entry_count; i++) { + if (!(entries[i].flags & IR3_REG_HALF)) + ctx.entries[ctx.entry_count++] = entries[i]; + } + _handle_copies(instr, &ctx); + } +} + +void +ir3_lower_copies(struct ir3_shader_variant *v) +{ + DECLARE_ARRAY(struct copy_entry, copies); + copies_count = copies_sz = 0; + copies = NULL; + + foreach_block (block, &v->ir->block_list) { + foreach_instr_safe (instr, &block->instr_list) { + if (instr->opc == OPC_META_PARALLEL_COPY) { + copies_count = 0; + for (unsigned i = 0; i < instr->regs_count / 2; i++) { + struct ir3_register *dst = instr->regs[i]; + struct ir3_register *src = instr->regs[i + instr->regs_count / 2]; + unsigned flags = src->flags & (IR3_REG_HALF | IR3_REG_SHARED); + for (unsigned j = 0; j < reg_elems(dst); j++) { + array_insert(NULL, copies, (struct copy_entry) { + .dst = ra_num_to_physreg(dst->num + j, flags), + .src = get_copy_src(src, j * reg_elem_size(dst)), + .flags = flags, + }); + } + } + handle_copies(instr, copies, copies_count, v->mergedregs); + list_del(&instr->node); + } else if (instr->opc == OPC_META_COLLECT) { + copies_count = 0; + struct ir3_register *dst = instr->regs[0]; + unsigned flags = dst->flags & (IR3_REG_HALF | IR3_REG_SHARED); + for (unsigned i = 1; i < instr->regs_count; i++) { + struct ir3_register *src = instr->regs[i]; + array_insert(NULL, copies, (struct copy_entry) { + .dst = ra_num_to_physreg(dst->num + i - 1, flags), + .src = get_copy_src(src, 0), + .flags = flags, + }); + } + handle_copies(instr, copies, copies_count, v->mergedregs); + list_del(&instr->node); + } else if (instr->opc == OPC_META_SPLIT) { + copies_count = 0; + struct ir3_register *dst = instr->regs[0]; + struct ir3_register *src = instr->regs[1]; + unsigned flags = src->flags & (IR3_REG_HALF | IR3_REG_SHARED); + array_insert(NULL, copies, (struct copy_entry) { + .dst = ra_reg_get_physreg(dst), + .src = get_copy_src(src, instr->split.off * reg_elem_size(dst)), + .flags = flags, + }); + handle_copies(instr, copies, copies_count, v->mergedregs); + list_del(&instr->node); + } else if (instr->opc == OPC_META_PHI) { + list_del(&instr->node); + } + } + } + + if (copies) + ralloc_free(copies); +} + diff --git a/src/freedreno/ir3/ir3_merge_regs.c b/src/freedreno/ir3/ir3_merge_regs.c new file mode 100644 index 0000000..abb8e86 --- /dev/null +++ b/src/freedreno/ir3/ir3_merge_regs.c @@ -0,0 +1,574 @@ +/* + * Copyright (C) 2021 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ir3_ra.h" +#include "ir3_compiler.h" +#include "ralloc.h" + +/* This pass "merges" compatible phi-web SSA values. First, we insert a bunch + * of parallelcopy's to trivially turn the program into CSSA form. Then we + * try to "merge" SSA def's into "merge sets" which could be allocated to a + * single register in order to eliminate copies. First we merge phi nodes, + * which should always succeed because of the parallelcopy's we inserted, and + * then we try to coalesce the copies we introduced. + * + * The merged registers are used for three purposes: + * + * 1. We always use the same pvtmem slot for spilling all SSA defs in each + * merge set. This prevents us from having to insert memory-to-memory copies + * in the spiller and makes sure we don't insert unecessary copies. + * 2. When two values are live at the same time, part of the same merge + * set, and they overlap each other in the merge set, they always occupy + * overlapping physical registers in RA. This reduces register pressure and + * copies in several important scenarios: + * - When sources of a collect are used later by something else, we don't + * have to introduce copies. + * - We can handle sequences of extracts that "explode" a vector into its + * components without any additional copying. + * 3. We use the merge sets for affinities in register allocation: That is, we + * try to allocate all the definitions in the same merge set to the + * same/compatible registers. This helps us e.g. allocate sources of a collect + * to contiguous registers without too much special code in RA. + * + * In a "normal" register allocator, or when spilling, we'd just merge + * registers in the same merge set to the same register, but with SSA-based + * register allocation we may have to split the live interval. + * + * The implementation is based on "Revisiting Out-of-SSA Translation for + * Correctness, CodeQuality, and Efficiency," and is broadly similar to the + * implementation in nir_from_ssa, with the twist that we also try to coalesce + * META_SPLIT and META_COLLECT. This makes this pass more complicated but + * prevents us from needing to handle these specially in RA and the spiller, + * which are already complicated enough. This also forces us to implement that + * value-comparison optimization they explain, as without it we wouldn't be + * able to coalesce META_SPLIT even in the simplest of cases. + */ + +/* In order to dynamically reconstruct the dominance forest, we need the + * instructions ordered by a preorder traversal of the dominance tree: + */ + +static unsigned +index_instrs(struct ir3_block *block, unsigned index) +{ + foreach_instr (instr, &block->instr_list) + instr->ip = index++; + + for (unsigned i = 0; i < block->dom_children_count; i++) + index = index_instrs(block->dom_children[i], index); + + return index; +} + +/* Definitions within a merge set are ordered by instr->ip as set above: */ + +static bool +def_after(struct ir3_register *a, struct ir3_register *b) +{ + return a->instr->ip > b->instr->ip; +} + +static bool +def_dominates(struct ir3_register *a, struct ir3_register *b) +{ + if (def_after(a, b)) { + return false; + } else if (a->instr->block == b->instr->block) { + return def_after(b, a); + } else { + return ir3_block_dominates(a->instr->block, b->instr->block); + } +} + +/* This represents a region inside a register. The offset is relative to the + * start of the register, and offset + size <= size(reg). + */ +struct def_value { + struct ir3_register *reg; + unsigned offset, size; +}; + +/* Chase any copies to get the source of a region inside a register. This is + * Value(a) in the paper. + */ +static struct def_value +chase_copies(struct def_value value) +{ + while (true) { + struct ir3_instruction *instr = value.reg->instr; + if (instr->opc == OPC_META_SPLIT) { + value.offset += instr->split.off * reg_elem_size(value.reg); + value.reg = instr->regs[1]->def; + } else if (instr->opc == OPC_META_COLLECT) { + if (value.offset % reg_elem_size(value.reg) != 0 || + value.size > reg_elem_size(value.reg) || + value.offset + value.size > reg_size(value.reg)) + break; + struct ir3_register *src = instr->regs[1 + value.offset / reg_elem_size(value.reg)]; + if (!src->def) + break; + value.offset = 0; + value.reg = src->def; + } else { + /* TODO: parallelcopy */ + break; + } + } + + return value; +} + +/* This represents an entry in the merge set, and consists of a register + + * offset from the merge set base. + */ +struct merge_def { + struct ir3_register *reg; + unsigned offset; +}; + +static bool +can_skip_interference(const struct merge_def *a, const struct merge_def *b) +{ + unsigned a_start = a->offset; + unsigned b_start = b->offset; + unsigned a_end = a_start + reg_size(a->reg); + unsigned b_end = b_start + reg_size(b->reg); + + /* Registers that don't overlap never interfere */ + if (a_end <= b_start || b_end <= a_start) + return true; + + /* Disallow skipping interference unless one definition contains the + * other. This restriction is important for register allocation, because + * it means that at any given point in the program, the live values in a + * given merge set will form a tree. If they didn't, then one live value + * would partially overlap another, and they would have overlapping live + * ranges because they're live at the same point. This simplifies register + * allocation and spilling. + */ + if (!((a_start <= b_start && a_end >= b_end) || + (b_start <= a_start && b_end >= a_end))) + return false; + + /* For each register, chase the intersection of a and b to find the + * ultimate source. + */ + unsigned start = MAX2(a_start, b_start); + unsigned end = MIN2(a_end, b_end); + struct def_value a_value = + chase_copies((struct def_value) { + .reg = a->reg, + .offset = start - a_start, + .size = end - start, + }); + struct def_value b_value = + chase_copies((struct def_value) { + .reg = b->reg, + .offset = start - b_start, + .size = end - start, + }); + return a_value.reg == b_value.reg && a_value.offset == b_value.offset; +} + +static struct ir3_merge_set * +get_merge_set(struct ir3_register *def) +{ + if (def->merge_set) + return def->merge_set; + + struct ir3_merge_set *set = ralloc(def, struct ir3_merge_set); + set->preferred_reg = ~0; + set->interval_start = ~0; + set->size = reg_size(def); + set->alignment = (def->flags & IR3_REG_HALF) ? 1 : 2; + set->regs_count = 1; + set->regs = ralloc(set, struct ir3_register *); + set->regs[0] = def; + + return set; +} + +/* Merges b into a */ +static struct ir3_merge_set * +merge_merge_sets(struct ir3_merge_set *a, struct ir3_merge_set *b, + int b_offset) +{ + if (b_offset < 0) + return merge_merge_sets(b, a, -b_offset); + + struct ir3_register **new_regs = + rzalloc_array(a, struct ir3_register *, a->regs_count + b->regs_count); + + unsigned a_index = 0, b_index = 0, new_index = 0; + for (; a_index < a->regs_count || b_index < b->regs_count; new_index++) { + if (b_index < b->regs_count && + (a_index == a->regs_count || + def_after(a->regs[a_index], b->regs[b_index]))) { + new_regs[new_index] = b->regs[b_index++]; + new_regs[new_index]->merge_set_offset += b_offset; + } else { + new_regs[new_index] = a->regs[a_index++]; + } + new_regs[new_index]->merge_set = a; + } + + assert(new_index == a->regs_count + b->regs_count); + + /* Technically this should be the lcm, but because alignment is only 1 or + * 2 so far this should be ok. + */ + a->alignment = MAX2(a->alignment, b->alignment); + a->regs_count += b->regs_count; + ralloc_free(a->regs); + a->regs = new_regs; + a->size = MAX2(a->size, b->size + b_offset); + + return a; +} + +static bool +merge_sets_interfere(struct ir3_liveness *live, + struct ir3_merge_set *a, struct ir3_merge_set *b, + int b_offset) +{ + if (b_offset < 0) + return merge_sets_interfere(live, b, a, -b_offset); + + struct merge_def dom[a->regs_count + b->regs_count]; + unsigned a_index = 0, b_index = 0; + int dom_index = -1; + + /* Reject trying to merge the sets if the alignment doesn't work out */ + if (b_offset % a->alignment != 0) + return true; + + while (a_index < a->regs_count || b_index < b->regs_count) { + struct merge_def current; + if (a_index == a->regs_count) { + current.reg = b->regs[b_index]; + current.offset = current.reg->merge_set_offset + b_offset; + b_index++; + } else if (b_index == b->regs_count) { + current.reg = a->regs[a_index]; + current.offset = current.reg->merge_set_offset; + a_index++; + } else { + if (def_after(b->regs[b_index], a->regs[a_index])) { + current.reg = a->regs[a_index]; + current.offset = current.reg->merge_set_offset; + a_index++; + } else { + current.reg = b->regs[b_index]; + current.offset = current.reg->merge_set_offset + b_offset; + b_index++; + } + } + + while (dom_index >= 0 && + !def_dominates(dom[dom_index].reg, current.reg)) { + dom_index--; + } + + /* TODO: in the original paper, just dom[dom_index] needs to be + * checked for interference. We implement the value-chasing extension + * as well as support for sub-registers, which complicates this + * significantly because it's no longer the case that if a dominates b + * dominates c and a and b don't interfere then we only need to check + * interference between b and c to be sure a and c don't interfere -- + * this means we may have to check for interference against values + * higher in the stack then dom[dom_index]. In the paper there's a + * description of a way to do less interference tests with the + * value-chasing extension, but we'd have to come up with something + * ourselves for handling the similar problems that come up with + * allowing values to contain subregisters. For now we just test + * everything in the stack. + */ + for (int i = 0; i <= dom_index; i++) { + if (can_skip_interference(¤t, &dom[i])) + continue; + + /* Ok, now we actually have to check interference. Since we know + * that dom[i] dominates current, this boils down to checking + * whether dom[i] is live after current. + */ + if (ir3_def_live_after(live, dom[i].reg, current.reg->instr)) + return true; + } + + dom[++dom_index] = current; + } + + return false; +} + +static void +try_merge_defs(struct ir3_liveness *live, + struct ir3_register *a, struct ir3_register *b, + unsigned b_offset) +{ + struct ir3_merge_set *a_set = get_merge_set(a); + struct ir3_merge_set *b_set = get_merge_set(b); + + if (a_set == b_set) { + /* Note: Even in this case we may not always successfully be able to + * coalesce this copy, if the offsets don't line up. But in any + * case, we can't do anything. + */ + return; + } + + int b_set_offset = a->merge_set_offset + b_offset - b->merge_set_offset; + + if (!merge_sets_interfere(live, a_set, b_set, b_set_offset)) + merge_merge_sets(a_set, b_set, b_set_offset); +} + +static void +coalesce_phi(struct ir3_liveness *live, + struct ir3_instruction *phi) +{ + for (unsigned i = 1; i < phi->regs_count; i++) { + if (phi->regs[i]->def) + try_merge_defs(live, phi->regs[0], phi->regs[i]->def, 0); + } +} + +static void +aggressive_coalesce_parallel_copy(struct ir3_liveness *live, + struct ir3_instruction *pcopy) +{ + unsigned copies = pcopy->regs_count / 2; + for (unsigned i = 0; i < copies; i++) { + if (!(pcopy->regs[copies + i]->flags & IR3_REG_SSA)) + continue; + try_merge_defs(live, pcopy->regs[i], pcopy->regs[copies + i]->def, 0); + } +} + +static void +aggressive_coalesce_split(struct ir3_liveness *live, + struct ir3_instruction *split) +{ + try_merge_defs(live, split->regs[1]->def, split->regs[0], + split->split.off * reg_elem_size(split->regs[0])); +} + +static void +aggressive_coalesce_collect(struct ir3_liveness *live, + struct ir3_instruction *collect) +{ + for (unsigned i = 1, offset = 0; i < collect->regs_count; + offset += reg_elem_size(collect->regs[i]), i++) { + if (!(collect->regs[i]->flags & IR3_REG_SSA)) + continue; + try_merge_defs(live, collect->regs[0], collect->regs[i]->def, offset); + } +} + +static void +create_parallel_copy(struct ir3_block *block) +{ + for (unsigned i = 0; i < 2; i++) { + if (!block->successors[i]) + continue; + + struct ir3_block *succ = block->successors[i]; + + unsigned pred_idx = ir3_block_get_pred_index(succ, block); + + unsigned phi_count = 0; + foreach_instr (phi, &succ->instr_list) { + if (phi->opc != OPC_META_PHI) + break; + + /* Avoid undef */ + if ((phi->regs[1 + pred_idx]->flags & IR3_REG_SSA) && + !phi->regs[1 + pred_idx]->def) + continue; + + /* We don't support critical edges. If we were to support them, + * we'd need to insert parallel copies after the phi node to solve + * the lost-copy problem. + */ + assert(i == 0 && !block->successors[1]); + phi_count++; + } + + if (phi_count == 0) + continue; + + struct ir3_register *src[phi_count]; + unsigned j = 0; + foreach_instr (phi, &succ->instr_list) { + if (phi->opc != OPC_META_PHI) + break; + if ((phi->regs[1 + pred_idx]->flags & IR3_REG_SSA) && + !phi->regs[1 + pred_idx]->def) + continue; + src[j++] = phi->regs[pred_idx + 1]; + } + assert(j == phi_count); + + struct ir3_instruction *pcopy = + ir3_instr_create(block, OPC_META_PARALLEL_COPY, 2 * phi_count); + + for (j = 0; j < phi_count; j++) { + struct ir3_register *reg = __ssa_dst(pcopy); + reg->flags |= src[j]->flags & (IR3_REG_HALF | IR3_REG_ARRAY); + reg->size = reg_elems(src[j]); + } + + for (j = 0; j < phi_count; j++) { + pcopy->regs[pcopy->regs_count++] = ir3_reg_clone(block->shader, src[j]); + } + + j = 0; + foreach_instr (phi, &succ->instr_list) { + if (phi->opc != OPC_META_PHI) + break; + if ((phi->regs[1 + pred_idx]->flags & IR3_REG_SSA) && + !phi->regs[1 + pred_idx]->def) + continue; + phi->regs[1 + pred_idx]->def = pcopy->regs[j]; + phi->regs[1 + pred_idx]->flags = pcopy->regs[j]->flags & ~IR3_REG_DEST; + j++; + } + assert(j == phi_count); + } +} + +void +ir3_create_parallel_copies(struct ir3 *ir) +{ + foreach_block (block, &ir->block_list) { + create_parallel_copy(block); + } +} + +static void +index_merge_sets(struct ir3 *ir) +{ + unsigned offset = 0; + foreach_block (block, &ir->block_list) { + foreach_instr (instr, &block->instr_list) { + for (unsigned i = 0; i < instr->regs_count; i++) { + struct ir3_register *dst = instr->regs[i]; + if (!(dst->flags & IR3_REG_DEST)) + continue; + + unsigned dst_offset; + struct ir3_merge_set *merge_set = dst->merge_set; + unsigned size = reg_size(dst); + if (merge_set) { + if (merge_set->interval_start == ~0) { + merge_set->interval_start = offset; + offset += merge_set->size; + } + dst_offset = merge_set->interval_start + dst->merge_set_offset; + } else { + dst_offset = offset; + offset += size; + } + + dst->interval_start = dst_offset; + dst->interval_end = dst_offset + size; + } + } + } +} + +#define RESET "\x1b[0m" +#define BLUE "\x1b[0;34m" +#define SYN_SSA(x) BLUE x RESET + +static void +dump_merge_sets(struct ir3 *ir) +{ + printf("merge sets:\n"); + struct set *merge_sets = _mesa_pointer_set_create(NULL); + foreach_block (block, &ir->block_list) { + foreach_instr (instr, &block->instr_list) { + for (unsigned i = 0; i < instr->regs_count; i++) { + struct ir3_register *dst = instr->regs[i]; + if (!(dst->flags & IR3_REG_DEST)) + continue; + + struct ir3_merge_set *merge_set = dst->merge_set; + if (!merge_set || _mesa_set_search(merge_sets, merge_set)) + continue; + + printf("merge set, size %u, align %u:\n", merge_set->size, merge_set->alignment); + for (unsigned j = 0; j < merge_set->regs_count; j++) { + struct ir3_register *reg = merge_set->regs[j]; + printf("\t"SYN_SSA("ssa_%u")":%u, offset %u\n", reg->instr->serialno, + reg->name, reg->merge_set_offset); + } + + _mesa_set_add(merge_sets, merge_set); + } + } + } + + ralloc_free(merge_sets); +} + +void +ir3_merge_regs(struct ir3_liveness *live, struct ir3 *ir) +{ + index_instrs(ir3_start_block(ir), 0); + + /* First pass: coalesce phis, which must be together. */ + foreach_block (block, &ir->block_list) { + foreach_instr (instr, &block->instr_list) { + if (instr->opc != OPC_META_PHI) + break; + + coalesce_phi(live, instr); + } + } + + /* Second pass: aggressively coalesce parallelcopy, split, collect */ + foreach_block (block, &ir->block_list) { + foreach_instr (instr, &block->instr_list) { + switch (instr->opc) { + case OPC_META_SPLIT: + aggressive_coalesce_split(live, instr); + break; + case OPC_META_COLLECT: + aggressive_coalesce_collect(live, instr); + break; + case OPC_META_PARALLEL_COPY: + aggressive_coalesce_parallel_copy(live, instr); + break; + default: + break; + } + } + } + + index_merge_sets(ir); + + if (ir3_shader_debug & IR3_DBG_RAMSGS) + dump_merge_sets(ir); +} + diff --git a/src/freedreno/ir3/ir3_postsched.c b/src/freedreno/ir3/ir3_postsched.c index 6a79261..3b246ed 100644 --- a/src/freedreno/ir3/ir3_postsched.c +++ b/src/freedreno/ir3/ir3_postsched.c @@ -728,23 +728,14 @@ cleanup_self_movs(struct ir3 *ir) { foreach_block (block, &ir->block_list) { foreach_instr_safe (instr, &block->instr_list) { - - foreach_src (reg, instr) { - if (!reg->def) - continue; - - if (is_self_mov(reg->def->instr)) { - list_delinit(®->def->instr->node); - reg->def = reg->def->instr->regs[1]->def; - } - } - for (unsigned i = 0; i < instr->deps_count; i++) { if (instr->deps[i] && is_self_mov(instr->deps[i])) { - list_delinit(&instr->deps[i]->node); - instr->deps[i] = instr->deps[i]->regs[1]->def->instr; + instr->deps[i] = NULL; } } + + if (is_self_mov(instr)) + list_delinit(&instr->node); } } } diff --git a/src/freedreno/ir3/ir3_print.c b/src/freedreno/ir3/ir3_print.c index d1fac82..3646421 100644 --- a/src/freedreno/ir3/ir3_print.c +++ b/src/freedreno/ir3/ir3_print.c @@ -165,6 +165,8 @@ static void print_instr_name(struct ir3_instruction *instr, bool flags) static void print_ssa_def_name(struct ir3_register *reg) { printf(SYN_SSA("ssa_%u"), reg->instr->serialno); + if (reg->name != 0) + printf(":%u", reg->name); } static void print_ssa_name(struct ir3_register *reg, bool dst) @@ -177,6 +179,9 @@ static void print_ssa_name(struct ir3_register *reg, bool dst) } else { print_ssa_def_name(reg); } + + if (reg->num != INVALID_REG) + printf("("SYN_REG("r%u.%c")")", reg_num(reg), "xyzw"[reg_comp(reg)]); } static void print_reg_name(struct ir3_instruction *instr, struct ir3_register *reg) @@ -189,6 +194,11 @@ static void print_reg_name(struct ir3_instruction *instr, struct ir3_register *r else if (reg->flags & (IR3_REG_FABS | IR3_REG_SABS)) printf("(abs)"); + if (reg->flags & IR3_REG_FIRST_KILL) + printf("(kill)"); + if (reg->flags & IR3_REG_UNUSED) + printf("(unused)"); + if (reg->flags & IR3_REG_R) printf("(r)"); diff --git a/src/freedreno/ir3/ir3_ra.c b/src/freedreno/ir3/ir3_ra.c index 7151a2e..a8a523f 100644 --- a/src/freedreno/ir3/ir3_ra.c +++ b/src/freedreno/ir3/ir3_ra.c @@ -1,4 +1,5 @@ /* + * Copyright (C) 2021 Valve Corporation * Copyright (C) 2014 Rob Clark * * Permission is hereby granted, free of charge, to any person obtaining a @@ -19,1556 +20,1951 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * - * Authors: - * Rob Clark */ +#include "ir3_ra.h" +#include "ir3_shader.h" +#include "util/rb_tree.h" #include "util/u_math.h" -#include "util/register_allocate.h" -#include "util/ralloc.h" -#include "util/bitset.h" -#include "ir3.h" -#include "ir3_shader.h" -#include "ir3_ra.h" +/* This file implements an SSA-based register allocator. Unlike other + * SSA-based allocators, it handles vector split/collect "smartly," meaning + * that multiple values may share the same register interval. From the + * perspective of the allocator itself, only the top-level intervals matter, + * and the allocator is only concerned with allocating top-level intervals, + * which may mean moving other top-level intervals around. Other intervals, + * like the destination of a split instruction or the source of a collect + * instruction, are "locked" to their parent interval. The details of this are + * mostly handled by ir3_merge_regs and ir3_reg_ctx. + * + * We currently don't do any backtracking, but we do use the merge sets as a + * form of affinity to try to avoid moves from phis/splits/collects. Each + * merge set is what a more "classic" graph-coloring or live-range based + * allocator would consider a single register, but here we use it as merely a + * hint, except when multiple overlapping values are live at the same time. + * Each merge set has a "preferred" register, and we try to honor that when + * allocating values in the merge set. + */ +/* ir3_reg_ctx implementation. */ -#ifdef DEBUG -#define RA_DEBUG (ir3_shader_debug & IR3_DBG_RAMSGS) -#else -#define RA_DEBUG 0 -#endif -#define d(fmt, ...) do { if (RA_DEBUG) { \ - printf("RA: "fmt"\n", ##__VA_ARGS__); \ -} } while (0) +static int +ir3_reg_interval_cmp(const struct rb_node *node, const void *data) +{ + physreg_t reg = *(const physreg_t *)data; + const struct ir3_reg_interval *interval = ir3_rb_node_to_interval_const(node); + if (interval->reg->interval_start > reg) + return -1; + else if (interval->reg->interval_end <= reg) + return 1; + else + return 0; +} -#define di(instr, fmt, ...) do { if (RA_DEBUG) { \ - printf("RA: "fmt": ", ##__VA_ARGS__); \ - ir3_print_instr(instr); \ -} } while (0) +static struct ir3_reg_interval * +ir3_reg_interval_search(struct rb_tree *tree, unsigned offset) +{ + struct rb_node *node = rb_tree_search(tree, &offset, ir3_reg_interval_cmp); + return node ? ir3_rb_node_to_interval(node) : NULL; +} -/* - * Register Assignment: - * - * Uses the register_allocate util, which implements graph coloring - * algo with interference classes. To handle the cases where we need - * consecutive registers (for example, texture sample instructions), - * we model these as larger (double/quad/etc) registers which conflict - * with the corresponding registers in other classes. - * - * Additionally we create additional classes for half-regs, which - * do not conflict with the full-reg classes. We do need at least - * sizes 1-4 (to deal w/ texture sample instructions output to half- - * reg). At the moment we don't create the higher order half-reg - * classes as half-reg frequently does not have enough precision - * for texture coords at higher resolutions. - * - * There are some additional cases that we need to handle specially, - * as the graph coloring algo doesn't understand "partial writes". - * For example, a sequence like: - * - * add r0.z, ... - * sam (f32)(xy)r0.x, ... - * ... - * sam (f32)(xyzw)r0.w, r0.x, ... ; 3d texture, so r0.xyz are coord - * - * In this scenario, we treat r0.xyz as class size 3, which is written - * (from a use/def perspective) at the 'add' instruction and ignore the - * subsequent partial writes to r0.xy. So the 'add r0.z, ...' is the - * defining instruction, as it is the first to partially write r0.xyz. - * - * To address the fragmentation that this can potentially cause, a - * two pass register allocation is used. After the first pass the - * assignment of scalars is discarded, but the assignment of vecN (for - * N > 1) is used to pre-color in the second pass, which considers - * only scalars. - * - * Arrays of arbitrary size are handled via pre-coloring a consecutive - * sequence of registers. Additional scalar (single component) reg - * names are allocated starting at ctx->class_base[total_class_count] - * (see arr->base), which are pre-colored. In the use/def graph direct - * access is treated as a single element use/def, and indirect access - * is treated as use or def of all array elements. (Only the first - * def is tracked, in case of multiple indirect writes, etc.) - * - * TODO arrays that fit in one of the pre-defined class sizes should - * not need to be pre-colored, but instead could be given a normal - * vreg name. (Ignoring this for now since it is a good way to work - * out the kinks with arbitrary sized arrays.) - * - * TODO might be easier for debugging to split this into two passes, - * the first assigning vreg names in a way that we could ir3_print() - * the result. +static struct ir3_reg_interval * +ir3_reg_interval_search_sloppy(struct rb_tree *tree, unsigned offset) +{ + struct rb_node *node = rb_tree_search_sloppy(tree, &offset, ir3_reg_interval_cmp); + return node ? ir3_rb_node_to_interval(node) : NULL; +} + +/* Get the interval covering the reg, or the closest to the right if it + * doesn't exist. */ +static struct ir3_reg_interval * +ir3_reg_interval_search_right(struct rb_tree *tree, unsigned offset) +{ + struct ir3_reg_interval *interval = ir3_reg_interval_search_sloppy(tree, offset); + if (!interval) { + return NULL; + } else if (interval->reg->interval_end > offset) { + return interval; + } else { + /* There is no interval covering reg, and ra_file_search_sloppy() + * returned the closest range to the left, so the next interval to the + * right should be the closest to the right. + */ + return ir3_reg_interval_next_or_null(interval); + } +} + +static int +ir3_reg_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b) +{ + const struct ir3_reg_interval *a = ir3_rb_node_to_interval_const(_a); + const struct ir3_reg_interval *b = ir3_rb_node_to_interval_const(_b); + return b->reg->interval_start - a->reg->interval_start; +} +static void +interval_insert(struct ir3_reg_ctx *ctx, struct rb_tree *tree, + struct ir3_reg_interval *interval) +{ + struct ir3_reg_interval *right = + ir3_reg_interval_search_right(tree, interval->reg->interval_start); + if (right && right->reg->interval_start < interval->reg->interval_end) { + /* We disallow trees where different members have different half-ness. + * This means that we can't treat bitcasts as copies like normal + * split/collect, so something like this would require an extra copy + * in mergedregs mode, and count as 4 half-units of register pressure + * instead of 2: + * + * f16vec2 foo = unpackFloat2x16(bar) + * ... = foo.x + * ... = bar + * + * However, relaxing this rule would open a huge can of worms. What + * happens when there's a vector of 16 things, and the fifth element + * has been bitcasted as a half-reg? Would that element alone have to + * be small enough to be used as a half-reg source? Let's keep that + * can of worms firmly shut for now. + */ + assert((interval->reg->flags & IR3_REG_HALF) == + (right->reg->flags & IR3_REG_HALF)); -static struct ir3_instruction * name_to_instr(struct ir3_ra_ctx *ctx, unsigned name); + if (right->reg->interval_end <= interval->reg->interval_end && + right->reg->interval_start >= interval->reg->interval_start) { + /* Check if we're inserting something that's already inserted */ + assert(interval != right); -static bool name_is_array(struct ir3_ra_ctx *ctx, unsigned name); -static struct ir3_array * name_to_array(struct ir3_ra_ctx *ctx, unsigned name); + /* "right" is contained in "interval" and must become a child of + * it. There may be further children too. + */ + for (struct ir3_reg_interval *next = ir3_reg_interval_next(right); + right && right->reg->interval_start < interval->reg->interval_end; + right = next, next = ir3_reg_interval_next_or_null(next)) { + /* "right" must be contained in "interval." */ + assert(right->reg->interval_end <= interval->reg->interval_end); + assert((interval->reg->flags & IR3_REG_HALF) == + (right->reg->flags & IR3_REG_HALF)); + if (!right->parent) + ctx->interval_delete(ctx, right); + right->parent = interval; + rb_tree_remove(tree, &right->node); + rb_tree_insert(&interval->children, &right->node, + ir3_reg_interval_insert_cmp); + } + } else { + /* "right" must contain "interval," since intervals must form a + * tree. + */ + assert(right->reg->interval_start <= interval->reg->interval_start); + interval->parent = right; + interval_insert(ctx, &right->children, interval); + return; + } + } -/* does it conflict? */ -static inline bool -intersects(unsigned a_start, unsigned a_end, unsigned b_start, unsigned b_end) -{ - return !((a_start >= b_end) || (b_start >= a_end)); + if (!interval->parent) + ctx->interval_add(ctx, interval); + rb_tree_insert(tree, &interval->node, ir3_reg_interval_insert_cmp); + interval->inserted = true; } -static bool -instr_before(struct ir3_instruction *a, struct ir3_instruction *b) +void +ir3_reg_interval_insert(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval) { - if (a->flags & IR3_INSTR_UNUSED) - return false; - return (a->ip < b->ip); + interval_insert(ctx, &ctx->intervals, interval); } -static struct ir3_instruction * -get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr, - int *sz, int *off) +void +ir3_reg_interval_remove(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval) { - struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; - struct ir3_instruction *d = NULL; + if (interval->parent) { + rb_tree_remove(&interval->parent->children, &interval->node); + } else { + ctx->interval_delete(ctx, interval); + rb_tree_remove(&ctx->intervals, &interval->node); + } + + rb_tree_foreach_safe(struct ir3_reg_interval, child, &interval->children, node) { + rb_tree_remove(&interval->children, &child->node); + child->parent = interval->parent; - if (ctx->scalar_pass) { - id->defn = instr; - id->off = 0; - id->sz = 1; /* considering things as N scalar regs now */ + if (interval->parent) { + rb_tree_insert(&child->parent->children, &child->node, + ir3_reg_interval_insert_cmp); + } else { + ctx->interval_readd(ctx, interval, child); + rb_tree_insert(&ctx->intervals, &child->node, + ir3_reg_interval_insert_cmp); + } } - if (id->defn) { - *sz = id->sz; - *off = id->off; - return id->defn; + interval->inserted = false; +} + +void +ir3_reg_interval_remove_all(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval) +{ + assert(!interval->parent); + + ctx->interval_delete(ctx, interval); + rb_tree_remove(&ctx->intervals, &interval->node); +} + +static void +interval_dump(struct ir3_reg_interval *interval, unsigned indent) +{ + for (unsigned i = 0; i < indent; i++) + printf("\t"); + printf("reg %u start %u\n", interval->reg->name, interval->reg->interval_start); + + rb_tree_foreach(struct ir3_reg_interval, child, &interval->children, node) { + interval_dump(child, indent + 1); } - if (instr->opc == OPC_META_COLLECT) { - /* What about the case where collect is subset of array, we - * need to find the distance between where actual array starts - * and collect.. that probably doesn't happen currently. - */ - int dsz, doff; + for (unsigned i = 0; i < indent; i++) + printf("\t"); + printf("reg %u end %u\n", interval->reg->name, interval->reg->interval_end); +} - /* note: don't use foreach_ssa_src as this gets called once - * while assigning regs (which clears SSA flag) - */ - foreach_src_n (src, n, instr) { - struct ir3_instruction *dd; - if (!src->def) - continue; +void +ir3_reg_interval_dump(struct ir3_reg_interval *interval) +{ + interval_dump(interval, 0); +} - dd = get_definer(ctx, src->def->instr, &dsz, &doff); +/* These are the core datastructures used by the register allocator. First + * ra_interval and ra_file, which are used for intra-block tracking and use + * the ir3_reg_ctx infrastructure: + */ - if ((!d) || instr_before(dd, d)) { - d = dd; - *sz = dsz; - *off = doff - n; - } - } +struct ra_interval { + struct ir3_reg_interval interval; - } else if (instr->cp.right || instr->cp.left) { - /* covers also the meta:fo case, which ends up w/ single - * scalar instructions for each component: - */ - struct ir3_instruction *f = ir3_neighbor_first(instr); + struct rb_node physreg_node; + physreg_t physreg_start, physreg_end; - /* by definition, the entire sequence forms one linked list - * of single scalar register nodes (even if some of them may - * be splits from a texture sample (for example) instr. We - * just need to walk the list finding the first element of - * the group defined (lowest ip) - */ - int cnt = 0; + /* True if this is a source of the current instruction which is entirely + * killed. This means we can allocate the dest over it, but we can't break + * it up. + */ + bool is_killed; - /* need to skip over unused in the group: */ - while (f && (f->flags & IR3_INSTR_UNUSED)) { - f = f->cp.right; - cnt++; - } + /* True if this interval cannot be moved from its position. This is only + * used for precolored inputs to ensure that other inputs don't get + * allocated on top of them. + */ + bool frozen; +}; - while (f) { - if ((!d) || instr_before(f, d)) - d = f; - if (f == instr) - *off = cnt; - f = f->cp.right; - cnt++; - } +struct ra_file { + struct ir3_reg_ctx reg_ctx; + + BITSET_DECLARE(available, RA_MAX_FILE_SIZE); + BITSET_DECLARE(available_to_evict, RA_MAX_FILE_SIZE); + + struct rb_tree physreg_intervals; + + unsigned size; + unsigned start; +}; + +/* State for inter-block tracking. When we split a live range to make space + * for a vector, we may need to insert fixup code when a block has multiple + * predecessors that have moved the same live value to different registers. + * This keeps track of state required to do that. + */ + +struct ra_block_state { + /* Map of defining ir3_register -> physreg it was allocated to at the end + * of the block. + */ + struct hash_table *renames; + + /* For loops, we need to process a block before all its predecessors have + * been processed. In particular, we need to pick registers for values + * without knowing if all the predecessors have been renamed. This keeps + * track of the registers we chose so that when we visit the back-edge we + * can move them appropriately. If all predecessors have been visited + * before this block is visited then we don't need to fill this out. This + * is a map from ir3_register -> physreg. + */ + struct hash_table *entry_regs; + + /* True if the block has been visited and "renames" is complete. + */ + bool visited; +}; + +struct ra_parallel_copy { + struct ra_interval *interval; + physreg_t src; +}; + +/* The main context: */ + +struct ra_ctx { + /* r0.x - r47.w. On a6xx with merged-regs, hr0.x-hr47.w go into the bottom + * half of this file too. + */ + struct ra_file full; + + /* hr0.x - hr63.w, only used without merged-regs. */ + struct ra_file half; + + /* Shared regs. */ + struct ra_file shared; + + struct ir3_liveness *live; + + struct ir3_block *block; + + const struct ir3_compiler *compiler; + gl_shader_stage stage; - *sz = cnt; + /* Pending moves of top-level intervals that will be emitted once we're + * finished: + */ + DECLARE_ARRAY(struct ra_parallel_copy, parallel_copies); + + struct ra_interval *intervals; + struct ra_block_state *blocks; + + bool merged_regs; +}; + +#define foreach_interval(interval, file) \ + rb_tree_foreach(struct ra_interval, interval, &(file)->physreg_intervals, physreg_node) +#define foreach_interval_rev(interval, file) \ + rb_tree_foreach(struct ra_interval, interval, &(file)->physreg_intervals, physreg_node) +#define foreach_interval_safe(interval, file) \ + rb_tree_foreach_safe(struct ra_interval, interval, &(file)->physreg_intervals, physreg_node) +#define foreach_interval_rev_safe(interval, file) \ + rb_tree_foreach_rev_safe(struct ra_interval, interval, &(file)->physreg_intervals, physreg_node) + +static struct ra_interval * +rb_node_to_interval(struct rb_node *node) +{ + return rb_node_data(struct ra_interval, node, physreg_node); +} + +static const struct ra_interval * +rb_node_to_interval_const(const struct rb_node *node) +{ + return rb_node_data(struct ra_interval, node, physreg_node); +} + +static struct ra_interval * +ra_interval_next(struct ra_interval *interval) +{ + struct rb_node *next = rb_node_next(&interval->physreg_node); + return next ? rb_node_to_interval(next) : NULL; +} +static struct ra_interval * +ra_interval_next_or_null(struct ra_interval *interval) +{ + return interval ? ra_interval_next(interval) : NULL; +} + +static int +ra_interval_cmp(const struct rb_node *node, const void *data) +{ + physreg_t reg = *(const physreg_t *)data; + const struct ra_interval *interval = rb_node_to_interval_const(node); + if (interval->physreg_start > reg) + return -1; + else if (interval->physreg_end <= reg) + return 1; + else + return 0; +} + +static struct ra_interval * +ra_interval_search_sloppy(struct rb_tree *tree, physreg_t reg) +{ + struct rb_node *node = rb_tree_search_sloppy(tree, ®, ra_interval_cmp); + return node ? rb_node_to_interval(node) : NULL; +} + +/* Get the interval covering the reg, or the closest to the right if it + * doesn't exist. + */ +static struct ra_interval * +ra_interval_search_right(struct rb_tree *tree, physreg_t reg) +{ + struct ra_interval *interval = ra_interval_search_sloppy(tree, reg); + if (!interval) { + return NULL; + } else if (interval->physreg_end > reg) { + return interval; } else { - /* second case is looking directly at the instruction which - * produces multiple values (eg, texture sample), rather - * than the split nodes that point back to that instruction. - * This isn't quite right, because it may be part of a larger - * group, such as: - * - * sam (f32)(xyzw)r0.x, ... - * add r1.x, ... - * add r1.y, ... - * sam (f32)(xyzw)r2.x, r0.w <-- (r0.w, r1.x, r1.y) - * - * need to come up with a better way to handle that case. + /* There is no interval covering reg, and ra_file_search_sloppy() + * returned the closest range to the left, so the next interval to the + * right should be the closest to the right. */ - if (instr->address) { - *sz = instr->regs[0]->size; - } else { - *sz = util_last_bit(instr->regs[0]->wrmask); - } - *off = 0; - d = instr; + return ra_interval_next_or_null(interval); } +} + +static struct ra_interval * +ra_file_search_right(struct ra_file *file, physreg_t reg) +{ + return ra_interval_search_right(&file->physreg_intervals, reg); +} + +static int +ra_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b) +{ + const struct ra_interval *a = rb_node_to_interval_const(_a); + const struct ra_interval *b = rb_node_to_interval_const(_b); + return b->physreg_start - a->physreg_start; +} + +static struct ra_interval * +ir3_reg_interval_to_ra_interval(struct ir3_reg_interval *interval) +{ + return rb_node_data(struct ra_interval, interval, interval); +} - if (d->opc == OPC_META_SPLIT) { - struct ir3_instruction *dd; - int dsz, doff; +static struct ra_file * +ir3_reg_ctx_to_file(struct ir3_reg_ctx *ctx) +{ + return rb_node_data(struct ra_file, ctx, reg_ctx); +} - dd = get_definer(ctx, d->regs[1]->def->instr, &dsz, &doff); +static void +interval_add(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_interval) +{ + struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval); + struct ra_file *file = ir3_reg_ctx_to_file(ctx); - /* by definition, should come before: */ - ra_assert(ctx, instr_before(dd, d)); + /* We can assume in this case that physreg_start/physreg_end is already + * initialized. + */ + for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) { + BITSET_CLEAR(file->available, i); + BITSET_CLEAR(file->available_to_evict, i); + } - *sz = MAX2(*sz, dsz); + rb_tree_insert(&file->physreg_intervals, &interval->physreg_node, + ra_interval_insert_cmp); +} - if (instr->opc == OPC_META_SPLIT) - *off = MAX2(*off, instr->split.off); +static void +interval_delete(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_interval) +{ + struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval); + struct ra_file *file = ir3_reg_ctx_to_file(ctx); - d = dd; + for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) { + BITSET_SET(file->available, i); + BITSET_SET(file->available_to_evict, i); } - ra_assert(ctx, d->opc != OPC_META_SPLIT); + rb_tree_remove(&file->physreg_intervals, &interval->physreg_node); +} + +static void +interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_parent, + struct ir3_reg_interval *_child) +{ + struct ra_interval *parent = ir3_reg_interval_to_ra_interval(_parent); + struct ra_interval *child = ir3_reg_interval_to_ra_interval(_child); - id->defn = d; - id->sz = *sz; - id->off = *off; + child->physreg_start = parent->physreg_start + + (child->interval.reg->interval_start - parent->interval.reg->interval_start); + child->physreg_end = child->physreg_start + + (child->interval.reg->interval_end - child->interval.reg->interval_start); - return d; + interval_add(ctx, _child); } + static void -ra_block_find_definers(struct ir3_ra_ctx *ctx, struct ir3_block *block) +ra_file_init(struct ra_file *file) { - foreach_instr (instr, &block->instr_list) { - struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; - if (instr->regs_count == 0) - continue; - /* couple special cases: */ - if (writes_addr0(instr) || writes_addr1(instr) || writes_pred(instr)) { - id->cls = -1; - } else if (instr->regs[0]->flags & IR3_REG_ARRAY) { - id->cls = total_class_count; - } else { - /* and the normal case: */ - id->defn = get_definer(ctx, instr, &id->sz, &id->off); - id->cls = ra_size_to_class(id->sz, is_half(id->defn), is_shared(id->defn)); - - /* this is a bit of duct-tape.. if we have a scenario like: - * - * sam (f32)(x) out.x, ... - * sam (f32)(x) out.y, ... - * - * Then the fanout/split meta instructions for the two different - * tex instructions end up grouped as left/right neighbors. The - * upshot is that in when you get_definer() on one of the meta:fo's - * you get definer as the first sam with sz=2, but when you call - * get_definer() on the either of the sam's you get itself as the - * definer with sz=1. - * - * (We actually avoid this scenario exactly, the neighbor links - * prevent one of the output mov's from being eliminated, so this - * hack should be enough. But probably we need to rethink how we - * find the "defining" instruction.) - * - * TODO how do we figure out offset properly... - */ - if (id->defn != instr) { - struct ir3_ra_instr_data *did = &ctx->instrd[id->defn->ip]; - if (did->sz < id->sz) { - did->sz = id->sz; - did->cls = id->cls; - } - } - } + for (unsigned i = 0; i < file->size; i++) { + BITSET_SET(file->available, i); + BITSET_SET(file->available_to_evict, i); } + + file->start = 0; + + rb_tree_init(&file->reg_ctx.intervals); + rb_tree_init(&file->physreg_intervals); + + file->reg_ctx.interval_add = interval_add; + file->reg_ctx.interval_delete = interval_delete; + file->reg_ctx.interval_readd = interval_readd; } -/* give each instruction a name (and ip), and count up the # of names - * of each class - */ static void -ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block) +ra_file_insert(struct ra_file *file, struct ra_interval *interval) { - foreach_instr (instr, &block->instr_list) { - struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; + assert(interval->physreg_start < interval->physreg_end); + assert(interval->physreg_end <= file->size); + if (interval->interval.reg->flags & IR3_REG_HALF) + assert(interval->physreg_end <= RA_HALF_SIZE); -#ifdef DEBUG - instr->name = ~0; -#endif + ir3_reg_interval_insert(&file->reg_ctx, &interval->interval); +} - ctx->instr_cnt++; +static void +ra_file_remove(struct ra_file *file, struct ra_interval *interval) +{ + ir3_reg_interval_remove(&file->reg_ctx, &interval->interval); +} - if (!writes_gpr(instr)) - continue; +static void +ra_file_mark_killed(struct ra_file *file, struct ra_interval *interval) +{ + assert(!interval->interval.parent); - if (id->defn != instr) - continue; + for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) { + BITSET_SET(file->available, i); + } - /* In scalar pass, collect/split don't get their own names, - * but instead inherit them from their src(s): - * - * Possibly we don't need this because of scalar_name(), but - * it does make the ir3_print() dumps easier to read. - */ - if (ctx->scalar_pass) { - if (instr->opc == OPC_META_SPLIT) { - instr->name = instr->regs[1]->def->instr->name + instr->split.off; - continue; - } + interval->is_killed = true; +} - if (instr->opc == OPC_META_COLLECT) { - instr->name = instr->regs[1]->def->instr->name; - continue; - } - } +static physreg_t +ra_interval_get_physreg(const struct ra_interval *interval) +{ + unsigned child_start = interval->interval.reg->interval_start; - /* arrays which don't fit in one of the pre-defined class - * sizes are pre-colored: - */ - if ((id->cls >= 0) && (id->cls < total_class_count)) { - /* in the scalar pass, we generate a name for each - * scalar component, instr->name is the name of the - * first component. - */ - unsigned n = ctx->scalar_pass ? dest_regs(instr) : 1; - instr->name = ctx->class_alloc_count[id->cls]; - ctx->class_alloc_count[id->cls] += n; - ctx->alloc_count += n; - } + while (interval->interval.parent) { + interval = ir3_reg_interval_to_ra_interval(interval->interval.parent); } + + return interval->physreg_start + + (child_start - interval->interval.reg->interval_start); +} + +static unsigned +ra_interval_get_num(const struct ra_interval *interval) +{ + return ra_physreg_to_num(ra_interval_get_physreg(interval), + interval->interval.reg->flags); } -/** - * Set a value for max register target. - * - * Currently this just rounds up to a multiple of full-vec4 (ie. the - * granularity that we configure the hw for.. there is no point to - * using r3.x if you aren't going to make r3.yzw available). But - * in reality there seems to be multiple thresholds that affect the - * number of waves.. and we should round up the target to the next - * threshold when we round-robin registers, to give postsched more - * options. When we understand that better, this is where we'd - * implement that. - */ static void -ra_set_register_target(struct ir3_ra_ctx *ctx, unsigned max_target) +ra_interval_init(struct ra_interval *interval, struct ir3_register *reg) { - const unsigned hvec4 = 4; - const unsigned vec4 = 2 * hvec4; + ir3_reg_interval_init(&interval->interval, reg); + interval->is_killed = false; + interval->frozen = false; +} - ctx->max_target = align(max_target, vec4); +static void +ra_interval_dump(struct ra_interval *interval) +{ + printf("physreg %u ", interval->physreg_start); - d("New max_target=%u", ctx->max_target); + ir3_reg_interval_dump(&interval->interval); } -static int -pick_in_range(BITSET_WORD *regs, unsigned min, unsigned max) +static void +ra_file_dump(struct ra_file *file) { - for (unsigned i = min; i <= max; i++) { - if (BITSET_TEST(regs, i)) { - return i; - } + rb_tree_foreach(struct ra_interval, interval, &file->physreg_intervals, physreg_node) { + ra_interval_dump(interval); } - return -1; + + unsigned start, end; + printf("available:\n"); + BITSET_FOREACH_RANGE(start, end, file->available, file->size) { + printf("%u-%u ", start, end); + } + printf("\n"); + + printf("available to evict:\n"); + BITSET_FOREACH_RANGE(start, end, file->available_to_evict, file->size) { + printf("%u-%u ", start, end); + } + printf("\n"); + printf("start: %u\n", file->start); } -static int -pick_in_range_rev(BITSET_WORD *regs, int min, int max) +static void +ra_ctx_dump(struct ra_ctx *ctx) { - for (int i = max; i >= min; i--) { - if (BITSET_TEST(regs, i)) { - return i; - } + printf("full:\n"); + ra_file_dump(&ctx->full); + printf("half:\n"); + ra_file_dump(&ctx->half); + printf("shared:\n"); + ra_file_dump(&ctx->shared); +} + +static unsigned +reg_file_size(struct ra_file *file, struct ir3_register *reg) +{ + /* Half-regs can only take up the first half of the combined regfile */ + if (reg->flags & IR3_REG_HALF) + return MIN2(file->size, RA_HALF_SIZE); + else + return file->size; +} + +/* ra_pop_interval/ra_push_interval provide an API to shuffle around multiple + * top-level intervals at once. Pop multiple intervals, then push them back in + * any order. + */ + +struct ra_removed_interval { + struct ra_interval *interval; + unsigned size; +}; + +static struct ra_removed_interval +ra_pop_interval(struct ra_ctx *ctx, struct ra_file *file, + struct ra_interval *interval) +{ + assert(!interval->interval.parent); + + /* Check if we've already moved this reg before */ + unsigned pcopy_index; + for (pcopy_index = 0; pcopy_index < ctx->parallel_copies_count; pcopy_index++) { + if (ctx->parallel_copies[pcopy_index].interval == interval) + break; + } + + if (pcopy_index == ctx->parallel_copies_count) { + array_insert(ctx, ctx->parallel_copies, (struct ra_parallel_copy) { + .interval = interval, + .src = interval->physreg_start, + }); } - return -1; + + ir3_reg_interval_remove_all(&file->reg_ctx, &interval->interval); + + return (struct ra_removed_interval) { + .interval = interval, + .size = interval->physreg_end - interval->physreg_start, + }; } -/* register selector for the a6xx+ merged register file: */ -static unsigned int -ra_select_reg_merged(unsigned int n, BITSET_WORD *regs, void *data) +static void +ra_push_interval(struct ra_ctx *ctx, struct ra_file *file, + const struct ra_removed_interval *removed, physreg_t dst) { - struct ir3_ra_ctx *ctx = data; - struct ra_class *classp = ra_get_node_class(ctx->g, n); - unsigned int class = ra_class_index(classp); - bool half, shared; - int sz = ra_class_to_size(class, &half, &shared); + struct ra_interval *interval = removed->interval; - assert (sz > 0); + interval->physreg_start = dst; + interval->physreg_end = dst + removed->size; - /* dimensions within the register class: */ - unsigned max_target, start; + ir3_reg_interval_insert(&file->reg_ctx, &interval->interval); +} - /* the regs bitset will include *all* of the virtual regs, but we lay - * out the different classes consecutively in the virtual register - * space. So we just need to think about the base offset of a given - * class within the virtual register space, and offset the register - * space we search within by that base offset. - */ - unsigned base; - - /* TODO I think eventually we want to round-robin in vector pass - * as well, but needs some more work to calculate # of live vals - * for this. (Maybe with some work, we could just figure out - * the scalar target and use that, since that is what we care - * about in the end.. but that would mean setting up use-def/ - * liveranges for scalar pass before doing vector pass.) - * - * For now, in the vector class, just move assignments for scalar - * vals higher to hopefully prevent them from limiting where vecN - * values can be placed. Since the scalar values are re-assigned - * in the 2nd pass, we don't really care where they end up in the - * vector pass. - */ - if (!ctx->scalar_pass) { - base = ctx->set->gpr_to_ra_reg[class][0]; - if (shared) { - max_target = SHARED_CLASS_REGS(class - SHARED_OFFSET); - } else if (half) { - max_target = HALF_CLASS_REGS(class - HALF_OFFSET); - } else { - max_target = CLASS_REGS(class); +/* Pick up the interval and place it at "dst". */ +static void +ra_move_interval(struct ra_ctx *ctx, struct ra_file *file, + struct ra_interval *interval, physreg_t dst) +{ + struct ra_removed_interval temp = ra_pop_interval(ctx, file, interval); + ra_push_interval(ctx, file, &temp, dst); +} + +static bool +get_reg_specified(struct ra_file *file, struct ir3_register *reg, physreg_t physreg, bool is_source) +{ + for (unsigned i = 0; i < reg_size(reg); i++) { + if (!BITSET_TEST(is_source ? file->available_to_evict : file->available, physreg + i)) + return false; + } + + return true; +} + +/* Try to evict any registers conflicting with the proposed spot "physreg" for + * "reg". That is, move them to other places so that we can allocate "physreg" + * here. + */ + +static bool +try_evict_regs(struct ra_ctx *ctx, struct ra_file *file, + struct ir3_register *reg, physreg_t physreg, + unsigned *_eviction_count, bool is_source, bool speculative) +{ + BITSET_DECLARE(available_to_evict, RA_MAX_FILE_SIZE); + memcpy(available_to_evict, file->available_to_evict, sizeof(available_to_evict)); + + for (unsigned i = 0; i < reg_size(reg); i++) + BITSET_CLEAR(available_to_evict, physreg + i); + + unsigned eviction_count = 0; + /* Iterate over each range conflicting with physreg */ + for (struct ra_interval *conflicting = ra_file_search_right(file, physreg), + *next = ra_interval_next_or_null(conflicting); + conflicting != NULL && conflicting->physreg_start < physreg + reg_size(reg); + conflicting = next, next = ra_interval_next_or_null(next)) { + if (!is_source && conflicting->is_killed) + continue; + + if (conflicting->frozen) { + assert(speculative); + return false; } - if ((sz == 1) && !shared) { - return pick_in_range_rev(regs, base, base + max_target); - } else { - return pick_in_range(regs, base, base + max_target); + unsigned avail_start, avail_end; + bool evicted = false; + BITSET_FOREACH_RANGE(avail_start, avail_end, available_to_evict, + reg_file_size(file, conflicting->interval.reg)) { + unsigned size = avail_end - avail_start; + + /* non-half registers must be aligned */ + if (!(conflicting->interval.reg->flags & IR3_REG_HALF) && avail_start % 2 == 1) { + avail_start++; + size--; + } + + if (size >= conflicting->physreg_end - conflicting->physreg_start) { + for (unsigned i = 0; i < conflicting->physreg_end - conflicting->physreg_start; i++) + BITSET_CLEAR(available_to_evict, avail_start + i); + eviction_count += conflicting->physreg_end - conflicting->physreg_start; + if (!speculative) + ra_move_interval(ctx, file, conflicting, avail_start); + evicted = true; + break; + } } - } else { - ra_assert(ctx, sz == 1); + + if (!evicted) + return false; } - /* NOTE: this is only used in scalar pass, so the register - * class will be one of the scalar classes (ie. idx==0): + *_eviction_count = eviction_count; + return true; +} + +static int removed_interval_cmp(const void *_i1, const void *_i2) +{ + const struct ra_removed_interval *i1 = _i1; + const struct ra_removed_interval *i2 = _i2; + + /* We sort the registers as follows: + * + * |--------------------------------------------------------------------| + * | | | | | + * | Half live-through | Half killed | Full killed | Full live-through | + * | | | | | + * |--------------------------------------------------------------------| + * | | + * | Destination | + * | | + * |-----------------| + * + * Half-registers have to be first so that they stay in the low half of + * the register file. Then half and full killed must stay together so that + * there's a contiguous range where we can put the register. With this + * structure we should be able to accomodate any collection of intervals + * such that the total number of half components is within the half limit + * and the combined components are within the full limit. */ - base = ctx->set->gpr_to_ra_reg[class][0]; - if (shared) { - max_target = SHARED_CLASS_REGS(0); - start = 0; - } else if (half) { - max_target = ctx->max_target; - start = ctx->start_search_reg; + + unsigned i1_align = reg_elem_size(i1->interval->interval.reg); + unsigned i2_align = reg_elem_size(i2->interval->interval.reg); + if (i1_align > i2_align) + return 1; + if (i1_align < i2_align) + return -1; + + if (i1_align == 1) { + if (i2->interval->is_killed) + return -1; + if (i1->interval->is_killed) + return 1; } else { - max_target = ctx->max_target / 2; - start = ctx->start_search_reg; + if (i2->interval->is_killed) + return 1; + if (i1->interval->is_killed) + return -1; } - /* For cat4 instructions, if the src reg is already assigned, and - * avail to pick, use it. Because this doesn't introduce unnecessary - * dependencies, and it potentially avoids needing (ss) syncs to - * for write after read hazards: - */ - struct ir3_instruction *instr = name_to_instr(ctx, n); - if (is_sfu(instr)) { - struct ir3_register *src = instr->regs[1]; - int src_n; - - if ((src->flags & IR3_REG_ARRAY) && !(src->flags & IR3_REG_RELATIV)) { - struct ir3_array *arr = ir3_lookup_array(ctx->ir, src->array.id); - src_n = arr->base + src->array.offset; + return 0; +} + +/* "Compress" all the live intervals so that there is enough space for the + * destination register. As there can be gaps when a more-aligned interval + * follows a less-aligned interval, this also sorts them to remove such + * "padding", which may be required when space is very tight. This isn't + * amazing, but should be used only as a last resort in case the register file + * is almost full and badly fragmented. + * + * Return the physreg to use. + */ +static physreg_t +compress_regs_left(struct ra_ctx *ctx, struct ra_file *file, unsigned size, + unsigned align, bool is_source) +{ + DECLARE_ARRAY(struct ra_removed_interval, intervals); + intervals_count = intervals_sz = 0; + intervals = NULL; + + unsigned removed_full_size = 0; + unsigned removed_half_size = 0; + unsigned file_size = align == 1 ? MIN2(file->size, RA_HALF_SIZE) : file->size; + physreg_t start_reg = 0; + + foreach_interval_rev_safe(interval, file) { + /* Check if we can sort the intervals *after* this one and have + * enough space leftover to accomodate "size" units. + */ + if (align == 1) { + if (interval->physreg_end + removed_half_size <= file_size - size) { + start_reg = interval->physreg_end; + break; + } } else { - src_n = scalar_name(ctx, src->def->instr, 0); + if (interval->physreg_end + removed_half_size <= file_size - + removed_full_size - size) { + start_reg = interval->physreg_end; + break; + } } - unsigned reg = ra_get_node_reg(ctx->g, src_n); + /* We assume that all frozen intervals are at the start and that we + * can avoid popping them. + */ + assert(!interval->frozen); - /* Check if the src register has been assigned yet: */ - if (reg != NO_REG) { - if (BITSET_TEST(regs, reg)) { - return reg; - } + /* Killed sources don't count because they go at the end and can + * overlap the register we're trying to add. + */ + if (!interval->is_killed && !is_source) { + if (interval->interval.reg->flags & IR3_REG_HALF) + removed_half_size += interval->physreg_end - interval->physreg_start; + else + removed_full_size += interval->physreg_end - interval->physreg_start; } - } - int r = pick_in_range(regs, base + start, base + max_target); - if (r < 0) { - /* wrap-around: */ - r = pick_in_range(regs, base, base + start); + /* Now that we've done the accounting, pop this off */ + d("popping interval %u physreg %u\n", interval->interval.reg->name, interval->physreg_start); + array_insert(ctx, intervals, ra_pop_interval(ctx, file, interval)); } - if (r < 0) { - /* overflow, we need to increase max_target: */ - ra_set_register_target(ctx, ctx->max_target + 1); - return ra_select_reg_merged(n, regs, data); + /* TODO: In addition to skipping registers at the beginning that are + * well-packed, we should try to skip registers at the end. + */ + + qsort(intervals, intervals_count, sizeof(*intervals), removed_interval_cmp); + + physreg_t physreg = start_reg; + physreg_t ret_reg = (physreg_t) ~0; + for (unsigned i = 0; i < intervals_count; i++) { + if (ret_reg == (physreg_t) ~0 && + ((intervals[i].interval->is_killed && !is_source) || + !(intervals[i].interval->interval.reg->flags & IR3_REG_HALF))) { + ret_reg = ALIGN(physreg, align); + } + + if (ret_reg != (physreg_t) ~0 && + (is_source || !intervals[i].interval->is_killed)) { + physreg = MAX2(physreg, ret_reg + size); + } + + if (!(intervals[i].interval->interval.reg->flags & IR3_REG_HALF)) { + physreg = ALIGN(physreg, 2); + } + + if (physreg + intervals[i].size > + reg_file_size(file, intervals[i].interval->interval.reg)) { + d("ran out of room for interval %u!\n", intervals[i].interval->interval.reg->name); + unreachable("reg pressure calculation was wrong!"); + return 0; + } + + d("pushing interval %u physreg %u\n", intervals[i].interval->interval.reg->name, physreg); + ra_push_interval(ctx, file, &intervals[i], physreg); + + physreg += intervals[i].size; } - if (classp == ctx->set->half_classes[0]) { - int n = r - base; - ctx->start_search_reg = (n + 1) % ctx->max_target; - } else if (classp == ctx->set->classes[0]) { - int n = (r - base) * 2; - ctx->start_search_reg = (n + 1) % ctx->max_target; + if (ret_reg == (physreg_t) ~0) + ret_reg = physreg; + + ret_reg = ALIGN(ret_reg, align); + if (ret_reg + size > file_size) { + d("ran out of room for the new interval!\n"); + unreachable("reg pressure calculation was wrong!"); + return 0; } - return r; + return ret_reg; } static void -ra_init(struct ir3_ra_ctx *ctx) +update_affinity(struct ir3_register *reg, physreg_t physreg) +{ + if (!reg->merge_set || reg->merge_set->preferred_reg != (physreg_t) ~0) + return; + + if (physreg < reg->merge_set_offset) + return; + + reg->merge_set->preferred_reg = physreg - reg->merge_set_offset; +} + +/* Try to find free space for a register without shuffling anything. This uses + * a round-robin algorithm to reduce false dependencies. + */ +static physreg_t +find_best_gap(struct ra_file *file, unsigned file_size, + unsigned size, unsigned align, bool is_source) { - unsigned n, base; + BITSET_WORD *available = is_source ? file->available_to_evict : file->available; + + unsigned start = ALIGN(file->start, align) % (file_size - size + align); + unsigned candidate = start; + do { + bool is_available = true; + for (unsigned i = 0; i < size; i++) { + if (!BITSET_TEST(available, candidate + i)) { + is_available = false; + break; + } + } - ir3_clear_mark(ctx->ir); - n = ir3_count_instructions_ra(ctx->ir); + if (is_available) { + file->start = (candidate + size) % file_size; + return candidate; + } - ctx->instrd = rzalloc_array(NULL, struct ir3_ra_instr_data, n); + candidate += align; + if (candidate + size > file_size) + candidate = 0; + } while (candidate != start); + + return (physreg_t) ~0; +} - foreach_block (block, &ctx->ir->block_list) { - ra_block_find_definers(ctx, block); - } +static struct ra_file * +ra_get_file(struct ra_ctx *ctx, struct ir3_register *reg) +{ + if (reg->flags & IR3_REG_SHARED) + return &ctx->shared; + else if (ctx->merged_regs || !(reg->flags & IR3_REG_HALF)) + return &ctx->full; + else + return &ctx->half; +} - foreach_block (block, &ctx->ir->block_list) { - ra_block_name_instructions(ctx, block); +/* This is the main entrypoint for picking a register. Pick a free register + * for "reg", shuffling around sources if necessary. In the normal case where + * "is_source" is false, this register can overlap with killed sources + * (intervals with "is_killed == true"). If "is_source" is true, then + * is_killed is ignored and the register returned must not overlap with killed + * sources. This must be used for tied registers, because we're actually + * allocating the destination and the tied source at the same time. + */ + +static physreg_t +get_reg(struct ra_ctx *ctx, struct ra_file *file, struct ir3_register *reg, + bool is_source) +{ + unsigned file_size = reg_file_size(file, reg); + if (reg->merge_set && reg->merge_set->preferred_reg != (physreg_t) ~0) { + physreg_t preferred_reg = + reg->merge_set->preferred_reg + reg->merge_set_offset; + if (preferred_reg < file_size && + preferred_reg % reg_elem_size(reg) == 0 && + get_reg_specified(file, reg, preferred_reg, is_source)) + return preferred_reg; } - /* figure out the base register name for each class. The - * actual ra name is class_base[cls] + instr->name; + /* If this register is a subset of a merge set which we have not picked a + * register for, first try to allocate enough space for the entire merge + * set. */ - ctx->class_base[0] = 0; - for (unsigned i = 1; i <= total_class_count; i++) { - ctx->class_base[i] = ctx->class_base[i-1] + - ctx->class_alloc_count[i-1]; + unsigned size = reg_size(reg); + if (reg->merge_set && reg->merge_set->preferred_reg == (physreg_t)~0 && + size < reg->merge_set->size) { + physreg_t best_reg = + find_best_gap(file, file_size, reg->merge_set->size, reg->merge_set->alignment, is_source); + if (best_reg != (physreg_t) ~0u) { + best_reg += reg->merge_set_offset; + return best_reg; + } } - /* and vreg names for array elements: */ - base = ctx->class_base[total_class_count]; - foreach_array (arr, &ctx->ir->array_list) { - arr->base = base; - ctx->class_alloc_count[total_class_count] += arr->length; - base += arr->length; + /* For ALU and SFU instructions, if the src reg is avail to pick, use it. + * Because this doesn't introduce unnecessary dependencies, and it + * potentially avoids needing (ss) syncs for write after read hazards for + * SFU instructions: + */ + if (is_sfu(reg->instr) || is_alu(reg->instr)) { + for (unsigned i = 1; i < reg->instr->regs_count; i++) { + struct ir3_register *src = reg->instr->regs[i]; + if (!ra_reg_is_src(src)) + continue; + if (ra_get_file(ctx, src) == file && reg_size(src) >= size) { + struct ra_interval *src_interval = + &ctx->intervals[src->def->name]; + physreg_t src_physreg = ra_interval_get_physreg(src_interval); + if (src_physreg % reg_elem_size(reg) == 0 && + src_physreg + size <= file_size && + get_reg_specified(file, reg, src_physreg, is_source)) + return src_physreg; + } + } } - ctx->alloc_count += ctx->class_alloc_count[total_class_count]; - - /* Add vreg names for r0.xyz */ - ctx->r0_xyz_nodes = ctx->alloc_count; - ctx->alloc_count += 3; - ctx->hr0_xyz_nodes = ctx->alloc_count; - ctx->alloc_count += 3; - /* Add vreg name for prefetch-exclusion range: */ - ctx->prefetch_exclude_node = ctx->alloc_count++; + physreg_t best_reg = + find_best_gap(file, file_size, size, reg_elem_size(reg), is_source); + if (best_reg != (physreg_t) ~0u) { + return best_reg; + } - if (RA_DEBUG) { - d("INSTRUCTION VREG NAMES:"); - foreach_block (block, &ctx->ir->block_list) { - foreach_instr (instr, &block->instr_list) { - if (!ctx->instrd[instr->ip].defn) - continue; - if (!writes_gpr(instr)) - continue; - di(instr, "%04u", scalar_name(ctx, instr, 0)); + /* Ok, we couldn't find anything that fits. Here is where we have to start + * moving things around to make stuff fit. First try solely evicting + * registers in the way. + */ + unsigned best_eviction_count = ~0; + for (physreg_t i = 0; i + size <= file_size; i += reg_elem_size(reg)) { + unsigned eviction_count; + if (try_evict_regs(ctx, file, reg, i, &eviction_count, is_source, true)) { + if (eviction_count < best_eviction_count) { + best_eviction_count = eviction_count; + best_reg = i; } } - d("ARRAY VREG NAMES:"); - foreach_array (arr, &ctx->ir->array_list) { - d("%04u: arr%u", arr->base, arr->id); - } - d("EXTRA VREG NAMES:"); - d("%04u: r0_xyz_nodes", ctx->r0_xyz_nodes); - d("%04u: hr0_xyz_nodes", ctx->hr0_xyz_nodes); - d("%04u: prefetch_exclude_node", ctx->prefetch_exclude_node); + } + + if (best_eviction_count != ~0) { + ASSERTED bool result = + try_evict_regs(ctx, file, reg, best_reg, &best_eviction_count, is_source, false); + assert(result); + return best_reg; } - ctx->g = ra_alloc_interference_graph(ctx->set->regs, ctx->alloc_count); - ralloc_steal(ctx->g, ctx->instrd); - ctx->def = rzalloc_array(ctx->g, unsigned, ctx->alloc_count); - ctx->use = rzalloc_array(ctx->g, unsigned, ctx->alloc_count); - - /* TODO add selector callback for split (pre-a6xx) register file: */ - if (ctx->v->mergedregs) { - ra_set_select_reg_callback(ctx->g, ra_select_reg_merged, ctx); + /* Use the dumb fallback only if try_evict_regs() fails. */ + return compress_regs_left(ctx, file, reg_size(reg), reg_elem_size(reg), is_source); +} - if (ctx->scalar_pass) { - ctx->name_to_instr = _mesa_hash_table_create(ctx->g, - _mesa_hash_int, _mesa_key_int_equal); - } +static void +assign_reg(struct ir3_instruction *instr, struct ir3_register *reg, unsigned num) +{ + if (reg->flags & IR3_REG_ARRAY) { + reg->array.base = num; + if (reg->flags & IR3_REG_RELATIV) + reg->array.offset += num; + else + reg->num = num + reg->array.offset; + } else { + reg->num = num; } } -/* Map the name back to instruction: */ -static struct ir3_instruction * -name_to_instr(struct ir3_ra_ctx *ctx, unsigned name) +static void +mark_src_killed(struct ra_ctx *ctx, struct ir3_register *src) { - ra_assert(ctx, !name_is_array(ctx, name)); - struct hash_entry *entry = _mesa_hash_table_search(ctx->name_to_instr, &name); - if (entry) - return entry->data; - ra_unreachable(ctx, "invalid instr name"); - return NULL; + struct ra_interval *interval = &ctx->intervals[src->def->name]; + + if (!(src->flags & IR3_REG_FIRST_KILL) || interval->is_killed || + interval->interval.parent || !rb_tree_is_empty(&interval->interval.children)) + return; + + ra_file_mark_killed(ra_get_file(ctx, src), interval); } -static bool -name_is_array(struct ir3_ra_ctx *ctx, unsigned name) +static void +insert_dst(struct ra_ctx *ctx, struct ir3_register *dst) { - return name >= ctx->class_base[total_class_count]; + struct ra_file *file = ra_get_file(ctx, dst); + struct ra_interval *interval = &ctx->intervals[dst->name]; + + d("insert dst %u physreg %u", dst->name, ra_interval_get_physreg(interval)); + + if (!(dst->flags & IR3_REG_UNUSED)) + ra_file_insert(file, interval); + + assign_reg(dst->instr, dst, ra_interval_get_num(interval)); } -static struct ir3_array * -name_to_array(struct ir3_ra_ctx *ctx, unsigned name) +static void +allocate_dst_fixed(struct ra_ctx *ctx, struct ir3_register *dst, physreg_t physreg) { - ra_assert(ctx, name_is_array(ctx, name)); - foreach_array (arr, &ctx->ir->array_list) { - if (name < (arr->base + arr->length)) - return arr; - } - ra_unreachable(ctx, "invalid array name"); - return NULL; + struct ra_interval *interval = &ctx->intervals[dst->name]; + update_affinity(dst, physreg); + + ra_interval_init(interval, dst); + interval->physreg_start = physreg; + interval->physreg_end = physreg + reg_size(dst); } static void -ra_destroy(struct ir3_ra_ctx *ctx) +allocate_dst(struct ra_ctx *ctx, struct ir3_register *dst) { - ralloc_free(ctx->g); + struct ra_file *file = ra_get_file(ctx, dst); + + struct ir3_register *tied = ra_dst_get_tied_src(ctx->compiler, dst); + if (tied) { + struct ra_interval *tied_interval = &ctx->intervals[tied->def->name]; + struct ra_interval *dst_interval = &ctx->intervals[dst->name]; + physreg_t tied_physreg = ra_interval_get_physreg(tied_interval); + if (tied_interval->is_killed) { + /* The easy case: the source is killed, so we can just reuse it + * for the destination. + */ + allocate_dst_fixed(ctx, dst, ra_interval_get_physreg(tied_interval)); + } else { + /* The source is live-through, so we need to get a free register + * (which is free for both the source and destination!), copy the + * original source to it, then use that for the source and + * destination. + */ + physreg_t physreg = get_reg(ctx, file, dst, true); + allocate_dst_fixed(ctx, dst, physreg); + array_insert(ctx, ctx->parallel_copies, (struct ra_parallel_copy) { + .interval = dst_interval, + .src = tied_physreg, + }); + } + + return; + } + + /* All the hard work is done by get_reg here. */ + physreg_t physreg = get_reg(ctx, file, dst, false); + + allocate_dst_fixed(ctx, dst, physreg); } static void -__def(struct ir3_ra_ctx *ctx, struct ir3_ra_block_data *bd, unsigned name, - struct ir3_instruction *instr) +assign_src(struct ra_ctx *ctx, struct ir3_instruction *instr, struct ir3_register *src) { - ra_assert(ctx, name < ctx->alloc_count); + struct ra_interval *interval = &ctx->intervals[src->def->name]; + struct ra_file *file = ra_get_file(ctx, src); - /* split/collect do not actually define any real value */ - if ((instr->opc == OPC_META_SPLIT) || (instr->opc == OPC_META_COLLECT)) - return; + bool array_rmw = ra_reg_is_array_rmw(src); - /* defined on first write: */ - if (!ctx->def[name]) - ctx->def[name] = instr->ip; - ctx->use[name] = MAX2(ctx->use[name], instr->ip); - BITSET_SET(bd->def, name); + struct ir3_register *tied = ra_src_get_tied_dst(ctx->compiler, instr, src); + physreg_t physreg; + if (tied) { + struct ra_interval *tied_interval = &ctx->intervals[tied->name]; + physreg = ra_interval_get_physreg(tied_interval); + } else { + physreg = ra_interval_get_physreg(interval); + } + + assign_reg(instr, src, ra_physreg_to_num(physreg, src->flags)); + + if (src->flags & IR3_REG_FIRST_KILL) + ra_file_remove(file, interval); + + /* This source is also a destination. */ + if (array_rmw) { + struct ra_interval *dst_interval = &ctx->intervals[src->name]; + ra_interval_init(dst_interval, src); + dst_interval->physreg_start = physreg; + dst_interval->physreg_end = physreg + src->size * reg_elem_size(src); + ra_file_insert(file, dst_interval); + } } +/* Insert a parallel copy instruction before the instruction with the parallel + * copy entries we've built up. + */ static void -__use(struct ir3_ra_ctx *ctx, struct ir3_ra_block_data *bd, unsigned name, - struct ir3_instruction *instr) +insert_parallel_copy_instr(struct ra_ctx *ctx, struct ir3_instruction *instr) { - ra_assert(ctx, name < ctx->alloc_count); - ctx->use[name] = MAX2(ctx->use[name], instr->ip); - if (!BITSET_TEST(bd->def, name)) - BITSET_SET(bd->use, name); + if (ctx->parallel_copies_count == 0) + return; + + struct ir3_instruction *pcopy = + ir3_instr_create(instr->block, OPC_META_PARALLEL_COPY, 2 * ctx->parallel_copies_count); + + for (unsigned i = 0; i < ctx->parallel_copies_count; i++) { + struct ra_parallel_copy *entry = &ctx->parallel_copies[i]; + struct ir3_register *reg = + ir3_reg_create(pcopy, ra_interval_get_num(entry->interval), + entry->interval->interval.reg->flags & ~IR3_REG_SSA); + reg->size = entry->interval->interval.reg->size; + reg->wrmask = entry->interval->interval.reg->wrmask; + } + + for (unsigned i = 0; i < ctx->parallel_copies_count; i++) { + struct ra_parallel_copy *entry = &ctx->parallel_copies[i]; + struct ir3_register *reg = + ir3_reg_create(pcopy, + ra_physreg_to_num(entry->src, entry->interval->interval.reg->flags), + entry->interval->interval.reg->flags & ~(IR3_REG_DEST | IR3_REG_SSA)); + reg->size = entry->interval->interval.reg->size; + reg->wrmask = entry->interval->interval.reg->wrmask; + } + + list_del(&pcopy->node); + list_addtail(&pcopy->node, &instr->node); + ctx->parallel_copies_count = 0; } static void -ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block) +handle_normal_instr(struct ra_ctx *ctx, struct ir3_instruction *instr) { - struct ir3_ra_block_data *bd; - unsigned bitset_words = BITSET_WORDS(ctx->alloc_count); + /* First, mark sources as going-to-be-killed while allocating the dest. */ + ra_foreach_src(src, instr) { + mark_src_killed(ctx, src); + } -#define def(name, instr) __def(ctx, bd, name, instr) -#define use(name, instr) __use(ctx, bd, name, instr) + /* Allocate the destination. */ + ra_foreach_dst(dst, instr) { + if (ra_reg_is_array_rmw(dst)) + continue; + allocate_dst(ctx, dst); + } - bd = rzalloc(ctx->g, struct ir3_ra_block_data); + /* Now handle sources. Go backward so that in case there are multiple + * sources with the same def and that def is killed we only remove it at + * the end. + */ + ra_foreach_src_rev(src, instr) { + assign_src(ctx, instr, src); + } - bd->def = rzalloc_array(bd, BITSET_WORD, bitset_words); - bd->use = rzalloc_array(bd, BITSET_WORD, bitset_words); - bd->livein = rzalloc_array(bd, BITSET_WORD, bitset_words); - bd->liveout = rzalloc_array(bd, BITSET_WORD, bitset_words); + /* Now finally insert the destination into the map. */ + ra_foreach_dst(dst, instr) { + if (ra_reg_is_array_rmw(dst)) + continue; + insert_dst(ctx, dst); + } - block->data = bd; + insert_parallel_copy_instr(ctx, instr); +} - struct ir3_instruction *first_non_input = NULL; - foreach_instr (instr, &block->instr_list) { - if (instr->opc != OPC_META_INPUT) { - first_non_input = instr; - break; - } +static void +handle_split(struct ra_ctx *ctx, struct ir3_instruction *instr) +{ + struct ir3_register *dst = instr->regs[0]; + struct ir3_register *src = instr->regs[1]; + + if (dst->merge_set == NULL || src->def->merge_set != dst->merge_set) { + handle_normal_instr(ctx, instr); + return; } - foreach_instr (instr, &block->instr_list) { - foreach_def (name, ctx, instr) { - if (name_is_array(ctx, name)) { - struct ir3_array *arr = name_to_array(ctx, name); - - arr->start_ip = MIN2(arr->start_ip, instr->ip); - arr->end_ip = MAX2(arr->end_ip, instr->ip); - - for (unsigned i = 0; i < arr->length; i++) { - unsigned name = arr->base + i; - if(arr->half) - ra_set_node_class(ctx->g, name, ctx->set->half_classes[0]); - else - ra_set_node_class(ctx->g, name, ctx->set->classes[0]); - } - } else { - struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; - if (is_shared(instr)) { - ra_set_node_class(ctx->g, name, - ctx->set->shared_classes[id->cls - SHARED_OFFSET]); - } else if (is_half(instr)) { - ra_set_node_class(ctx->g, name, - ctx->set->half_classes[id->cls - HALF_OFFSET]); - } else { - ra_set_node_class(ctx->g, name, - ctx->set->classes[id->cls]); - } - } + struct ra_interval *src_interval = &ctx->intervals[src->def->name]; - def(name, instr); + physreg_t physreg = ra_interval_get_physreg(src_interval); + assign_src(ctx, instr, src); - if ((instr->opc == OPC_META_INPUT) && first_non_input) - use(name, first_non_input); + allocate_dst_fixed(ctx, dst, physreg - src->def->merge_set_offset + dst->merge_set_offset); + insert_dst(ctx, dst); +} - /* Texture instructions with writemasks can be treated as smaller - * vectors (or just scalars!) to allocate knowing that the - * masked-out regs won't be written, but we need to make sure that - * the start of the vector doesn't come before the first register - * or we'll wrap. - */ - if (is_tex_or_prefetch(instr)) { - int writemask_skipped_regs = ffs(instr->regs[0]->wrmask) - 1; - int r0_xyz = is_half(instr) ? - ctx->hr0_xyz_nodes : ctx->r0_xyz_nodes; - for (int i = 0; i < writemask_skipped_regs; i++) - ra_add_node_interference(ctx->g, name, r0_xyz + i); - } +static void +handle_collect(struct ra_ctx *ctx, struct ir3_instruction *instr) +{ + struct ir3_merge_set *dst_set = instr->regs[0]->merge_set; + unsigned dst_offset = instr->regs[0]->merge_set_offset; + + if (!dst_set || dst_set->regs_count == 1) { + handle_normal_instr(ctx, instr); + return; + } - /* Pre-fetched textures have a lower limit for bits to encode dst - * register, so add additional interference with registers above - * that limit. - */ - if (instr->opc == OPC_META_TEX_PREFETCH) { - ra_add_node_interference(ctx->g, name, - ctx->prefetch_exclude_node); - } - } + /* We need to check if any of the sources are contained in an interval + * that is at least as large as the vector. In this case, we should put + * the vector inside that larger interval. (There should be one + * unambiguous place to put it, because values sharing the same merge set + * should be allocated together.) This can happen in a case like: + * + * ssa_1 (wrmask=0xf) = ... + * ssa_2 = split ssa_1 off:0 + * ssa_3 = split ssa_1 off:1 + * ssa_4 (wrmask=0x3) = collect (kill)ssa_2, (kill)ssa_3 + * ... = (kill)ssa_1 + * ... = (kill)ssa_4 + * + * ssa_4 will be coalesced with ssa_1 and needs to be allocated inside it. + */ + physreg_t dst_fixed = (physreg_t) ~0u; - foreach_use (name, ctx, instr) { - if (name_is_array(ctx, name)) { - struct ir3_array *arr = name_to_array(ctx, name); + for (unsigned i = 1; i < instr->regs_count; i++) { + if (!ra_reg_is_src(instr->regs[i])) + continue; - arr->start_ip = MIN2(arr->start_ip, instr->ip); - arr->end_ip = MAX2(arr->end_ip, instr->ip); + if (instr->regs[i]->flags & IR3_REG_FIRST_KILL) { + mark_src_killed(ctx, instr->regs[i]); + } - /* NOTE: arrays are not SSA so unconditionally - * set use bit: - */ - BITSET_SET(bd->use, name); - } + struct ir3_register *src = instr->regs[i]; + struct ra_interval *interval = &ctx->intervals[src->def->name]; - use(name, instr); + if (src->def->merge_set != dst_set || interval->is_killed) + continue; + while (interval->interval.parent != NULL) { + interval = ir3_reg_interval_to_ra_interval(interval->interval.parent); } - - foreach_name (name, ctx, instr) { - /* split/collect instructions have duplicate names - * as real instructions, so they skip the hashtable: + if (reg_size(interval->interval.reg) >= reg_size(instr->regs[0])) { + dst_fixed = interval->physreg_start - interval->interval.reg->merge_set_offset + dst_offset; + } else { + /* For sources whose root interval is smaller than the + * destination (i.e. the normal case), we will shuffle them + * around after allocating the destination. Mark them killed so + * that the destination can be allocated over them, even if they + * aren't actually killed. */ - if (ctx->name_to_instr && !((instr->opc == OPC_META_SPLIT) || - (instr->opc == OPC_META_COLLECT))) { - /* this is slightly annoying, we can't just use an - * integer on the stack - */ - unsigned *key = ralloc(ctx->name_to_instr, unsigned); - *key = name; - ra_assert(ctx, !_mesa_hash_table_search(ctx->name_to_instr, key)); - _mesa_hash_table_insert(ctx->name_to_instr, key, instr); - } + ra_file_mark_killed(ra_get_file(ctx, src), interval); } } -} - -static bool -ra_compute_livein_liveout(struct ir3_ra_ctx *ctx) -{ - unsigned bitset_words = BITSET_WORDS(ctx->alloc_count); - bool progress = false; - foreach_block (block, &ctx->ir->block_list) { - struct ir3_ra_block_data *bd = block->data; + if (dst_fixed != (physreg_t) ~0u) + allocate_dst_fixed(ctx, instr->regs[0], dst_fixed); + else + allocate_dst(ctx, instr->regs[0]); - /* update livein: */ - for (unsigned i = 0; i < bitset_words; i++) { - /* anything used but not def'd within a block is - * by definition a live value coming into the block: - */ - BITSET_WORD new_livein = - (bd->use[i] | (bd->liveout[i] & ~bd->def[i])); + /* Remove the temporary is_killed we added */ + for (unsigned i = 1; i < instr->regs_count; i++) { + if (!ra_reg_is_src(instr->regs[i])) + continue; - if (new_livein & ~bd->livein[i]) { - bd->livein[i] |= new_livein; - progress = true; - } + struct ir3_register *src = instr->regs[i]; + struct ra_interval *interval = &ctx->intervals[src->def->name]; + while (interval->interval.parent != NULL) { + interval = ir3_reg_interval_to_ra_interval(interval->interval.parent); } - /* update liveout: */ - for (unsigned j = 0; j < ARRAY_SIZE(block->successors); j++) { - struct ir3_block *succ = block->successors[j]; - struct ir3_ra_block_data *succ_bd; + /* Filter out cases where it actually should be killed */ + if (interval != &ctx->intervals[src->def->name] || + !(src->flags & IR3_REG_KILL)) + interval->is_killed = false; + } - if (!succ) - continue; - succ_bd = succ->data; + ra_foreach_src_rev(src, instr) { + assign_src(ctx, instr, src); + } - for (unsigned i = 0; i < bitset_words; i++) { - /* add anything that is livein in a successor block - * to our liveout: - */ - BITSET_WORD new_liveout = - (succ_bd->livein[i] & ~bd->liveout[i]); + /* Note: insert_dst will automatically shuffle around any intervals that + * are a child of the collect by making them children of the collect. + */ - if (new_liveout) { - bd->liveout[i] |= new_liveout; - progress = true; - } - } - } - } + insert_dst(ctx, instr->regs[0]); - return progress; + insert_parallel_copy_instr(ctx, instr); } +/* Parallel copies before RA should only be at the end of the block, for + * phi's. For these we only need to fill in the sources, and then we fill in + * the destinations in the successor block. + */ static void -print_bitset(const char *name, BITSET_WORD *bs, unsigned cnt) -{ - bool first = true; - debug_printf("RA: %s:", name); - for (unsigned i = 0; i < cnt; i++) { - if (BITSET_TEST(bs, i)) { - if (!first) - debug_printf(","); - debug_printf(" %04u", i); - first = false; - } - } - debug_printf("\n"); -} - -/* size of one component of instruction result, ie. half vs full: */ -static unsigned -live_size(struct ir3_instruction *instr) +handle_pcopy(struct ra_ctx *ctx, struct ir3_instruction *instr) { - if (is_half(instr)) { - return 1; - } else if (is_shared(instr)) { - /* doesn't count towards footprint */ - return 0; - } else { - return 2; + ra_foreach_src_rev(src, instr) { + assign_src(ctx, instr, src); } } -static unsigned -name_size(struct ir3_ra_ctx *ctx, unsigned name) +/* Some inputs may need to be precolored. We need to handle those first, so + * that other non-precolored inputs don't accidentally get allocated over + * them. Inputs are the very first thing in the shader, so it shouldn't be a + * problem to allocate them to a specific physreg. + */ + +static void +handle_precolored_input(struct ra_ctx *ctx, struct ir3_instruction *instr) { - if (name_is_array(ctx, name)) { - struct ir3_array *arr = name_to_array(ctx, name); - return arr->half ? 1 : 2; - } else { - struct ir3_instruction *instr = name_to_instr(ctx, name); - /* in scalar pass, each name represents on scalar value, - * half or full precision - */ - return live_size(instr); - } + if (instr->regs[0]->num == INVALID_REG) + return; + + struct ra_interval *interval = &ctx->intervals[instr->regs[0]->name]; + physreg_t physreg = ra_reg_get_physreg(instr->regs[0]); + allocate_dst_fixed(ctx, instr->regs[0], physreg); + insert_dst(ctx, instr->regs[0]); + interval->frozen = true; } -static unsigned -ra_calc_block_live_values(struct ir3_ra_ctx *ctx, struct ir3_block *block) +static void +handle_input(struct ra_ctx *ctx, struct ir3_instruction *instr) { - struct ir3_ra_block_data *bd = block->data; - unsigned name; + if (instr->regs[0]->num != INVALID_REG) + return; - ra_assert(ctx, ctx->name_to_instr); + allocate_dst(ctx, instr->regs[0]); - /* TODO this gets a bit more complicated in non-scalar pass.. but - * possibly a lowball estimate is fine to start with if we do - * round-robin in non-scalar pass? Maybe we just want to handle - * that in a different fxn? - */ - ra_assert(ctx, ctx->scalar_pass); + struct ra_file *file = ra_get_file(ctx, instr->regs[0]); + struct ra_interval *interval = &ctx->intervals[instr->regs[0]->name]; + ra_file_insert(file, interval); +} - BITSET_WORD *live = - rzalloc_array(bd, BITSET_WORD, BITSET_WORDS(ctx->alloc_count)); +static void +assign_input(struct ra_ctx *ctx, struct ir3_instruction *instr) +{ + struct ra_interval *interval = &ctx->intervals[instr->regs[0]->name]; + struct ra_file *file = ra_get_file(ctx, instr->regs[0]); - /* Add the live input values: */ - unsigned livein = 0; - BITSET_FOREACH_SET (name, bd->livein, ctx->alloc_count) { - livein += name_size(ctx, name); - BITSET_SET(live, name); + if (instr->regs[0]->num == INVALID_REG) { + assign_reg(instr, instr->regs[0], ra_interval_get_num(interval)); + } else { + interval->frozen = false; } - d("---------------------"); - d("block%u: LIVEIN: %u", block_id(block), livein); - - unsigned max = livein; - int cur_live = max; - - /* Now that we know the live inputs to the block, iterate the - * instructions adjusting the current # of live values as we - * see their last use: - */ - foreach_instr (instr, &block->instr_list) { - if (RA_DEBUG) - print_bitset("LIVE", live, ctx->alloc_count); - di(instr, "CALC"); - - unsigned new_live = 0; /* newly live values */ - unsigned new_dead = 0; /* newly no-longer live values */ - unsigned next_dead = 0; /* newly dead following this instr */ - - foreach_def (name, ctx, instr) { - /* NOTE: checking ctx->def filters out things like split/ - * collect which are just redefining existing live names - * or array writes to already live array elements: - */ - if (ctx->def[name] != instr->ip) - continue; - new_live += live_size(instr); - d("NEW_LIVE: %u (new_live=%u, use=%u)", name, new_live, ctx->use[name]); - BITSET_SET(live, name); - /* There can be cases where this is *also* the last use - * of a value, for example instructions that write multiple - * values, only some of which are used. These values are - * dead *after* (rather than during) this instruction. - */ - if (ctx->use[name] != instr->ip) - continue; - next_dead += live_size(instr); - d("NEXT_DEAD: %u (next_dead=%u)", name, next_dead); - BITSET_CLEAR(live, name); - } - - /* To be more resilient against special cases where liverange - * is extended (like first_non_input), rather than using the - * foreach_use() iterator, we iterate the current live values - * instead: - */ - BITSET_FOREACH_SET (name, live, ctx->alloc_count) { - /* Is this the last use? */ - if (ctx->use[name] != instr->ip) - continue; - new_dead += name_size(ctx, name); - d("NEW_DEAD: %u (new_dead=%u)", name, new_dead); - BITSET_CLEAR(live, name); - } + if (instr->regs[0]->flags & IR3_REG_UNUSED) + ra_file_remove(file, interval); - cur_live += new_live; - cur_live -= new_dead; + ra_foreach_src_rev(src, instr) + assign_src(ctx, instr, src); +} - ra_assert(ctx, cur_live >= 0); - d("CUR_LIVE: %u", cur_live); +/* chmask is a bit weird, because it has pre-colored sources due to the need + * to pass some registers to the next stage. Fortunately there are only at + * most two, and there should be no other live values by the time we get to + * this instruction, so we only have to do the minimum and don't need any + * fancy fallbacks. + * + * TODO: Add more complete handling of precolored sources, e.g. for function + * argument handling. We'd need a way to mark sources as fixed so that they + * don't get moved around when placing other sources in the fallback case, and + * a duplication of much of the logic in get_reg(). This also opens another + * can of worms, e.g. what if the precolored source is a split of a vector + * which is still live -- this breaks our assumption that splits don't incur + * any "extra" register requirements and we'd have to break it out of the + * parent ra_interval. + */ - max = MAX2(max, cur_live); +static void +handle_precolored_source(struct ra_ctx *ctx, struct ir3_register *src) +{ + struct ra_file *file = ra_get_file(ctx, src); + struct ra_interval *interval = &ctx->intervals[src->def->name]; + physreg_t physreg = ra_reg_get_physreg(src); - /* account for written values which are not used later, - * but after updating max (since they are for one cycle - * live) - */ - cur_live -= next_dead; - ra_assert(ctx, cur_live >= 0); + if (ra_interval_get_num(interval) == src->num) + return; - if (RA_DEBUG) { - unsigned cnt = 0; - BITSET_FOREACH_SET (name, live, ctx->alloc_count) { - cnt += name_size(ctx, name); - } - ra_assert(ctx, cur_live == cnt); + /* Try evicting stuff in our way if it isn't free. This won't move + * anything unless it overlaps with our precolored physreg, so we don't + * have to worry about evicting other precolored sources. + */ + if (!get_reg_specified(file, src, physreg, true)) { + unsigned eviction_count; + if (!try_evict_regs(ctx, file, src, physreg, &eviction_count, true, false)) { + unreachable("failed to evict for precolored source!"); + return; } } - d("block%u max=%u", block_id(block), max); + ra_move_interval(ctx, file, interval, physreg); +} - /* the remaining live should match liveout (for extra sanity testing): */ - if (RA_DEBUG) { - unsigned new_dead = 0; - BITSET_FOREACH_SET (name, live, ctx->alloc_count) { - /* Is this the last use? */ - if (ctx->use[name] != block->end_ip) - continue; - new_dead += name_size(ctx, name); - d("NEW_DEAD: %u (new_dead=%u)", name, new_dead); - BITSET_CLEAR(live, name); - } - unsigned liveout = 0; - BITSET_FOREACH_SET (name, bd->liveout, ctx->alloc_count) { - liveout += name_size(ctx, name); - BITSET_CLEAR(live, name); - } +static void +handle_chmask(struct ra_ctx *ctx, struct ir3_instruction *instr) +{ + /* Note: we purposely don't mark sources as killed, so that we can reuse + * some of the get_reg() machinery as-if the source is a destination. + * Marking it as killed would make e.g. get_reg_specified() wouldn't work + * correctly. + */ + ra_foreach_src(src, instr) { + assert(src->num != INVALID_REG); + handle_precolored_source(ctx, src); + } - if (cur_live != liveout) { - print_bitset("LEAKED", live, ctx->alloc_count); - /* TODO there are a few edge cases where live-range extension - * tells us a value is livein. But not used by the block or - * liveout for the block. Possibly a bug in the liverange - * extension. But for now leave the assert disabled: - ra_assert(ctx, cur_live == liveout); - */ - } + ra_foreach_src(src, instr) { + struct ra_file *file = ra_get_file(ctx, src); + struct ra_interval *interval = &ctx->intervals[src->def->name]; + if (src->flags & IR3_REG_FIRST_KILL) + ra_file_remove(file, interval); } - ralloc_free(live); + /* add dummy destination for validation */ + assign_reg(instr, instr->regs[0], 0); - return max; + insert_parallel_copy_instr(ctx, instr); } -static unsigned -ra_calc_max_live_values(struct ir3_ra_ctx *ctx) +static physreg_t +read_register(struct ra_ctx *ctx, struct ir3_block *block, struct ir3_register *def) { - unsigned max = 0; - - foreach_block (block, &ctx->ir->block_list) { - unsigned block_live = ra_calc_block_live_values(ctx, block); - max = MAX2(max, block_live); + struct ra_block_state *state = &ctx->blocks[block->index]; + if (state->renames) { + struct hash_entry *entry = _mesa_hash_table_search(state->renames, def); + if (entry) { + return (physreg_t)(uintptr_t)entry->data; + } } - return max; + return ra_reg_get_physreg(def); } static void -ra_add_interference(struct ir3_ra_ctx *ctx) +handle_live_in(struct ra_ctx *ctx, struct ir3_register *def) { - struct ir3 *ir = ctx->ir; + physreg_t physreg = ~0; + for (unsigned i = 0; i < ctx->block->predecessors_count; i++) { + struct ir3_block *pred = ctx->block->predecessors[i]; + struct ra_block_state *pred_state = &ctx->blocks[pred->index]; - /* initialize array live ranges: */ - foreach_array (arr, &ir->array_list) { - arr->start_ip = ~0; - arr->end_ip = 0; - } + if (!pred_state->visited) + continue; - /* set up the r0.xyz precolor regs. */ - for (int i = 0; i < 3; i++) { - ra_set_node_reg(ctx->g, ctx->r0_xyz_nodes + i, i); - ra_set_node_reg(ctx->g, ctx->hr0_xyz_nodes + i, - ctx->set->first_half_reg + i); + physreg = read_register(ctx, pred, def); + break; } - /* pre-color node that conflict with half/full regs higher than what - * can be encoded for tex-prefetch: - */ - ra_set_node_reg(ctx->g, ctx->prefetch_exclude_node, - ctx->set->prefetch_exclude_reg); + assert(physreg != (physreg_t)~0); + + struct ra_interval *interval = &ctx->intervals[def->name]; + struct ra_file *file = ra_get_file(ctx, def); + ra_interval_init(interval, def); + interval->physreg_start = physreg; + interval->physreg_end = physreg + reg_size(def); + ra_file_insert(file, interval); +} - /* compute live ranges (use/def) on a block level, also updating - * block's def/use bitmasks (used below to calculate per-block - * livein/liveout): +static void +handle_live_out(struct ra_ctx *ctx, struct ir3_register *def) +{ + /* Skip parallelcopy's which in the original program are only used as phi + * arguments. Even though phi arguments are live out, they are only + * assigned when the phi is. */ - foreach_block (block, &ir->block_list) { - ra_block_compute_live_ranges(ctx, block); + if (def->instr->opc == OPC_META_PARALLEL_COPY) + return; + + struct ra_block_state *state = &ctx->blocks[ctx->block->index]; + struct ra_interval *interval = &ctx->intervals[def->name]; + physreg_t physreg = ra_interval_get_physreg(interval); + if (physreg != ra_reg_get_physreg(def)) { + if (!state->renames) + state->renames = _mesa_pointer_hash_table_create(ctx); + _mesa_hash_table_insert(state->renames, def, (void *)(uintptr_t)physreg); } +} - /* update per-block livein/liveout: */ - while (ra_compute_livein_liveout(ctx)) {} +static void +handle_phi(struct ra_ctx *ctx, struct ir3_register *def) +{ + struct ra_file *file = ra_get_file(ctx, def); + struct ra_interval *interval = &ctx->intervals[def->name]; - if (RA_DEBUG) { - d("AFTER LIVEIN/OUT:"); - foreach_block (block, &ir->block_list) { - struct ir3_ra_block_data *bd = block->data; - d("block%u:", block_id(block)); - print_bitset(" def", bd->def, ctx->alloc_count); - print_bitset(" use", bd->use, ctx->alloc_count); - print_bitset(" l/i", bd->livein, ctx->alloc_count); - print_bitset(" l/o", bd->liveout, ctx->alloc_count); - } - foreach_array (arr, &ir->array_list) { - d("array%u:", arr->id); - d(" length: %u", arr->length); - d(" start_ip: %u", arr->start_ip); - d(" end_ip: %u", arr->end_ip); - } + /* phis are always scalar, so they should already be the smallest possible + * size. However they may be coalesced with other live-in values/phi + * nodes, so check for that here. + */ + struct ir3_reg_interval *parent_ir3 = + ir3_reg_interval_search(&file->reg_ctx.intervals, def->interval_start); + physreg_t physreg; + if (parent_ir3) { + struct ra_interval *parent = ir3_reg_interval_to_ra_interval(parent_ir3); + physreg = ra_interval_get_physreg(parent) + + (def->interval_start - parent_ir3->reg->interval_start); + } else { + physreg = get_reg(ctx, file, def, false); } - /* extend start/end ranges based on livein/liveout info from cfg: */ - foreach_block (block, &ir->block_list) { - struct ir3_ra_block_data *bd = block->data; - - for (unsigned i = 0; i < ctx->alloc_count; i++) { - if (BITSET_TEST(bd->livein, i)) { - ctx->def[i] = MIN2(ctx->def[i], block->start_ip); - ctx->use[i] = MAX2(ctx->use[i], block->start_ip); - } + allocate_dst_fixed(ctx, def, physreg); - if (BITSET_TEST(bd->liveout, i)) { - ctx->def[i] = MIN2(ctx->def[i], block->end_ip); - ctx->use[i] = MAX2(ctx->use[i], block->end_ip); - } - } + ra_file_insert(file, interval); +} - foreach_array (arr, &ctx->ir->array_list) { - for (unsigned i = 0; i < arr->length; i++) { - if (BITSET_TEST(bd->livein, i + arr->base)) { - arr->start_ip = MIN2(arr->start_ip, block->start_ip); - } - if (BITSET_TEST(bd->liveout, i + arr->base)) { - arr->end_ip = MAX2(arr->end_ip, block->end_ip); - } - } +static void +assign_phi(struct ra_ctx *ctx, struct ir3_instruction *phi) +{ + struct ra_file *file = ra_get_file(ctx, phi->regs[0]); + struct ra_interval *interval = &ctx->intervals[phi->regs[0]->name]; + assert(!interval->interval.parent); + unsigned num = ra_interval_get_num(interval); + assign_reg(phi, phi->regs[0], num); + + /* Assign the parallelcopy sources of this phi */ + for (unsigned i = 1; i < phi->regs_count; i++) { + if (phi->regs[i]->def) { + assign_reg(phi, phi->regs[i], num); + assign_reg(phi, phi->regs[i]->def, num); } } - if (ctx->name_to_instr) { - unsigned max = ra_calc_max_live_values(ctx); - ra_set_register_target(ctx, max); - } - - for (unsigned i = 0; i < ctx->alloc_count; i++) { - for (unsigned j = 0; j < ctx->alloc_count; j++) { - if (intersects(ctx->def[i], ctx->use[i], - ctx->def[j], ctx->use[j])) { - ra_add_node_interference(ctx->g, i, j); - } - } - } + if (phi->regs[0]->flags & IR3_REG_UNUSED) + ra_file_remove(file, interval); } -/* NOTE: instr could be NULL for IR3_REG_ARRAY case, for the first - * array access(es) which do not have any previous access to depend - * on from scheduling point of view +/* When we split a live range, we sometimes need to emit fixup code at the end + * of a block. For example, something like: + * + * a = ... + * if (...) { + * ... + * a' = a + * b = ... // a evicted to make room for b + * ... + * } + * ... = a + * + * When we insert the copy to a' in insert_parallel_copy_instr(), this forces + * to insert another copy "a = a'" at the end of the if. Normally this would + * also entail adding a phi node, but since we're about to go out of SSA + * anyway we just insert an extra move. Note, however, that "b" might be used + * in a phi node at the end of the if and share registers with "a", so we + * have to be careful to extend any preexisting parallelcopy instruction + * instead of creating our own in order to guarantee that they properly get + * swapped. */ + static void -reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg, - struct ir3_instruction *instr) +insert_liveout_copy(struct ir3_block *block, physreg_t dst, physreg_t src, + struct ir3_register *reg) { - struct ir3_ra_instr_data *id; - - if (reg->flags & IR3_REG_ARRAY) { - struct ir3_array *arr = - ir3_lookup_array(ctx->ir, reg->array.id); - unsigned name = arr->base + reg->array.offset; - unsigned r = ra_get_node_reg(ctx->g, name); - unsigned num = ctx->set->ra_reg_to_gpr[r]; - - if (reg->flags & IR3_REG_RELATIV) { - reg->array.base = arr->reg; - reg->array.offset = num; - } else { - reg->num = num; - reg->flags &= ~IR3_REG_SSA; - } + struct ir3_instruction *old_pcopy = NULL; + if (!list_is_empty(&block->instr_list)) { + struct ir3_instruction *last = + LIST_ENTRY(struct ir3_instruction, block->instr_list.prev, node); + if (last->opc == OPC_META_PARALLEL_COPY) + old_pcopy = last; + } - reg->flags &= ~IR3_REG_ARRAY; - } else if ((id = &ctx->instrd[instr->ip]) && id->defn) { - unsigned first_component = 0; + unsigned old_pcopy_regs = old_pcopy ? old_pcopy->regs_count : 0; + struct ir3_instruction *pcopy = + ir3_instr_create(block, OPC_META_PARALLEL_COPY, + 2 + old_pcopy_regs); - /* Special case for tex instructions, which may use the wrmask - * to mask off the first component(s). In the scalar pass, - * this means the masked off component(s) are not def'd/use'd, - * so we get a bogus value when we ask the register_allocate - * algo to get the assigned reg for the unused/untouched - * component. So we need to consider the first used component: - */ - if (ctx->scalar_pass && is_tex_or_prefetch(id->defn)) { - unsigned n = ffs(id->defn->regs[0]->wrmask); - ra_assert(ctx, n > 0); - first_component = n - 1; - } + for (unsigned i = 0; i < old_pcopy_regs / 2; i++) { + pcopy->regs[pcopy->regs_count++] = old_pcopy->regs[i]; + } - unsigned name = scalar_name(ctx, id->defn, first_component); - unsigned r = ra_get_node_reg(ctx->g, name); - unsigned num = ctx->set->ra_reg_to_gpr[r] + id->off; + struct ir3_register *dst_reg = + ir3_reg_create(pcopy, ra_physreg_to_num(dst, reg->flags), reg->flags); + dst_reg->wrmask = reg->wrmask; + dst_reg->size = reg->size; - ra_assert(ctx, !(reg->flags & IR3_REG_RELATIV)); + for (unsigned i = old_pcopy_regs / 2; i < old_pcopy_regs; i++) { + pcopy->regs[pcopy->regs_count++] = old_pcopy->regs[i]; + } - ra_assert(ctx, num >= first_component); + struct ir3_register *src_reg = + ir3_reg_create(pcopy, ra_physreg_to_num(src, reg->flags), + reg->flags & ~IR3_REG_DEST); + src_reg->wrmask = reg->wrmask; + src_reg->size = reg->size; - if (is_shared(id->defn)) - num += FIRST_SHARED_REG; + if (old_pcopy) + list_del(&old_pcopy->node); +} - reg->num = num - first_component; +static void +insert_live_in_move(struct ra_ctx *ctx, struct ra_interval *interval) +{ + physreg_t physreg = ra_interval_get_physreg(interval); + + for (unsigned i = 0; i < ctx->block->predecessors_count; i++) { + struct ir3_block *pred = ctx->block->predecessors[i]; + struct ra_block_state *pred_state = &ctx->blocks[pred->index]; - reg->flags &= ~IR3_REG_SSA; + if (!pred_state->visited) + continue; - if (is_half(id->defn)) - reg->flags |= IR3_REG_HALF; + physreg_t pred_reg = read_register(ctx, pred, interval->interval.reg); + if (pred_reg != physreg) { + insert_liveout_copy(pred, physreg, pred_reg, interval->interval.reg); + } } } -/* helper to determine which regs to assign in which pass: */ -static bool -should_assign(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr) +static void +insert_file_live_in_moves(struct ra_ctx *ctx, struct ra_file *file) { - /* Array regs are precolored completely separately, and we need to keep - * their array-ness until the end to be able to compute the array reg's - * live interval in the scalar pass. - */ - if (instr->regs[0]->flags & IR3_REG_ARRAY) - return ctx->scalar_pass; - - if ((instr->opc == OPC_META_SPLIT) && - (util_bitcount(instr->regs[1]->wrmask) > 1)) - return !ctx->scalar_pass; - if ((instr->opc == OPC_META_COLLECT) && - (util_bitcount(instr->regs[0]->wrmask) > 1)) - return !ctx->scalar_pass; - return ctx->scalar_pass; + BITSET_WORD *live_in = ctx->live->live_in[ctx->block->index]; + rb_tree_foreach(struct ra_interval, interval, &file->physreg_intervals, physreg_node) { + /* Skip phi nodes. This needs to happen after phi nodes are allocated, + * because we may have to move live-ins around to make space for phi + * nodes, but we shouldn't be handling phi nodes here. + */ + if (BITSET_TEST(live_in, interval->interval.reg->name)) + insert_live_in_move(ctx, interval); + } } static void -ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block) +insert_entry_regs(struct ra_block_state *state, struct ra_file *file) { - foreach_instr (instr, &block->instr_list) { - - if (writes_gpr(instr)) { - if (should_assign(ctx, instr)) { - reg_assign(ctx, instr->regs[0], instr); - } - } - - foreach_src_n (reg, n, instr) { - struct ir3_instruction *src = reg->def ? reg->def->instr : NULL; - - if (src && should_assign(ctx, instr)) - reg_assign(ctx, src->regs[0], src); - - /* Note: reg->def could be null for IR3_REG_ARRAY */ - if (((reg->flags & IR3_REG_ARRAY) && ctx->scalar_pass) || - (src && should_assign(ctx, src))) { - reg_assign(ctx, instr->regs[n+1], src); - } - } - } - - /* We need to pre-color outputs for the scalar pass in - * ra_precolor_assigned(), so we need to actually assign - * them in the first pass: - */ - if (!ctx->scalar_pass) { - foreach_input (in, ctx->ir) { - reg_assign(ctx, in->regs[0], in); - } + rb_tree_foreach(struct ra_interval, interval, &file->physreg_intervals, physreg_node) { + _mesa_hash_table_insert(state->entry_regs, interval->interval.reg, + (void *)(uintptr_t)interval->physreg_start); } } static void -assign_arr_base(struct ir3_ra_ctx *ctx, struct ir3_array *arr, - struct ir3_instruction **precolor, unsigned nprecolor) +insert_live_in_moves(struct ra_ctx *ctx) { - /* In the mergedregs case, we convert full precision arrays - * to their effective half-precision base, and find conflicts - * amongst all other arrays/inputs. - * - * In the splitregs case (halfreg file and fullreg file do - * not conflict), we ignore arrays and other pre-colors that - * are not the same precision. - */ - bool mergedregs = ctx->v->mergedregs; - unsigned base = 0; + insert_file_live_in_moves(ctx, &ctx->full); + insert_file_live_in_moves(ctx, &ctx->half); + insert_file_live_in_moves(ctx, &ctx->shared); - /* figure out what else we conflict with which has already - * been assigned: + /* If not all predecessors are visited, insert live-in regs so that + * insert_live_out_moves() will work. */ -retry: - foreach_array (arr2, &ctx->ir->array_list) { - if (arr2 == arr) + bool all_preds_visited = true; + for (unsigned i = 0; i < ctx->block->predecessors_count; i++) { + if (!ctx->blocks[ctx->block->predecessors[i]->index].visited) { + all_preds_visited = false; break; - ra_assert(ctx, arr2->start_ip <= arr2->end_ip); - - unsigned base2 = arr2->reg; - unsigned len2 = arr2->length; - unsigned len = arr->length; - - if (mergedregs) { - /* convert into half-reg space: */ - if (!arr2->half) { - base2 *= 2; - len2 *= 2; - } - if (!arr->half) { - len *= 2; - } - } else if (arr2->half != arr->half) { - /* for split-register-file mode, we only conflict with - * other arrays of same precision: - */ - continue; - } - - /* if it intersects with liverange AND register range.. */ - if (intersects(arr->start_ip, arr->end_ip, - arr2->start_ip, arr2->end_ip) && - intersects(base, base + len, - base2, base2 + len2)) { - base = MAX2(base, base2 + len2); - goto retry; } } - /* also need to not conflict with any pre-assigned inputs: */ - for (unsigned i = 0; i < nprecolor; i++) { - struct ir3_instruction *instr = precolor[i]; + if (!all_preds_visited) { + struct ra_block_state *state = &ctx->blocks[ctx->block->index]; + state->entry_regs = _mesa_pointer_hash_table_create(ctx); + + insert_entry_regs(state, &ctx->full); + insert_entry_regs(state, &ctx->half); + insert_entry_regs(state, &ctx->shared); + } +} - if (!instr || (instr->flags & IR3_INSTR_UNUSED)) +static void +insert_live_out_move(struct ra_ctx *ctx, struct ra_interval *interval) +{ + for (unsigned i = 0; i < 2; i++) { + if (!ctx->block->successors[i]) continue; - struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; + struct ir3_block *succ = ctx->block->successors[i]; + struct ra_block_state *succ_state = &ctx->blocks[succ->index]; - /* only consider the first component: */ - if (id->off > 0) + if (!succ_state->visited) continue; - unsigned name = ra_name(ctx, id); - unsigned regid = instr->regs[0]->num; - unsigned reglen = class_sizes[id->cls]; - unsigned len = arr->length; - - if (mergedregs) { - /* convert into half-reg space: */ - if (!is_half(instr)) { - regid *= 2; - reglen *= 2; - } - if (!arr->half) { - len *= 2; - } - } else if (is_half(instr) != arr->half) { - /* for split-register-file mode, we only conflict with - * other arrays of same precision: - */ + struct hash_entry *entry = + _mesa_hash_table_search(succ_state->entry_regs, interval->interval.reg); + if (!entry) continue; - } - /* Check if array intersects with liverange AND register - * range of the input: - */ - if (intersects(arr->start_ip, arr->end_ip, - ctx->def[name], ctx->use[name]) && - intersects(base, base + len, - regid, regid + reglen)) { - base = MAX2(base, regid + reglen); - goto retry; + physreg_t new_reg = (physreg_t)(uintptr_t)entry->data; + if (new_reg != interval->physreg_start) { + insert_liveout_copy(ctx->block, new_reg, interval->physreg_start, + interval->interval.reg); } } +} - /* convert back from half-reg space to fullreg space: */ - if (mergedregs && !arr->half) { - base = DIV_ROUND_UP(base, 2); +static void +insert_file_live_out_moves(struct ra_ctx *ctx, struct ra_file *file) +{ + rb_tree_foreach(struct ra_interval, interval, &file->physreg_intervals, physreg_node) { + insert_live_out_move(ctx, interval); } +} - arr->reg = base; +static void +insert_live_out_moves(struct ra_ctx *ctx) +{ + insert_file_live_out_moves(ctx, &ctx->full); + insert_file_live_out_moves(ctx, &ctx->half); + insert_file_live_out_moves(ctx, &ctx->shared); } -/* handle pre-colored registers. This includes "arrays" (which could be of - * length 1, used for phi webs lowered to registers in nir), as well as - * special shader input values that need to be pinned to certain registers. - */ static void -ra_precolor(struct ir3_ra_ctx *ctx, struct ir3_instruction **precolor, unsigned nprecolor) +handle_block(struct ra_ctx *ctx, struct ir3_block *block) { - for (unsigned i = 0; i < nprecolor; i++) { - if (precolor[i] && !(precolor[i]->flags & IR3_INSTR_UNUSED)) { - struct ir3_instruction *instr = precolor[i]; + ctx->block = block; + + /* Reset the register files from the last block */ + ra_file_init(&ctx->full); + ra_file_init(&ctx->half); + ra_file_init(&ctx->shared); + + /* Handle live-ins, phis, and input meta-instructions. These all appear + * live at the beginning of the block, and interfere with each other + * therefore need to be allocated "in parallel". This means that we + * have to allocate all of them, inserting them into the file, and then + * delay updating the IR until all of them are allocated. + * + * Handle precolored inputs first, because we need to make sure that other + * inputs don't overwrite them. We shouldn't have both live-ins/phi nodes + * and inputs at the same time, because the first block doesn't have + * predecessors. Therefore handle_live_in doesn't have to worry about + * them. + */ - if (instr->regs[0]->num == INVALID_REG) - continue; + foreach_instr (instr, &block->instr_list) { + if (instr->opc == OPC_META_INPUT) + handle_precolored_input(ctx, instr); + else + break; + } - struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; - - ra_assert(ctx, !(instr->regs[0]->flags & (IR3_REG_HALF | IR3_REG_SHARED))); - - /* 'base' is in scalar (class 0) but we need to map that - * the conflicting register of the appropriate class (ie. - * input could be vec2/vec3/etc) - * - * Note that the higher class (larger than scalar) regs - * are setup to conflict with others in the same class, - * so for example, R1 (scalar) is also the first component - * of D1 (vec2/double): - * - * Single (base) | Double - * --------------+--------------- - * R0 | D0 - * R1 | D0 D1 - * R2 | D1 D2 - * R3 | D2 - * .. and so on.. - */ - unsigned regid = instr->regs[0]->num; - ra_assert(ctx, regid >= id->off); - regid -= id->off; + unsigned name; + BITSET_FOREACH_SET(name, ctx->live->live_in[block->index], + ctx->live->definitions_count) { + struct ir3_register *reg = ctx->live->definitions[name]; + handle_live_in(ctx, reg); + } - unsigned reg = ctx->set->gpr_to_ra_reg[id->cls][regid]; - unsigned name = ra_name(ctx, id); - ra_set_node_reg(ctx->g, name, reg); - } + foreach_instr (instr, &block->instr_list) { + if (instr->opc == OPC_META_PHI) + handle_phi(ctx, instr->regs[0]); + else if (instr->opc == OPC_META_INPUT || instr->opc == OPC_META_TEX_PREFETCH) + handle_input(ctx, instr); + else + break; } - /* - * Pre-assign array elements: + /* After this point, every live-in/phi/input has an interval assigned to + * it. We delay actually assigning values until everything has been + * allocated, so we can simply ignore any parallel copy entries created + * when shuffling them around. */ - foreach_array (arr, &ctx->ir->array_list) { - - if (arr->end_ip == 0) - continue; - - if (!ctx->scalar_pass) - assign_arr_base(ctx, arr, precolor, nprecolor); + ctx->parallel_copies_count = 0; - for (unsigned i = 0; i < arr->length; i++) { - unsigned cls = arr->half ? HALF_OFFSET : 0; + insert_live_in_moves(ctx); - ra_set_node_reg(ctx->g, - arr->base + i, /* vreg name */ - ctx->set->gpr_to_ra_reg[cls][arr->reg + i]); - } + if (RA_DEBUG) { + printf("after live-in block %u:\n", block->index); + ra_ctx_dump(ctx); } - if (ir3_shader_debug & IR3_DBG_OPTMSGS) { - foreach_array (arr, &ctx->ir->array_list) { - unsigned first = arr->reg; - unsigned last = arr->reg + arr->length - 1; - debug_printf("arr[%d] at r%d.%c->r%d.%c\n", arr->id, - (first >> 2), "xyzw"[first & 0x3], - (last >> 2), "xyzw"[last & 0x3]); + /* Now we're done with processing live-ins, and can handle the body of the + * block. + */ + foreach_instr (instr, &block->instr_list) { + if (RA_DEBUG) { + printf("processing: "); + ir3_print_instr(instr); } + + if (instr->opc == OPC_META_PHI) + assign_phi(ctx, instr); + else if (instr->opc == OPC_META_INPUT || instr->opc == OPC_META_TEX_PREFETCH) + assign_input(ctx, instr); + else if (instr->opc == OPC_META_SPLIT) + handle_split(ctx, instr); + else if (instr->opc == OPC_META_COLLECT) + handle_collect(ctx, instr); + else if (instr->opc == OPC_META_PARALLEL_COPY) + handle_pcopy(ctx, instr); + else if (instr->opc == OPC_CHMASK) + handle_chmask(ctx, instr); + else + handle_normal_instr(ctx, instr); + + if (RA_DEBUG) + ra_ctx_dump(ctx); } -} -static void -precolor(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr) -{ - struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; - unsigned n = dest_regs(instr); - for (unsigned i = 0; i < n; i++) { - /* tex instructions actually have a wrmask, and - * don't touch masked out components. So we - * shouldn't precolor them:: - */ - if (is_tex_or_prefetch(instr) && - !(instr->regs[0]->wrmask & (1 << i))) - continue; + insert_live_out_moves(ctx); - unsigned name = scalar_name(ctx, instr, i); - unsigned regid = instr->regs[0]->num + i; + BITSET_FOREACH_SET(name, ctx->live->live_out[block->index], + ctx->live->definitions_count) { + struct ir3_register *reg = ctx->live->definitions[name]; + handle_live_out(ctx, reg); + } - if (instr->regs[0]->flags & IR3_REG_SHARED) - regid -= FIRST_SHARED_REG; + ctx->blocks[block->index].visited = true; - unsigned vreg = ctx->set->gpr_to_ra_reg[id->cls][regid]; - ra_set_node_reg(ctx->g, name, vreg); + for (unsigned i = 0; i < block->dom_children_count; i++) { + handle_block(ctx, block->dom_children[i]); } } -/* pre-color non-scalar registers based on the registers assigned in previous - * pass. Do this by looking actually at the fanout instructions. - */ -static void -ra_precolor_assigned(struct ir3_ra_ctx *ctx) +static unsigned +calc_target_full_pressure(struct ir3_shader_variant *v, unsigned pressure) { - ra_assert(ctx, ctx->scalar_pass); + /* Registers are allocated in units of vec4, so switch from units of + * half-regs to vec4. + */ + unsigned reg_count = DIV_ROUND_UP(pressure, 2 * 4); + + bool double_threadsize = ir3_should_double_threadsize(v, reg_count); + + unsigned target = reg_count; + unsigned reg_independent_max_waves = + ir3_get_reg_independent_max_waves(v, double_threadsize); + unsigned reg_dependent_max_waves = + ir3_get_reg_dependent_max_waves(v->shader->compiler, reg_count, + double_threadsize); + unsigned target_waves = + MIN2(reg_independent_max_waves, reg_dependent_max_waves); + + while (target <= RA_FULL_SIZE / (2 * 4) && + ir3_should_double_threadsize(v, target) == double_threadsize && + ir3_get_reg_dependent_max_waves(v->shader->compiler, target, + double_threadsize) >= target_waves) + target++; + + return (target - 1) * 2 * 4; +} - foreach_block (block, &ctx->ir->block_list) { - foreach_instr (instr, &block->instr_list) { +int +ir3_ra(struct ir3_shader_variant *v) +{ + ir3_calc_dominance(v->ir); - if (!writes_gpr(instr)) - continue; + ir3_create_parallel_copies(v->ir); - if (should_assign(ctx, instr)) - continue; + struct ir3_liveness *live = ir3_calc_liveness(v); - precolor(ctx, instr); + ir3_debug_print(v->ir, "AFTER: create_parallel_copies"); - foreach_src (src, instr) { - if (!src->def) - continue; - precolor(ctx, src->def->instr); - } - } - } -} + ir3_merge_regs(live, v->ir); -static int -ra_alloc(struct ir3_ra_ctx *ctx) -{ - if (!ra_allocate(ctx->g)) - return -1; + struct ir3_pressure max_pressure; + ir3_calc_pressure(v, live, &max_pressure); + d("max pressure:"); + d("\tfull: %u", max_pressure.full); + d("\thalf: %u", max_pressure.half); + d("\tshared: %u", max_pressure.shared); - foreach_block (block, &ctx->ir->block_list) { - ra_block_alloc(ctx, block); + if (v->mergedregs) { + max_pressure.full += max_pressure.half; + max_pressure.half = 0; } - return 0; -} - -/* if we end up with split/collect instructions with non-matching src - * and dest regs, that means something has gone wrong. Which makes it - * a pretty good sanity check. - */ -static void -ra_sanity_check(struct ir3 *ir) -{ - foreach_block (block, &ir->block_list) { - foreach_instr (instr, &block->instr_list) { - if (instr->opc == OPC_META_SPLIT) { - struct ir3_register *dst = instr->regs[0]; - struct ir3_register *src = instr->regs[1]; - debug_assert(dst->num == (src->num + instr->split.off)); - } else if (instr->opc == OPC_META_COLLECT) { - struct ir3_register *dst = instr->regs[0]; - - foreach_src_n (src, n, instr) { - debug_assert(dst->num == (src->num - n)); - } - } - } + if (max_pressure.full > RA_FULL_SIZE || + max_pressure.half > RA_HALF_SIZE || + max_pressure.shared > RA_SHARED_SIZE) { + d("max pressure exceeded!"); + return 1; } -} - -static int -ir3_ra_pass(struct ir3_shader_variant *v, struct ir3_instruction **precolor, - unsigned nprecolor, bool scalar_pass) -{ - struct ir3_ra_ctx ctx = { - .v = v, - .ir = v->ir, - .set = v->mergedregs ? - v->ir->compiler->mergedregs_set : v->ir->compiler->set, - .scalar_pass = scalar_pass, - }; - int ret; - ret = setjmp(ctx.jmp_env); - if (ret) - goto fail; + struct ra_ctx *ctx = rzalloc(NULL, struct ra_ctx); - ra_init(&ctx); - ra_add_interference(&ctx); - ra_precolor(&ctx, precolor, nprecolor); - if (scalar_pass) - ra_precolor_assigned(&ctx); - ret = ra_alloc(&ctx); + ctx->merged_regs = v->mergedregs; + ctx->compiler = v->shader->compiler; + ctx->stage = v->type; + ctx->live = live; + ctx->intervals = rzalloc_array(ctx, struct ra_interval, live->definitions_count); + ctx->blocks = rzalloc_array(ctx, struct ra_block_state, live->block_count); -fail: - ra_destroy(&ctx); + ctx->full.size = calc_target_full_pressure(v, max_pressure.full); + d("full size: %u", ctx->full.size); + + if (!v->mergedregs) + ctx->half.size = RA_HALF_SIZE; - return ret; -} + ctx->shared.size = RA_SHARED_SIZE; -int -ir3_ra(struct ir3_shader_variant *v, struct ir3_instruction **precolor, - unsigned nprecolor) -{ - int ret; + handle_block(ctx, ir3_start_block(v->ir)); - /* First pass, assign the vecN (non-scalar) registers: */ - ret = ir3_ra_pass(v, precolor, nprecolor, false); - if (ret) - return ret; + /* Strip array-ness and SSA-ness at the end, because various helpers still + * need to work even on definitions that have already been assigned. For + * example, we need to preserve array-ness so that array live-ins have the + * right size. + */ + foreach_block (block, &v->ir->block_list) { + foreach_instr (instr, &block->instr_list) { + for (unsigned i = 0; i < instr->regs_count; i++) { + instr->regs[i]->flags &= ~IR3_REG_SSA; - ir3_debug_print(v->ir, "AFTER: ir3_ra (1st pass)"); + /* Parallel copies of array registers copy the whole register, + * and we need some way to let the parallel copy code know + * that this was an array whose size is determined by + * reg->size. So keep the array flag on those. + */ + if (!is_meta(instr)) + instr->regs[i]->flags &= ~IR3_REG_ARRAY; + } + } + } - /* Second pass, assign the scalar registers: */ - ret = ir3_ra_pass(v, precolor, nprecolor, true); - if (ret) - return ret; + ir3_debug_print(v->ir, "AFTER: register allocation"); - ir3_debug_print(v->ir, "AFTER: ir3_ra (2st pass)"); + ir3_lower_copies(v); -#ifdef DEBUG -# define SANITY_CHECK DEBUG -#else -# define SANITY_CHECK 0 -#endif - if (SANITY_CHECK) - ra_sanity_check(v->ir); + ir3_debug_print(v->ir, "AFTER: ir3_lower_copies"); - return ret; + ralloc_free(ctx); + ralloc_free(live); + return 0; } + diff --git a/src/freedreno/ir3/ir3_ra.h b/src/freedreno/ir3/ir3_ra.h index 34f2549..672b536 100644 --- a/src/freedreno/ir3/ir3_ra.h +++ b/src/freedreno/ir3/ir3_ra.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2014 Rob Clark + * Copyright (C) 2021 Valve Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -19,361 +19,282 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * - * Authors: - * Rob Clark */ -#ifndef IR3_RA_H_ -#define IR3_RA_H_ +#ifndef _IR3_RA_H +#define _IR3_RA_H -#include +#include "ir3.h" +#include "ir3_compiler.h" +#include "util/rb_tree.h" -#include "util/bitset.h" +#ifdef DEBUG +#define RA_DEBUG (ir3_shader_debug & IR3_DBG_RAMSGS) +#else +#define RA_DEBUG 0 +#endif +#define d(fmt, ...) do { if (RA_DEBUG) { \ + printf("RA: "fmt"\n", ##__VA_ARGS__); \ +} } while (0) +#define di(instr, fmt, ...) do { if (RA_DEBUG) { \ + printf("RA: "fmt": ", ##__VA_ARGS__); \ + ir3_print_instr(instr); \ +} } while (0) -static const unsigned class_sizes[] = { - 1, 2, 3, 4, - 4 + 4, /* txd + 1d/2d */ - 4 + 6, /* txd + 3d */ -}; -#define class_count ARRAY_SIZE(class_sizes) +typedef uint16_t physreg_t; -static const unsigned half_class_sizes[] = { - 1, 2, 3, 4, -}; -#define half_class_count ARRAY_SIZE(half_class_sizes) +static inline unsigned +ra_physreg_to_num(physreg_t physreg, unsigned flags) +{ + if (!(flags & IR3_REG_HALF)) + physreg /= 2; + if (flags & IR3_REG_SHARED) + physreg += 48 * 4; + return physreg; +} -/* seems to just be used for compute shaders? Seems like vec1 and vec3 - * are sufficient (for now?) - */ -static const unsigned shared_class_sizes[] = { - 1, 3, -}; -#define shared_class_count ARRAY_SIZE(shared_class_sizes) +static inline physreg_t +ra_num_to_physreg(unsigned num, unsigned flags) +{ + if (flags & IR3_REG_SHARED) + num -= 48 * 4; + if (!(flags & IR3_REG_HALF)) + num *= 2; + return num; +} -#define total_class_count (class_count + half_class_count + shared_class_count) +static inline unsigned +ra_reg_get_num(const struct ir3_register *reg) +{ + return (reg->flags & IR3_REG_ARRAY) ? reg->array.base : reg->num; +} -/* Below a0.x are normal regs. RA doesn't need to assign a0.x/p0.x. */ -#define NUM_REGS (4 * 48) /* r0 to r47 */ -#define NUM_SHARED_REGS (4 * 8) /* r48 to r55 */ -#define FIRST_SHARED_REG (4 * 48) -/* Number of virtual regs in a given class: */ +static inline physreg_t +ra_reg_get_physreg(const struct ir3_register *reg) +{ + return ra_num_to_physreg(ra_reg_get_num(reg), reg->flags); +} -static inline unsigned CLASS_REGS(unsigned i) +static inline bool +def_is_gpr(const struct ir3_register *reg) { - assert(i < class_count); + return reg_num(reg) != REG_A0 && reg_num(reg) != REG_P0; +} - return (NUM_REGS - (class_sizes[i] - 1)); +/* Note: don't count undef as a source. + */ +static inline bool +ra_reg_is_src(const struct ir3_register *reg) +{ + return (reg->flags & IR3_REG_SSA) && reg->def && + def_is_gpr(reg->def); } -static inline unsigned HALF_CLASS_REGS(unsigned i) +/* Array destinations can act as a source, reading the previous array and then + * modifying it. Return true when the register is an array destination that + * acts like a source. + */ +static inline bool +ra_reg_is_array_rmw(const struct ir3_register *reg) { - assert(i < half_class_count); + return ((reg->flags & IR3_REG_ARRAY) && (reg->flags & IR3_REG_DEST) && reg->def); +} - return (NUM_REGS - (half_class_sizes[i] - 1)); +static inline bool +ra_reg_is_dst(const struct ir3_register *reg) +{ + return (reg->flags & IR3_REG_SSA) && (reg->flags & IR3_REG_DEST) && + def_is_gpr(reg) && + ((reg->flags & IR3_REG_ARRAY) || reg->wrmask); } -static inline unsigned SHARED_CLASS_REGS(unsigned i) +static inline struct ir3_register * +ra_dst_get_tied_src(const struct ir3_compiler *compiler, struct ir3_register *dst) { - assert(i < shared_class_count); + /* With the a6xx new cat6 encoding, the same register is used for the + * value and destination of atomic operations. + */ + if (compiler->gpu_id >= 600 && is_atomic(dst->instr->opc) && + (dst->instr->flags & IR3_INSTR_G)) { + return dst->instr->regs[3]; + } - return (NUM_SHARED_REGS - (shared_class_sizes[i] - 1)); + return NULL; } -#define HALF_OFFSET (class_count) -#define SHARED_OFFSET (class_count + half_class_count) +/* Iterators for sources and destinations which: + * - Don't include fake sources (irrelevant for RA) + * - Don't include non-SSA sources (immediates and constants, also irrelevant) + * - Consider array destinations as both a source and a destination + */ -/* register-set, created one time, used for all shaders: */ -struct ir3_ra_reg_set { - struct ra_regs *regs; - struct ra_class *classes[class_count]; - struct ra_class *half_classes[half_class_count]; - struct ra_class *shared_classes[shared_class_count]; +#define ra_foreach_src(__srcreg, __instr) \ + for (struct ir3_register *__srcreg = (void *)~0; __srcreg; __srcreg = NULL) \ + for (unsigned __cnt = (__instr)->regs_count, __i = 0; __i < __cnt; __i++) \ + if (ra_reg_is_src((__srcreg = (__instr)->regs[__i]))) + +#define ra_foreach_src_rev(__srcreg, __instr) \ + for (struct ir3_register *__srcreg = (void *)~0; __srcreg; __srcreg = NULL) \ + for (int __cnt = (__instr)->regs_count, __i = __cnt - 1; __i >= 0; __i--) \ + if (ra_reg_is_src((__srcreg = (__instr)->regs[__i]))) + +#define ra_foreach_dst(__srcreg, __instr) \ + for (struct ir3_register *__srcreg = (void *)~0; __srcreg; __srcreg = NULL) \ + for (unsigned __cnt = (__instr)->regs_count, __i = 0; __i < __cnt; __i++) \ + if (ra_reg_is_dst((__srcreg = (__instr)->regs[__i]))) + +static inline struct ir3_register * +ra_src_get_tied_dst(const struct ir3_compiler *compiler, + struct ir3_instruction *instr, + struct ir3_register *src) +{ + if (compiler->gpu_id >= 600 && is_atomic(instr->opc) && + (instr->flags & IR3_INSTR_G) && src == instr->regs[3]) { + return instr->regs[0]; + } - /* pre-fetched tex dst is limited, on current gens to regs - * 0x3f and below. An additional register class, with one - * vreg, that is setup to conflict with any regs above that - * limit. - */ - struct ra_class *prefetch_exclude_class; - unsigned prefetch_exclude_reg; - - /* The virtual register space flattens out all the classes, - * starting with full, followed by half and then shared, ie: - * - * scalar full (starting at zero) - * vec2 full - * vec3 full - * ... - * vecN full - * scalar half (starting at first_half_reg) - * vec2 half - * ... - * vecN half - * scalar shared (starting at first_shared_reg) - * ... - * vecN shared - * - */ - unsigned first_half_reg, first_shared_reg; + return NULL; +} - /* maps flat virtual register space to base gpr: */ - uint16_t *ra_reg_to_gpr; - /* maps cls,gpr to flat virtual register space: */ - uint16_t **gpr_to_ra_reg; -}; -/* additional block-data (per-block) */ -struct ir3_ra_block_data { - BITSET_WORD *def; /* variables defined before used in block */ - BITSET_WORD *use; /* variables used before defined in block */ - BITSET_WORD *livein; /* which defs reach entry point of block */ - BITSET_WORD *liveout; /* which defs reach exit point of block */ -}; +#define RA_HALF_SIZE (4 * 48) +#define RA_FULL_SIZE (4 * 48 * 2) +#define RA_SHARED_SIZE (2 * 4 * 8) +#define RA_MAX_FILE_SIZE RA_FULL_SIZE -/* additional instruction-data (per-instruction) */ -struct ir3_ra_instr_data { - /* cached instruction 'definer' info: */ - struct ir3_instruction *defn; - int off, sz, cls; +struct ir3_liveness { + unsigned block_count; + DECLARE_ARRAY(struct ir3_register *, definitions); + DECLARE_ARRAY(BITSET_WORD *, live_out); + DECLARE_ARRAY(BITSET_WORD *, live_in); }; -/* register-assign context, per-shader */ -struct ir3_ra_ctx { - struct ir3_shader_variant *v; - struct ir3 *ir; +struct ir3_liveness *ir3_calc_liveness(struct ir3_shader_variant *v); - struct ir3_ra_reg_set *set; - struct ra_graph *g; +bool ir3_def_live_after(struct ir3_liveness *live, struct ir3_register *def, + struct ir3_instruction *instr); - /* Are we in the scalar assignment pass? In this pass, all larger- - * than-vec1 vales have already been assigned and pre-colored, so - * we only consider scalar values. - */ - bool scalar_pass; - - unsigned alloc_count; - unsigned r0_xyz_nodes; /* ra node numbers for r0.[xyz] precolors */ - unsigned hr0_xyz_nodes; /* ra node numbers for hr0.[xyz] precolors */ - unsigned prefetch_exclude_node; - /* one per class, plus one slot for arrays: */ - unsigned class_alloc_count[total_class_count + 1]; - unsigned class_base[total_class_count + 1]; - unsigned instr_cnt; - unsigned *def, *use; /* def/use table */ - struct ir3_ra_instr_data *instrd; - - /* Mapping vreg name back to instruction, used select reg callback: */ - struct hash_table *name_to_instr; - - /* Tracking for select_reg callback */ - unsigned start_search_reg; - unsigned max_target; - - /* Temporary buffer for def/use iterators - * - * The worst case should probably be an array w/ relative access (ie. - * all elements are def'd or use'd), and that can't be larger than - * the number of registers. - * - * NOTE we could declare this on the stack if needed, but I don't - * think there is a need for nested iterators. - */ - unsigned namebuf[NUM_REGS]; - unsigned namecnt, nameidx; +void ir3_create_parallel_copies(struct ir3 *ir); + +void ir3_merge_regs(struct ir3_liveness *live, struct ir3 *ir); - /* Error handling: */ - jmp_buf jmp_env; +struct ir3_pressure { + unsigned full, half, shared; }; -#define ra_assert(ctx, expr) do { \ - if (!(expr)) { \ - _debug_printf("RA: %s:%u: %s: Assertion `%s' failed.\n", __FILE__, __LINE__, __func__, #expr); \ - longjmp((ctx)->jmp_env, -1); \ - } \ - } while (0) -#define ra_unreachable(ctx, str) ra_assert(ctx, !str) +void ir3_calc_pressure(struct ir3_shader_variant *v, + struct ir3_liveness *live, + struct ir3_pressure *max_pressure); -static inline int -ra_name(struct ir3_ra_ctx *ctx, struct ir3_ra_instr_data *id) -{ - unsigned name; - debug_assert(id->cls >= 0); - debug_assert(id->cls < total_class_count); /* we shouldn't get arrays here.. */ - name = ctx->class_base[id->cls] + id->defn->name; - debug_assert(name < ctx->alloc_count); - return name; -} +void ir3_lower_copies(struct ir3_shader_variant *v); -/* Get the scalar name of the n'th component of an instruction dst: */ -static inline int -scalar_name(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr, unsigned n) -{ - if (ctx->scalar_pass) { - if (instr->opc == OPC_META_SPLIT) { - debug_assert(n == 0); /* split results in a scalar */ - struct ir3_instruction *src = instr->regs[1]->def->instr; - return scalar_name(ctx, src, instr->split.off); - } else if (instr->opc == OPC_META_COLLECT) { - debug_assert(n < (instr->regs_count + 1)); - struct ir3_instruction *src = instr->regs[n + 1]->def->instr; - return scalar_name(ctx, src, 0); - } - } else { - debug_assert(n == 0); - } +/* Register interval datastructure + * + * ir3_reg_ctx is used to track which registers are live. The tricky part is + * that some registers may overlap each other, when registers with overlapping + * live ranges get coalesced. For example, splits will overlap with their + * parent vector and sometimes collect sources will also overlap with the + * collect'ed vector. ir3_merge_regs guarantees for us that none of the + * registers in a merge set that are live at any given point partially + * overlap, which means that we can organize them into a forest. While each + * register has a per-merge-set offset, ir3_merge_regs also computes a + * "global" offset which allows us to throw away the original merge sets and + * think of registers as just intervals in a forest of live intervals. When a + * register becomes live, we insert it into the forest, and when it dies we + * remove it from the forest (and then its children get moved up a level). We + * use red-black trees to keep track of each level of the forest, so insertion + * and deletion should be fast operations. ir3_reg_ctx handles all the + * internal bookkeeping for this, so that it can be shared between RA, + * spilling, and register pressure tracking. + */ - return ra_name(ctx, &ctx->instrd[instr->ip]) + n; -} +struct ir3_reg_interval { + struct rb_node node; -#define NO_NAME ~0 + struct rb_tree children; -/* - * Iterators to iterate the vreg names of an instructions def's and use's - */ + struct ir3_reg_interval *parent; -static inline unsigned -__ra_name_cnt(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr) -{ - if (!instr) - return 0; + struct ir3_register *reg; + + bool inserted; +}; - /* Filter special cases, ie. writes to a0.x or p0.x, or non-ssa: */ - if (!writes_gpr(instr) || (instr->regs[0]->flags & IR3_REG_ARRAY)) - return 0; +struct ir3_reg_ctx { + /* The tree of top-level intervals in the forest. */ + struct rb_tree intervals; - /* in scalar pass, we aren't considering virtual register classes, ie. - * if an instruction writes a vec2, then it defines two different scalar - * register names. + /* Users of ir3_reg_ctx need to keep around additional state that is + * modified when top-level intervals are added or removed. For register + * pressure tracking, this is just the register pressure, but for RA we + * need to keep track of the physreg of each top-level interval. These + * callbacks provide a place to let users deriving from ir3_reg_ctx update + * their state when top-level intervals are inserted/removed. */ - if (ctx->scalar_pass) - return dest_regs(instr); - return 1; -} + /* Called when an interval is added and it turns out to be at the top + * level. + */ + void (*interval_add)(struct ir3_reg_ctx *ctx, + struct ir3_reg_interval *interval); -#define foreach_name_n(__name, __n, __ctx, __instr) \ - for (unsigned __cnt = __ra_name_cnt(__ctx, __instr), __n = 0, __name; \ - (__n < __cnt) && ({__name = scalar_name(__ctx, __instr, __n); 1;}); __n++) + /* Called when an interval is deleted from the top level. */ + void (*interval_delete)(struct ir3_reg_ctx *ctx, + struct ir3_reg_interval *interval); -#define foreach_name(__name, __ctx, __instr) \ - foreach_name_n(__name, __n, __ctx, __instr) + /* Called when an interval is deleted and its child becomes top-level. + */ + void (*interval_readd)(struct ir3_reg_ctx *ctx, + struct ir3_reg_interval *parent, + struct ir3_reg_interval *child); +}; -static inline unsigned -__ra_itr_pop(struct ir3_ra_ctx *ctx) +static inline struct ir3_reg_interval * +ir3_rb_node_to_interval(struct rb_node *node) { - if (ctx->nameidx < ctx->namecnt) - return ctx->namebuf[ctx->nameidx++]; - return NO_NAME; + return rb_node_data(struct ir3_reg_interval, node, node); } -static inline void -__ra_itr_push(struct ir3_ra_ctx *ctx, unsigned name) +static inline const struct ir3_reg_interval * +ir3_rb_node_to_interval_const(const struct rb_node *node) { - assert(ctx->namecnt < ARRAY_SIZE(ctx->namebuf)); - ctx->namebuf[ctx->namecnt++] = name; + return rb_node_data(struct ir3_reg_interval, node, node); } -static inline unsigned -__ra_init_def_itr(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr) +static inline struct ir3_reg_interval * +ir3_reg_interval_next(struct ir3_reg_interval *interval) { - /* nested use is not supported: */ - assert(ctx->namecnt == ctx->nameidx); - - ctx->namecnt = ctx->nameidx = 0; - - if (!writes_gpr(instr)) - return NO_NAME; - - struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; - struct ir3_register *dst = instr->regs[0]; - - if (dst->flags & IR3_REG_ARRAY) { - struct ir3_array *arr = ir3_lookup_array(ctx->ir, dst->array.id); - - /* indirect write is treated like a write to all array - * elements, since we don't know which one is actually - * written: - */ - if (dst->flags & IR3_REG_RELATIV) { - for (unsigned i = 0; i < arr->length; i++) { - __ra_itr_push(ctx, arr->base + i); - } - } else { - __ra_itr_push(ctx, arr->base + dst->array.offset); - debug_assert(dst->array.offset < arr->length); - } - } else if (id->defn == instr) { - foreach_name_n (name, i, ctx, instr) { - /* tex instructions actually have a wrmask, and - * don't touch masked out components. We can't do - * anything useful about that in the first pass, - * but in the scalar pass we can realize these - * registers are available: - */ - if (ctx->scalar_pass && is_tex_or_prefetch(instr) && - !(instr->regs[0]->wrmask & (1 << i))) - continue; - __ra_itr_push(ctx, name); - } - } - - return __ra_itr_pop(ctx); + struct rb_node *next = rb_node_next(&interval->node); + return next ? ir3_rb_node_to_interval(next) : NULL; } -static inline unsigned -__ra_init_use_itr(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr) +static inline struct ir3_reg_interval * +ir3_reg_interval_next_or_null(struct ir3_reg_interval *interval) { - /* nested use is not supported: */ - assert(ctx->namecnt == ctx->nameidx); - - ctx->namecnt = ctx->nameidx = 0; - - foreach_src (reg, instr) { - if (reg->flags & IR3_REG_ARRAY) { - struct ir3_array *arr = - ir3_lookup_array(ctx->ir, reg->array.id); - - /* indirect read is treated like a read from all array - * elements, since we don't know which one is actually - * read: - */ - if (reg->flags & IR3_REG_RELATIV) { - for (unsigned i = 0; i < arr->length; i++) { - __ra_itr_push(ctx, arr->base + i); - } - } else { - __ra_itr_push(ctx, arr->base + reg->array.offset); - debug_assert(reg->array.offset < arr->length); - } - } else if (reg->def) { - foreach_name_n (name, i, ctx, reg->def->instr) { - /* split takes a src w/ wrmask potentially greater - * than 0x1, but it really only cares about a single - * component. This shows up in splits coming out of - * a tex instruction w/ wrmask=.z, for example. - */ - if (ctx->scalar_pass && (instr->opc == OPC_META_SPLIT) && - !(i == instr->split.off)) - continue; - __ra_itr_push(ctx, name); - } - } - } + return interval ? ir3_reg_interval_next(interval) : NULL; +} - return __ra_itr_pop(ctx); +static inline void +ir3_reg_interval_init(struct ir3_reg_interval *interval, struct ir3_register *reg) +{ + rb_tree_init(&interval->children); + interval->reg = reg; + interval->parent = NULL; + interval->inserted = false; } -#define foreach_def(__name, __ctx, __instr) \ - for (unsigned __name = __ra_init_def_itr(__ctx, __instr); \ - __name != NO_NAME; __name = __ra_itr_pop(__ctx)) +void +ir3_reg_interval_dump(struct ir3_reg_interval *interval); + +void ir3_reg_interval_insert(struct ir3_reg_ctx *ctx, + struct ir3_reg_interval *interval); + +void ir3_reg_interval_remove(struct ir3_reg_ctx *ctx, + struct ir3_reg_interval *interval); -#define foreach_use(__name, __ctx, __instr) \ - for (unsigned __name = __ra_init_use_itr(__ctx, __instr); \ - __name != NO_NAME; __name = __ra_itr_pop(__ctx)) +void ir3_reg_interval_remove_all(struct ir3_reg_ctx *ctx, + struct ir3_reg_interval *interval); -int ra_size_to_class(unsigned sz, bool half, bool shared); -int ra_class_to_size(unsigned class, bool *half, bool *shared); +#endif -#endif /* IR3_RA_H_ */ diff --git a/src/freedreno/ir3/ir3_ra_regset.c b/src/freedreno/ir3/ir3_ra_regset.c deleted file mode 100644 index d7290bf..0000000 --- a/src/freedreno/ir3/ir3_ra_regset.c +++ /dev/null @@ -1,249 +0,0 @@ -/* - * Copyright (C) 2014 Rob Clark - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors: - * Rob Clark - */ - -#include "util/u_math.h" -#include "util/register_allocate.h" -#include "util/ralloc.h" -#include "util/bitset.h" - -#include "ir3.h" -#include "ir3_compiler.h" -#include "ir3_ra.h" - -static void -setup_conflicts(struct ir3_ra_reg_set *set) -{ - unsigned reg; - - reg = 0; - for (unsigned i = 0; i < class_count; i++) { - for (unsigned j = 0; j < CLASS_REGS(i); j++) { - for (unsigned br = j; br < j + class_sizes[i]; br++) { - ra_add_transitive_reg_conflict(set->regs, br, reg); - } - - reg++; - } - } - - for (unsigned i = 0; i < half_class_count; i++) { - for (unsigned j = 0; j < HALF_CLASS_REGS(i); j++) { - for (unsigned br = j; br < j + half_class_sizes[i]; br++) { - ra_add_transitive_reg_conflict(set->regs, - br + set->first_half_reg, reg); - } - - reg++; - } - } - - for (unsigned i = 0; i < shared_class_count; i++) { - for (unsigned j = 0; j < SHARED_CLASS_REGS(i); j++) { - for (unsigned br = j; br < j + shared_class_sizes[i]; br++) { - ra_add_transitive_reg_conflict(set->regs, - br + set->first_shared_reg, reg); - } - - reg++; - } - } - - /* - * Setup conflicts with registers over 0x3f for the special vreg - * that exists to use as interference for tex-prefetch: - */ - - for (unsigned i = 0x40; i < CLASS_REGS(0); i++) { - ra_add_transitive_reg_conflict(set->regs, i, - set->prefetch_exclude_reg); - } - - for (unsigned i = 0x40; i < HALF_CLASS_REGS(0); i++) { - ra_add_transitive_reg_conflict(set->regs, i + set->first_half_reg, - set->prefetch_exclude_reg); - } -} - -/* One-time setup of RA register-set, which describes all the possible - * "virtual" registers and their interferences. Ie. double register - * occupies (and conflicts with) two single registers, and so forth. - * Since registers do not need to be aligned to their class size, they - * can conflict with other registers in the same class too. Ie: - * - * Single (base) | Double - * --------------+--------------- - * R0 | D0 - * R1 | D0 D1 - * R2 | D1 D2 - * R3 | D2 - * .. and so on.. - * - * (NOTE the disassembler uses notation like r0.x/y/z/w but those are - * really just four scalar registers. Don't let that confuse you.) - */ -struct ir3_ra_reg_set * -ir3_ra_alloc_reg_set(struct ir3_compiler *compiler, bool mergedregs) -{ - struct ir3_ra_reg_set *set = rzalloc(compiler, struct ir3_ra_reg_set); - unsigned ra_reg_count, reg, base; - - /* calculate # of regs across all classes: */ - ra_reg_count = 0; - for (unsigned i = 0; i < class_count; i++) - ra_reg_count += CLASS_REGS(i); - for (unsigned i = 0; i < half_class_count; i++) - ra_reg_count += HALF_CLASS_REGS(i); - for (unsigned i = 0; i < shared_class_count; i++) - ra_reg_count += SHARED_CLASS_REGS(i); - - ra_reg_count += 1; /* for tex-prefetch excludes */ - - /* allocate the reg-set.. */ - set->regs = ra_alloc_reg_set(set, ra_reg_count, true); - set->ra_reg_to_gpr = ralloc_array(set, uint16_t, ra_reg_count); - set->gpr_to_ra_reg = ralloc_array(set, uint16_t *, total_class_count); - - /* .. and classes */ - reg = 0; - for (unsigned i = 0; i < class_count; i++) { - set->classes[i] = ra_alloc_reg_class(set->regs); - - set->gpr_to_ra_reg[i] = ralloc_array(set, uint16_t, CLASS_REGS(i)); - - for (unsigned j = 0; j < CLASS_REGS(i); j++) { - ra_class_add_reg(set->classes[i], reg); - - set->ra_reg_to_gpr[reg] = j; - set->gpr_to_ra_reg[i][j] = reg; - - reg++; - } - } - - set->first_half_reg = reg; - base = HALF_OFFSET; - - for (unsigned i = 0; i < half_class_count; i++) { - set->half_classes[i] = ra_alloc_reg_class(set->regs); - - set->gpr_to_ra_reg[base + i] = - ralloc_array(set, uint16_t, HALF_CLASS_REGS(i)); - - for (unsigned j = 0; j < HALF_CLASS_REGS(i); j++) { - ra_class_add_reg(set->half_classes[i], reg); - - set->ra_reg_to_gpr[reg] = j; - set->gpr_to_ra_reg[base + i][j] = reg; - - reg++; - } - } - - set->first_shared_reg = reg; - base = SHARED_OFFSET; - - for (unsigned i = 0; i < shared_class_count; i++) { - set->shared_classes[i] = ra_alloc_reg_class(set->regs); - - set->gpr_to_ra_reg[base + i] = - ralloc_array(set, uint16_t, SHARED_CLASS_REGS(i)); - - for (unsigned j = 0; j < SHARED_CLASS_REGS(i); j++) { - ra_class_add_reg(set->shared_classes[i], reg); - - set->ra_reg_to_gpr[reg] = j; - set->gpr_to_ra_reg[base + i][j] = reg; - - reg++; - } - } - - /* - * Setup an additional class, with one vreg, to simply conflict - * with registers that are too high to encode tex-prefetch. This - * vreg is only used to setup additional conflicts so that RA - * knows to allocate prefetch dst regs below the limit: - */ - set->prefetch_exclude_class = ra_alloc_reg_class(set->regs); - ra_class_add_reg(set->prefetch_exclude_class, reg); - set->prefetch_exclude_reg = reg++; - - /* - * And finally setup conflicts. Starting a6xx, half precision regs - * conflict w/ full precision regs (when using MERGEDREGS): - */ - if (mergedregs) { - for (unsigned i = 0; i < CLASS_REGS(0) / 2; i++) { - unsigned freg = set->gpr_to_ra_reg[0][i]; - unsigned hreg0 = set->gpr_to_ra_reg[0 + HALF_OFFSET][(i * 2) + 0]; - unsigned hreg1 = set->gpr_to_ra_reg[0 + HALF_OFFSET][(i * 2) + 1]; - - ra_add_transitive_reg_pair_conflict(set->regs, freg, hreg0, hreg1); - } - } - - setup_conflicts(set); - - ra_set_finalize(set->regs, NULL); - - return set; -} - -int -ra_size_to_class(unsigned sz, bool half, bool shared) -{ - if (shared) { - for (unsigned i = 0; i < shared_class_count; i++) - if (shared_class_sizes[i] >= sz) - return i + SHARED_OFFSET; - } else if (half) { - for (unsigned i = 0; i < half_class_count; i++) - if (half_class_sizes[i] >= sz) - return i + HALF_OFFSET; - } else { - for (unsigned i = 0; i < class_count; i++) - if (class_sizes[i] >= sz) - return i; - } - debug_assert(0); - return -1; -} - -int -ra_class_to_size(unsigned class, bool *half, bool *shared) -{ - *half = *shared = false; - - if (class >= SHARED_OFFSET) { - *shared = true; - return shared_class_sizes[class - SHARED_OFFSET]; - } else if (class >= HALF_OFFSET) { - *half = true; - return half_class_sizes[class - HALF_OFFSET]; - } else { - return class_sizes[class]; - } -} diff --git a/src/freedreno/ir3/ir3_spill.c b/src/freedreno/ir3/ir3_spill.c new file mode 100644 index 0000000..7342c9b --- /dev/null +++ b/src/freedreno/ir3/ir3_spill.c @@ -0,0 +1,362 @@ +/* + * Copyright (C) 2021 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ir3_ra.h" +#include "ir3_shader.h" +#include "util/rb_tree.h" + +/* + * This pass does one thing so far: + * + * 1. Calculates the maximum register pressure. To do this, we need to use the + * exact same technique that RA uses for combining meta_split instructions + * with their sources, so that our calculation agrees with RA. + * + * It will also optionally spill registers once that's implemented. + */ + +struct ra_spill_interval { + struct ir3_reg_interval interval; +}; + +struct ra_spill_ctx { + struct ir3_reg_ctx reg_ctx; + + struct ra_spill_interval *intervals; + + struct ir3_pressure cur_pressure, max_pressure; + + struct ir3_liveness *live; + + const struct ir3_compiler *compiler; +}; + +static void +ra_spill_interval_init(struct ra_spill_interval *interval, struct ir3_register *reg) +{ + ir3_reg_interval_init(&interval->interval, reg); +} + +static void +ra_pressure_add(struct ir3_pressure *pressure, struct ra_spill_interval *interval) +{ + unsigned size = reg_size(interval->interval.reg); + if (interval->interval.reg->flags & IR3_REG_SHARED) + pressure->shared += size; + else if (interval->interval.reg->flags & IR3_REG_HALF) + pressure->half += size; + else + pressure->full += size; +} + +static void +ra_pressure_sub(struct ir3_pressure *pressure, struct ra_spill_interval *interval) +{ + unsigned size = reg_size(interval->interval.reg); + if (interval->interval.reg->flags & IR3_REG_SHARED) + pressure->shared -= size; + else if (interval->interval.reg->flags & IR3_REG_HALF) + pressure->half -= size; + else + pressure->full -= size; +} + +static struct ra_spill_interval * +ir3_reg_interval_to_interval(struct ir3_reg_interval *interval) +{ + return rb_node_data(struct ra_spill_interval, interval, interval); +} + +static struct ra_spill_ctx * +ir3_reg_ctx_to_ctx(struct ir3_reg_ctx *ctx) +{ + return rb_node_data(struct ra_spill_ctx, ctx, reg_ctx); +} + +static void +interval_add(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_interval) +{ + struct ra_spill_interval *interval = ir3_reg_interval_to_interval(_interval); + struct ra_spill_ctx *ctx = ir3_reg_ctx_to_ctx(_ctx); + + ra_pressure_add(&ctx->cur_pressure, interval); +} + +static void +interval_delete(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_interval) +{ + struct ra_spill_interval *interval = ir3_reg_interval_to_interval(_interval); + struct ra_spill_ctx *ctx = ir3_reg_ctx_to_ctx(_ctx); + + ra_pressure_sub(&ctx->cur_pressure, interval); +} + +static void +interval_readd(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_parent, + struct ir3_reg_interval *_child) +{ + interval_add(_ctx, _child); +} + +static void +spill_ctx_init(struct ra_spill_ctx *ctx) +{ + rb_tree_init(&ctx->reg_ctx.intervals); + ctx->reg_ctx.interval_add = interval_add; + ctx->reg_ctx.interval_delete = interval_delete; + ctx->reg_ctx.interval_readd = interval_readd; +} + +static void +ra_spill_ctx_insert(struct ra_spill_ctx *ctx, struct ra_spill_interval *interval) +{ + ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval); +} + +static void +ra_spill_ctx_remove(struct ra_spill_ctx *ctx, struct ra_spill_interval *interval) +{ + ir3_reg_interval_remove(&ctx->reg_ctx, &interval->interval); +} + +static void +init_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst) +{ + struct ra_spill_interval *interval = &ctx->intervals[dst->name]; + ra_spill_interval_init(interval, dst); +} + +static void +insert_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst) +{ + struct ra_spill_interval *interval = &ctx->intervals[dst->name]; + if (interval->interval.inserted) + return; + + ra_spill_ctx_insert(ctx, interval); + + /* For precolored inputs, make sure we leave enough registers to allow for + * holes in the inputs. It can happen that the binning shader has a lower + * register pressure than the main shader, but the main shader decided to + * add holes between the inputs which means that the binning shader has a + * higher register demand. + */ + if (dst->instr->opc == OPC_META_INPUT && + dst->num != INVALID_REG) { + physreg_t physreg = ra_reg_get_physreg(dst); + physreg_t max = physreg + reg_size(dst); + + if (interval->interval.reg->flags & IR3_REG_SHARED) + ctx->max_pressure.shared = MAX2(ctx->max_pressure.shared, max); + else if (interval->interval.reg->flags & IR3_REG_HALF) + ctx->max_pressure.half = MAX2(ctx->max_pressure.half, max); + else + ctx->max_pressure.full = MAX2(ctx->max_pressure.full, max); + } +} + +static void +remove_src_early(struct ra_spill_ctx *ctx, struct ir3_instruction *instr, struct ir3_register *src) +{ + if (!(src->flags & IR3_REG_FIRST_KILL)) + return; + + struct ra_spill_interval *interval = &ctx->intervals[src->def->name]; + + if (!interval->interval.inserted || interval->interval.parent || + !rb_tree_is_empty(&interval->interval.children)) + return; + + ra_spill_ctx_remove(ctx, interval); +} + +static void +remove_src(struct ra_spill_ctx *ctx, struct ir3_instruction *instr, struct ir3_register *src) +{ + if (!(src->flags & IR3_REG_FIRST_KILL)) + return; + + struct ra_spill_interval *interval = &ctx->intervals[src->def->name]; + + if (!interval->interval.inserted) + return; + + ra_spill_ctx_remove(ctx, interval); +} + +static void +remove_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst) +{ + struct ra_spill_interval *interval = &ctx->intervals[dst->name]; + + if (!interval->interval.inserted) + return; + + ra_spill_ctx_remove(ctx, interval); +} + +static void +update_max_pressure(struct ra_spill_ctx *ctx) +{ + d("pressure:"); + d("\tfull: %u", ctx->cur_pressure.full); + d("\thalf: %u", ctx->cur_pressure.half); + d("\tshared: %u", ctx->cur_pressure.shared); + + ctx->max_pressure.full = + MAX2(ctx->max_pressure.full, ctx->cur_pressure.full); + ctx->max_pressure.half = + MAX2(ctx->max_pressure.half, ctx->cur_pressure.half); + ctx->max_pressure.shared = + MAX2(ctx->max_pressure.shared, ctx->cur_pressure.shared); +} + +static void +handle_instr(struct ra_spill_ctx *ctx, struct ir3_instruction *instr) +{ + if (RA_DEBUG) { + printf("processing: "); + ir3_print_instr(instr); + } + + ra_foreach_dst(dst, instr) { + init_dst(ctx, dst); + } + + /* Handle tied destinations. If a destination is tied to a source and that + * source is live-through, then we need to allocate a new register for the + * destination which is live-through itself and cannot overlap the + * sources. + */ + + ra_foreach_dst(dst, instr) { + if (!ra_reg_is_array_rmw(dst)) { + struct ir3_register *tied_src = + ra_dst_get_tied_src(ctx->compiler, dst); + if (tied_src && !(tied_src->flags & IR3_REG_FIRST_KILL)) + insert_dst(ctx, dst); + } + } + + update_max_pressure(ctx); + + ra_foreach_src(src, instr) { + if (src->flags & IR3_REG_FIRST_KILL) + remove_src_early(ctx, instr, src); + } + + + ra_foreach_dst(dst, instr) { + insert_dst(ctx, dst); + } + + update_max_pressure(ctx); + + for (unsigned i = 0; i < instr->regs_count; i++) { + if (ra_reg_is_src(instr->regs[i]) && + (instr->regs[i]->flags & IR3_REG_FIRST_KILL)) + remove_src(ctx, instr, instr->regs[i]); + else if (ra_reg_is_dst(instr->regs[i]) && + (instr->regs[i]->flags & IR3_REG_UNUSED)) + remove_dst(ctx, instr->regs[i]); + } +} + +static void +handle_input_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *instr) +{ + init_dst(ctx, instr->regs[0]); + insert_dst(ctx, instr->regs[0]); +} + +static void +remove_input_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *instr) +{ + ra_foreach_src(src, instr) + remove_src(ctx, instr, src); + if (instr->regs[0]->flags & IR3_REG_UNUSED) + remove_dst(ctx, instr->regs[0]); +} + +static void +handle_live_in(struct ra_spill_ctx *ctx, struct ir3_register *def) +{ + struct ra_spill_interval *interval = &ctx->intervals[def->name]; + ra_spill_interval_init(interval, def); + insert_dst(ctx, def); +} + +static void +handle_block(struct ra_spill_ctx *ctx, struct ir3_block *block) +{ + memset(&ctx->cur_pressure, 0, sizeof(ctx->cur_pressure)); + rb_tree_init(&ctx->reg_ctx.intervals); + + unsigned name; + BITSET_FOREACH_SET(name, ctx->live->live_in[block->index], + ctx->live->definitions_count) { + struct ir3_register *reg = ctx->live->definitions[name]; + handle_live_in(ctx, reg); + } + + foreach_instr (instr, &block->instr_list) { + if (instr->opc != OPC_META_PHI && instr->opc != OPC_META_INPUT && + instr->opc != OPC_META_TEX_PREFETCH) + break; + handle_input_phi(ctx, instr); + } + + update_max_pressure(ctx); + + foreach_instr (instr, &block->instr_list) { + if (instr->opc == OPC_META_PHI || instr->opc == OPC_META_INPUT || + instr->opc == OPC_META_TEX_PREFETCH) + remove_input_phi(ctx, instr); + else + handle_instr(ctx, instr); + } +} + +void +ir3_calc_pressure(struct ir3_shader_variant *v, struct ir3_liveness *live, + struct ir3_pressure *max_pressure) +{ + struct ra_spill_ctx ctx = {}; + ctx.live = live; + ctx.intervals = calloc(live->definitions_count, sizeof(*ctx.intervals)); + ctx.compiler = v->shader->compiler; + spill_ctx_init(&ctx); + + foreach_block (block, &v->ir->block_list) { + handle_block(&ctx, block); + } + + assert(ctx.cur_pressure.full == 0); + assert(ctx.cur_pressure.half == 0); + assert(ctx.cur_pressure.shared == 0); + + free(ctx.intervals); + + *max_pressure = ctx.max_pressure; +} + diff --git a/src/freedreno/ir3/meson.build b/src/freedreno/ir3/meson.build index b3799f1..c27b267 100644 --- a/src/freedreno/ir3/meson.build +++ b/src/freedreno/ir3/meson.build @@ -81,11 +81,13 @@ libfreedreno_ir3_files = files( 'ir3_delay.c', 'ir3_dominance.c', 'ir3_disk_cache.c', - 'ir3_group.c', 'ir3_image.c', 'ir3_image.h', 'ir3.h', 'ir3_legalize.c', + 'ir3_liveness.c', + 'ir3_lower_parallelcopy.c', + 'ir3_merge_regs.c', 'ir3_nir.c', 'ir3_nir.h', 'ir3_nir_analyze_ubo_ranges.c', @@ -100,10 +102,10 @@ libfreedreno_ir3_files = files( 'ir3_print.c', 'ir3_ra.c', 'ir3_ra.h', - 'ir3_ra_regset.c', 'ir3_sched.c', 'ir3_shader.c', 'ir3_shader.h', + 'ir3_spill.c', 'ir3_validate.c', 'regmask.h', ) diff --git a/src/gallium/drivers/freedreno/ci/piglit-freedreno-a530-fails.txt b/src/gallium/drivers/freedreno/ci/piglit-freedreno-a530-fails.txt index 3a15fd3..cf601d8 100644 --- a/src/gallium/drivers/freedreno/ci/piglit-freedreno-a530-fails.txt +++ b/src/gallium/drivers/freedreno/ci/piglit-freedreno-a530-fails.txt @@ -336,6 +336,6 @@ wgl@wgl-multi-context-single-window,Fail wgl@wgl-multi-window-single-context,Fail wgl@wgl-sanity,Fail spec@glsl-1.30@execution@clipping@fs-clip-distance-interpolated,Crash -spec@glsl-1.30@execution@fs-large-local-array-vec2,Crash -spec@glsl-1.30@execution@fs-large-local-array-vec3,Crash -spec@glsl-1.30@execution@fs-large-local-array-vec4,Crash +spec@glsl-1.30@execution@fs-large-local-array-vec2,Fail +spec@glsl-1.30@execution@fs-large-local-array-vec3,Fail +spec@glsl-1.30@execution@fs-large-local-array-vec4,Fail diff --git a/src/gallium/drivers/freedreno/ci/piglit-freedreno-a630-fails.txt b/src/gallium/drivers/freedreno/ci/piglit-freedreno-a630-fails.txt index 5c4d78e..f149728 100644 --- a/src/gallium/drivers/freedreno/ci/piglit-freedreno-a630-fails.txt +++ b/src/gallium/drivers/freedreno/ci/piglit-freedreno-a630-fails.txt @@ -475,8 +475,6 @@ spec@arb_tessellation_shader@execution@variable-indexing@tes-input-array-float-i spec@arb_tessellation_shader@execution@variable-indexing@tes-input-array-vec2-index-rd,Crash spec@arb_tessellation_shader@execution@variable-indexing@tes-input-array-vec3-index-rd,Crash spec@arb_tessellation_shader@execution@variable-indexing@tes-input-array-vec4-index-rd,Crash -spec@arb_tessellation_shader@execution@variable-indexing@vs-output-array-vec3-index-wr-before-tcs,Fail -spec@arb_tessellation_shader@execution@variable-indexing@vs-output-array-vec4-index-wr-before-tcs,Fail spec@arb_tessellation_shader@execution@vertex-partial-write,Crash spec@arb_tessellation_shader@execution@vs-tes-max-in-out-components,Fail spec@arb_tessellation_shader@execution@vs-tes-tessinner-tessouter-inputs-quads,Fail @@ -507,11 +505,8 @@ spec@glsl-1.50@execution@compatibility@vs-gs-texcoord-array-2,Crash spec@glsl-1.50@execution@geometry@max-input-components,Fail spec@glsl-1.50@execution@primitive-id-no-gs-quad-strip,Fail spec@glsl-1.50@execution@primitive-id-no-gs-quads,Fail -spec@glsl-1.50@execution@variable-indexing@gs-input-array-float-index-rd,Fail spec@glsl-1.50@execution@variable-indexing@gs-input-array-vec2-index-rd,Fail spec@glsl-1.50@execution@variable-indexing@gs-input-array-vec3-index-rd,Fail spec@glsl-1.50@execution@variable-indexing@gs-output-array-vec3-index-wr,Fail spec@glsl-1.50@execution@variable-indexing@gs-output-array-vec4-index-wr,Crash -spec@glsl-1.50@execution@variable-indexing@vs-output-array-vec3-index-wr-before-gs,Fail spec@glsl-1.50@execution@variable-indexing@vs-output-array-vec4-index-wr-before-gs,Fail -spec@glsl-es-3.10@execution@cs-image-atomic-if-else-2,Fail -- 2.7.4