From 39aa8c4a5ac9243348b552d9055a926b77d98c29 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Wed, 30 Dec 2020 15:50:50 -0500 Subject: [PATCH] pan/bi: Switch to new IR Signed-off-by: Alyssa Rosenzweig Part-of: --- .gitlab-ci/deqp-panfrost-g52-fails.txt | 2 - src/panfrost/bifrost/bi_liveness.c | 10 +- src/panfrost/bifrost/bi_opt_dce.c | 13 +- src/panfrost/bifrost/bi_pack.c | 223 +++++++++++++++++++----------- src/panfrost/bifrost/bi_print.c | 4 +- src/panfrost/bifrost/bi_ra.c | 238 ++++++++++++++++----------------- src/panfrost/bifrost/bi_schedule.c | 4 +- src/panfrost/bifrost/bifrost_compile.c | 30 +++-- src/panfrost/bifrost/bir.c | 4 +- src/panfrost/bifrost/compiler.h | 4 +- 10 files changed, 301 insertions(+), 231 deletions(-) diff --git a/.gitlab-ci/deqp-panfrost-g52-fails.txt b/.gitlab-ci/deqp-panfrost-g52-fails.txt index 94b3837..36ebabf 100644 --- a/.gitlab-ci/deqp-panfrost-g52-fails.txt +++ b/.gitlab-ci/deqp-panfrost-g52-fails.txt @@ -14,7 +14,6 @@ dEQP-GLES2.functional.fbo.completeness.renderable.texture.color0.rgb_half_float_ dEQP-GLES2.functional.fbo.completeness.size.distinct,Fail dEQP-GLES2.functional.negative_api.shader.uniform_matrixfv_invalid_transpose,Fail dEQP-GLES2.functional.negative_api.texture.generatemipmap_zero_level_array_compressed,Fail -dEQP-GLES2.functional.shaders.random.all_features.fragment.88,Fail dEQP-GLES2.functional.shaders.texture_functions.vertex.texturecubelod,Fail dEQP-GLES2.functional.texture.mipmap.cube.basic.linear_linear,Fail dEQP-GLES2.functional.texture.mipmap.cube.basic.linear_nearest,Fail @@ -38,4 +37,3 @@ dEQP-GLES2.functional.texture.vertex.cube.wrap.clamp_repeat,Fail dEQP-GLES2.functional.texture.vertex.cube.wrap.mirror_clamp,Fail dEQP-GLES2.functional.texture.vertex.cube.wrap.mirror_mirror,Fail dEQP-GLES2.functional.texture.vertex.cube.wrap.mirror_repeat,Fail -dEQP-GLES2.functional.uniform_api.random.79,Fail diff --git a/src/panfrost/bifrost/bi_liveness.c b/src/panfrost/bifrost/bi_liveness.c index 94275d0..712e77a 100644 --- a/src/panfrost/bifrost/bi_liveness.c +++ b/src/panfrost/bifrost/bi_liveness.c @@ -25,15 +25,15 @@ #include "compiler.h" void -bi_liveness_ins_update(uint16_t *live, bi_instruction *ins, unsigned max) +bi_liveness_ins_update(uint16_t *live, bi_instr *ins, unsigned max) { /* live_in[s] = GEN[s] + (live_out[s] - KILL[s]) */ - pan_liveness_kill(live, ins->dest, max, bi_writemask(ins)); + pan_liveness_kill(live, bi_get_node(ins->dest[0]), max, bi_writemask_new(ins)); bi_foreach_src(ins, src) { - unsigned node = ins->src[src]; - unsigned bytemask = bi_bytemask_of_read_components(ins, node); + unsigned node = bi_get_node(ins->src[src]); + unsigned bytemask = bi_bytemask_of_read_components_new(ins, ins->src[src]); pan_liveness_gen(live, node, max, bytemask); } @@ -42,7 +42,7 @@ bi_liveness_ins_update(uint16_t *live, bi_instruction *ins, unsigned max) static void bi_liveness_ins_update_wrap(uint16_t *live, void *ins, unsigned max) { - bi_liveness_ins_update(live, (bi_instruction *) ins, max); + bi_liveness_ins_update(live, (bi_instr *) ins, max); } void diff --git a/src/panfrost/bifrost/bi_opt_dce.c b/src/panfrost/bifrost/bi_opt_dce.c index 9a0c039..7310705 100644 --- a/src/panfrost/bifrost/bi_opt_dce.c +++ b/src/panfrost/bifrost/bi_opt_dce.c @@ -36,12 +36,13 @@ bi_opt_dead_code_eliminate(bi_context *ctx, bi_block *block) uint16_t *live = mem_dup(block->base.live_out, temp_count * sizeof(uint16_t)); - bi_foreach_instr_in_block_safe_rev(block, ins) { - if (ins->dest && !(ins->dest & BIR_SPECIAL)) { - if (!live[ins->dest]) { - bi_remove_instruction(ins); - progress |= true; - } + bi_foreach_instr_in_block_safe_rev(block, _ins) { + bi_instr *ins = (bi_instr *) _ins; + unsigned index = bi_get_node(ins->dest[0]); + + if (index < temp_count && !live[index]) { + bi_remove_instruction((bi_instruction *) ins); + progress |= true; } bi_liveness_ins_update(live, ins, temp_count); diff --git a/src/panfrost/bifrost/bi_pack.c b/src/panfrost/bifrost/bi_pack.c index 7d276c5..3b5d127 100644 --- a/src/panfrost/bifrost/bi_pack.c +++ b/src/panfrost/bifrost/bi_pack.c @@ -49,7 +49,7 @@ bi_pack_header(bi_clause *clause, bi_clause *next_1, bi_clause *next_2, bool tdd (next_1 == NULL) ? BIFROST_FLOW_END : clause->flow_control, .terminate_discarded_threads = tdd, - .next_clause_prefetch = clause->next_clause_prefetch, + .next_clause_prefetch = clause->next_clause_prefetch && next_1, .staging_barrier = clause->staging_barrier, .staging_register = clause->staging_register, .dependency_wait = dependency_wait, @@ -105,15 +105,16 @@ bi_constant_field(unsigned idx) static bool bi_assign_fau_idx_single(bi_registers *regs, bi_clause *clause, - bi_instruction *ins, + bi_instr *ins, bool assigned, bool fast_zero) { if (!ins) return assigned; - if (ins->type == BI_BRANCH && clause->branch_constant) { - /* By convention branch constant is last */ + if (ins->branch_target && clause->branch_constant) { + /* By convention branch constant is last XXX: this whole thing + * is a hack, FIXME */ unsigned idx = clause->constant_count - 1; /* We can only jump to clauses which are qword aligned so the @@ -126,18 +127,26 @@ bi_assign_fau_idx_single(bi_registers *regs, if (assigned && regs->fau_idx != C) unreachable("Mismatched fau_idx: branch"); + bi_foreach_src(ins, s) { + if (ins->src[s].type == BI_INDEX_CONSTANT) + ins->src[s] = bi_passthrough(BIFROST_SRC_FAU_HI); + } + regs->fau_idx = C; return true; } bi_foreach_src(ins, s) { - if (ins->src[s] & BIR_INDEX_CONSTANT) { + if (ins->src[s].type == BI_INDEX_CONSTANT) { bool hi = false; - uint32_t cons = bi_get_immediate(ins, s); + uint32_t cons = ins->src[s].value; + unsigned swizzle = ins->src[s].swizzle; /* FMA can encode zero for free */ if (cons == 0 && fast_zero) { - ins->src[s] = BIR_INDEX_PASS | BIFROST_SRC_STAGE; + assert(!ins->src[s].abs && !ins->src[s].neg); + ins->src[s] = bi_passthrough(BIFROST_SRC_STAGE); + ins->src[s].swizzle = swizzle; continue; } @@ -149,16 +158,17 @@ bi_assign_fau_idx_single(bi_registers *regs, unreachable("Mismatched uniform/const field: imm"); regs->fau_idx = f; - ins->src[s] = BIR_INDEX_PASS | (hi ? BIFROST_SRC_FAU_HI : BIFROST_SRC_FAU_LO); + ins->src[s] = bi_passthrough(hi ? BIFROST_SRC_FAU_HI : BIFROST_SRC_FAU_LO); + ins->src[s].swizzle = swizzle; assigned = true; - } else if (ins->src[s] & BIR_INDEX_FAU) { - unsigned index = ins->src[s] & BIR_FAU_TYPE_MASK; - bool hi = !!(ins->src[s] & BIR_FAU_HI); - - assert(!assigned || regs->fau_idx == index); - regs->fau_idx = index; - ins->src[s] = BIR_INDEX_PASS | - (hi ? BIFROST_SRC_FAU_HI : BIFROST_SRC_FAU_LO); + } else if (ins->src[s].type == BI_INDEX_FAU) { + bool hi = ins->src[s].offset > 0; + + assert(!assigned || regs->fau_idx == ins->src[s].value); + assert(ins->src[s].swizzle == BI_SWIZZLE_H01); + regs->fau_idx = ins->src[s].value; + ins->src[s] = bi_passthrough(hi ? BIFROST_SRC_FAU_HI : + BIFROST_SRC_FAU_LO); assigned = true; } } @@ -171,43 +181,41 @@ bi_assign_fau_idx(bi_clause *clause, bi_bundle *bundle) { bool assigned = - bi_assign_fau_idx_single(&bundle->regs, clause, bundle->fma, false, true); + bi_assign_fau_idx_single(&bundle->regs, clause, (bi_instr *) bundle->fma, false, true); - bi_assign_fau_idx_single(&bundle->regs, clause, bundle->add, assigned, false); + bi_assign_fau_idx_single(&bundle->regs, clause, (bi_instr *) bundle->add, assigned, false); } /* Assigns a slot for reading, before anything is written */ static void -bi_assign_slot_read(bi_registers *regs, unsigned src) +bi_assign_slot_read(bi_registers *regs, bi_index src) { /* We only assign for registers */ - if (!(src & BIR_INDEX_REGISTER)) + if (src.type != BI_INDEX_REGISTER) return; - unsigned reg = src & ~BIR_INDEX_REGISTER; - /* Check if we already assigned the slot */ for (unsigned i = 0; i <= 1; ++i) { - if (regs->slot[i] == reg && regs->enabled[i]) + if (regs->slot[i] == src.value && regs->enabled[i]) return; } - if (regs->slot[2] == reg && regs->slot23.slot2 == BIFROST_OP_READ) + if (regs->slot[2] == src.value && regs->slot23.slot2 == BIFROST_OP_READ) return; /* Assign it now */ for (unsigned i = 0; i <= 1; ++i) { if (!regs->enabled[i]) { - regs->slot[i] = reg; + regs->slot[i] = src.value; regs->enabled[i] = true; return; } } if (!regs->slot23.slot3) { - regs->slot[2] = reg; + regs->slot[2] = src.value; regs->slot23.slot2 = BIFROST_OP_READ; return; } @@ -223,44 +231,52 @@ bi_assign_slots(bi_bundle *now, bi_bundle *prev) * use the data registers, which has its own mechanism entirely * and thus gets skipped over here. */ - unsigned read_dreg = now->add && - bi_class_props[now->add->type] & BI_DATA_REG_SRC; + bool read_dreg = now->add && + bi_opcode_props[((bi_instr *) now->add)->op].sr_read; - unsigned write_dreg = prev->add && - bi_class_props[prev->add->type] & BI_DATA_REG_DEST; + bool write_dreg = now->add && + bi_opcode_props[((bi_instr *) now->add)->op].sr_write; /* First, assign reads */ if (now->fma) bi_foreach_src(now->fma, src) - bi_assign_slot_read(&now->regs, now->fma->src[src]); + bi_assign_slot_read(&now->regs, ((bi_instr *) now->fma)->src[src]); if (now->add) { bi_foreach_src(now->add, src) { if (!(src == 0 && read_dreg)) - bi_assign_slot_read(&now->regs, now->add->src[src]); + bi_assign_slot_read(&now->regs, ((bi_instr *) now->add)->src[src]); } } - /* Next, assign writes */ + /* Next, assign writes. Staging writes are assigned separately, but + * +ATEST wants its destination written to both a staging register + * _and_ a regular write, because it may not generate a message */ - if (prev->add && prev->add->dest & BIR_INDEX_REGISTER && !write_dreg) { - now->regs.slot[3] = prev->add->dest & ~BIR_INDEX_REGISTER; - now->regs.slot23.slot3 = BIFROST_OP_WRITE; - } + if (prev->add && (!write_dreg || ((bi_instr *) prev->add)->op == BI_OPCODE_ATEST)) { + bi_index idx = ((bi_instr *) prev->add)->dest[0]; - if (prev->fma && prev->fma->dest & BIR_INDEX_REGISTER) { - unsigned r = prev->fma->dest & ~BIR_INDEX_REGISTER; - - if (now->regs.slot23.slot3) { - /* Scheduler constraint: cannot read 3 and write 2 */ - assert(!now->regs.slot23.slot2); - now->regs.slot[2] = r; - now->regs.slot23.slot2 = BIFROST_OP_WRITE; - } else { - now->regs.slot[3] = r; + if (idx.type == BI_INDEX_REGISTER) { + now->regs.slot[3] = idx.value; now->regs.slot23.slot3 = BIFROST_OP_WRITE; - now->regs.slot23.slot3_fma = true; + } + } + + if (prev->fma) { + bi_index idx = ((bi_instr *) prev->fma)->dest[0]; + + if (idx.type == BI_INDEX_REGISTER) { + if (now->regs.slot23.slot3) { + /* Scheduler constraint: cannot read 3 and write 2 */ + assert(!now->regs.slot23.slot2); + now->regs.slot[2] = idx.value; + now->regs.slot23.slot2 = BIFROST_OP_WRITE; + } else { + now->regs.slot[3] = idx.value; + now->regs.slot23.slot3 = BIFROST_OP_WRITE; + now->regs.slot23.slot3_fma = true; + } } } @@ -934,39 +950,65 @@ bi_flip_slots(bi_registers *regs) static void bi_lower_cubeface2(bi_context *ctx, bi_bundle *bundle) { + bi_instr *old = (bi_instr *) bundle->add; + /* Filter for +CUBEFACE2 */ - if (!bundle->add || bundle->add->type != BI_SPECIAL_ADD - || bundle->add->op.special != BI_SPECIAL_CUBEFACE2) { + if (!old || old->op != BI_OPCODE_CUBEFACE2) return; - } /* This won't be used once we emit non-singletons, for now this is just * a fact of our scheduler and allows us to clobber FMA */ assert(!bundle->fma); /* Construct an FMA op */ - bi_instruction cubeface1 = { - .type = BI_SPECIAL_FMA, - .op.special = BI_SPECIAL_CUBEFACE1, - /* no dest, just to a temporary */ - .dest_type = nir_type_float32, - .src_types = { nir_type_float32, nir_type_float32, nir_type_float32 }, - }; - - /* Copy over the register allocated sources (coordinates). */ - memcpy(&cubeface1.src, bundle->add->src, sizeof(cubeface1.src)); - - /* Zeroed by RA since this is all 32-bit */ - for (unsigned i = 0; i < 3; ++i) - assert(bundle->add->swizzle[i][0] == 0); + bi_instr *new = rzalloc(ctx, bi_instr); + new->op = BI_OPCODE_CUBEFACE1; + /* no dest, just a temporary */ + new->src[0] = old->src[0]; + new->src[1] = old->src[1]; + new->src[2] = old->src[2]; /* Emit the instruction */ - bundle->fma = bi_emit_before(ctx, bundle->add, cubeface1); + list_addtail(&new->link, &old->link); + bundle->fma = (bi_instruction *) new; /* Now replace the sources of the CUBEFACE2 with a single passthrough * from the CUBEFACE1 (and a side-channel) */ - bundle->add->src[0] = BIR_INDEX_PASS | BIFROST_SRC_STAGE; - bundle->add->src[1] = bundle->add->src[2] = 0; + old->src[0] = bi_passthrough(BIFROST_SRC_STAGE); + old->src[1] = old->src[2] = bi_null(); +} + +static inline enum bifrost_packed_src +bi_get_src_slot(bi_registers *regs, unsigned reg) +{ + if (regs->slot[0] == reg && regs->enabled[0]) + return BIFROST_SRC_PORT0; + else if (regs->slot[1] == reg && regs->enabled[1]) + return BIFROST_SRC_PORT1; + else if (regs->slot[2] == reg && regs->slot23.slot2 == BIFROST_OP_READ) + return BIFROST_SRC_PORT2; + else + unreachable("Tried to access register with no port"); +} + +static inline enum bifrost_packed_src +bi_get_src_new(bi_instr *ins, bi_registers *regs, unsigned s) +{ + if (!ins) + return 0; + + bi_index src = ins->src[s]; + + if (src.type == BI_INDEX_REGISTER) + return bi_get_src_slot(regs, src.value); + else if (src.type == BI_INDEX_PASS) + return src.value; + else if (bi_is_null(src) && ins->op == BI_OPCODE_ZS_EMIT && s < 2) + return BIFROST_SRC_STAGE; + else { + /* TODO make safer */ + return BIFROST_SRC_STAGE; + } } static struct bi_packed_bundle @@ -978,9 +1020,38 @@ bi_pack_bundle(bi_clause *clause, bi_bundle bundle, bi_bundle prev, bool first_b bi_flip_slots(&bundle.regs); + bool sr_read = bundle.add && + bi_opcode_props[((bi_instr *) bundle.add)->op].sr_read; + uint64_t reg = bi_pack_registers(bundle.regs); - uint64_t fma = pan_pack_fma(clause, bundle, &bundle.regs); - uint64_t add = pan_pack_add(clause, bundle, &bundle.regs, stage); + uint64_t fma = bi_pack_fma((bi_instr *) bundle.fma, + bi_get_src_new((bi_instr *) bundle.fma, &bundle.regs, 0), + bi_get_src_new((bi_instr *) bundle.fma, &bundle.regs, 1), + bi_get_src_new((bi_instr *) bundle.fma, &bundle.regs, 2), + bi_get_src_new((bi_instr *) bundle.fma, &bundle.regs, 3)); + + uint64_t add = bi_pack_add((bi_instr *) bundle.add, + bi_get_src_new((bi_instr *) bundle.add, &bundle.regs, sr_read + 0), + bi_get_src_new((bi_instr *) bundle.add, &bundle.regs, sr_read + 1), + bi_get_src_new((bi_instr *) bundle.add, &bundle.regs, sr_read + 2), + 0); + + if (bundle.add) { + bi_instr *add = (bi_instr *) bundle.add; + + bool sr_write = bi_opcode_props[add->op].sr_write; + + if (sr_read) { + assert(add->src[0].type == BI_INDEX_REGISTER); + clause->staging_register = add->src[0].value; + + if (sr_write) + assert(bi_is_equiv(add->src[0], add->dest[0])); + } else if (sr_write) { + assert(add->dest[0].type == BI_INDEX_REGISTER); + clause->staging_register = add->dest[0].value; + } + } struct bi_packed_bundle packed = { .lo = reg | (fma << 35) | ((add & 0b111111) << 58), @@ -1022,8 +1093,8 @@ bi_pack_constants(bi_context *ctx, bi_clause *clause, /* Compute branch offset instead of a dummy 0 */ if (branches) { - bi_instruction *br = clause->bundles[clause->bundle_count - 1].add; - assert(br && br->type == BI_BRANCH && br->branch_target); + bi_instr *br = (bi_instr *) clause->bundles[clause->bundle_count - 1].add; + assert(br && br->branch_target); /* Put it in the high place */ int32_t qwords = bi_block_offset(ctx, clause, br->branch_target); @@ -1074,7 +1145,7 @@ bi_pack_clause(bi_context *ctx, bi_clause *clause, struct util_dynarray *emission, gl_shader_stage stage, bool tdd) { - /* After the deadline lowering */ + /* TODO After the deadline lowering */ bi_lower_cubeface2(ctx, &clause->bundles[0]); struct bi_packed_bundle ins_1 = bi_pack_bundle(clause, clause->bundles[0], clause->bundles[0], true, stage); @@ -1148,9 +1219,9 @@ bi_collect_blend_ret_addr(bi_context *ctx, struct util_dynarray *emission, return; const bi_bundle *bundle = &clause->bundles[clause->bundle_count - 1]; - const bi_instruction *ins = bundle->add; + const bi_instr *ins = (bi_instr *) bundle->add; - if (!ins || ins->type != BI_BLEND) + if (!ins || ins->op != BI_OPCODE_BLEND) return; /* We don't support non-terminal blend instructions yet. @@ -1160,11 +1231,13 @@ bi_collect_blend_ret_addr(bi_context *ctx, struct util_dynarray *emission, */ assert(0); +#if 0 assert(ins->blend_location < ARRAY_SIZE(ctx->blend_ret_offsets)); assert(!ctx->blend_ret_offsets[ins->blend_location]); ctx->blend_ret_offsets[ins->blend_location] = util_dynarray_num_elements(emission, uint8_t); assert(!(ctx->blend_ret_offsets[ins->blend_location] & 0x7)); +#endif } void diff --git a/src/panfrost/bifrost/bi_print.c b/src/panfrost/bifrost/bi_print.c index 1785bf0..d897198 100644 --- a/src/panfrost/bifrost/bi_print.c +++ b/src/panfrost/bifrost/bi_print.c @@ -480,7 +480,7 @@ bi_print_bundle(bi_bundle *bundle, FILE *fp) for (unsigned i = 0; i < 2; ++i) { if (ins[i]) - bi_print_instruction(ins[i], fp); + bi_print_instr((bi_instr *) ins[i], fp); else fprintf(fp, "nop\n"); } @@ -536,7 +536,7 @@ bi_print_block(bi_block *block, FILE *fp) bi_print_clause(clause, fp); } else { bi_foreach_instr_in_block(block, ins) - bi_print_instruction(ins, fp); + bi_print_instr((bi_instr *) ins, fp); } fprintf(fp, "}"); diff --git a/src/panfrost/bifrost/bi_ra.c b/src/panfrost/bifrost/bi_ra.c index ab768aa..c14627b 100644 --- a/src/panfrost/bifrost/bi_ra.c +++ b/src/panfrost/bifrost/bi_ra.c @@ -26,6 +26,7 @@ #include "compiler.h" #include "bi_print.h" +#include "bi_builder.h" #include "panfrost/util/lcra.h" #include "util/u_memory.h" @@ -38,14 +39,18 @@ bi_compute_interference(bi_context *ctx, struct lcra_state *l) bi_block *blk = (bi_block *) _blk; uint16_t *live = mem_dup(_blk->live_out, l->node_count * sizeof(uint16_t)); - bi_foreach_instr_in_block_rev(blk, ins) { + bi_foreach_instr_in_block_rev(blk, _ins) { /* Mark all registers live after the instruction as * interfering with the destination */ - if (ins->dest && (ins->dest < l->node_count)) { + bi_instr *ins = (bi_instr *) _ins; + for (unsigned d = 0; d < ARRAY_SIZE(ins->dest); ++d) { + if (bi_get_node(ins->dest[d]) >= l->node_count) + continue; + for (unsigned i = 1; i < l->node_count; ++i) { if (live[i]) - lcra_add_node_interference(l, ins->dest, bi_writemask(ins), i, live[i]); + lcra_add_node_interference(l, bi_get_node(ins->dest[d]), bi_writemask_new(ins), i, live[i]); } } @@ -76,15 +81,19 @@ bi_allocate_registers(bi_context *ctx, bool *success) } else { /* R0 - R63, all 32-bit */ l->class_start[BI_REG_CLASS_WORK] = 0; - l->class_size[BI_REG_CLASS_WORK] = 63 * 4; + l->class_size[BI_REG_CLASS_WORK] = 59 * 4; } - bi_foreach_instr_global(ctx, ins) { - unsigned dest = ins->dest; + bi_foreach_instr_global(ctx, _ins) { + bi_instr *ins = (bi_instr *) _ins; + unsigned dest = bi_get_node(ins->dest[0]); /* Blend shaders expect the src colour to be in r0-r3 */ - if (ins->type == BI_BLEND && !ctx->is_blend) - l->solutions[ins->src[0]] = 0; + if (ins->op == BI_OPCODE_BLEND && !ctx->is_blend) { + unsigned node = bi_get_node(ins->src[0]); + assert(node < node_count); + l->solutions[node] = 0; + } if (!dest || (dest >= node_count)) continue; @@ -102,87 +111,61 @@ bi_allocate_registers(bi_context *ctx, bool *success) return l; } -static unsigned -bi_reg_from_index(struct lcra_state *l, unsigned index, unsigned offset) +static bi_index +bi_reg_from_index(struct lcra_state *l, bi_index index) { + /* Offsets can only be applied when we register allocated an index, or + * alternatively for FAU's encoding */ + + ASSERTED bool is_offset = (index.offset > 0) && + (index.type != BI_INDEX_FAU); + /* Did we run RA for this index at all */ - if (index >= l->node_count) + if (bi_get_node(index) >= l->node_count) { + assert(!is_offset); return index; + } /* LCRA didn't bother solving this index (how lazy!) */ - signed solution = l->solutions[index]; - if (solution < 0) + signed solution = l->solutions[bi_get_node(index)]; + if (solution < 0) { + assert(!is_offset); return index; + } assert((solution & 0x3) == 0); unsigned reg = solution / 4; - reg += offset; - - return BIR_INDEX_REGISTER | reg; -} - -static void -bi_adjust_src_ra(bi_instruction *ins, struct lcra_state *l, unsigned src) -{ - if (ins->src[src] >= l->node_count) - return; - - bool vector = (bi_class_props[ins->type] & BI_VECTOR) && src == 0; - unsigned offset = 0; - - if (vector) { - /* TODO: Do we do anything here? */ - } else { - /* Use the swizzle as component select */ - unsigned components = bi_get_component_count(ins, src); - - nir_alu_type T = ins->src_types[src]; - unsigned size = nir_alu_type_get_type_size(T); - unsigned components_per_word = MAX2(32 / size, 1); - - for (unsigned i = 0; i < components; ++i) { - unsigned off = ins->swizzle[src][i] / components_per_word; - - /* We can't cross register boundaries in a swizzle */ - if (i == 0) - offset = off; - else - assert(off == offset); - - ins->swizzle[src][i] %= components_per_word; - } - } - - ins->src[src] = bi_reg_from_index(l, ins->src[src], offset); -} - -static void -bi_adjust_dest_ra(bi_instruction *ins, struct lcra_state *l) -{ - if (ins->dest >= l->node_count) - return; - - ins->dest = bi_reg_from_index(l, ins->dest, ins->dest_offset); - ins->dest_offset = 0; + reg += index.offset; + + /* todo: do we want to compose with the subword swizzle? */ + bi_index new_index = bi_register(reg); + new_index.swizzle = index.swizzle; + new_index.abs = index.abs; + new_index.neg = index.neg; + return new_index; } static void bi_install_registers(bi_context *ctx, struct lcra_state *l) { - bi_foreach_instr_global(ctx, ins) { - bi_adjust_dest_ra(ins, l); + bi_foreach_instr_global(ctx, _ins) { + bi_instr *ins = (bi_instr *) _ins; + ins->dest[0] = bi_reg_from_index(l, ins->dest[0]); bi_foreach_src(ins, s) - bi_adjust_src_ra(ins, l, s); + ins->src[s] = bi_reg_from_index(l, ins->src[s]); } } static void -bi_rewrite_index_src_single(bi_instruction *ins, unsigned old, unsigned new) +bi_rewrite_index_src_single(bi_instr *ins, bi_index old, bi_index new) { bi_foreach_src(ins, i) { - if (ins->src[i] == old) - ins->src[i] = new; + if (bi_is_equiv(ins->src[i], old)) { + ins->src[i].type = new.type; + ins->src[i].reg = new.reg; + ins->src[i].value = new.value; + } } } @@ -279,9 +262,12 @@ bi_choose_spill_node(bi_context *ctx, struct lcra_state *l) { /* Pick a node satisfying bi_spill_register's preconditions */ - bi_foreach_instr_global(ctx, ins) { - if (ins->no_spill) - lcra_set_node_spill_cost(l, ins->dest, -1); + bi_foreach_instr_global(ctx, _ins) { + bi_instr *ins = (bi_instr *) _ins; + if (ins->no_spill || ins->dest[0].offset || !bi_is_null(ins->dest[1])) { + for (unsigned d = 0; d < ARRAY_SIZE(ins->dest); ++d) + lcra_set_node_spill_cost(l, bi_get_node(ins->dest[0]), -1); + } } for (unsigned i = PAN_IS_REG; i < l->node_count; i += 2) @@ -290,54 +276,75 @@ bi_choose_spill_node(bi_context *ctx, struct lcra_state *l) return lcra_get_best_spill_node(l); } +static void +bi_spill_dest(bi_builder *b, bi_index index, uint32_t offset, + bi_clause *clause, bi_block *block, bi_instr *ins, + uint32_t *channels) +{ + ins->dest[0] = bi_temp(b->shader); + ins->no_spill = true; + + unsigned newc = util_last_bit(bi_writemask_new(ins)) >> 2; + *channels = MAX2(*channels, newc); + + b->cursor = bi_after_instr(ins); + + bi_instr *st = bi_store_to(b, (*channels) * 32, bi_null(), + ins->dest[0], bi_imm_u32(offset), bi_zero(), + BI_SEG_TL); + + bi_clause *singleton = bi_singleton(b->shader, st, block, 0, (1 << 0), + true); + + list_add(&singleton->link, &clause->link); + b->shader->spills++; +} + +static void +bi_fill_src(bi_builder *b, bi_index index, uint32_t offset, bi_clause *clause, + bi_block *block, bi_instr *ins, unsigned channels) +{ + bi_index temp = bi_temp(b->shader); + + b->cursor = bi_before_instr(ins); + bi_instr *ld = bi_load_to(b, channels * 32, temp, bi_imm_u32(offset), + bi_zero(), BI_SEG_TL); + ld->no_spill = true; + + bi_clause *singleton = bi_singleton(b->shader, ld, block, 0, + (1 << 0), true); + + list_addtail(&singleton->link, &clause->link); + + /* Rewrite to use */ + bi_rewrite_index_src_single((bi_instr *) ins, index, temp); + b->shader->fills++; +} + /* Once we've chosen a spill node, spill it. Precondition: node is a valid * SSA node in the non-optimized scheduled IR that was not already * spilled (enforced by bi_choose_spill_node). Returns bytes spilled */ static unsigned -bi_spill_register(bi_context *ctx, unsigned node, unsigned offset) +bi_spill_register(bi_context *ctx, bi_index index, uint32_t offset) { - assert(!(node & PAN_IS_REG)); + assert(!index.reg); + bi_builder _b = { .shader = ctx }; unsigned channels = 1; - /* Spill after every store */ + /* Spill after every store, fill before every load */ bi_foreach_block(ctx, _block) { bi_block *block = (bi_block *) _block; bi_foreach_clause_in_block_safe(block, clause) { - bi_instruction *ins = bi_unwrap_singleton(clause); - - if (ins->dest != node) continue; - - ins->dest = bi_make_temp(ctx); - ins->no_spill = true; - channels = MAX2(channels, ins->vector_channels); - - bi_instruction st = bi_spill(ins->dest, offset, channels); - bi_insert_singleton(ctx, clause, block, st, false); - ctx->spills++; - } - } - - /* Fill before every use */ - bi_foreach_block(ctx, _block) { - bi_block *block = (bi_block *) _block; - bi_foreach_clause_in_block_safe(block, clause) { - bi_instruction *ins = bi_unwrap_singleton(clause); - if (!bi_has_arg(ins, node)) continue; - - /* Don't rewrite spills themselves */ - if (ins->segment == BI_SEG_TL) continue; - - unsigned index = bi_make_temp(ctx); - - bi_instruction ld = bi_fill(index, offset, channels); - ld.no_spill = true; - bi_insert_singleton(ctx, clause, block, ld, true); + bi_instr *ins = (bi_instr *) bi_unwrap_singleton(clause); + if (bi_is_equiv(ins->dest[0], index)) { + bi_spill_dest(&_b, index, offset, clause, + block, ins, &channels); + } - /* Rewrite to use */ - bi_rewrite_index_src_single(ins, node, index); - ctx->fills++; + if (bi_has_arg(ins, index)) + bi_fill_src(&_b, index, offset, clause, block, ins, channels); } } @@ -350,36 +357,23 @@ bi_register_allocate(bi_context *ctx) struct lcra_state *l = NULL; bool success = false; - unsigned iter_count = 100; /* max iterations */ + unsigned iter_count = 1000; /* max iterations */ /* Number of bytes of memory we've spilled into */ unsigned spill_count = 0; - /* For instructions that both read and write from a data register, it's - * the *same* data register. We enforce that constraint by just doing a - * quick rewrite. TODO: are there cases where this causes RA to have no - * solutions due to copyprop? */ - bi_foreach_instr_global(ctx, ins) { - unsigned props = bi_class_props[ins->type]; - unsigned both = BI_DATA_REG_SRC | BI_DATA_REG_DEST; - if ((props & both) != both) continue; - - assert(ins->src[0] & PAN_IS_REG); - bi_rewrite_uses(ctx, ins->dest, 0, ins->src[0], 0); - ins->dest = ins->src[0]; - } - do { if (l) { signed spill_node = bi_choose_spill_node(ctx, l); lcra_free(l); l = NULL; - if (spill_node == -1) unreachable("Failed to choose spill node\n"); - spill_count += bi_spill_register(ctx, spill_node, spill_count); + spill_count += bi_spill_register(ctx, + bi_node_to_index(spill_node, bi_max_temp(ctx)), + spill_count); } bi_invalidate_liveness(ctx); diff --git a/src/panfrost/bifrost/bi_schedule.c b/src/panfrost/bifrost/bi_schedule.c index 96613b6..03a0ca7 100644 --- a/src/panfrost/bifrost/bi_schedule.c +++ b/src/panfrost/bifrost/bi_schedule.c @@ -341,9 +341,9 @@ bi_schedule(bi_context *ctx) bi_foreach_instr_in_block(bblock, ins) { /* Convenient time to lower */ - bi_lower_fmov(ins); +// bi_lower_fmov(ins); - bi_clause *u = bi_make_singleton(ctx, ins, + bi_clause *u = bi_singleton(ctx, (bi_instr *) ins, bblock, 0, (1 << 0), !is_first); diff --git a/src/panfrost/bifrost/bifrost_compile.c b/src/panfrost/bifrost/bifrost_compile.c index dffbe32..54305f0 100644 --- a/src/panfrost/bifrost/bifrost_compile.c +++ b/src/panfrost/bifrost/bifrost_compile.c @@ -3703,8 +3703,10 @@ emit_block(bi_context *ctx, nir_block *block) list_addtail(&ctx->current_block->base.link, &ctx->blocks); list_inithead(&ctx->current_block->base.instructions); + bi_builder _b = bi_init_builder(ctx); + nir_foreach_instr(instr, block) { - emit_instr(ctx, instr); + bi_emit_instr(&_b, instr); ++ctx->instruction_count; } @@ -3767,16 +3769,13 @@ emit_if(bi_context *ctx, nir_if *nif) bi_block *before_block = ctx->current_block; /* Speculatively emit the branch, but we can't fill it in until later */ - bi_instruction *then_branch = bi_emit_branch(ctx); - bi_set_branch_cond(then_branch, &nif->condition, true); + bi_builder _b = bi_init_builder(ctx); + bi_instr *then_branch = bi_branch(&_b, &nif->condition, true); /* Emit the two subblocks. */ bi_block *then_block = emit_cf_list(ctx, &nif->then_list); bi_block *end_then_block = ctx->current_block; - /* Emit a jump from the end of the then block to the end of the else */ - bi_instruction *then_exit = bi_emit_branch(ctx); - /* Emit second block, and check if it's empty */ int count_in = ctx->instruction_count; @@ -3790,13 +3789,15 @@ emit_if(bi_context *ctx, nir_if *nif) assert(else_block); if (ctx->instruction_count == count_in) { - /* The else block is empty, so don't emit an exit jump */ - bi_remove_instruction(then_exit); then_branch->branch_target = ctx->after_block; pan_block_add_successor(&end_then_block->base, &ctx->after_block->base); /* fallthrough */ } else { then_branch->branch_target = else_block; - then_exit->branch_target = ctx->after_block; + + /* Emit a jump from the end of the then block to the end of the else */ + _b.cursor = bi_after_block(end_then_block); + bi_instr *then_exit = bi_jump(&_b, ctx->after_block); + pan_block_add_successor(&end_then_block->base, &then_exit->branch_target->base); pan_block_add_successor(&end_else_block->base, &ctx->after_block->base); /* fallthrough */ } @@ -3822,8 +3823,8 @@ emit_loop(bi_context *ctx, nir_loop *nloop) emit_cf_list(ctx, &nloop->body); /* Branch back to loop back */ - bi_instruction *br_back = bi_emit_branch(ctx); - br_back->branch_target = ctx->continue_block; + bi_builder _b = bi_init_builder(ctx); + bi_jump(&_b, ctx->continue_block); pan_block_add_successor(&start_block->base, &ctx->continue_block->base); pan_block_add_successor(&ctx->current_block->base, &ctx->continue_block->base); @@ -4130,8 +4131,6 @@ bifrost_compile_shader_nir(void *mem_ctx, nir_shader *nir, /* Name blocks now that we're done emitting so the order is * consistent */ block->base.name = block_source_count++; - - bi_lower_combine(ctx, block); } bool progress = false; @@ -4145,6 +4144,11 @@ bifrost_compile_shader_nir(void *mem_ctx, nir_shader *nir, } } while(progress); + bi_foreach_block(ctx, _block) { + bi_block *block = (bi_block *) _block; + bi_lower_fau(ctx, block); + } + if (bifrost_debug & BIFROST_DBG_SHADERS && !nir->info.internal) bi_print_shader(ctx, stdout); bi_schedule(ctx); diff --git a/src/panfrost/bifrost/bir.c b/src/panfrost/bifrost/bir.c index 8fa89f6..2992a70 100644 --- a/src/panfrost/bifrost/bir.c +++ b/src/panfrost/bifrost/bir.c @@ -63,13 +63,13 @@ bi_is_src_swizzled(bi_instruction *ins, unsigned s) } bool -bi_has_arg(bi_instruction *ins, unsigned arg) +bi_has_arg(bi_instr *ins, bi_index arg) { if (!ins) return false; bi_foreach_src(ins, s) { - if (ins->src[s] == arg) + if (bi_is_equiv(ins->src[s], arg)) return true; } diff --git a/src/panfrost/bifrost/compiler.h b/src/panfrost/bifrost/compiler.h index 0debc16..b2d4f03 100644 --- a/src/panfrost/bifrost/compiler.h +++ b/src/panfrost/bifrost/compiler.h @@ -1070,7 +1070,7 @@ void bi_emit_deriv(bi_context *ctx, nir_alu_instr *instr); bool bi_has_clamp(bi_instruction *ins); bool bi_has_source_mods(bi_instruction *ins); bool bi_is_src_swizzled(bi_instruction *ins, unsigned s); -bool bi_has_arg(bi_instruction *ins, unsigned arg); +bool bi_has_arg(bi_instr *ins, bi_index arg); uint16_t bi_from_bytemask(uint16_t bytemask, unsigned bytes); unsigned bi_get_component_count(bi_instruction *ins, signed s); uint16_t bi_bytemask_of_read_components(bi_instruction *ins, unsigned node); @@ -1106,7 +1106,7 @@ bi_clause *bi_make_singleton(void *memctx, bi_instruction *ins, /* Liveness */ void bi_compute_liveness(bi_context *ctx); -void bi_liveness_ins_update(uint16_t *live, bi_instruction *ins, unsigned max); +void bi_liveness_ins_update(uint16_t *live, bi_instr *ins, unsigned max); void bi_invalidate_liveness(bi_context *ctx); /* Layout */ -- 2.7.4