From 1e28f2a6f2446ae4b27112e9302153fdcb70a408 Mon Sep 17 00:00:00 2001 From: Iago Toral Quiroga Date: Mon, 20 Mar 2023 11:15:40 +0100 Subject: [PATCH] broadcom/compiler: track pending ldtmu count with each TMU lookup MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit And use this information when scheduling QPU to avoid merging a new TMU request into a previous ldtmu instruction when doing so may cause TMU output fifo overflow due to a stalling ldtmu. Reviewed-by: Alejandro Piñeiro Part-of: --- src/broadcom/compiler/nir_to_vir.c | 5 ++- src/broadcom/compiler/qpu_schedule.c | 56 ++++++++++++++++++++++++++- src/broadcom/compiler/v3d40_tex.c | 36 ++++++++++------- src/broadcom/compiler/v3d_compiler.h | 5 +++ src/broadcom/compiler/vir_register_allocate.c | 1 + 5 files changed, 86 insertions(+), 17 deletions(-) diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c index 1b9b568..b32cf16 100644 --- a/src/broadcom/compiler/nir_to_vir.c +++ b/src/broadcom/compiler/nir_to_vir.c @@ -449,6 +449,7 @@ emit_tmu_general_address_write(struct v3d_compile *c, int offset_src, struct qreg base_offset, uint32_t const_offset, + uint32_t dest_components, uint32_t *tmu_writes) { if (mode == MODE_COUNT) { @@ -494,6 +495,8 @@ emit_tmu_general_address_write(struct v3d_compile *c, if (vir_in_nonuniform_control_flow(c)) vir_set_cond(tmu, V3D_QPU_COND_IFA); + + tmu->ldtmu_count = dest_components; } /** @@ -684,7 +687,7 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, emit_tmu_general_address_write(c, mode, instr, config, dynamic_src, offset_src, base_offset, const_offset, - &tmu_writes); + dest_components, &tmu_writes); assert(tmu_writes > 0); if (mode == MODE_COUNT) { diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c index 517c9eb..ae119cf 100644 --- a/src/broadcom/compiler/qpu_schedule.c +++ b/src/broadcom/compiler/qpu_schedule.c @@ -496,6 +496,8 @@ struct choose_scoreboard { bool last_thrsw_emitted; bool fixup_ldvary; int ldvary_count; + int pending_ldtmu_count; + bool first_ldtmu_after_thrsw; }; static bool @@ -1207,6 +1209,29 @@ retry: continue; } + /* We can emit a new tmu lookup with a previous ldtmu + * if doing this would free just enough space in the + * TMU output fifo so we don't overflow, however, this + * is only safe if the ldtmu cannot stall. + * + * A ldtmu can stall if it is not the first following a + * thread switch and corresponds to the first word of a + * read request. + * + * FIXME: For now we forbid pairing up a new lookup + * with a previous ldtmu that is not the first after a + * thrsw if that could overflow the TMU output fifo + * regardless of whether the ldtmu is reading the first + * word of a TMU result or not, since we don't track + * this aspect in the compiler yet. + */ + if (prev_inst->inst->qpu.sig.ldtmu && + !scoreboard->first_ldtmu_after_thrsw && + (scoreboard->pending_ldtmu_count + + n->inst->ldtmu_count > 16 / c->threads)) { + continue; + } + struct v3d_qpu_instr merged_inst; if (!qpu_merge_inst(c->devinfo, &merged_inst, &prev_inst->inst->qpu, inst)) { @@ -1295,10 +1320,31 @@ update_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard *scoreboard, } static void +update_scoreboard_tmu_tracking(struct choose_scoreboard *scoreboard, + const struct qinst *inst) +{ + /* Track if the have seen any ldtmu after the last thread switch */ + if (scoreboard->tick == scoreboard->last_thrsw_tick + 2) + scoreboard->first_ldtmu_after_thrsw = true; + + /* Track the number of pending ldtmu instructions for outstanding + * TMU lookups. + */ + scoreboard->pending_ldtmu_count += inst->ldtmu_count; + if (inst->qpu.sig.ldtmu) { + assert(scoreboard->pending_ldtmu_count > 0); + scoreboard->pending_ldtmu_count--; + scoreboard->first_ldtmu_after_thrsw = false; + } +} + +static void update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, - const struct v3d_qpu_instr *inst, + const struct qinst *qinst, const struct v3d_device_info *devinfo) { + const struct v3d_qpu_instr *inst = &qinst->qpu; + if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) return; @@ -1334,6 +1380,8 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, if (inst->sig.ldvary) scoreboard->last_ldvary_tick = scoreboard->tick; + + update_scoreboard_tmu_tracking(scoreboard, qinst); } static void @@ -1495,7 +1543,7 @@ insert_scheduled_instruction(struct v3d_compile *c, { list_addtail(&inst->link, &block->instructions); - update_scoreboard_for_chosen(scoreboard, &inst->qpu, c->devinfo); + update_scoreboard_for_chosen(scoreboard, inst, c->devinfo); c->qpu_inst_count++; scoreboard->tick++; } @@ -2229,6 +2277,9 @@ schedule_instructions(struct v3d_compile *c, merge->inst->uniform; } + chosen->inst->ldtmu_count += + merge->inst->ldtmu_count; + if (debug) { fprintf(stderr, "t=%4d: merging: ", time); @@ -2478,6 +2529,7 @@ v3d_qpu_schedule_instructions(struct v3d_compile *c) scoreboard.last_branch_tick = -10; scoreboard.last_setmsf_tick = -10; scoreboard.last_stallable_sfu_tick = -10; + scoreboard.first_ldtmu_after_thrsw = true; if (debug) { fprintf(stderr, "Pre-schedule instructions\n"); diff --git a/src/broadcom/compiler/v3d40_tex.c b/src/broadcom/compiler/v3d40_tex.c index fbc1b36..db85ac8 100644 --- a/src/broadcom/compiler/v3d40_tex.c +++ b/src/broadcom/compiler/v3d40_tex.c @@ -30,25 +30,27 @@ #define __gen_emit_reloc(cl, reloc) #include "cle/v3d_packet_v41_pack.h" -static inline void +static inline struct qinst * vir_TMU_WRITE(struct v3d_compile *c, enum v3d_qpu_waddr waddr, struct qreg val) { /* XXX perf: We should figure out how to merge ALU operations * producing the val with this MOV, when possible. */ - vir_MOV_dest(c, vir_reg(QFILE_MAGIC, waddr), val); + return vir_MOV_dest(c, vir_reg(QFILE_MAGIC, waddr), val); } -static inline void +static inline struct qinst * vir_TMU_WRITE_or_count(struct v3d_compile *c, enum v3d_qpu_waddr waddr, struct qreg val, uint32_t *tmu_writes) { - if (tmu_writes) + if (tmu_writes) { (*tmu_writes)++; - else - vir_TMU_WRITE(c, waddr, val); + return NULL; + } else { + return vir_TMU_WRITE(c, waddr, val); + } } static void @@ -381,17 +383,19 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed); /* Emit retiring TMU write */ + struct qinst *retiring; if (instr->op == nir_texop_txf) { assert(instr->sampler_dim != GLSL_SAMPLER_DIM_CUBE); - vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, s); + retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, s); } else if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { - vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSCM, s); + retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSCM, s); } else if (instr->op == nir_texop_txl) { - vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSLOD, s); + retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSLOD, s); } else { - vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUS, s); + retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUS, s); } + retiring->ldtmu_count = p0_unpacked.return_words_of_texture_data; ntq_add_pending_tmu_flush(c, &instr->dest, p0_unpacked.return_words_of_texture_data); } @@ -440,7 +444,7 @@ v3d40_image_load_store_tmu_op(nir_intrinsic_instr *instr) * which is why we always call ntq_get_src() even if we are only interested in * register write counts. */ -static void +static struct qinst * vir_image_emit_register_writes(struct v3d_compile *c, nir_intrinsic_instr *instr, bool atomic_add_replaced, @@ -507,7 +511,8 @@ vir_image_emit_register_writes(struct v3d_compile *c, V3D_QPU_PF_PUSHZ); } - vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUSF, src_1_0, tmu_writes); + struct qinst *retiring = + vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUSF, src_1_0, tmu_writes); if (!tmu_writes && vir_in_nonuniform_control_flow(c) && instr->intrinsic != nir_intrinsic_image_load) { @@ -515,6 +520,8 @@ vir_image_emit_register_writes(struct v3d_compile *c, (struct qinst *)c->cur_block->instructions.prev; vir_set_cond(last_inst, V3D_QPU_COND_IFA); } + + return retiring; } static unsigned @@ -612,8 +619,9 @@ v3d40_vir_emit_image_load_store(struct v3d_compile *c, if (memcmp(&p2_unpacked, &p2_unpacked_default, sizeof(p2_unpacked))) vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed); - vir_image_emit_register_writes(c, instr, atomic_add_replaced, NULL); - + struct qinst *retiring = + vir_image_emit_register_writes(c, instr, atomic_add_replaced, NULL); + retiring->ldtmu_count = p0_unpacked.return_words_of_texture_data; ntq_add_pending_tmu_flush(c, &instr->dest, p0_unpacked.return_words_of_texture_data); } diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h index f9b3902..41cce95 100644 --- a/src/broadcom/compiler/v3d_compiler.h +++ b/src/broadcom/compiler/v3d_compiler.h @@ -173,6 +173,11 @@ struct qinst { /* If this is a a TLB Z write */ bool is_tlb_z_write; + /* If this is a retiring TMU instruction (the last in a lookup sequence), + * how many ldtmu instructions are required to read the results. + */ + uint32_t ldtmu_count; + /* Position of this instruction in the program. Filled in during * register allocation. */ diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c index ca1428b..66fe353 100644 --- a/src/broadcom/compiler/vir_register_allocate.c +++ b/src/broadcom/compiler/vir_register_allocate.c @@ -443,6 +443,7 @@ v3d_emit_spill_tmua(struct v3d_compile *c, struct qreg tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU); struct qinst *inst = vir_ADD_dest(c, tmua, c->spill_base, offset); inst->qpu.flags.ac = cond; + inst->ldtmu_count = 1; inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, 0xffffff7f); /* per-quad */ -- 2.7.4