broadcom/compiler: track pending ldtmu count with each TMU lookup
authorIago Toral Quiroga <itoral@igalia.com>
Mon, 20 Mar 2023 10:15:40 +0000 (11:15 +0100)
committerMarge Bot <emma+marge@anholt.net>
Tue, 21 Mar 2023 11:29:05 +0000 (11:29 +0000)
And use this information when scheduling QPU to avoid merging
a new TMU request into a previous ldtmu instruction when doing
so may cause TMU output fifo overflow due to a stalling ldtmu.

Reviewed-by: Alejandro PiƱeiro <apinheiro@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22044>

src/broadcom/compiler/nir_to_vir.c
src/broadcom/compiler/qpu_schedule.c
src/broadcom/compiler/v3d40_tex.c
src/broadcom/compiler/v3d_compiler.h
src/broadcom/compiler/vir_register_allocate.c

index 1b9b568..b32cf16 100644 (file)
@@ -449,6 +449,7 @@ emit_tmu_general_address_write(struct v3d_compile *c,
                                int offset_src,
                                struct qreg base_offset,
                                uint32_t const_offset,
+                               uint32_t dest_components,
                                uint32_t *tmu_writes)
 {
         if (mode == MODE_COUNT) {
@@ -494,6 +495,8 @@ emit_tmu_general_address_write(struct v3d_compile *c,
 
         if (vir_in_nonuniform_control_flow(c))
                 vir_set_cond(tmu, V3D_QPU_COND_IFA);
+
+        tmu->ldtmu_count = dest_components;
 }
 
 /**
@@ -684,7 +687,7 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
                 emit_tmu_general_address_write(c, mode, instr, config,
                                                dynamic_src, offset_src,
                                                base_offset, const_offset,
-                                               &tmu_writes);
+                                               dest_components, &tmu_writes);
 
                 assert(tmu_writes > 0);
                 if (mode == MODE_COUNT) {
index 517c9eb..ae119cf 100644 (file)
@@ -496,6 +496,8 @@ struct choose_scoreboard {
         bool last_thrsw_emitted;
         bool fixup_ldvary;
         int ldvary_count;
+        int pending_ldtmu_count;
+        bool first_ldtmu_after_thrsw;
 };
 
 static bool
@@ -1207,6 +1209,29 @@ retry:
                                 continue;
                         }
 
+                        /* We can emit a new tmu lookup with a previous ldtmu
+                         * if doing this would free just enough space in the
+                         * TMU output fifo so we don't overflow, however, this
+                         * is only safe if the ldtmu cannot stall.
+                         *
+                         * A ldtmu can stall if it is not the first following a
+                         * thread switch and corresponds to the first word of a
+                         * read request.
+                         *
+                         * FIXME: For now we forbid pairing up a new lookup
+                         * with a previous ldtmu that is not the first after a
+                         * thrsw if that could overflow the TMU output fifo
+                         * regardless of whether the ldtmu is reading the first
+                         * word of a TMU result or not, since we don't track
+                         * this aspect in the compiler yet.
+                         */
+                        if (prev_inst->inst->qpu.sig.ldtmu &&
+                            !scoreboard->first_ldtmu_after_thrsw &&
+                            (scoreboard->pending_ldtmu_count +
+                             n->inst->ldtmu_count > 16 / c->threads)) {
+                                continue;
+                        }
+
                         struct v3d_qpu_instr merged_inst;
                         if (!qpu_merge_inst(c->devinfo, &merged_inst,
                                             &prev_inst->inst->qpu, inst)) {
@@ -1295,10 +1320,31 @@ update_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard *scoreboard,
 }
 
 static void
+update_scoreboard_tmu_tracking(struct choose_scoreboard *scoreboard,
+                               const struct qinst *inst)
+{
+        /* Track if the have seen any ldtmu after the last thread switch */
+        if (scoreboard->tick == scoreboard->last_thrsw_tick + 2)
+                scoreboard->first_ldtmu_after_thrsw = true;
+
+        /* Track the number of pending ldtmu instructions for outstanding
+         * TMU lookups.
+         */
+        scoreboard->pending_ldtmu_count += inst->ldtmu_count;
+        if (inst->qpu.sig.ldtmu) {
+                assert(scoreboard->pending_ldtmu_count > 0);
+                scoreboard->pending_ldtmu_count--;
+                scoreboard->first_ldtmu_after_thrsw = false;
+        }
+}
+
+static void
 update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
-                             const struct v3d_qpu_instr *inst,
+                             const struct qinst *qinst,
                              const struct v3d_device_info *devinfo)
 {
+        const struct v3d_qpu_instr *inst = &qinst->qpu;
+
         if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
                 return;
 
@@ -1334,6 +1380,8 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
 
         if (inst->sig.ldvary)
                 scoreboard->last_ldvary_tick = scoreboard->tick;
+
+        update_scoreboard_tmu_tracking(scoreboard, qinst);
 }
 
 static void
@@ -1495,7 +1543,7 @@ insert_scheduled_instruction(struct v3d_compile *c,
 {
         list_addtail(&inst->link, &block->instructions);
 
-        update_scoreboard_for_chosen(scoreboard, &inst->qpu, c->devinfo);
+        update_scoreboard_for_chosen(scoreboard, inst, c->devinfo);
         c->qpu_inst_count++;
         scoreboard->tick++;
 }
@@ -2229,6 +2277,9 @@ schedule_instructions(struct v3d_compile *c,
                                                 merge->inst->uniform;
                                 }
 
+                                chosen->inst->ldtmu_count +=
+                                        merge->inst->ldtmu_count;
+
                                 if (debug) {
                                         fprintf(stderr, "t=%4d: merging: ",
                                                 time);
@@ -2478,6 +2529,7 @@ v3d_qpu_schedule_instructions(struct v3d_compile *c)
         scoreboard.last_branch_tick = -10;
         scoreboard.last_setmsf_tick = -10;
         scoreboard.last_stallable_sfu_tick = -10;
+        scoreboard.first_ldtmu_after_thrsw = true;
 
         if (debug) {
                 fprintf(stderr, "Pre-schedule instructions\n");
index fbc1b36..db85ac8 100644 (file)
 #define __gen_emit_reloc(cl, reloc)
 #include "cle/v3d_packet_v41_pack.h"
 
-static inline void
+static inline struct qinst *
 vir_TMU_WRITE(struct v3d_compile *c, enum v3d_qpu_waddr waddr, struct qreg val)
 {
         /* XXX perf: We should figure out how to merge ALU operations
          * producing the val with this MOV, when possible.
          */
-        vir_MOV_dest(c, vir_reg(QFILE_MAGIC, waddr), val);
+        return vir_MOV_dest(c, vir_reg(QFILE_MAGIC, waddr), val);
 }
 
-static inline void
+static inline struct qinst *
 vir_TMU_WRITE_or_count(struct v3d_compile *c,
                        enum v3d_qpu_waddr waddr,
                        struct qreg val,
                        uint32_t *tmu_writes)
 {
-        if (tmu_writes)
+        if (tmu_writes) {
                 (*tmu_writes)++;
-        else
-                vir_TMU_WRITE(c, waddr, val);
+                return NULL;
+        } else {
+                return vir_TMU_WRITE(c, waddr, val);
+        }
 }
 
 static void
@@ -381,17 +383,19 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
                 vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed);
 
         /* Emit retiring TMU write */
+        struct qinst *retiring;
         if (instr->op == nir_texop_txf) {
                 assert(instr->sampler_dim != GLSL_SAMPLER_DIM_CUBE);
-                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, s);
+                retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, s);
         } else if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
-                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSCM, s);
+                retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSCM, s);
         } else if (instr->op == nir_texop_txl) {
-                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSLOD, s);
+                retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSLOD, s);
         } else {
-                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUS, s);
+                retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUS, s);
         }
 
+        retiring->ldtmu_count = p0_unpacked.return_words_of_texture_data;
         ntq_add_pending_tmu_flush(c, &instr->dest,
                                   p0_unpacked.return_words_of_texture_data);
 }
@@ -440,7 +444,7 @@ v3d40_image_load_store_tmu_op(nir_intrinsic_instr *instr)
  * which is why we always call ntq_get_src() even if we are only interested in
  * register write counts.
  */
-static void
+static struct qinst *
 vir_image_emit_register_writes(struct v3d_compile *c,
                                nir_intrinsic_instr *instr,
                                bool atomic_add_replaced,
@@ -507,7 +511,8 @@ vir_image_emit_register_writes(struct v3d_compile *c,
                            V3D_QPU_PF_PUSHZ);
         }
 
-        vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUSF, src_1_0, tmu_writes);
+        struct qinst *retiring =
+                vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUSF, src_1_0, tmu_writes);
 
         if (!tmu_writes && vir_in_nonuniform_control_flow(c) &&
             instr->intrinsic != nir_intrinsic_image_load) {
@@ -515,6 +520,8 @@ vir_image_emit_register_writes(struct v3d_compile *c,
                         (struct  qinst *)c->cur_block->instructions.prev;
                 vir_set_cond(last_inst, V3D_QPU_COND_IFA);
         }
+
+        return retiring;
 }
 
 static unsigned
@@ -612,8 +619,9 @@ v3d40_vir_emit_image_load_store(struct v3d_compile *c,
         if (memcmp(&p2_unpacked, &p2_unpacked_default, sizeof(p2_unpacked)))
                    vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed);
 
-        vir_image_emit_register_writes(c, instr, atomic_add_replaced, NULL);
-
+        struct qinst *retiring =
+                vir_image_emit_register_writes(c, instr, atomic_add_replaced, NULL);
+        retiring->ldtmu_count = p0_unpacked.return_words_of_texture_data;
         ntq_add_pending_tmu_flush(c, &instr->dest,
                                   p0_unpacked.return_words_of_texture_data);
 }
index f9b3902..41cce95 100644 (file)
@@ -173,6 +173,11 @@ struct qinst {
         /* If this is a a TLB Z write */
         bool is_tlb_z_write;
 
+        /* If this is a retiring TMU instruction (the last in a lookup sequence),
+         * how many ldtmu instructions are required to read the results.
+         */
+        uint32_t ldtmu_count;
+
         /* Position of this instruction in the program. Filled in during
          * register allocation.
          */
index ca1428b..66fe353 100644 (file)
@@ -443,6 +443,7 @@ v3d_emit_spill_tmua(struct v3d_compile *c,
         struct qreg tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU);
         struct qinst *inst = vir_ADD_dest(c, tmua, c->spill_base, offset);
         inst->qpu.flags.ac = cond;
+        inst->ldtmu_count = 1;
         inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT,
                                               0xffffff7f); /* per-quad */