int offset_src,
struct qreg base_offset,
uint32_t const_offset,
+ uint32_t dest_components,
uint32_t *tmu_writes)
{
if (mode == MODE_COUNT) {
if (vir_in_nonuniform_control_flow(c))
vir_set_cond(tmu, V3D_QPU_COND_IFA);
+
+ tmu->ldtmu_count = dest_components;
}
/**
emit_tmu_general_address_write(c, mode, instr, config,
dynamic_src, offset_src,
base_offset, const_offset,
- &tmu_writes);
+ dest_components, &tmu_writes);
assert(tmu_writes > 0);
if (mode == MODE_COUNT) {
bool last_thrsw_emitted;
bool fixup_ldvary;
int ldvary_count;
+ int pending_ldtmu_count;
+ bool first_ldtmu_after_thrsw;
};
static bool
continue;
}
+ /* We can emit a new tmu lookup with a previous ldtmu
+ * if doing this would free just enough space in the
+ * TMU output fifo so we don't overflow, however, this
+ * is only safe if the ldtmu cannot stall.
+ *
+ * A ldtmu can stall if it is not the first following a
+ * thread switch and corresponds to the first word of a
+ * read request.
+ *
+ * FIXME: For now we forbid pairing up a new lookup
+ * with a previous ldtmu that is not the first after a
+ * thrsw if that could overflow the TMU output fifo
+ * regardless of whether the ldtmu is reading the first
+ * word of a TMU result or not, since we don't track
+ * this aspect in the compiler yet.
+ */
+ if (prev_inst->inst->qpu.sig.ldtmu &&
+ !scoreboard->first_ldtmu_after_thrsw &&
+ (scoreboard->pending_ldtmu_count +
+ n->inst->ldtmu_count > 16 / c->threads)) {
+ continue;
+ }
+
struct v3d_qpu_instr merged_inst;
if (!qpu_merge_inst(c->devinfo, &merged_inst,
&prev_inst->inst->qpu, inst)) {
}
static void
+update_scoreboard_tmu_tracking(struct choose_scoreboard *scoreboard,
+ const struct qinst *inst)
+{
+ /* Track if the have seen any ldtmu after the last thread switch */
+ if (scoreboard->tick == scoreboard->last_thrsw_tick + 2)
+ scoreboard->first_ldtmu_after_thrsw = true;
+
+ /* Track the number of pending ldtmu instructions for outstanding
+ * TMU lookups.
+ */
+ scoreboard->pending_ldtmu_count += inst->ldtmu_count;
+ if (inst->qpu.sig.ldtmu) {
+ assert(scoreboard->pending_ldtmu_count > 0);
+ scoreboard->pending_ldtmu_count--;
+ scoreboard->first_ldtmu_after_thrsw = false;
+ }
+}
+
+static void
update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
- const struct v3d_qpu_instr *inst,
+ const struct qinst *qinst,
const struct v3d_device_info *devinfo)
{
+ const struct v3d_qpu_instr *inst = &qinst->qpu;
+
if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
return;
if (inst->sig.ldvary)
scoreboard->last_ldvary_tick = scoreboard->tick;
+
+ update_scoreboard_tmu_tracking(scoreboard, qinst);
}
static void
{
list_addtail(&inst->link, &block->instructions);
- update_scoreboard_for_chosen(scoreboard, &inst->qpu, c->devinfo);
+ update_scoreboard_for_chosen(scoreboard, inst, c->devinfo);
c->qpu_inst_count++;
scoreboard->tick++;
}
merge->inst->uniform;
}
+ chosen->inst->ldtmu_count +=
+ merge->inst->ldtmu_count;
+
if (debug) {
fprintf(stderr, "t=%4d: merging: ",
time);
scoreboard.last_branch_tick = -10;
scoreboard.last_setmsf_tick = -10;
scoreboard.last_stallable_sfu_tick = -10;
+ scoreboard.first_ldtmu_after_thrsw = true;
if (debug) {
fprintf(stderr, "Pre-schedule instructions\n");
#define __gen_emit_reloc(cl, reloc)
#include "cle/v3d_packet_v41_pack.h"
-static inline void
+static inline struct qinst *
vir_TMU_WRITE(struct v3d_compile *c, enum v3d_qpu_waddr waddr, struct qreg val)
{
/* XXX perf: We should figure out how to merge ALU operations
* producing the val with this MOV, when possible.
*/
- vir_MOV_dest(c, vir_reg(QFILE_MAGIC, waddr), val);
+ return vir_MOV_dest(c, vir_reg(QFILE_MAGIC, waddr), val);
}
-static inline void
+static inline struct qinst *
vir_TMU_WRITE_or_count(struct v3d_compile *c,
enum v3d_qpu_waddr waddr,
struct qreg val,
uint32_t *tmu_writes)
{
- if (tmu_writes)
+ if (tmu_writes) {
(*tmu_writes)++;
- else
- vir_TMU_WRITE(c, waddr, val);
+ return NULL;
+ } else {
+ return vir_TMU_WRITE(c, waddr, val);
+ }
}
static void
vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed);
/* Emit retiring TMU write */
+ struct qinst *retiring;
if (instr->op == nir_texop_txf) {
assert(instr->sampler_dim != GLSL_SAMPLER_DIM_CUBE);
- vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, s);
+ retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, s);
} else if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
- vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSCM, s);
+ retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSCM, s);
} else if (instr->op == nir_texop_txl) {
- vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSLOD, s);
+ retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSLOD, s);
} else {
- vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUS, s);
+ retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUS, s);
}
+ retiring->ldtmu_count = p0_unpacked.return_words_of_texture_data;
ntq_add_pending_tmu_flush(c, &instr->dest,
p0_unpacked.return_words_of_texture_data);
}
* which is why we always call ntq_get_src() even if we are only interested in
* register write counts.
*/
-static void
+static struct qinst *
vir_image_emit_register_writes(struct v3d_compile *c,
nir_intrinsic_instr *instr,
bool atomic_add_replaced,
V3D_QPU_PF_PUSHZ);
}
- vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUSF, src_1_0, tmu_writes);
+ struct qinst *retiring =
+ vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUSF, src_1_0, tmu_writes);
if (!tmu_writes && vir_in_nonuniform_control_flow(c) &&
instr->intrinsic != nir_intrinsic_image_load) {
(struct qinst *)c->cur_block->instructions.prev;
vir_set_cond(last_inst, V3D_QPU_COND_IFA);
}
+
+ return retiring;
}
static unsigned
if (memcmp(&p2_unpacked, &p2_unpacked_default, sizeof(p2_unpacked)))
vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed);
- vir_image_emit_register_writes(c, instr, atomic_add_replaced, NULL);
-
+ struct qinst *retiring =
+ vir_image_emit_register_writes(c, instr, atomic_add_replaced, NULL);
+ retiring->ldtmu_count = p0_unpacked.return_words_of_texture_data;
ntq_add_pending_tmu_flush(c, &instr->dest,
p0_unpacked.return_words_of_texture_data);
}
/* If this is a a TLB Z write */
bool is_tlb_z_write;
+ /* If this is a retiring TMU instruction (the last in a lookup sequence),
+ * how many ldtmu instructions are required to read the results.
+ */
+ uint32_t ldtmu_count;
+
/* Position of this instruction in the program. Filled in during
* register allocation.
*/
struct qreg tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU);
struct qinst *inst = vir_ADD_dest(c, tmua, c->spill_base, offset);
inst->qpu.flags.ac = cond;
+ inst->ldtmu_count = 1;
inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT,
0xffffff7f); /* per-quad */