From f85fcaa494cf1bbc9337d4675069c7921356d7e9 Mon Sep 17 00:00:00 2001 From: Iago Toral Quiroga Date: Thu, 11 Feb 2021 11:29:00 +0100 Subject: [PATCH] broadcom/compiler: pass a devinfo to check if an instruction writes to TMU MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit V3D 3.x has V3D_QPU_WADDR_TMU which in V3D 4.x is V3D_QPU_WADDR_UNIFA (which isn't a TMU write address). This change passes a devinfo to any functions that need to do these checks so we can account for the target V3D version correctly. Reviewed-by: Alejandro Piñeiro Part-of: --- src/broadcom/compiler/qpu_schedule.c | 71 ++++++++++++++++----------- src/broadcom/compiler/qpu_validate.c | 8 ++- src/broadcom/compiler/v3d_compiler.h | 2 +- src/broadcom/compiler/vir.c | 4 +- src/broadcom/compiler/vir_register_allocate.c | 20 +++++--- src/broadcom/qpu/qpu_instr.c | 33 ++++++++----- src/broadcom/qpu/qpu_instr.h | 9 ++-- 7 files changed, 91 insertions(+), 56 deletions(-) diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c index b75d565..2d563d0 100644 --- a/src/broadcom/compiler/qpu_schedule.c +++ b/src/broadcom/compiler/qpu_schedule.c @@ -174,7 +174,7 @@ process_waddr_deps(struct schedule_state *state, struct schedule_node *n, { if (!magic) { add_write_dep(state, &state->last_rf[waddr], n); - } else if (v3d_qpu_magic_waddr_is_tmu(waddr)) { + } else if (v3d_qpu_magic_waddr_is_tmu(state->devinfo, waddr)) { /* XXX perf: For V3D 4.x, we could reorder TMU writes other * than the TMUS/TMUD/TMUA to improve scheduling flexibility. */ @@ -568,7 +568,8 @@ mux_read_stalls(struct choose_scoreboard *scoreboard, #define MAX_SCHEDULE_PRIORITY 16 static int -get_instruction_priority(const struct v3d_qpu_instr *inst) +get_instruction_priority(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *inst) { uint32_t baseline_score; uint32_t next_score = 0; @@ -590,7 +591,7 @@ get_instruction_priority(const struct v3d_qpu_instr *inst) next_score++; /* Schedule texture read setup early to hide their latency better. */ - if (v3d_qpu_writes_tmu(inst)) + if (v3d_qpu_writes_tmu(devinfo, inst)) return next_score; next_score++; @@ -601,9 +602,10 @@ get_instruction_priority(const struct v3d_qpu_instr *inst) } static bool -qpu_magic_waddr_is_periph(enum v3d_qpu_waddr waddr) +qpu_magic_waddr_is_periph(const struct v3d_device_info *devinfo, + enum v3d_qpu_waddr waddr) { - return (v3d_qpu_magic_waddr_is_tmu(waddr) || + return (v3d_qpu_magic_waddr_is_tmu(devinfo, waddr) || v3d_qpu_magic_waddr_is_sfu(waddr) || v3d_qpu_magic_waddr_is_tlb(waddr) || v3d_qpu_magic_waddr_is_vpm(waddr) || @@ -611,7 +613,8 @@ qpu_magic_waddr_is_periph(enum v3d_qpu_waddr waddr) } static bool -qpu_accesses_peripheral(const struct v3d_qpu_instr *inst) +qpu_accesses_peripheral(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *inst) { if (v3d_qpu_uses_vpm(inst)) return true; @@ -621,7 +624,7 @@ qpu_accesses_peripheral(const struct v3d_qpu_instr *inst) if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { if (inst->alu.add.op != V3D_QPU_A_NOP && inst->alu.add.magic_write && - qpu_magic_waddr_is_periph(inst->alu.add.waddr)) { + qpu_magic_waddr_is_periph(devinfo, inst->alu.add.waddr)) { return true; } @@ -630,7 +633,7 @@ qpu_accesses_peripheral(const struct v3d_qpu_instr *inst) if (inst->alu.mul.op != V3D_QPU_M_NOP && inst->alu.mul.magic_write && - qpu_magic_waddr_is_periph(inst->alu.mul.waddr)) { + qpu_magic_waddr_is_periph(devinfo, inst->alu.mul.waddr)) { return true; } } @@ -647,8 +650,8 @@ qpu_compatible_peripheral_access(const struct v3d_device_info *devinfo, const struct v3d_qpu_instr *a, const struct v3d_qpu_instr *b) { - const bool a_uses_peripheral = qpu_accesses_peripheral(a); - const bool b_uses_peripheral = qpu_accesses_peripheral(b); + const bool a_uses_peripheral = qpu_accesses_peripheral(devinfo, a); + const bool b_uses_peripheral = qpu_accesses_peripheral(devinfo, b); /* We can always do one peripheral access per instruction. */ if (!a_uses_peripheral || !b_uses_peripheral) @@ -665,8 +668,8 @@ qpu_compatible_peripheral_access(const struct v3d_device_info *devinfo, return true; } - if ((a->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(b)) || - (b->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(a))) { + if ((a->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(devinfo, b)) || + (b->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(devinfo, a))) { return true; } @@ -849,7 +852,7 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo, } } - int prio = get_instruction_priority(inst); + int prio = get_instruction_priority(devinfo, inst); if (mux_read_stalls(scoreboard, inst)) { /* Don't merge an instruction that stalls */ @@ -910,7 +913,8 @@ update_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard *scoreboard, static void update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, - const struct v3d_qpu_instr *inst) + const struct v3d_qpu_instr *inst, + const struct v3d_device_info *devinfo) { if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) return; @@ -920,7 +924,8 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, if (inst->alu.add.op != V3D_QPU_A_NOP) { if (inst->alu.add.magic_write) { update_scoreboard_for_magic_waddr(scoreboard, - inst->alu.add.waddr); + inst->alu.add.waddr, + devinfo); } else { update_scoreboard_for_sfu_stall_waddr(scoreboard, inst); @@ -930,7 +935,8 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, if (inst->alu.mul.op != V3D_QPU_M_NOP) { if (inst->alu.mul.magic_write) { update_scoreboard_for_magic_waddr(scoreboard, - inst->alu.mul.waddr); + inst->alu.mul.waddr, + devinfo); } } @@ -964,7 +970,8 @@ dump_state(const struct v3d_device_info *devinfo, struct dag *dag) } } -static uint32_t magic_waddr_latency(enum v3d_qpu_waddr waddr, +static uint32_t magic_waddr_latency(const struct v3d_device_info *devinfo, + enum v3d_qpu_waddr waddr, const struct v3d_qpu_instr *after) { /* Apply some huge latency between texture fetch requests and getting @@ -990,8 +997,10 @@ static uint32_t magic_waddr_latency(enum v3d_qpu_waddr waddr, * * because we associate the first load_tmu0 with the *second* tmu0_s. */ - if (v3d_qpu_magic_waddr_is_tmu(waddr) && v3d_qpu_waits_on_tmu(after)) + if (v3d_qpu_magic_waddr_is_tmu(devinfo, waddr) && + v3d_qpu_waits_on_tmu(after)) { return 100; + } /* Assume that anything depending on us is consuming the SFU result. */ if (v3d_qpu_magic_waddr_is_sfu(waddr)) @@ -1001,7 +1010,8 @@ static uint32_t magic_waddr_latency(enum v3d_qpu_waddr waddr, } static uint32_t -instruction_latency(struct schedule_node *before, struct schedule_node *after) +instruction_latency(const struct v3d_device_info *devinfo, + struct schedule_node *before, struct schedule_node *after) { const struct v3d_qpu_instr *before_inst = &before->inst->qpu; const struct v3d_qpu_instr *after_inst = &after->inst->qpu; @@ -1013,13 +1023,15 @@ instruction_latency(struct schedule_node *before, struct schedule_node *after) if (before_inst->alu.add.magic_write) { latency = MAX2(latency, - magic_waddr_latency(before_inst->alu.add.waddr, + magic_waddr_latency(devinfo, + before_inst->alu.add.waddr, after_inst)); } if (before_inst->alu.mul.magic_write) { latency = MAX2(latency, - magic_waddr_latency(before_inst->alu.mul.waddr, + magic_waddr_latency(devinfo, + before_inst->alu.mul.waddr, after_inst)); } @@ -1034,6 +1046,7 @@ static void compute_delay(struct dag_node *node, void *state) { struct schedule_node *n = (struct schedule_node *)node; + struct v3d_compile *c = (struct v3d_compile *) state; n->delay = 1; @@ -1042,7 +1055,8 @@ compute_delay(struct dag_node *node, void *state) (struct schedule_node *)edge->child; n->delay = MAX2(n->delay, (child->delay + - instruction_latency(n, child))); + instruction_latency(c->devinfo, n, + child))); } } @@ -1061,7 +1075,8 @@ pre_remove_head(struct dag *dag, struct schedule_node *n) } static void -mark_instruction_scheduled(struct dag *dag, +mark_instruction_scheduled(const struct v3d_device_info *devinfo, + struct dag *dag, uint32_t time, struct schedule_node *node) { @@ -1075,7 +1090,7 @@ mark_instruction_scheduled(struct dag *dag, if (!child) continue; - uint32_t latency = instruction_latency(node, child); + uint32_t latency = instruction_latency(devinfo, node, child); child->unblocked_time = MAX2(child->unblocked_time, time + latency); @@ -1091,7 +1106,7 @@ insert_scheduled_instruction(struct v3d_compile *c, { list_addtail(&inst->link, &block->instructions); - update_scoreboard_for_chosen(scoreboard, &inst->qpu); + update_scoreboard_for_chosen(scoreboard, &inst->qpu, c->devinfo); c->qpu_inst_count++; scoreboard->tick++; } @@ -1390,10 +1405,10 @@ schedule_instructions(struct v3d_compile *c, * be scheduled. Update the children's unblocked time for this * DAG edge as we do so. */ - mark_instruction_scheduled(scoreboard->dag, time, chosen); + mark_instruction_scheduled(devinfo, scoreboard->dag, time, chosen); list_for_each_entry(struct schedule_node, merge, &merged_list, link) { - mark_instruction_scheduled(scoreboard->dag, time, merge); + mark_instruction_scheduled(devinfo, scoreboard->dag, time, merge); /* The merged VIR instruction doesn't get re-added to the * block, so free it now. @@ -1456,7 +1471,7 @@ qpu_schedule_instructions_block(struct v3d_compile *c, calculate_forward_deps(c, scoreboard->dag, &setup_list); calculate_reverse_deps(c, scoreboard->dag, &setup_list); - dag_traverse_bottom_up(scoreboard->dag, compute_delay, NULL); + dag_traverse_bottom_up(scoreboard->dag, compute_delay, c); uint32_t cycles = schedule_instructions(c, scoreboard, block, orig_uniform_contents, diff --git a/src/broadcom/compiler/qpu_validate.c b/src/broadcom/compiler/qpu_validate.c index 24be4fd..a2de9f5 100644 --- a/src/broadcom/compiler/qpu_validate.c +++ b/src/broadcom/compiler/qpu_validate.c @@ -145,8 +145,10 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst) if (inst->alu.add.op != V3D_QPU_A_NOP) { if (inst->alu.add.magic_write) { - if (v3d_qpu_magic_waddr_is_tmu(inst->alu.add.waddr)) + if (v3d_qpu_magic_waddr_is_tmu(state->c->devinfo, + inst->alu.add.waddr)) { tmu_writes++; + } if (v3d_qpu_magic_waddr_is_sfu(inst->alu.add.waddr)) sfu_writes++; if (v3d_qpu_magic_waddr_is_vpm(inst->alu.add.waddr)) @@ -160,8 +162,10 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst) if (inst->alu.mul.op != V3D_QPU_M_NOP) { if (inst->alu.mul.magic_write) { - if (v3d_qpu_magic_waddr_is_tmu(inst->alu.mul.waddr)) + if (v3d_qpu_magic_waddr_is_tmu(state->c->devinfo, + inst->alu.mul.waddr)) { tmu_writes++; + } if (v3d_qpu_magic_waddr_is_sfu(inst->alu.mul.waddr)) sfu_writes++; if (v3d_qpu_magic_waddr_is_vpm(inst->alu.mul.waddr)) diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h index 548940c..e4f7ab4 100644 --- a/src/broadcom/compiler/v3d_compiler.h +++ b/src/broadcom/compiler/v3d_compiler.h @@ -950,7 +950,7 @@ bool vir_has_side_effects(struct v3d_compile *c, struct qinst *inst); bool vir_get_add_op(struct qinst *inst, enum v3d_qpu_add_op *op); bool vir_get_mul_op(struct qinst *inst, enum v3d_qpu_mul_op *op); bool vir_is_raw_mov(struct qinst *inst); -bool vir_is_tex(struct qinst *inst); +bool vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst); bool vir_is_add(struct qinst *inst); bool vir_is_mul(struct qinst *inst); bool vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst); diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c index e6cf729..6c79b2d 100644 --- a/src/broadcom/compiler/vir.c +++ b/src/broadcom/compiler/vir.c @@ -130,10 +130,10 @@ vir_is_mul(struct qinst *inst) } bool -vir_is_tex(struct qinst *inst) +vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst) { if (inst->dst.file == QFILE_MAGIC) - return v3d_qpu_magic_waddr_is_tmu(inst->dst.index); + return v3d_qpu_magic_waddr_is_tmu(devinfo, inst->dst.index); if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && inst->qpu.alu.add.op == V3D_QPU_A_TMUWT) { diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c index 6b4e947..b29960a 100644 --- a/src/broadcom/compiler/vir_register_allocate.c +++ b/src/broadcom/compiler/vir_register_allocate.c @@ -34,15 +34,17 @@ #define PHYS_COUNT 64 static inline bool -qinst_writes_tmu(struct qinst *inst) +qinst_writes_tmu(const struct v3d_device_info *devinfo, + struct qinst *inst) { return (inst->dst.file == QFILE_MAGIC && - v3d_qpu_magic_waddr_is_tmu(inst->dst.index)) || + v3d_qpu_magic_waddr_is_tmu(devinfo, inst->dst.index)) || inst->qpu.sig.wrtmuc; } static bool -is_end_of_tmu_sequence(struct qinst *inst, struct qblock *block) +is_end_of_tmu_sequence(const struct v3d_device_info *devinfo, + struct qinst *inst, struct qblock *block) { if (!inst->qpu.sig.ldtmu && !(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && @@ -58,7 +60,7 @@ is_end_of_tmu_sequence(struct qinst *inst, struct qblock *block) return false; } - if (qinst_writes_tmu(scan_inst)) + if (qinst_writes_tmu(devinfo, scan_inst)) return true; } @@ -149,10 +151,10 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g, * final LDTMU or TMUWT from that TMU setup. We * penalize spills during that time. */ - if (is_end_of_tmu_sequence(inst, block)) + if (is_end_of_tmu_sequence(c->devinfo, inst, block)) in_tmu_operation = false; - if (qinst_writes_tmu(inst)) + if (qinst_writes_tmu(c->devinfo, inst)) in_tmu_operation = true; } } @@ -268,7 +270,7 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp) * move the fill up to not intrude in the middle of the TMU * sequence. */ - if (is_end_of_tmu_sequence(inst, block)) { + if (is_end_of_tmu_sequence(c->devinfo, inst, block)) { if (postponed_spill) { v3d_emit_tmu_spill(c, postponed_spill, inst, spill_offset); @@ -278,8 +280,10 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp) postponed_spill = NULL; } - if (!start_of_tmu_sequence && qinst_writes_tmu(inst)) + if (!start_of_tmu_sequence && + qinst_writes_tmu(c->devinfo, inst)) { start_of_tmu_sequence = inst; + } /* fills */ for (int i = 0; i < vir_get_nsrc(inst); i++) { diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c index 5c86dae..2a041cd 100644 --- a/src/broadcom/qpu/qpu_instr.c +++ b/src/broadcom/qpu/qpu_instr.c @@ -533,13 +533,20 @@ v3d_qpu_magic_waddr_is_sfu(enum v3d_qpu_waddr waddr) } bool -v3d_qpu_magic_waddr_is_tmu(enum v3d_qpu_waddr waddr) -{ - /* XXX: WADDR_TMU changed to UNIFA on 4.x */ - return ((waddr >= V3D_QPU_WADDR_TMU && - waddr <= V3D_QPU_WADDR_TMUAU) || - (waddr >= V3D_QPU_WADDR_TMUC && - waddr <= V3D_QPU_WADDR_TMUHSLOD)); +v3d_qpu_magic_waddr_is_tmu(const struct v3d_device_info *devinfo, + enum v3d_qpu_waddr waddr) +{ + if (devinfo->ver >= 40) { + return ((waddr >= V3D_QPU_WADDR_TMUD && + waddr <= V3D_QPU_WADDR_TMUAU) || + (waddr >= V3D_QPU_WADDR_TMUC && + waddr <= V3D_QPU_WADDR_TMUHSLOD)); + } else { + return ((waddr >= V3D_QPU_WADDR_TMU && + waddr <= V3D_QPU_WADDR_TMUAU) || + (waddr >= V3D_QPU_WADDR_TMUC && + waddr <= V3D_QPU_WADDR_TMUHSLOD)); + } } bool @@ -681,19 +688,21 @@ v3d_qpu_instr_is_sfu(const struct v3d_qpu_instr *inst) } bool -v3d_qpu_writes_tmu(const struct v3d_qpu_instr *inst) +v3d_qpu_writes_tmu(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *inst) { return (inst->type == V3D_QPU_INSTR_TYPE_ALU && ((inst->alu.add.magic_write && - v3d_qpu_magic_waddr_is_tmu(inst->alu.add.waddr)) || + v3d_qpu_magic_waddr_is_tmu(devinfo, inst->alu.add.waddr)) || (inst->alu.mul.magic_write && - v3d_qpu_magic_waddr_is_tmu(inst->alu.mul.waddr)))); + v3d_qpu_magic_waddr_is_tmu(devinfo, inst->alu.mul.waddr)))); } bool -v3d_qpu_writes_tmu_not_tmuc(const struct v3d_qpu_instr *inst) +v3d_qpu_writes_tmu_not_tmuc(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *inst) { - return v3d_qpu_writes_tmu(inst) && + return v3d_qpu_writes_tmu(devinfo, inst) && (!inst->alu.add.magic_write || inst->alu.add.waddr != V3D_QPU_WADDR_TMUC) && (!inst->alu.mul.magic_write || diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h index c0374c1..86dde9d 100644 --- a/src/broadcom/qpu/qpu_instr.h +++ b/src/broadcom/qpu/qpu_instr.h @@ -442,7 +442,8 @@ v3d_qpu_instr_unpack(const struct v3d_device_info *devinfo, struct v3d_qpu_instr *instr); bool v3d_qpu_magic_waddr_is_sfu(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST; -bool v3d_qpu_magic_waddr_is_tmu(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST; +bool v3d_qpu_magic_waddr_is_tmu(const struct v3d_device_info *devinfo, + enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST; bool v3d_qpu_magic_waddr_is_tlb(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST; bool v3d_qpu_magic_waddr_is_vpm(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST; bool v3d_qpu_magic_waddr_is_tsy(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST; @@ -450,8 +451,10 @@ bool v3d_qpu_magic_waddr_loads_unif(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST; bool v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; bool v3d_qpu_instr_is_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; bool v3d_qpu_uses_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; -bool v3d_qpu_writes_tmu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; -bool v3d_qpu_writes_tmu_not_tmuc(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; +bool v3d_qpu_writes_tmu(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; +bool v3d_qpu_writes_tmu_not_tmuc(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; bool v3d_qpu_writes_r3(const struct v3d_device_info *devinfo, const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST; bool v3d_qpu_writes_r4(const struct v3d_device_info *devinfo, -- 2.7.4