From 2630c8f546584948214ce846e99298f19d6f20d7 Mon Sep 17 00:00:00 2001 From: Iago Toral Quiroga Date: Mon, 13 Dec 2021 08:56:43 +0100 Subject: [PATCH] broadcom/compiler: improve thrsw merge MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Instead of stopping the merge process when we find an instruction with an incompatible signal (such as an small immediate), keep going and see if we can merge the thrsw in a previous instruction that is compatible. total instructions in shared programs: 13409835 -> 13356648 (-0.40%) instructions in affected programs: 3556860 -> 3503673 (-1.50%) helped: 17457 HURT: 18 Instructions are helped. total max-temps in shared programs: 2353971 -> 2352956 (-0.04%) max-temps in affected programs: 13960 -> 12945 (-7.27%) helped: 703 HURT: 0 Max-temps are helped. total spills in shared programs: 12301 -> 12301 (0.00%) total sfu-stalls in shared programs: 32596 -> 32499 (-0.30%) sfu-stalls in affected programs: 225 -> 128 (-43.11%) helped: 79 HURT: 3 Sfu-stalls are helped. total nops in shared programs: 347204 -> 325234 (-6.33%) nops in affected programs: 99834 -> 77864 (-22.01%) helped: 11515 HURT: 158 Nops are helped. Reviewed-by: Alejandro Piñeiro Part-of: --- src/broadcom/compiler/qpu_schedule.c | 45 ++++++++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 7 deletions(-) diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c index ff24016..112573d 100644 --- a/src/broadcom/compiler/qpu_schedule.c +++ b/src/broadcom/compiler/qpu_schedule.c @@ -1763,26 +1763,57 @@ emit_thrsw(struct v3d_compile *c, /* Find how far back into previous instructions we can put the THRSW. */ int slots_filled = 0; + int invalid_sig_count = 0; + bool last_thrsw_after_invalid_ok = false; struct qinst *merge_inst = NULL; vir_for_each_inst_rev(prev_inst, block) { - struct v3d_qpu_sig sig = prev_inst->qpu.sig; - sig.thrsw = true; - uint32_t packed_sig; - - if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig)) - break; - if (!valid_thrsw_sequence(c, scoreboard, prev_inst, slots_filled + 1, is_thrend)) { break; } + struct v3d_qpu_sig sig = prev_inst->qpu.sig; + sig.thrsw = true; + uint32_t packed_sig; + if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig)) { + /* If we can't merge the thrsw here because of signal + * incompatibility, keep going, we might be able to + * merge it in an earlier instruction. + */ + invalid_sig_count++; + goto cont_block; + } + + /* For last thrsw we need 2 consecutive slots that are + * thrsw compatible, so if we have previously jumped over + * an incompatible signal, flag that we have found the first + * valid slot here and keep going. + */ + if (inst->is_last_thrsw && invalid_sig_count > 0 && + !last_thrsw_after_invalid_ok) { + last_thrsw_after_invalid_ok = true; + invalid_sig_count++; + goto cont_block; + } + + last_thrsw_after_invalid_ok = false; + invalid_sig_count = 0; merge_inst = prev_inst; + +cont_block: if (++slots_filled == 3) break; } + /* If we jumped over a signal incompatibility and did not manage to + * merge the thrsw in the end, we need to adjust slots filled to match + * the last valid merge point. + */ + assert(invalid_sig_count == 0 || slots_filled >= invalid_sig_count); + if (invalid_sig_count > 0) + slots_filled -= invalid_sig_count; + bool needs_free = false; if (merge_inst) { merge_inst->qpu.sig.thrsw = true; -- 2.7.4