broadcom/compiler: improve thrsw merge
authorIago Toral Quiroga <itoral@igalia.com>
Mon, 13 Dec 2021 07:56:43 +0000 (08:56 +0100)
committerMarge Bot <emma+marge@anholt.net>
Tue, 14 Dec 2021 09:50:17 +0000 (09:50 +0000)
Instead of stopping the merge process when we find an instruction
with an incompatible signal (such as an small immediate), keep
going and see if we can merge the thrsw in a previous instruction
that is compatible.

total instructions in shared programs: 13409835 -> 13356648 (-0.40%)
instructions in affected programs: 3556860 -> 3503673 (-1.50%)
helped: 17457
HURT: 18
Instructions are helped.

total max-temps in shared programs: 2353971 -> 2352956 (-0.04%)
max-temps in affected programs: 13960 -> 12945 (-7.27%)
helped: 703
HURT: 0
Max-temps are helped.

total spills in shared programs: 12301 -> 12301 (0.00%)
total sfu-stalls in shared programs: 32596 -> 32499 (-0.30%)
sfu-stalls in affected programs: 225 -> 128 (-43.11%)
helped: 79
HURT: 3
Sfu-stalls are helped.

total nops in shared programs: 347204 -> 325234 (-6.33%)
nops in affected programs: 99834 -> 77864 (-22.01%)
helped: 11515
HURT: 158
Nops are helped.

Reviewed-by: Alejandro PiƱeiro <apinheiro@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14172>

src/broadcom/compiler/qpu_schedule.c

index ff24016..112573d 100644 (file)
@@ -1763,26 +1763,57 @@ emit_thrsw(struct v3d_compile *c,
 
         /* Find how far back into previous instructions we can put the THRSW. */
         int slots_filled = 0;
+        int invalid_sig_count = 0;
+        bool last_thrsw_after_invalid_ok = false;
         struct qinst *merge_inst = NULL;
         vir_for_each_inst_rev(prev_inst, block) {
-                struct v3d_qpu_sig sig = prev_inst->qpu.sig;
-                sig.thrsw = true;
-                uint32_t packed_sig;
-
-                if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig))
-                        break;
-
                 if (!valid_thrsw_sequence(c, scoreboard,
                                           prev_inst, slots_filled + 1,
                                           is_thrend)) {
                         break;
                 }
 
+                struct v3d_qpu_sig sig = prev_inst->qpu.sig;
+                sig.thrsw = true;
+                uint32_t packed_sig;
+                if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig)) {
+                        /* If we can't merge the thrsw here because of signal
+                         * incompatibility, keep going, we might be able to
+                         * merge it in an earlier instruction.
+                         */
+                        invalid_sig_count++;
+                        goto cont_block;
+                }
+
+                /* For last thrsw we need 2 consecutive slots that are
+                 * thrsw compatible, so if we have previously jumped over
+                 * an incompatible signal, flag that we have found the first
+                 * valid slot here and keep going.
+                 */
+                if (inst->is_last_thrsw && invalid_sig_count > 0 &&
+                    !last_thrsw_after_invalid_ok) {
+                        last_thrsw_after_invalid_ok = true;
+                        invalid_sig_count++;
+                        goto cont_block;
+                }
+
+                last_thrsw_after_invalid_ok = false;
+                invalid_sig_count = 0;
                 merge_inst = prev_inst;
+
+cont_block:
                 if (++slots_filled == 3)
                         break;
         }
 
+        /* If we jumped over a signal incompatibility and did not manage to
+         * merge the thrsw in the end, we need to adjust slots filled to match
+         * the last valid merge point.
+         */
+        assert(invalid_sig_count == 0 || slots_filled >= invalid_sig_count);
+        if (invalid_sig_count > 0)
+                slots_filled -= invalid_sig_count;
+
         bool needs_free = false;
         if (merge_inst) {
                 merge_inst->qpu.sig.thrsw = true;