broadcom/compiler: try harder to merge thread switch earlier
authorIago Toral Quiroga <itoral@igalia.com>
Wed, 26 Apr 2023 07:45:03 +0000 (09:45 +0200)
committerMarge Bot <emma+marge@anholt.net>
Thu, 27 Apr 2023 08:43:29 +0000 (08:43 +0000)
We have been stopping as soon as we find a conflict but that doesn't
mean we can't merge it in an earlier slot, so keep going. Going by
shader-db, this sometimes allows us to merge the final thrsw a bit
earlier and avoid emitting NOP instructions at the program end to
make up for its delay slots. I have not observed cases where this
helps with regular thrsw though, but it doesn't hurt to try with
those too.

total instructions in shared programs: 11526876 -> 11526354 (<.01%)
instructions in affected programs: 10760 -> 10238 (-4.85%)
helped: 236
HURT: 0
Instructions are helped.

total max-temps in shared programs: 2231705 -> 2231677 (<.01%)
max-temps in affected programs: 276 -> 248 (-10.14%)
helped: 27
HURT: 0
Max-temps are helped.

total inst-and-stalls in shared programs: 11545177 -> 11544655 (<.01%)
inst-and-stalls in affected programs: 10777 -> 10255 (-4.84%)
helped: 236
HURT: 0
Inst-and-stalls are helped.

total nops in shared programs: 321624 -> 321152 (-0.15%)
nops in affected programs: 751 -> 279 (-62.85%)
helped: 236
HURT: 0
Nops are helped.

Reviewed-by: Alejandro PiƱeiro <apinheiro@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22679>

src/broadcom/compiler/qpu_schedule.c

index 1b0d21463c4d721102dad2cb3238625a69c8b978..3b32b48f86f09668147b6a57136972944d96c792 100644 (file)
@@ -1766,12 +1766,6 @@ valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard
                      struct qinst *qinst, int instructions_in_sequence,
                      bool is_thrend)
 {
-        /* No emitting our thrsw while the previous thrsw hasn't happened yet. */
-        if (scoreboard->last_thrsw_tick + 3 >
-            scoreboard->tick - instructions_in_sequence) {
-                return false;
-        }
-
         for (int slot = 0; slot < instructions_in_sequence; slot++) {
                 if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot))
                         return false;
@@ -1825,13 +1819,28 @@ emit_thrsw(struct v3d_compile *c,
         /* Find how far back into previous instructions we can put the THRSW. */
         int slots_filled = 0;
         int invalid_sig_count = 0;
+        int invalid_seq_count = 0;
         bool last_thrsw_after_invalid_ok = false;
         struct qinst *merge_inst = NULL;
         vir_for_each_inst_rev(prev_inst, block) {
+                /* No emitting our thrsw while the previous thrsw hasn't
+                 * happened yet.
+                 */
+                if (scoreboard->last_thrsw_tick + 3 >
+                    scoreboard->tick - (slots_filled + 1)) {
+                        break;
+                }
+
+
                 if (!valid_thrsw_sequence(c, scoreboard,
                                           prev_inst, slots_filled + 1,
                                           is_thrend)) {
-                        break;
+                        /* Even if the current sequence isn't valid, we may
+                         * be able to get a valid sequence by trying to move the
+                         * thrsw earlier, so keep going.
+                         */
+                        invalid_seq_count++;
+                        goto cont_block;
                 }
 
                 struct v3d_qpu_sig sig = prev_inst->qpu.sig;
@@ -1858,8 +1867,10 @@ emit_thrsw(struct v3d_compile *c,
                         goto cont_block;
                 }
 
+                /* We can merge the thrsw in this instruction */
                 last_thrsw_after_invalid_ok = false;
                 invalid_sig_count = 0;
+                invalid_seq_count = 0;
                 merge_inst = prev_inst;
 
 cont_block:
@@ -1871,9 +1882,12 @@ cont_block:
          * merge the thrsw in the end, we need to adjust slots filled to match
          * the last valid merge point.
          */
-        assert(invalid_sig_count == 0 || slots_filled >= invalid_sig_count);
+        assert((invalid_sig_count == 0 && invalid_seq_count == 0) ||
+                slots_filled >= invalid_sig_count + invalid_seq_count);
         if (invalid_sig_count > 0)
                 slots_filled -= invalid_sig_count;
+        if (invalid_seq_count > 0)
+                slots_filled -= invalid_seq_count;
 
         bool needs_free = false;
         if (merge_inst) {