broadcom/compiler: skip jumps in non-uniform if/then when block cost is small
authorIago Toral Quiroga <itoral@igalia.com>
Tue, 16 May 2023 09:34:51 +0000 (11:34 +0200)
committerMarge Bot <emma+marge@anholt.net>
Mon, 22 May 2023 09:23:41 +0000 (09:23 +0000)
We have an optimization for non-uniform if/else where if all channels meet the
jump condition we emit a branch to jump straight to the ELSE block. Similarly,
if at the end of the THEN block we don't have any channels that would execute
the ELSE block, we emit a branch to jump straight to the AFTER block.

This optimization has a cost though: we need to emit the condition for the
branch and a branch instruction (which also comes with a 3 delay slot), so for
very small blocks (just a couple of ALU for example) emitting the branch
instruction is typically worse. Futher, if the condition for the branch is not
met, we still pay the cost for no benefit at all.

Here is an example:

nop                           ; fmul.ifa rf26, 0x3e800000, rf54
xor.pushz -, rf52, 2          ; nop
bu.alla  32, r:unif (0x00000000 / 0.000000)
nop                           ; nop
nop                           ; nop
nop                           ; nop
xor.pushz -, rf52, 3          ; nop
nop                           ; mov.ifa rf52, 0
nop                           ; mov.pushz -, rf52
nop                           ; mov.ifa rf26, 0x3f800000

The bu instruction here is setup to jump over the following 4 instructions
(the last 4 instructions in there). To do this, we pay the price of the xor
to generate the condition, the bu instruction, and the 3 delay slots right
after it, so we end up paying 6 instructions to skip over 4 which we pay
always, even if the branch is not taken and we still have to execute those
4 instructions. With this change, we produce:

nop                           ; fmul.ifa rf56, 0x3e800000, rf28
xor.pushz -, rf9, 3           ; nop
nop                           ; mov.ifa rf9, 0
nop                           ; mov.pushz -, rf9
nop                           ; mov.ifa rf56, 0x3f800000

Now we don't try to skip the small block, ever. At worse, if all channels
would have met the branch condition, we only pay the cost of the 4
instructions instead of 6, at best, if any channel wouldn't take the
branch, we save ourselves 5 cycles for the branch condition, the branch
instruction and its 3 delay slots.

Reviewed-by: Alejandro PiƱeiro <apinheiro@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23161>

src/broadcom/compiler/nir_to_vir.c

index 53973e5..ea18d87 100644 (file)
@@ -3819,6 +3819,25 @@ ntq_activate_execute_for_block(struct v3d_compile *c)
         vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
 }
 
+static bool
+is_cheap_block(nir_block *block)
+{
+        int32_t cost = 3;
+        nir_foreach_instr(instr, block) {
+                switch (instr->type) {
+                case nir_instr_type_alu:
+                case nir_instr_type_ssa_undef:
+                case nir_instr_type_load_const:
+                        if (--cost <= 0)
+                                return false;
+                break;
+                default:
+                        return false;
+                }
+        }
+        return true;
+}
+
 static void
 ntq_emit_uniform_if(struct v3d_compile *c, nir_if *if_stmt)
 {
@@ -3963,12 +3982,16 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt)
                      c->execute,
                      vir_uniform_ui(c, else_block->index));
 
-        /* Jump to ELSE if nothing is active for THEN, otherwise fall
-         * through.
+        /* Jump to ELSE if nothing is active for THEN (unless THEN block is
+         * so small it won't pay off), otherwise fall through.
          */
-        vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ);
-        vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA);
-        vir_link_blocks(c->cur_block, else_block);
+        bool is_cheap = exec_list_is_singular(&if_stmt->then_list) &&
+                        is_cheap_block(nir_if_first_then_block(if_stmt));
+        if (!is_cheap) {
+                vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ);
+                vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA);
+                vir_link_blocks(c->cur_block, else_block);
+        }
         vir_link_blocks(c->cur_block, then_block);
 
         /* Process the THEN block. */
@@ -3985,13 +4008,19 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt)
                 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
                              vir_uniform_ui(c, after_block->index));
 
-                /* If everything points at ENDIF, then jump there immediately. */
-                vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
-                                        c->execute,
-                                        vir_uniform_ui(c, after_block->index)),
-                           V3D_QPU_PF_PUSHZ);
-                vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA);
-                vir_link_blocks(c->cur_block, after_block);
+                /* If everything points at ENDIF, then jump there immediately
+                 * (unless ELSE block is so small it won't pay off).
+                 */
+                bool is_cheap = exec_list_is_singular(&if_stmt->else_list) &&
+                                is_cheap_block(nir_else_block);
+                if (!is_cheap) {
+                        vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
+                                                   c->execute,
+                                                   vir_uniform_ui(c, after_block->index)),
+                                   V3D_QPU_PF_PUSHZ);
+                        vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA);
+                        vir_link_blocks(c->cur_block, after_block);
+                }
                 vir_link_blocks(c->cur_block, else_block);
 
                 vir_set_emit_block(c, else_block);