From e401add741f33d113fe1496298e35ad00ce6a878 Mon Sep 17 00:00:00 2001 From: Iago Toral Quiroga Date: Tue, 16 May 2023 11:34:51 +0200 Subject: [PATCH] broadcom/compiler: skip jumps in non-uniform if/then when block cost is small MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit We have an optimization for non-uniform if/else where if all channels meet the jump condition we emit a branch to jump straight to the ELSE block. Similarly, if at the end of the THEN block we don't have any channels that would execute the ELSE block, we emit a branch to jump straight to the AFTER block. This optimization has a cost though: we need to emit the condition for the branch and a branch instruction (which also comes with a 3 delay slot), so for very small blocks (just a couple of ALU for example) emitting the branch instruction is typically worse. Futher, if the condition for the branch is not met, we still pay the cost for no benefit at all. Here is an example: nop ; fmul.ifa rf26, 0x3e800000, rf54 xor.pushz -, rf52, 2 ; nop bu.alla 32, r:unif (0x00000000 / 0.000000) nop ; nop nop ; nop nop ; nop xor.pushz -, rf52, 3 ; nop nop ; mov.ifa rf52, 0 nop ; mov.pushz -, rf52 nop ; mov.ifa rf26, 0x3f800000 The bu instruction here is setup to jump over the following 4 instructions (the last 4 instructions in there). To do this, we pay the price of the xor to generate the condition, the bu instruction, and the 3 delay slots right after it, so we end up paying 6 instructions to skip over 4 which we pay always, even if the branch is not taken and we still have to execute those 4 instructions. With this change, we produce: nop ; fmul.ifa rf56, 0x3e800000, rf28 xor.pushz -, rf9, 3 ; nop nop ; mov.ifa rf9, 0 nop ; mov.pushz -, rf9 nop ; mov.ifa rf56, 0x3f800000 Now we don't try to skip the small block, ever. At worse, if all channels would have met the branch condition, we only pay the cost of the 4 instructions instead of 6, at best, if any channel wouldn't take the branch, we save ourselves 5 cycles for the branch condition, the branch instruction and its 3 delay slots. Reviewed-by: Alejandro Piñeiro Part-of: --- src/broadcom/compiler/nir_to_vir.c | 53 +++++++++++++++++++++++++++++--------- 1 file changed, 41 insertions(+), 12 deletions(-) diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c index 53973e5..ea18d87 100644 --- a/src/broadcom/compiler/nir_to_vir.c +++ b/src/broadcom/compiler/nir_to_vir.c @@ -3819,6 +3819,25 @@ ntq_activate_execute_for_block(struct v3d_compile *c) vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0)); } +static bool +is_cheap_block(nir_block *block) +{ + int32_t cost = 3; + nir_foreach_instr(instr, block) { + switch (instr->type) { + case nir_instr_type_alu: + case nir_instr_type_ssa_undef: + case nir_instr_type_load_const: + if (--cost <= 0) + return false; + break; + default: + return false; + } + } + return true; +} + static void ntq_emit_uniform_if(struct v3d_compile *c, nir_if *if_stmt) { @@ -3963,12 +3982,16 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt) c->execute, vir_uniform_ui(c, else_block->index)); - /* Jump to ELSE if nothing is active for THEN, otherwise fall - * through. + /* Jump to ELSE if nothing is active for THEN (unless THEN block is + * so small it won't pay off), otherwise fall through. */ - vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ); - vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA); - vir_link_blocks(c->cur_block, else_block); + bool is_cheap = exec_list_is_singular(&if_stmt->then_list) && + is_cheap_block(nir_if_first_then_block(if_stmt)); + if (!is_cheap) { + vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ); + vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA); + vir_link_blocks(c->cur_block, else_block); + } vir_link_blocks(c->cur_block, then_block); /* Process the THEN block. */ @@ -3985,13 +4008,19 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt) vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, after_block->index)); - /* If everything points at ENDIF, then jump there immediately. */ - vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), - c->execute, - vir_uniform_ui(c, after_block->index)), - V3D_QPU_PF_PUSHZ); - vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA); - vir_link_blocks(c->cur_block, after_block); + /* If everything points at ENDIF, then jump there immediately + * (unless ELSE block is so small it won't pay off). + */ + bool is_cheap = exec_list_is_singular(&if_stmt->else_list) && + is_cheap_block(nir_else_block); + if (!is_cheap) { + vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), + c->execute, + vir_uniform_ui(c, after_block->index)), + V3D_QPU_PF_PUSHZ); + vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA); + vir_link_blocks(c->cur_block, after_block); + } vir_link_blocks(c->cur_block, else_block); vir_set_emit_block(c, else_block); -- 2.7.4