From 51a263530f4de29473a277ebf3a77bf4cf83b327 Mon Sep 17 00:00:00 2001 From: Iago Toral Quiroga Date: Mon, 15 Mar 2021 13:39:50 +0100 Subject: [PATCH] broadcom/compiler: use nir_opt_load_store_vectorize MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This will make it so we pack consecutive scalar operations into a vector operation, reducing the amount of load/store operations in the NIR program. Our backend can handle vector load/stores, and doing so may be more efficient since we don't need to setup individual load/stores all the time. A pathological case is: dEQP-VK.spirv_assembly.instruction.compute.opcopymemory.array which goes from 862 instructions to only 573 by converting all scalar SSBO load/store operations to vec4 operations. total instructions in shared programs: 13752607 -> 13733627 (-0.14%) instructions in affected programs: 367117 -> 348137 (-5.17%) helped: 1168 HURT: 371 Instructions are helped. total threads in shared programs: 412230 -> 412272 (0.01%) threads in affected programs: 54 -> 96 (77.78%) helped: 23 HURT: 2 Threads are helped. total uniforms in shared programs: 3790248 -> 3784601 (-0.15%) uniforms in affected programs: 57417 -> 51770 (-9.84%) helped: 1420 HURT: 19 Uniforms are helped. total max-temps in shared programs: 2322170 -> 2322714 (0.02%) max-temps in affected programs: 14353 -> 14897 (3.79%) helped: 185 HURT: 306 Max-temps are HURT. total spills in shared programs: 5940 -> 6010 (1.18%) spills in affected programs: 65 -> 135 (107.69%) helped: 0 HURT: 11 total fills in shared programs: 13372 -> 13494 (0.91%) fills in affected programs: 75 -> 197 (162.67%) helped: 0 HURT: 11 total sfu-stalls in shared programs: 31505 -> 31521 (0.05%) sfu-stalls in affected programs: 751 -> 767 (2.13%) helped: 210 HURT: 246 Inconclusive result (value mean confidence interval includes 0). total inst-and-stalls in shared programs: 13784112 -> 13765148 (-0.14%) inst-and-stalls in affected programs: 360283 -> 341319 (-5.26%) helped: 1125 HURT: 366 Inst-and-stalls are helped. Reviewed-by: Alejandro Piñeiro Part-of: --- src/broadcom/compiler/nir_to_vir.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c index 96bfd86..9133aab 100644 --- a/src/broadcom/compiler/nir_to_vir.c +++ b/src/broadcom/compiler/nir_to_vir.c @@ -1745,6 +1745,33 @@ emit_geom_end(struct v3d_compile *c) vir_VPMWT(c); } +static bool +mem_vectorize_callback(unsigned align_mul, unsigned align_offset, + unsigned bit_size, + unsigned num_components, + nir_intrinsic_instr *low, + nir_intrinsic_instr *high, + void *data) +{ + /* Our backend is 32-bit only at present */ + if (bit_size != 32) + return false; + + if (align_mul % 4 != 0 || align_offset % 4 != 0) + return false; + + /* Vector accesses wrap at 16-byte boundaries so we can't vectorize + * if the resulting vector crosses a 16-byte boundary. + */ + assert(util_is_power_of_two_nonzero(align_mul)); + align_mul = MIN2(align_mul, 16); + align_offset &= 0xf; + if (16 - align_mul + align_offset + num_components * 4 > 16) + return false; + + return true; +} + void v3d_optimize_nir(struct nir_shader *s) { @@ -1769,6 +1796,15 @@ v3d_optimize_nir(struct nir_shader *s) NIR_PASS(progress, s, nir_opt_algebraic); NIR_PASS(progress, s, nir_opt_constant_folding); + nir_load_store_vectorize_options vectorize_opts = { + .modes = nir_var_mem_ssbo | nir_var_mem_ubo | + nir_var_mem_push_const | nir_var_mem_shared | + nir_var_mem_global, + .callback = mem_vectorize_callback, + .robust_modes = 0, + }; + NIR_PASS(progress, s, nir_opt_load_store_vectorize, &vectorize_opts); + if (lower_flrp != 0) { bool lower_flrp_progress = false; -- 2.7.4