From f783bd0d2a75a1244fdbb4a1754f072891e87af6 Mon Sep 17 00:00:00 2001 From: Iago Toral Quiroga Date: Mon, 7 Mar 2022 14:04:19 +0100 Subject: [PATCH] broadcom/compiler: define v3d-specific delays for NIR instructions MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit We do a few changes over NIR's defaults: 1. Lower delay for texture reads. Empirically, we don't observe any benefits with delays over 50 and since this delay value is still used by the scheduler in the "favor register pressure" case it is benefitial to avoid overestimating it too much. 2. Adjust delay for non-filtered TMU reads to the delay selected for texture reads. 3. In our case, UBO reads from dynamically uniform addresses don't use the TMU and have a latency of 1 instruction in the best case scenario or 4 at worse, so we go with 1 so we don't try to move this early. This helps us get back some of what we lost when updating the default scheduler configuration to add a delay for non-filtered memory reads: total instructions in shared programs: 13126587 -> 12671765 (-3.46%) instructions in affected programs: 3764097 -> 3309275 (-12.08%) helped: 14664 HURT: 4244 total threads in shared programs: 407208 -> 415522 (2.04%) threads in affected programs: 8716 -> 17030 (95.39%) helped: 4224 HURT: 67 total uniforms in shared programs: 3812698 -> 3711224 (-2.66%) uniforms in affected programs: 335170 -> 233696 (-30.28%) helped: 2816 HURT: 3551 total max-temps in shared programs: 2318430 -> 2159345 (-6.86%) max-temps in affected programs: 539991 -> 380906 (-29.46%) helped: 13173 HURT: 1440 total spills in shared programs: 49086 -> 5966 (-87.85%) spills in affected programs: 48306 -> 5186 (-89.26%) helped: 1655 HURT: 28 total fills in shared programs: 55810 -> 9328 (-83.29%) fills in affected programs: 54821 -> 8339 (-84.79%) helped: 1659 HURT: 22 LOST: 0 GAINED: 3 Reviewed-by: Alejandro Piñeiro Part-of: --- src/broadcom/compiler/vir.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c index d649989..48405d0 100644 --- a/src/broadcom/compiler/vir.c +++ b/src/broadcom/compiler/vir.c @@ -1119,6 +1119,45 @@ v3d_intrinsic_dependency_cb(nir_intrinsic_instr *intr, return false; } +static unsigned +v3d_instr_delay_cb(nir_instr *instr, void *data) +{ + switch (instr->type) { + case nir_instr_type_ssa_undef: + case nir_instr_type_load_const: + case nir_instr_type_alu: + case nir_instr_type_deref: + case nir_instr_type_jump: + case nir_instr_type_parallel_copy: + case nir_instr_type_call: + case nir_instr_type_phi: + return 1; + + case nir_instr_type_intrinsic: { + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + switch (intr->intrinsic) { + case nir_intrinsic_load_ssbo: + case nir_intrinsic_load_scratch: + case nir_intrinsic_load_shared: + case nir_intrinsic_image_load: + return 30; + case nir_intrinsic_load_ubo: + if (nir_src_is_divergent(intr->src[1])) + return 30; + FALLTHROUGH; + default: + return 1; + } + break; + } + + case nir_instr_type_tex: + return 50; + } + + return 0; +} + static bool should_split_wrmask(const nir_instr *instr, const void *data) { @@ -1562,6 +1601,9 @@ v3d_attempt_compile(struct v3d_compile *c) .intrinsic_cb = v3d_intrinsic_dependency_cb, .intrinsic_cb_data = c, + + .instr_delay_cb = v3d_instr_delay_cb, + .instr_delay_cb_data = c, }; NIR_PASS_V(c->s, nir_schedule, &schedule_options); -- 2.7.4