From 9ff086052ab7bff3cb55c06365543190a3afe188 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sun, 28 Nov 2021 04:55:47 -0500 Subject: [PATCH] radeonsi: unroll loops of up to 128 iterations It's not exactly 128 because longer loop bodies scale the number down. This improves perf for VP13/Creo and Piano. Most other tests either didn't show any difference or are CPU-bound. v2: - The lowering passes had to be moved to the optimization loop because unrolling creates lowerable variables. - Piano has some pattern that looks like corruption and the pattern changed with loop unrolling. The pattern is present on other drivers as well. v3: - I removed the Piano test from CI traces because the image is random. The output was wrong even before this MR, and now it's randomly wrong. | PERCENTAGE DELTAS | Shaders | SGPRs | VGPRs |SpillSGPR |SpillVGPR | PrivVGPR | Scratch | CodeSize | MaxWaves | |------------------------|----------|----------|----------|----------|----------|----------|----------|----------|----------| | alien_isolation | 2936| . | 0.02 %| . | . | . | . | 0.83 %| . | | deadcore | 76| 18.47 %| . | . | . | . | . | 167.69 %| . | | deus_ex_mankind_div.. | 1410| 0.10 %| 0.15 %| . | . | . | . | 1.70 %| . | | f1-2015 | 775| 0.37 %| 0.16 %| . | . | . | . | 3.25 %| -0.07 %| | hitman | 1413| 0.10 %| -0.03 %| 6.45 %| . | . | . | 0.61 %| 0.03 %| | metro_2033_redux | 2670| . | . | . | . | . | . | 0.13 %| 0.01 %| | pixmark-piano-0.7.0 | 2| . | 14.29 %| -100.00 %| . | . | . | 78.07 %| -4.76 %| | reflections_subway | 98| -0.53 %| . | . | . | . | . | 7.64 %| . | | thea | 172| 0.12 %| -0.81 %| . | . | . | . | 0.65 %| 0.15 %| | ubershaders | 54| . | . | . | . | . | . | 61.13 %| . | | ue4_effects_cave | 290| 0.05 %| . | . | . | . | . | 2.62 %| . | | vp13-creo | 26| -3.38 %| -4.20 %| . | . | . | . | 88.56 %| 2.62 %| | vp13-sw | 100| -0.36 %| -9.14 %| . | -100.00 %| . | -100.00 %| -17.97 %| 0.39 %| | vp20-creo | 22| -0.82 %| -3.33 %| . | . | . | . | 81.59 %| 1.51 %| | vp20-sw | 296| -4.51 %| -0.63 %| . | . | . | . | 58.93 %| 0.20 %| |------------------------|----------|----------|----------|----------|----------|----------|----------|----------|----------| | All affected | 189| 3.05 %| -2.87 %| 500.00 %| -100.00 %| . | -100.00 %| 135.61 %| 1.32 %| |------------------------|----------|----------|----------|----------|----------|----------|----------|----------|----------| | Total | 57794| 0.01 %| -0.02 %| 0.27 %| -3.13 %| . | -2.89 %| 1.73 %| . | Reviewed-by: Pierre-Eric Pelloux-Prayer (v1) Part-of: --- src/gallium/drivers/radeonsi/ci/traces-radeonsi.yml | 4 ---- src/gallium/drivers/radeonsi/si_get.c | 2 +- src/gallium/drivers/radeonsi/si_shader_nir.c | 8 ++++---- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/src/gallium/drivers/radeonsi/ci/traces-radeonsi.yml b/src/gallium/drivers/radeonsi/ci/traces-radeonsi.yml index bd80fe7..dc025be 100644 --- a/src/gallium/drivers/radeonsi/ci/traces-radeonsi.yml +++ b/src/gallium/drivers/radeonsi/ci/traces-radeonsi.yml @@ -34,10 +34,6 @@ traces: expectations: - device: gl-radeonsi-stoney checksum: 84c499203944cdc59e70450c324bb8df - - path: gputest/pixmark-piano.trace - expectations: - - device: gl-radeonsi-stoney - checksum: a7317d54d452d19ce630c7f554f2279b - path: gputest/triangle.trace expectations: - device: gl-radeonsi-stoney diff --git a/src/gallium/drivers/radeonsi/si_get.c b/src/gallium/drivers/radeonsi/si_get.c index e4a32c3..b4064cd 100644 --- a/src/gallium/drivers/radeonsi/si_get.c +++ b/src/gallium/drivers/radeonsi/si_get.c @@ -1054,7 +1054,7 @@ void si_init_screen_get_functions(struct si_screen *sscreen) .has_dot_4x8 = sscreen->info.has_accelerated_dot_product, .has_dot_2x16 = sscreen->info.has_accelerated_dot_product, .optimize_sample_mask_in = true, - .max_unroll_iterations = 32, + .max_unroll_iterations = 128, .max_unroll_iterations_aggressive = 128, .use_interpolated_input_intrinsics = true, .lower_uniforms_to_ubo = true, diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c index f51909c..a3e49d8 100644 --- a/src/gallium/drivers/radeonsi/si_shader_nir.c +++ b/src/gallium/drivers/radeonsi/si_shader_nir.c @@ -597,15 +597,15 @@ void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first) { bool progress; - NIR_PASS_V(nir, nir_lower_vars_to_ssa); - NIR_PASS_V(nir, nir_lower_alu_to_scalar, si_alu_to_scalar_filter, sscreen); - NIR_PASS_V(nir, nir_lower_phis_to_scalar, false); - do { progress = false; bool lower_alu_to_scalar = false; bool lower_phis_to_scalar = false; + NIR_PASS(progress, nir, nir_lower_vars_to_ssa); + NIR_PASS(progress, nir, nir_lower_alu_to_scalar, si_alu_to_scalar_filter, sscreen); + NIR_PASS(progress, nir, nir_lower_phis_to_scalar, false); + if (first) { NIR_PASS(progress, nir, nir_split_array_vars, nir_var_function_temp); NIR_PASS(lower_alu_to_scalar, nir, nir_shrink_vec_array_vars, nir_var_function_temp); -- 2.7.4