radeonsi: force the MSAA resolve shader to use 1 clause for MSAA loads
authorMarek Olšák <marek.olsak@amd.com>
Sat, 22 Oct 2022 00:30:41 +0000 (20:30 -0400)
committerMarge Bot <emma+marge@anholt.net>
Sat, 29 Oct 2022 18:38:33 +0000 (18:38 +0000)
LLVM can't keep consecutive loads in a clause.

Using the optimization barrier for sample indices produces the optimal code.
Deduced by trial and error.

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Reviewed-by: Mihai Preda <mhpreda@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19243>

src/gallium/drivers/radeonsi/si_shaderlib_nir.c

index 0c01f01..3192b73 100644 (file)
@@ -347,12 +347,26 @@ static nir_ssa_def *image_resolve_msaa(nir_builder *b, nir_variable *img, unsign
       nir_push_else(b, NULL);
    }
 
+   /* We need to hide the constant sample indices behind the optimization barrier, otherwise
+    * LLVM doesn't put loads into the same clause.
+    *
+    * TODO: nir_group_loads could do this.
+    */
+   nir_ssa_def *sample_index[16];
+   for (unsigned i = 0; i < num_samples; i++)
+      sample_index[i] = nir_optimization_barrier_vgpr_amd(b, 32, nir_imm_int(b, i));
+
+   /* Load all samples. */
+   nir_ssa_def *samples[16];
+   for (unsigned i = 0; i < num_samples; i++) {
+      samples[i] = nir_image_deref_load(b, 4, 32, deref_ssa(b, img),
+                                        coord, sample_index[i], zero);
+   }
+
    /* Average all samples. (the only options on gfx11) */
    result = NULL;
    for (unsigned i = 0; i < num_samples; i++) {
-      nir_ssa_def *sample = nir_image_deref_load(b, 4, 32, deref_ssa(b, img),
-                                                 coord, nir_imm_int(b, i), zero);
-      result = result ? nir_fadd(b, result, sample) : sample;
+      result = i ? nir_fadd(b, result, samples[i]) : samples[i];
    }
    result = nir_fmul_imm(b, result, 1.0 / num_samples); /* average the sum */