From 74a40cc4b6ed9440a0820c6f4a9cee296a8e191a Mon Sep 17 00:00:00 2001
From: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Date: Thu, 24 Aug 2023 01:23:00 +0300
Subject: [PATCH] intel/fs: move lower of non-uniform at_sample barycentric to
 NIR

We use a non-uniform lowering loop in the backend which we can do
better in NIR because we can also use divergence analysis there.

This change also limits VGRF usage to a single VGRF to hold the sample
ID in the backend.

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24716>
---
 src/intel/compiler/brw_fs_nir.cpp                  | 67 ++++---------------
 src/intel/compiler/brw_nir.c                       |  4 ++
 src/intel/compiler/brw_nir.h                       |  2 +
 ...w_nir_lower_non_uniform_barycentric_at_sample.c | 78 ++++++++++++++++++++++
 src/intel/compiler/meson.build                     |  1 +
 5 files changed, 97 insertions(+), 55 deletions(-)
 create mode 100644 src/intel/compiler/brw_nir_lower_non_uniform_barycentric_at_sample.c

diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 3d3a6b6..058e184 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -3536,66 +3536,23 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
       const glsl_interp_mode interpolation =
          (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
 
+      fs_reg msg_data;
       if (nir_src_is_const(instr->src[0])) {
-         unsigned msg_data = nir_src_as_uint(instr->src[0]) << 4;
-
-         emit_pixel_interpolater_send(bld,
-                                      FS_OPCODE_INTERPOLATE_AT_SAMPLE,
-                                      dest,
-                                      fs_reg(), /* src */
-                                      brw_imm_ud(msg_data),
-                                      interpolation);
+         msg_data = brw_imm_ud(nir_src_as_uint(instr->src[0]) << 4);
       } else {
          const fs_reg sample_src = retype(get_nir_src(instr->src[0]),
                                           BRW_REGISTER_TYPE_UD);
-
-         if (nir_src_is_always_uniform(instr->src[0])) {
-            const fs_reg sample_id = bld.emit_uniformize(sample_src);
-            const fs_reg msg_data = vgrf(glsl_type::uint_type);
-            bld.exec_all().group(1, 0)
-               .SHL(msg_data, sample_id, brw_imm_ud(4u));
-            emit_pixel_interpolater_send(bld,
-                                         FS_OPCODE_INTERPOLATE_AT_SAMPLE,
-                                         dest,
-                                         fs_reg(), /* src */
-                                         component(msg_data, 0),
-                                         interpolation);
-         } else {
-            /* Make a loop that sends a message to the pixel interpolater
-             * for the sample number in each live channel. If there are
-             * multiple channels with the same sample number then these
-             * will be handled simultaneously with a single iteration of
-             * the loop.
-             */
-            bld.emit(BRW_OPCODE_DO);
-
-            /* Get the next live sample number into sample_id_reg */
-            const fs_reg sample_id = bld.emit_uniformize(sample_src);
-
-            /* Set the flag register so that we can perform the send
-             * message on all channels that have the same sample number
-             */
-            bld.CMP(bld.null_reg_ud(),
-                    sample_src, sample_id,
-                    BRW_CONDITIONAL_EQ);
-            const fs_reg msg_data = vgrf(glsl_type::uint_type);
-            bld.exec_all().group(1, 0)
-               .SHL(msg_data, sample_id, brw_imm_ud(4u));
-            fs_inst *inst =
-               emit_pixel_interpolater_send(bld,
-                                            FS_OPCODE_INTERPOLATE_AT_SAMPLE,
-                                            dest,
-                                            fs_reg(), /* src */
-                                            component(msg_data, 0),
-                                            interpolation);
-            set_predicate(BRW_PREDICATE_NORMAL, inst);
-
-            /* Continue the loop if there are any live channels left */
-            set_predicate_inv(BRW_PREDICATE_NORMAL,
-                              true, /* inverse */
-                              bld.emit(BRW_OPCODE_WHILE));
-         }
+         const fs_reg sample_id = bld.emit_uniformize(sample_src);
+         msg_data = component(bld.group(8, 0).vgrf(BRW_REGISTER_TYPE_UD), 0);
+         bld.exec_all().group(1, 0).SHL(msg_data, sample_id, brw_imm_ud(4u));
       }
+
+      emit_pixel_interpolater_send(bld,
+                                   FS_OPCODE_INTERPOLATE_AT_SAMPLE,
+                                   dest,
+                                   fs_reg(), /* src */
+                                   msg_data,
+                                   interpolation);
       break;
    }
 
diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c
index 7cbac9c..27a9d74 100644
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -1685,6 +1685,10 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
          brw_nir_optimize(nir, compiler);
    }
 
+   /* Do this only after the last opt_gcm. GCM will undo this lowering. */
+   if (nir->info.stage == MESA_SHADER_FRAGMENT)
+      OPT(brw_nir_lower_non_uniform_barycentric_at_sample);
+
    /* Clean up LCSSA phis */
    OPT(nir_opt_remove_phis);
 
diff --git a/src/intel/compiler/brw_nir.h b/src/intel/compiler/brw_nir.h
index 8d0b786..505e147 100644
--- a/src/intel/compiler/brw_nir.h
+++ b/src/intel/compiler/brw_nir.h
@@ -229,6 +229,8 @@ bool brw_nir_limit_trig_input_range_workaround(nir_shader *nir);
 
 void brw_nir_apply_tcs_quads_workaround(nir_shader *nir);
 
+bool brw_nir_lower_non_uniform_barycentric_at_sample(nir_shader *nir);
+
 void brw_nir_apply_key(nir_shader *nir,
                        const struct brw_compiler *compiler,
                        const struct brw_base_prog_key *key,
diff --git a/src/intel/compiler/brw_nir_lower_non_uniform_barycentric_at_sample.c b/src/intel/compiler/brw_nir_lower_non_uniform_barycentric_at_sample.c
new file mode 100644
index 0000000..700d54f
--- /dev/null
+++ b/src/intel/compiler/brw_nir_lower_non_uniform_barycentric_at_sample.c
@@ -0,0 +1,78 @@
+/*
+ * Copyright Â© 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/*
+ * Lower non uniform at sample messages to the interpolator.
+ *
+ * This is pretty much identical to what nir_lower_non_uniform_access() does.
+ * We do it here because otherwise GCM would undo this optimization. Also we
+ * can assume divergence analysis here.
+ */
+
+#include "brw_nir.h"
+#include "compiler/nir/nir_builder.h"
+
+static bool
+brw_nir_lower_non_uniform_barycentric_at_sample_instr(nir_builder *b,
+                                                      nir_instr *instr,
+                                                      void *cb_data)
+{
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+   if (intrin->intrinsic != nir_intrinsic_load_barycentric_at_sample)
+      return false;
+
+   if (nir_src_is_always_uniform(intrin->src[0]) ||
+       !nir_src_is_divergent(intrin->src[0]))
+      return false;
+
+   nir_def *sample_id = intrin->src[0].ssa;
+
+   b->cursor = nir_instr_remove(&intrin->instr);
+
+   nir_push_loop(b);
+
+   nir_def *first_sample_id = nir_read_first_invocation(b, sample_id);
+
+   nir_push_if(b, nir_ieq(b, sample_id, first_sample_id));
+
+   nir_builder_instr_insert(b, &intrin->instr);
+
+   nir_src_rewrite(&intrin->src[0], first_sample_id);
+
+   nir_jump(b, nir_jump_break);
+
+   return true;
+}
+
+bool
+brw_nir_lower_non_uniform_barycentric_at_sample(nir_shader *nir)
+{
+   return nir_shader_instructions_pass(
+      nir,
+      brw_nir_lower_non_uniform_barycentric_at_sample_instr,
+      nir_metadata_none,
+      NULL);
+}
diff --git a/src/intel/compiler/meson.build b/src/intel/compiler/meson.build
index 5d42d7f..6898bfa 100644
--- a/src/intel/compiler/meson.build
+++ b/src/intel/compiler/meson.build
@@ -92,6 +92,7 @@ libintel_compiler_files = files(
   'brw_nir_lower_alpha_to_coverage.c',
   'brw_nir_lower_intersection_shader.c',
   'brw_nir_lower_non_uniform_resource_intel.c',
+  'brw_nir_lower_non_uniform_barycentric_at_sample.c',
   'brw_nir_lower_ray_queries.c',
   'brw_nir_lower_rt_intrinsics.c',
   'brw_nir_lower_shader_calls.c',
-- 
2.7.4