From 429ef02f83e6516ec984caefba5046c939c6b8ee Mon Sep 17 00:00:00 2001
From: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Date: Sat, 8 Apr 2023 21:34:35 +0300
Subject: [PATCH] intel/fs: make tcs input_vertices dynamic

We need to do 3 things to accomplish this :

   1. make all the register access consider the maximal case when
      unknown at compile time

   2. move the clamping of load_per_vertex_input prior to lowering
      nir_intrinsic_load_patch_vertices_in (in the dynamic cases, the
      clamping will use the nir_intrinsic_load_patch_vertices_in to
      clamp), meaning clamping using derefs rather than lowered
      nir_intrinsic_load_per_vertex_input

   3. in the known cases, lower nir_intrinsic_load_patch_vertices_in
      in NIR (so that the clamped elements still be vectorized to the
      smallest number of URB read messages)

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22378>
---
 src/intel/compiler/brw_compiler.h                  | 10 +++
 src/intel/compiler/brw_fs_nir.cpp                  |  8 +--
 src/intel/compiler/brw_fs_thread_payload.cpp       |  4 +-
 src/intel/compiler/brw_nir.c                       | 10 +++
 src/intel/compiler/brw_nir.h                       |  8 ++-
 .../compiler/brw_nir_clamp_per_vertex_loads.c      | 75 +++++++++++++++++++---
 src/intel/compiler/brw_vec4_tcs.cpp                |  4 +-
 7 files changed, 97 insertions(+), 22 deletions(-)

diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h
index f456acc..2981bd9 100644
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@@ -322,6 +322,7 @@ struct brw_tcs_prog_key
 
    enum tess_primitive_mode _tes_primitive_mode;
 
+   /** Number of input vertices, 0 means dynamic */
    unsigned input_vertices;
 
    /** A bitfield of per-patch outputs written. */
@@ -331,6 +332,15 @@ struct brw_tcs_prog_key
    uint32_t padding:24;
 };
 
+#define BRW_MAX_TCS_INPUT_VERTICES (32)
+
+static inline uint32_t
+brw_tcs_prog_key_input_vertices(const struct brw_tcs_prog_key *key)
+{
+   return key->input_vertices != 0 ?
+          key->input_vertices : BRW_MAX_TCS_INPUT_VERTICES;
+}
+
 /** The program key for Tessellation Evaluation Shaders. */
 struct brw_tes_prog_key
 {
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index fa98909..be24207 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -2665,7 +2665,8 @@ fs_visitor::get_tcs_multi_patch_icp_handle(const fs_builder &bld,
     * we might read up to nir->info.gs.vertices_in registers.
     */
    bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, start,
-            icp_offset_bytes, brw_imm_ud(tcs_key->input_vertices * REG_SIZE));
+            icp_offset_bytes,
+            brw_imm_ud(brw_tcs_prog_key_input_vertices(tcs_key) * REG_SIZE));
 
    return icp_handle;
 }
@@ -2675,7 +2676,6 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
                                    nir_intrinsic_instr *instr)
 {
    assert(stage == MESA_SHADER_TESS_CTRL);
-   struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key;
    struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
    struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base;
 
@@ -2690,10 +2690,6 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
    case nir_intrinsic_load_invocation_id:
       bld.MOV(retype(dst, invocation_id.type), invocation_id);
       break;
-   case nir_intrinsic_load_patch_vertices_in:
-      bld.MOV(retype(dst, BRW_REGISTER_TYPE_D),
-              brw_imm_d(tcs_key->input_vertices));
-      break;
 
    case nir_intrinsic_scoped_barrier:
       if (nir_intrinsic_memory_scope(instr) != NIR_SCOPE_NONE)
diff --git a/src/intel/compiler/brw_fs_thread_payload.cpp b/src/intel/compiler/brw_fs_thread_payload.cpp
index 8b78114..0901d10 100644
--- a/src/intel/compiler/brw_fs_thread_payload.cpp
+++ b/src/intel/compiler/brw_fs_thread_payload.cpp
@@ -48,7 +48,7 @@ tcs_thread_payload::tcs_thread_payload(const fs_visitor &v)
       num_regs = 5;
    } else {
       assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_MULTI_PATCH);
-      assert(tcs_key->input_vertices > 0);
+      assert(tcs_key->input_vertices <= BRW_MAX_TCS_INPUT_VERTICES);
 
       patch_urb_output = brw_ud8_grf(1, 0);
 
@@ -59,7 +59,7 @@ tcs_thread_payload::tcs_thread_payload(const fs_visitor &v)
 
       /* ICP handles occupy the next 1-32 registers. */
       icp_handle_start = brw_ud8_grf(r, 0);
-      r += tcs_key->input_vertices;
+      r += brw_tcs_prog_key_input_vertices(tcs_key);
 
       num_regs = r;
    }
diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c
index 31168b4..436a088 100644
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -1049,6 +1049,14 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir,
        nir_var_mem_ubo | nir_var_mem_ssbo,
        nir_lower_direct_array_deref_of_vec_load);
 
+   /* Clamp load_per_vertex_input of the TCS stage so that we do not generate
+    * loads reading out of bounds. We can do this here because we called
+    * nir_lower_system_values above.
+    */
+   if (nir->info.stage == MESA_SHADER_TESS_CTRL &&
+       compiler->use_tcs_multi_patch)
+      OPT(brw_nir_clamp_per_vertex_loads);
+
    /* Get rid of split copies */
    brw_nir_optimize(nir, compiler);
 }
@@ -1985,6 +1993,8 @@ nir_shader *
 brw_nir_create_passthrough_tcs(void *mem_ctx, const struct brw_compiler *compiler,
                                const struct brw_tcs_prog_key *key)
 {
+   assert(key->input_vertices > 0);
+
    const nir_shader_compiler_options *options =
       compiler->nir_options[MESA_SHADER_TESS_CTRL];
 
diff --git a/src/intel/compiler/brw_nir.h b/src/intel/compiler/brw_nir.h
index 6700453..e19c3c9 100644
--- a/src/intel/compiler/brw_nir.h
+++ b/src/intel/compiler/brw_nir.h
@@ -98,6 +98,9 @@ struct brw_nir_compiler_opts {
 
    /* Whether robust image access is enabled */
    bool robust_image_access;
+
+   /* Input vertices for TCS stage (0 means dynamic) */
+   unsigned input_vertices;
 };
 
 void brw_preprocess_nir(const struct brw_compiler *compiler,
@@ -191,8 +194,9 @@ bool brw_nir_opt_peephole_ffma(nir_shader *shader);
 
 bool brw_nir_opt_peephole_imul32x16(nir_shader *shader);
 
-bool brw_nir_clamp_per_vertex_loads(nir_shader *shader,
-                                    unsigned input_vertices);
+bool brw_nir_clamp_per_vertex_loads(nir_shader *shader);
+
+bool brw_nir_lower_patch_vertices_in(nir_shader *shader, unsigned input_vertices);
 
 bool brw_nir_blockify_uniform_loads(nir_shader *shader,
                                     const struct intel_device_info *devinfo);
diff --git a/src/intel/compiler/brw_nir_clamp_per_vertex_loads.c b/src/intel/compiler/brw_nir_clamp_per_vertex_loads.c
index 599de48..afabe3f 100644
--- a/src/intel/compiler/brw_nir_clamp_per_vertex_loads.c
+++ b/src/intel/compiler/brw_nir_clamp_per_vertex_loads.c
@@ -24,38 +24,93 @@
 /*
  * Limit input per vertex input accesses. This is useful for the tesselation stages.
  * On Gfx12.5+ out of bound accesses generate hangs.
+ *
+ * This pass operates on derefs, it must be called before shader inputs are
+ * lowered.
  */
 
 #include "brw_nir.h"
 #include "compiler/nir/nir_builder.h"
+#include "compiler/nir/nir_deref.h"
 
 static bool
-lower_instr(nir_builder *b, nir_instr *instr, void *cb_data)
+clamp_per_vertex_loads_instr(nir_builder *b, nir_instr *instr, void *cb_data)
 {
-   unsigned *input_vertices = cb_data;
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+   if (intrin->intrinsic != nir_intrinsic_load_deref)
+      return false;
+
+   nir_deref_instr *deref = nir_instr_as_deref(intrin->src[0].ssa->parent_instr);
+   nir_variable *var = nir_deref_instr_get_variable(deref);
+   if (var == NULL || (var->data.mode & nir_var_shader_in) == 0)
+      return false;
+
+   nir_deref_path path;
+   nir_deref_path_init(&path, deref, cb_data);
+
+   bool progress = false;
+   for (unsigned i = 0; path.path[i]; i++) {
+      if (path.path[i]->deref_type != nir_deref_type_array)
+         continue;
+
+      b->cursor = nir_before_instr(&path.path[i]->instr);
+
+      nir_instr_rewrite_src_ssa(&path.path[i]->instr,
+                                &path.path[i]->arr.index,
+                                nir_umin(b,
+                                         path.path[i]->arr.index.ssa,
+                                         nir_iadd_imm(b, nir_load_patch_vertices_in(b), -1)));
+
+      progress = true;
+      break;
+   }
 
+   nir_deref_path_finish(&path);
+
+   return progress;
+}
+
+bool
+brw_nir_clamp_per_vertex_loads(nir_shader *shader)
+{
+   void *mem_ctx = ralloc_context(NULL);
+
+   bool ret = nir_shader_instructions_pass(shader, clamp_per_vertex_loads_instr,
+                                           nir_metadata_block_index |
+                                           nir_metadata_dominance,
+                                           mem_ctx);
+
+   ralloc_free(mem_ctx);
+
+   return ret;
+}
+
+static bool
+lower_patch_vertices_instr(nir_builder *b, nir_instr *instr, void *cb_data)
+{
    if (instr->type != nir_instr_type_intrinsic)
       return false;
 
    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
-   if (intrin->intrinsic != nir_intrinsic_load_per_vertex_input)
+   if (intrin->intrinsic != nir_intrinsic_load_patch_vertices_in)
       return false;
 
+   unsigned *input_vertices = cb_data;
+
    b->cursor = nir_before_instr(instr);
 
-   nir_instr_rewrite_src_ssa(instr,
-                             &intrin->src[0],
-                             nir_imin(b, intrin->src[0].ssa,
-                                         nir_imm_int(b, *input_vertices - 1)));
+   nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_imm_int(b, *input_vertices));
 
    return true;
 }
 
 bool
-brw_nir_clamp_per_vertex_loads(nir_shader *shader,
-                               unsigned input_vertices)
+brw_nir_lower_patch_vertices_in(nir_shader *shader, unsigned input_vertices)
 {
-   return nir_shader_instructions_pass(shader, lower_instr,
+   return nir_shader_instructions_pass(shader, lower_patch_vertices_instr,
                                        nir_metadata_block_index |
                                        nir_metadata_dominance,
                                        &input_vertices);
diff --git a/src/intel/compiler/brw_vec4_tcs.cpp b/src/intel/compiler/brw_vec4_tcs.cpp
index 8d60eee..9e9c147 100644
--- a/src/intel/compiler/brw_vec4_tcs.cpp
+++ b/src/intel/compiler/brw_vec4_tcs.cpp
@@ -385,8 +385,8 @@ brw_compile_tcs(const struct brw_compiler *compiler,
                              key->_tes_primitive_mode);
    if (key->quads_workaround)
       brw_nir_apply_tcs_quads_workaround(nir);
-   if (compiler->use_tcs_multi_patch)
-      brw_nir_clamp_per_vertex_loads(nir, key->input_vertices);
+   if (key->input_vertices > 0)
+      brw_nir_lower_patch_vertices_in(nir, key->input_vertices);
 
    brw_postprocess_nir(nir, compiler, debug_enabled,
                        key->base.robust_buffer_access);
-- 
2.7.4