From 0f2491cbddf80a4c95df5d57ae438b130288f013 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Mon, 13 Mar 2023 00:18:47 -0400
Subject: [PATCH] nir: add dual-slot input information into load_input
 intrinsics

This is necessary to allow optimizing VS inputs after nir_lower_io, which
is currently impossible because the loss of dual-slot information in NIR
would break VS inputs. With this, driver locations can be recomputed by
calling nir_recompute_io_bases. It's a prerequisite for optimizing varyings
with lowered IO.

When this is used, we will be able to eliminate unused dual-slot VS inputs
as well as unused low and high halves of dual-slot VS inputs for the first
time, which can happen due to optimizations of varyings. Without this,
st/mesa binds vertex buffers for dual-slot inputs that are fully or
partially unused in the shader.

Reviewed-By: Mike Blumenkrantz <michael.blumenkrantz@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25394>
---
 src/compiler/nir/nir.h          | 21 ++++++++++++++++++-
 src/compiler/nir/nir_lower_io.c | 46 ++++++++++++++++++++++++++++++++++-------
 src/compiler/nir/nir_print.c    |  3 +++
 3 files changed, 61 insertions(+), 9 deletions(-)

diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 2d17343..6012d20 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -1847,6 +1847,7 @@ typedef struct nir_io_semantics {
    unsigned per_view : 1;
    unsigned high_16bits : 1; /* whether accessing low or high half of the slot */
    unsigned invariant : 1;   /* The variable has the invariant flag set */
+   unsigned high_dvec2 : 1; /* whether accessing the high half of dvec3/dvec4 */
    /* CLIP_DISTn, LAYER, VIEWPORT, and TESS_LEVEL_* have up to 3 uses:
     * - an output consumed by the next stage
     * - a system value output affecting fixed-func hardware, e.g. the clipper
@@ -1857,7 +1858,7 @@ typedef struct nir_io_semantics {
    unsigned no_varying : 1;       /* whether this output isn't consumed by the next stage */
    unsigned no_sysval_output : 1; /* whether this system value output has no
                                      effect due to current pipeline states */
-   unsigned _pad : 3;
+   unsigned _pad : 2;
 } nir_io_semantics;
 
 /* Transform feedback info for 2 outputs. nir_intrinsic_store_output contains
@@ -4997,13 +4998,31 @@ typedef enum {
    /* If set, this causes all 64-bit IO operations to be lowered on-the-fly
     * to 32-bit operations.  This is only valid for nir_var_shader_in/out
     * modes.
+    *
+    * Note that this destroys dual-slot information i.e. whether an input
+    * occupies the low or high half of dvec4. Instead, it adds an offset of 1
+    * to the load (which is ambiguous) and expects driver locations of inputs
+    * to be final, which prevents any further optimizations.
+    *
+    * TODO: remove this in favor of nir_lower_io_lower_64bit_to_32_new.
     */
    nir_lower_io_lower_64bit_to_32 = (1 << 0),
+
    /* If set, this causes the subset of 64-bit IO operations involving floats to be lowered on-the-fly
     * to 32-bit operations.  This is only valid for nir_var_shader_in/out
     * modes.
     */
    nir_lower_io_lower_64bit_float_to_32 = (1 << 1),
+
+   /* This causes all 64-bit IO operations to be lowered to 32-bit operations.
+    * This is only valid for nir_var_shader_in/out modes.
+    *
+    * Only VS inputs: Dual slot information is preserved as nir_io_semantics::
+    * high_dvec2 and gathered into shader_info::dual_slot_inputs, so that
+    * the shader can be arbitrarily optimized and the low or high half of
+    * dvec4 can be DCE'd independently without affecting the other half.
+    */
+   nir_lower_io_lower_64bit_to_32_new = (1 << 2),
 } nir_lower_io_options;
 bool nir_lower_io(nir_shader *shader,
                   nir_variable_mode modes,
diff --git a/src/compiler/nir/nir_lower_io.c b/src/compiler/nir/nir_lower_io.c
index 8ab6e51..107cb87 100644
--- a/src/compiler/nir/nir_lower_io.c
+++ b/src/compiler/nir/nir_lower_io.c
@@ -158,6 +158,16 @@ nir_is_arrayed_io(const nir_variable *var, gl_shader_stage stage)
    return false;
 }
 
+static bool
+uses_high_dvec2_semantic(struct lower_io_state *state,
+                         const nir_variable *var)
+{
+   return state->builder.shader->info.stage == MESA_SHADER_VERTEX &&
+          state->options & nir_lower_io_lower_64bit_to_32_new &&
+          var->data.mode == nir_var_shader_in &&
+          glsl_type_is_dual_slot(glsl_without_array(var->type));
+}
+
 static unsigned
 get_number_of_slots(struct lower_io_state *state,
                     const nir_variable *var)
@@ -181,7 +191,8 @@ get_number_of_slots(struct lower_io_state *state,
        !nir_is_arrayed_io(var, state->builder.shader->info.stage))
       return 1;
 
-   return state->type_size(type, var->data.bindless);
+   return state->type_size(type, var->data.bindless) /
+          (uses_high_dvec2_semantic(state, var) ? 2 : 1);
 }
 
 static nir_def *
@@ -251,7 +262,7 @@ static nir_def *
 emit_load(struct lower_io_state *state,
           nir_def *array_index, nir_variable *var, nir_def *offset,
           unsigned component, unsigned num_components, unsigned bit_size,
-          nir_alu_type dest_type)
+          nir_alu_type dest_type, bool high_dvec2)
 {
    nir_builder *b = &state->builder;
    const nir_shader *nir = b->shader;
@@ -324,6 +335,7 @@ emit_load(struct lower_io_state *state,
       semantics.medium_precision =
          var->data.precision == GLSL_PRECISION_MEDIUM ||
          var->data.precision == GLSL_PRECISION_LOW;
+      semantics.high_dvec2 = high_dvec2;
       nir_intrinsic_set_io_semantics(load, semantics);
    }
 
@@ -350,14 +362,23 @@ lower_load(nir_intrinsic_instr *intrin, struct lower_io_state *state,
 {
    const bool lower_double = !glsl_type_is_integer(type) && state->options & nir_lower_io_lower_64bit_float_to_32;
    if (intrin->def.bit_size == 64 &&
-       (lower_double || (state->options & nir_lower_io_lower_64bit_to_32))) {
+       (lower_double || (state->options & (nir_lower_io_lower_64bit_to_32_new |
+                                           nir_lower_io_lower_64bit_to_32)))) {
       nir_builder *b = &state->builder;
+      bool use_high_dvec2_semantic = uses_high_dvec2_semantic(state, var);
+
+      /* Each slot is a dual slot, so divide the offset within the variable
+       * by 2.
+       */
+      if (use_high_dvec2_semantic)
+         offset = nir_ushr_imm(b, offset, 1);
 
       const unsigned slot_size = state->type_size(glsl_dvec_type(2), false);
 
       nir_def *comp64[4];
       assert(component == 0 || component == 2);
       unsigned dest_comp = 0;
+      bool high_dvec2 = false;
       while (dest_comp < intrin->def.num_components) {
          const unsigned num_comps =
             MIN2(intrin->def.num_components - dest_comp,
@@ -365,7 +386,7 @@ lower_load(nir_intrinsic_instr *intrin, struct lower_io_state *state,
 
          nir_def *data32 =
             emit_load(state, array_index, var, offset, component,
-                      num_comps * 2, 32, nir_type_uint32);
+                      num_comps * 2, 32, nir_type_uint32, high_dvec2);
          for (unsigned i = 0; i < num_comps; i++) {
             comp64[dest_comp + i] =
                nir_pack_64_2x32(b, nir_channels(b, data32, 3 << (i * 2)));
@@ -374,7 +395,15 @@ lower_load(nir_intrinsic_instr *intrin, struct lower_io_state *state,
          /* Only the first store has a component offset */
          component = 0;
          dest_comp += num_comps;
-         offset = nir_iadd_imm(b, offset, slot_size);
+
+         if (use_high_dvec2_semantic) {
+            /* Increment the offset when we wrap around the dual slot. */
+            if (high_dvec2)
+               offset = nir_iadd_imm(b, offset, slot_size);
+            high_dvec2 = !high_dvec2;
+         } else {
+            offset = nir_iadd_imm(b, offset, slot_size);
+         }
       }
 
       return nir_vec(b, comp64, intrin->def.num_components);
@@ -384,12 +413,12 @@ lower_load(nir_intrinsic_instr *intrin, struct lower_io_state *state,
       return nir_b2b1(&state->builder,
                       emit_load(state, array_index, var, offset, component,
                                 intrin->def.num_components, 32,
-                                nir_type_bool32));
+                                nir_type_bool32, false));
    } else {
       return emit_load(state, array_index, var, offset, component,
                        intrin->def.num_components,
                        intrin->def.bit_size,
-                       nir_get_nir_type_for_glsl_type(type));
+                       nir_get_nir_type_for_glsl_type(type), false);
    }
 }
 
@@ -461,7 +490,8 @@ lower_store(nir_intrinsic_instr *intrin, struct lower_io_state *state,
 {
    const bool lower_double = !glsl_type_is_integer(type) && state->options & nir_lower_io_lower_64bit_float_to_32;
    if (intrin->src[1].ssa->bit_size == 64 &&
-       (lower_double || (state->options & nir_lower_io_lower_64bit_to_32))) {
+       (lower_double || (state->options & (nir_lower_io_lower_64bit_to_32 |
+                                           nir_lower_io_lower_64bit_to_32_new)))) {
       nir_builder *b = &state->builder;
 
       const unsigned slot_size = state->type_size(glsl_dvec_type(2), false);
diff --git a/src/compiler/nir/nir_print.c b/src/compiler/nir/nir_print.c
index e87e8b5..301b98a 100644
--- a/src/compiler/nir/nir_print.c
+++ b/src/compiler/nir/nir_print.c
@@ -1378,6 +1378,9 @@ print_intrinsic_instr(nir_intrinsic_instr *instr, print_state *state)
          if (io.high_16bits)
             fprintf(fp, " high_16bits");
 
+         if (io.high_dvec2)
+            fprintf(fp, " high_dvec2");
+
          if (io.no_varying)
             fprintf(fp, " no_varying");
 
-- 
2.7.4