zink: use lowered io (kinda) for i/o vars
authorMike Blumenkrantz <michael.blumenkrantz@gmail.com>
Thu, 27 Jul 2023 17:47:13 +0000 (13:47 -0400)
committerMarge Bot <emma+marge@anholt.net>
Tue, 15 Aug 2023 11:54:05 +0000 (11:54 +0000)
this runs io lowering on shader create, which is a huge pita
and waste of time since it requires then re-creating all the deref io later,
but it also makes the variables simpler by eliminating struct awfulness
(which was already eliminated by the split_blocks pass, but who's keeping
track) and will enable future use of some bizarro inter-stage linker thing

Reviewed-by: Dave Airlie <airlied@redhat.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24634>

src/gallium/drivers/zink/zink_compiler.c

index 86460b4..3a3b81b 100644 (file)
@@ -1366,6 +1366,7 @@ zink_screen_init_compiler(struct zink_screen *screen)
       .lower_mul_2x32_64 = true,
       .support_16bit_alu = true, /* not quite what it sounds like */
       .max_unroll_iterations = 0,
+      .use_interpolated_input_intrinsics = true,
    };
 
    screen->nir_options = default_options;
@@ -1771,36 +1772,16 @@ find_var_with_location_frac(nir_shader *nir, unsigned location, unsigned locatio
 {
    assert((int)location >= 0);
 
-   unsigned found = 0;
-   if (!location_frac && location != VARYING_SLOT_PSIZ) {
-      nir_foreach_variable_with_modes(var, nir, mode) {
-         if (var->data.location == location)
-            found++;
-      }
-   }
-   if (found) {
-      /* multiple variables found for this location: find the biggest one */
-      nir_variable *out = NULL;
-      unsigned slots = 0;
-      nir_foreach_variable_with_modes(var, nir, mode) {
-         if (var->data.location == location) {
-            unsigned count_slots = glsl_count_vec4_slots(var->type, false, false);
-            if (count_slots > slots) {
-               slots = count_slots;
-               out = var;
-            }
-         }
-      }
-      return out;
-   } else {
-      /* only one variable found or this is location_frac */
-      nir_foreach_variable_with_modes(var, nir, mode) {
-         if (var->data.location == location &&
-             (var->data.location_frac == location_frac ||
-              (glsl_type_is_array(var->type) ? glsl_array_size(var->type) : glsl_get_vector_elements(var->type)) >= location_frac + 1)) {
-            if (location != VARYING_SLOT_PSIZ || !have_psiz || var->data.explicit_location)
-               return var;
-         }
+   nir_foreach_variable_with_modes(var, nir, mode) {
+      if (var->data.location == location && (location != VARYING_SLOT_PSIZ || !have_psiz || var->data.explicit_location)) {
+         unsigned num_components = glsl_get_vector_elements(var->type);
+         if (glsl_type_is_64bit(glsl_without_array(var->type)))
+            num_components *= 2;
+         if (var->data.location == VARYING_SLOT_CLIP_DIST0 || var->data.location == VARYING_SLOT_CULL_DIST0)
+            num_components = glsl_get_aoa_size(var->type);
+         if (var->data.location_frac <= location_frac &&
+               var->data.location_frac + num_components > location_frac)
+            return var;
       }
    }
    return NULL;
@@ -1898,6 +1879,38 @@ get_slot_components(nir_variable *var, unsigned slot, unsigned so_slot)
    return num_components;
 }
 
+static bool
+is_var_type_bindless(nir_variable *var)
+{
+   switch (glsl_get_base_type(glsl_without_array(var->type))) {
+   case GLSL_TYPE_SAMPLER:
+   case GLSL_TYPE_TEXTURE:
+   case GLSL_TYPE_IMAGE:
+      return true;
+   default:
+      break;
+   }
+   return false;
+}
+
+static unsigned
+get_var_slot_count(nir_shader *nir, nir_variable *var)
+{
+   assert(var->data.mode == nir_var_shader_in || var->data.mode == nir_var_shader_out);
+   const struct glsl_type *type = var->type;
+   if (nir_is_arrayed_io(var, nir->info.stage))
+      type = glsl_get_array_element(type);
+   unsigned slot_count = 0;
+   if (var->data.location >= VARYING_SLOT_VAR0)
+      slot_count = glsl_count_vec4_slots(type, false, is_var_type_bindless(var));
+   else if (glsl_type_is_array(type))
+      slot_count = DIV_ROUND_UP(glsl_get_aoa_size(type), 4);
+   else
+      slot_count = 1;
+   return slot_count;
+}
+
+
 static const struct pipe_stream_output *
 find_packed_output(const struct pipe_stream_output_info *so_info, uint8_t *reverse_map, unsigned slot)
 {
@@ -1939,46 +1952,59 @@ update_so_info(struct zink_shader *zs, nir_shader *nir, const struct pipe_stream
    nir_variable *psiz = NULL;
    for (unsigned i = 0; i < so_info->num_outputs; i++) {
       const struct pipe_stream_output *output = &so_info->output[i];
-      unsigned slot = reverse_map[output->register_index];
       /* always set stride to be used during draw */
       zs->sinfo.so_info.stride[output->output_buffer] = so_info->stride[output->output_buffer];
       if (zs->info.stage != MESA_SHADER_GEOMETRY || util_bitcount(zs->info.gs.active_stream_mask) == 1) {
-         nir_variable *var = NULL;
-         unsigned so_slot;
-         while (!var)
-            var = find_var_with_location_frac(nir, slot--, output->start_component, have_psiz, nir_var_shader_out);
-         if (var->data.location == VARYING_SLOT_PSIZ)
-            psiz = var;
-         so_slot = slot + 1;
-         slot = reverse_map[output->register_index];
-         if (var->data.explicit_xfb_buffer) {
-            /* handle dvec3 where gallium splits streamout over 2 registers */
-            for (unsigned j = 0; j < output->num_components; j++)
-               inlined[slot][output->start_component + j] = true;
-         }
-         if (is_inlined(inlined[slot], output))
-            continue;
-         bool is_struct = glsl_type_is_struct_or_ifc(glsl_without_array(var->type));
-         unsigned num_components = get_slot_components(var, slot, so_slot);
-         /* if this is the entire variable, try to blast it out during the initial declaration
-          * structs must be handled later to ensure accurate analysis
-          */
-         if (!is_struct && (num_components == output->num_components || (num_components > output->num_components && output->num_components == 4))) {
-            var->data.explicit_xfb_buffer = 1;
-            var->data.xfb.buffer = output->output_buffer;
-            var->data.xfb.stride = so_info->stride[output->output_buffer] * 4;
-            var->data.offset = output->dst_offset * 4;
-            var->data.stream = output->stream;
-            for (unsigned j = 0; j < output->num_components; j++)
-               inlined[slot][output->start_component + j] = true;
-         } else {
-            /* otherwise store some metadata for later */
-            packed |= BITFIELD64_BIT(slot);
-            packed_components[slot] += output->num_components;
-            packed_streams[slot] |= BITFIELD_BIT(output->stream);
-            packed_buffers[slot] |= BITFIELD_BIT(output->output_buffer);
-            for (unsigned j = 0; j < output->num_components; j++)
-               packed_offsets[output->register_index][j + output->start_component] = output->dst_offset + j;
+         for (unsigned c = 0; !is_inlined(inlined[reverse_map[output->register_index]], output) && c < output->num_components; c++) {
+            unsigned slot = reverse_map[output->register_index];
+            if (inlined[slot][output->start_component + c])
+               continue;
+            nir_variable *var = NULL;
+            while (!var && slot < VARYING_SLOT_TESS_MAX)
+               var = find_var_with_location_frac(nir, slot--, output->start_component + c, have_psiz, nir_var_shader_out);
+            slot = reverse_map[output->register_index];
+            unsigned slot_count = var ? get_var_slot_count(nir, var) : 0;
+            if (!var || var->data.location > slot || var->data.location + slot_count <= slot) {
+               /* if no variable is found for the xfb output, no output exists */
+               inlined[slot][c + output->start_component] = true;
+               continue;
+            }
+            if (var->data.location == VARYING_SLOT_PSIZ)
+               psiz = var;
+            if (var->data.explicit_xfb_buffer) {
+               /* handle dvec3 where gallium splits streamout over 2 registers */
+               for (unsigned j = 0; j < output->num_components; j++)
+                  inlined[slot][c + output->start_component + j] = true;
+            }
+            if (is_inlined(inlined[slot], output))
+               continue;
+            assert(!glsl_type_is_array(var->type) || var->data.location == VARYING_SLOT_CLIP_DIST0 || var->data.location == VARYING_SLOT_CULL_DIST0);
+            assert(!glsl_type_is_struct_or_ifc(var->type));
+            unsigned num_components = glsl_type_is_array(var->type) ? glsl_get_aoa_size(var->type) : glsl_get_vector_elements(var->type);
+            if (glsl_type_is_64bit(glsl_without_array(var->type)))
+               num_components *= 2;
+            /* if this is the entire variable, try to blast it out during the initial declaration
+            * structs must be handled later to ensure accurate analysis
+            */
+            if ((num_components == output->num_components ||
+                 num_components < output->num_components ||
+                 (num_components > output->num_components && output->num_components == 4))) {
+               var->data.explicit_xfb_buffer = 1;
+               var->data.xfb.buffer = output->output_buffer;
+               var->data.xfb.stride = so_info->stride[output->output_buffer] * 4;
+               var->data.offset = (c + output->dst_offset) * 4;
+               var->data.stream = output->stream;
+               for (unsigned j = 0; j < MIN2(num_components, output->num_components); j++)
+                  inlined[slot][c + output->start_component + j] = true;
+            } else {
+               /* otherwise store some metadata for later */
+               packed |= BITFIELD64_BIT(slot);
+               packed_components[slot] += output->num_components;
+               packed_streams[slot] |= BITFIELD_BIT(output->stream);
+               packed_buffers[slot] |= BITFIELD_BIT(output->output_buffer);
+               for (unsigned j = 0; j < output->num_components; j++)
+                  packed_offsets[output->register_index][j + output->start_component + c] = output->dst_offset + j;
+            }
          }
       }
    }
@@ -1996,6 +2022,10 @@ update_so_info(struct zink_shader *zs, nir_shader *nir, const struct pipe_stream
          nir_variable *var = NULL;
          while (!var)
             var = find_var_with_location_frac(nir, slot--, output->start_component, have_psiz, nir_var_shader_out);
+         slot = reverse_map[output->register_index];
+         unsigned slot_count = var ? get_var_slot_count(nir, var) : 0;
+         if (!var || var->data.location > slot || var->data.location + slot_count <= slot)
+            continue;
          /* this is a lowered 64bit variable that can't be exported due to packing */
          if (var->data.is_xfb)
             goto out;
@@ -2423,6 +2453,32 @@ remove_bo_access(nir_shader *shader, struct zink_shader *zs)
 }
 
 static bool
+filter_io_instr(nir_intrinsic_instr *intr, bool *is_load, bool *is_input, bool *is_interp)
+{
+   switch (intr->intrinsic) {
+   case nir_intrinsic_load_interpolated_input:
+      *is_interp = true;
+      FALLTHROUGH;
+   case nir_intrinsic_load_input:
+   case nir_intrinsic_load_per_vertex_input:
+      *is_input = true;
+      FALLTHROUGH;
+   case nir_intrinsic_load_output:
+   case nir_intrinsic_load_per_vertex_output:
+   case nir_intrinsic_load_per_primitive_output:
+      *is_load = true;
+      FALLTHROUGH;
+   case nir_intrinsic_store_output:
+   case nir_intrinsic_store_per_primitive_output:
+   case nir_intrinsic_store_per_vertex_output:
+      break;
+   default:
+      return false;
+   }
+   return true;
+}
+
+static bool
 find_var_deref(nir_shader *nir, nir_variable *var)
 {
    nir_foreach_function_impl(impl, nir) {
@@ -2439,6 +2495,47 @@ find_var_deref(nir_shader *nir, nir_variable *var)
    return false;
 }
 
+static bool
+find_var_io(nir_shader *nir, nir_variable *var)
+{
+   nir_foreach_function(function, nir) {
+      if (!function->impl)
+         continue;
+
+      nir_foreach_block(block, function->impl) {
+         nir_foreach_instr(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+            nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+            bool is_load = false;
+            bool is_input = false;
+            bool is_interp = false;
+            if (!filter_io_instr(intr, &is_load, &is_input, &is_interp))
+               continue;
+            if (var->data.mode == nir_var_shader_in && !is_input)
+               continue;
+            if (var->data.mode == nir_var_shader_out && is_input)
+               continue;
+            unsigned slot_offset = 0;
+            if (var->data.fb_fetch_output && !is_load)
+               continue;
+            if (nir->info.stage == MESA_SHADER_FRAGMENT && !is_load && !is_input && nir_intrinsic_io_semantics(intr).dual_source_blend_index != var->data.index)
+               continue;
+            nir_src *src_offset = nir_get_io_offset_src(intr);
+            if (src_offset && nir_src_is_const(*src_offset))
+               slot_offset = nir_src_as_uint(*src_offset);
+            unsigned slot_count = get_var_slot_count(nir, var);
+            if (var->data.mode & (nir_var_shader_out | nir_var_shader_in) &&
+                var->data.fb_fetch_output == nir_intrinsic_io_semantics(intr).fb_fetch_output &&
+                var->data.location <= nir_intrinsic_io_semantics(intr).location + slot_offset &&
+                var->data.location + slot_count > nir_intrinsic_io_semantics(intr).location + slot_offset)
+               return true;
+         }
+      }
+   }
+   return false;
+}
+
 struct clamp_layer_output_state {
    nir_variable *original;
    nir_variable *clamped;
@@ -2488,7 +2585,7 @@ clamp_layer_output(nir_shader *vs, nir_shader *fs, unsigned *next_location)
    }
    struct clamp_layer_output_state state = {0};
    state.original = nir_find_variable_with_location(vs, nir_var_shader_out, VARYING_SLOT_LAYER);
-   if (!state.original || !find_var_deref(vs, state.original))
+   if (!state.original || (!find_var_deref(vs, state.original) && !find_var_io(vs, state.original)))
       return false;
    state.clamped = nir_variable_create(vs, nir_var_shader_out, glsl_int_type(), "layer_clamped");
    state.clamped->data.location = VARYING_SLOT_LAYER;
@@ -2636,16 +2733,16 @@ rewrite_read_as_0(nir_builder *b, nir_instr *instr, void *data)
       return false;
 
    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-   if (intr->intrinsic != nir_intrinsic_load_deref)
+   if (intr->intrinsic != nir_intrinsic_load_input)
       return false;
-   nir_variable *deref_var = nir_intrinsic_get_var(intr, 0);
-   if (deref_var != var)
+   unsigned location = nir_intrinsic_io_semantics(intr).location;
+   if (location != var->data.location)
       return false;
    b->cursor = nir_before_instr(instr);
    nir_def *zero = nir_imm_zero(b, intr->def.num_components,
                                 intr->def.bit_size);
    if (b->shader->info.stage == MESA_SHADER_FRAGMENT) {
-      switch (var->data.location) {
+      switch (location) {
       case VARYING_SLOT_COL0:
       case VARYING_SLOT_COL1:
       case VARYING_SLOT_BFC0:
@@ -2663,6 +2760,34 @@ rewrite_read_as_0(nir_builder *b, nir_instr *instr, void *data)
    return true;
 }
 
+
+
+static bool
+delete_psiz_store_instr(nir_builder *b, nir_instr *instr, void *data)
+{
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+   switch (intr->intrinsic) {
+   case nir_intrinsic_store_output:
+   case nir_intrinsic_store_per_primitive_output:
+   case nir_intrinsic_store_per_vertex_output:
+      break;
+   default:
+      return false;
+   }
+   if (nir_intrinsic_io_semantics(intr).location != VARYING_SLOT_PSIZ)
+      return false;
+   nir_instr_remove(instr);
+   return true;
+}
+
+static bool
+delete_psiz_store(nir_shader *nir)
+{
+   return nir_shader_instructions_pass(nir, delete_psiz_store_instr, nir_metadata_dominance, NULL);
+}
+
 void
 zink_compiler_assign_io(struct zink_screen *screen, nir_shader *producer, nir_shader *consumer)
 {
@@ -2677,6 +2802,7 @@ zink_compiler_assign_io(struct zink_screen *screen, nir_shader *producer, nir_sh
       if (var && !var->data.explicit_location && !nir_find_variable_with_location(consumer, nir_var_shader_in, VARYING_SLOT_PSIZ)) {
          var->data.mode = nir_var_shader_temp;
          nir_fixup_deref_modes(producer);
+         delete_psiz_store(producer);
          NIR_PASS_V(producer, nir_remove_dead_variables, nir_var_shader_temp, NULL);
          optimize_nir(producer, NULL);
       }
@@ -3032,8 +3158,6 @@ lower_64bit_vars(nir_shader *shader, bool doubles_only)
    bool progress = false;
    struct hash_table *derefs = _mesa_hash_table_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
    struct set *deletes = _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
-   nir_foreach_variable_with_modes(var, shader, nir_var_shader_in | nir_var_shader_out)
-      progress |= lower_64bit_vars_loop(shader, var, derefs, deletes, doubles_only);
    nir_foreach_function_impl(impl, shader) {
       nir_foreach_function_temp_variable(var, impl) {
          if (!glsl_type_contains_64bit(var->type) || (doubles_only && !glsl_contains_double(var->type)))
@@ -3285,11 +3409,11 @@ static void
 prune_io(nir_shader *nir)
 {
    nir_foreach_shader_in_variable_safe(var, nir) {
-      if (!find_var_deref(nir, var))
+      if (!find_var_deref(nir, var) && !find_var_io(nir, var))
          var->data.mode = nir_var_shader_temp;
    }
    nir_foreach_shader_out_variable_safe(var, nir) {
-      if (!find_var_deref(nir, var))
+      if (!find_var_deref(nir, var) && !find_var_io(nir, var))
          var->data.mode = nir_var_shader_temp;
    }
    NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_shader_temp, NULL);
@@ -3501,6 +3625,172 @@ invert_point_coord(nir_shader *nir)
    return nir_shader_instructions_pass(nir, invert_point_coord_instr, nir_metadata_dominance, NULL);
 }
 
+static bool
+add_derefs_instr(nir_builder *b, nir_instr *instr, void *data)
+{
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+   bool is_load = false;
+   bool is_input = false;
+   bool is_interp = false;
+   if (!filter_io_instr(intr, &is_load, &is_input, &is_interp))
+      return false;
+   unsigned loc = nir_intrinsic_io_semantics(intr).location;
+   /* loop over all the variables and rewrite corresponding access */
+   nir_foreach_variable_with_modes(var, b->shader, is_input ? nir_var_shader_in : nir_var_shader_out) {
+      nir_src *src_offset = nir_get_io_offset_src(intr);
+      const unsigned slot_offset = src_offset && nir_src_is_const(*src_offset) ? nir_src_as_uint(*src_offset) : 0;
+      const struct glsl_type *type = var->type;
+      if (nir_is_arrayed_io(var, b->shader->info.stage))
+         type = glsl_get_array_element(type);
+      unsigned slot_count = get_var_slot_count(b->shader, var);
+      unsigned location = loc + slot_offset;
+      /* filter access that isn't specific to this variable */
+      if (var->data.location > location || var->data.location + slot_count <= location)
+         continue;
+      if (var->data.fb_fetch_output != nir_intrinsic_io_semantics(intr).fb_fetch_output)
+         continue;
+      if (b->shader->info.stage == MESA_SHADER_FRAGMENT && !is_load && nir_intrinsic_io_semantics(intr).dual_source_blend_index != var->data.index)
+         continue;
+      unsigned frac = nir_intrinsic_component(intr);
+      unsigned bit_size = is_load ? intr->def.bit_size : nir_src_bit_size(intr->src[0]);
+      /* set c aligned/rounded down to dword */
+      unsigned c = frac;
+      if (frac && bit_size < 32)
+         c = frac * bit_size / 32;
+
+      unsigned size = 0;
+      bool is_struct = glsl_type_is_struct(glsl_without_array(type));
+      if (is_struct)
+         size = get_slot_components(var, var->data.location + slot_offset, var->data.location);
+      else if ((var->data.mode == nir_var_shader_out && var->data.location < VARYING_SLOT_VAR0) ||
+          (var->data.mode == nir_var_shader_in && var->data.location < (b->shader->info.stage == MESA_SHADER_VERTEX ? VERT_ATTRIB_GENERIC0 : VARYING_SLOT_VAR0)))
+         size = glsl_type_is_array(type) ? glsl_get_aoa_size(type) : glsl_get_vector_elements(type);
+      else
+         size = glsl_get_vector_elements(glsl_without_array(type));
+      assert(size);
+      if (glsl_type_is_64bit(glsl_without_array(var->type)))
+         size *= 2;
+      if (var->data.location != location && size > 4 && size % 4 && !is_struct) {
+         /* adjust for dvec3-type slot overflow */
+         assert(location > var->data.location);
+         size -= (location - var->data.location) * 4;
+      }
+      assert(size);
+      if (var->data.location_frac + size <= c || var->data.location_frac > c)
+         continue;
+
+      b->cursor = nir_before_instr(instr);
+      nir_deref_instr *deref = nir_build_deref_var(b, var);
+      if (nir_is_arrayed_io(var, b->shader->info.stage)) {
+         assert(intr->intrinsic != nir_intrinsic_store_output);
+         deref = nir_build_deref_array(b, deref, intr->src[!is_load].ssa);
+      }
+      if (glsl_type_is_array(type)) {
+         /* unroll array derefs */
+         unsigned idx = frac - var->data.location_frac;
+         assert(src_offset);
+         if (var->data.location < VARYING_SLOT_VAR0) {
+            if (src_offset) {
+               /* clip/cull dist use different array offset semantics */
+               bool is_clipdist = (b->shader->info.stage != MESA_SHADER_VERTEX || var->data.mode == nir_var_shader_out) &&
+                                  var->data.location >= VARYING_SLOT_CLIP_DIST0 && var->data.location <= VARYING_SLOT_CULL_DIST1;
+               /* this is explicit for ease of debugging but could be collapsed at some point in the future*/
+               if (nir_src_is_const(*src_offset)) {
+                  unsigned offset = slot_offset;
+                  if (is_clipdist)
+                     offset *= 4;
+                  deref = nir_build_deref_array_imm(b, deref, offset + idx);
+               } else {
+                  nir_def *offset = src_offset->ssa;
+                  if (is_clipdist)
+                     nir_imul_imm(b, offset, 4);
+                  deref = nir_build_deref_array(b, deref, idx ? nir_iadd_imm(b, offset, idx) : src_offset->ssa);
+               }
+            } else {
+               deref = nir_build_deref_array_imm(b, deref, idx);
+            }
+            type = glsl_get_array_element(type);
+         } else {
+            /* need to convert possible N*M to [N][M] */
+            nir_def *nm = idx ? nir_iadd_imm(b, src_offset->ssa, idx) : src_offset->ssa;
+            while (glsl_type_is_array(type)) {
+               const struct glsl_type *elem = glsl_get_array_element(type);
+               unsigned type_size = glsl_count_vec4_slots(elem, false, false);
+               nir_def *n = glsl_type_is_array(elem) ? nir_udiv_imm(b, nm, type_size) : nm;
+               if (glsl_type_is_vector_or_scalar(elem) && glsl_type_is_64bit(elem) && glsl_get_vector_elements(elem) > 2)
+                  n = nir_udiv_imm(b, n, 2);
+               deref = nir_build_deref_array(b, deref, n);
+               nm = nir_umod_imm(b, nm, type_size);
+               type = glsl_get_array_element(type);
+            }
+         }
+      } else if (glsl_type_is_struct(type)) {
+         deref = nir_build_deref_struct(b, deref, slot_offset);
+      }
+      if (is_load) {
+         nir_def *load;
+         if (is_interp) {
+            nir_def *interp = intr->src[0].ssa;
+            nir_intrinsic_instr *interp_intr = nir_instr_as_intrinsic(interp->parent_instr);
+            assert(interp_intr);
+            switch (interp_intr->intrinsic) {
+            case nir_intrinsic_load_barycentric_centroid:
+               load = nir_interp_deref_at_centroid(b, intr->num_components, bit_size, &deref->def);
+               break;
+            case nir_intrinsic_load_barycentric_pixel:
+            case nir_intrinsic_load_barycentric_sample:
+               load = nir_load_deref(b, deref);
+               break;
+            case nir_intrinsic_load_barycentric_at_sample:
+               load = nir_interp_deref_at_sample(b, intr->num_components, bit_size, &deref->def, interp_intr->src[0].ssa);
+               break;
+            case nir_intrinsic_load_barycentric_at_offset:
+               load = nir_interp_deref_at_offset(b, intr->num_components, bit_size, &deref->def, interp_intr->src[0].ssa);
+               break;
+            default:
+               unreachable("unhandled interp!");
+            }
+         } else {
+            load = nir_load_deref(b, deref);
+         }
+         nir_def_rewrite_uses(&intr->def, load);
+      } else {
+         nir_def *store = intr->src[0].ssa;
+         assert(!glsl_type_is_array(type));
+         unsigned num_components = glsl_get_vector_elements(type);
+         /* pad/filter components to match deref type */
+         if (intr->num_components < num_components) {
+            nir_def *zero = nir_imm_zero(b, 1, bit_size);
+            nir_def *vec[4] = {zero, zero, zero, zero};
+            u_foreach_bit(i, nir_intrinsic_write_mask(intr))
+               vec[c - var->data.location_frac + i] = nir_channel(b, store, i);
+            store = nir_vec(b, vec, num_components);
+         } if (store->num_components > num_components) {
+            store = nir_channels(b, store, nir_intrinsic_write_mask(intr));
+         }
+         if (store->bit_size != glsl_get_bit_size(type)) {
+            /* this should be some weird bindless io conversion */
+            assert(store->bit_size == 64 && glsl_get_bit_size(type) == 32);
+            assert(num_components != store->num_components);
+            store = nir_unpack_64_2x32(b, store);
+         }
+         nir_store_deref(b, deref, store, BITFIELD_RANGE(c - var->data.location_frac, intr->num_components));
+      }
+      nir_instr_remove(instr);
+      return true;
+   }
+   unreachable("failed to find variable for explicit io!");
+   return true;
+}
+
+static bool
+add_derefs(nir_shader *nir)
+{
+   return nir_shader_instructions_pass(nir, add_derefs_instr, nir_metadata_dominance, NULL);
+}
+
 static struct zink_shader_object
 compile_module(struct zink_screen *screen, struct zink_shader *zs, nir_shader *nir, bool can_shobj, struct zink_program *pg)
 {
@@ -3535,9 +3825,10 @@ zink_shader_compile(struct zink_screen *screen, bool can_shobj, struct zink_shad
                     nir_shader *nir, const struct zink_shader_key *key, const void *extra_data, struct zink_program *pg)
 {
    struct zink_shader_info *sinfo = &zs->sinfo;
-   bool need_optimize = false;
+   bool need_optimize = true;
    bool inlined_uniforms = false;
 
+   NIR_PASS_V(nir, add_derefs);
    NIR_PASS_V(nir, nir_lower_fragcolor, nir->info.fs.color_is_dual_source ? 1 : 8);
    if (key) {
       if (key->inline_uniforms) {
@@ -3758,6 +4049,7 @@ zink_shader_compile_separate(struct zink_screen *screen, struct zink_shader *zs)
       default: break;
       }
    }
+   NIR_PASS_V(nir, add_derefs);
    NIR_PASS_V(nir, nir_lower_fragcolor, nir->info.fs.color_is_dual_source ? 1 : 8);
    if (screen->driconf.inline_uniforms) {
       NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_mem_global | nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_mem_shared, NULL, NULL);
@@ -4125,12 +4417,13 @@ lower_bindless_io_instr(nir_builder *b, nir_instr *in, void *data)
    if (in->type != nir_instr_type_intrinsic)
       return false;
    nir_intrinsic_instr *instr = nir_instr_as_intrinsic(in);
-   if (instr->intrinsic != nir_intrinsic_load_deref &&
-       instr->intrinsic != nir_intrinsic_store_deref)
+   bool is_load = false;
+   bool is_input = false;
+   bool is_interp = false;
+   if (!filter_io_instr(instr, &is_load, &is_input, &is_interp))
       return false;
 
-   nir_deref_instr *src_deref = nir_src_as_deref(instr->src[0]);
-   nir_variable *var = nir_deref_instr_get_variable(src_deref);
+   nir_variable *var = find_var_with_location_frac(b->shader, nir_intrinsic_io_semantics(instr).location, nir_intrinsic_component(instr), false, is_input ? nir_var_shader_in : nir_var_shader_out);
    if (var->data.bindless)
       return false;
    if (var->data.mode != nir_var_shader_in && var->data.mode != nir_var_shader_out)
@@ -4140,17 +4433,7 @@ lower_bindless_io_instr(nir_builder *b, nir_instr *in, void *data)
 
    var->type = glsl_int64_t_type();
    var->data.bindless = 1;
-   b->cursor = nir_before_instr(in);
-   nir_deref_instr *deref = nir_build_deref_var(b, var);
-   if (instr->intrinsic == nir_intrinsic_load_deref) {
-       nir_def *def = nir_load_deref(b, deref);
-       nir_instr_rewrite_src_ssa(in, &instr->src[0], def);
-       nir_def_rewrite_uses(&instr->def, def);
-   } else {
-      nir_store_deref(b, deref, instr->src[1].ssa, nir_intrinsic_write_mask(instr));
-   }
-   nir_instr_remove(in);
-   nir_instr_remove(&src_deref->instr);
+   nir_intrinsic_set_dest_type(instr, nir_type_int64);
    return true;
 }
 
@@ -4823,6 +5106,292 @@ zink_flat_flags(struct nir_shader *shader)
    return flat_flags;
 }
 
+static void
+store_location_var(nir_variable *vars[VARYING_SLOT_TESS_MAX][4], nir_variable *var, nir_shader *nir)
+{
+   unsigned slot_count;
+   const struct glsl_type *type;
+   bool is_bindless = is_var_type_bindless(var);
+   if (nir_is_arrayed_io(var, nir->info.stage)) {
+      type = glsl_get_array_element(var->type);
+      slot_count = glsl_count_vec4_slots(type, false, is_bindless);
+   } else {
+      type = glsl_without_array(var->type);
+      slot_count = glsl_count_vec4_slots(var->type, false, is_bindless);
+   }
+   unsigned num_components = glsl_get_vector_elements(glsl_without_array(type));
+   if (glsl_type_is_64bit(glsl_without_array(var->type)))
+      num_components *= 2;
+   if (!num_components)
+      num_components = 4; //this is a struct
+   for (unsigned i = 0; i < slot_count; i++) {
+      for (unsigned j = 0; j < MIN2(num_components, 4); j++) {
+         /* allow partial overlap */
+         if (!vars[var->data.location + i][var->data.location_frac + j])
+            vars[var->data.location + i][var->data.location_frac + j] = var;
+      }
+      if (num_components > 4)
+         num_components -= 4;
+   }
+}
+
+static void
+rework_io_vars(nir_shader *nir, nir_variable_mode mode)
+{
+   assert(mode == nir_var_shader_out || mode == nir_var_shader_in);
+   assert(util_bitcount(mode) == 1);
+   nir_variable *old_vars[VARYING_SLOT_TESS_MAX][4] = {{NULL}};
+   nir_variable *vars[VARYING_SLOT_TESS_MAX][4] = {{NULL}};
+   bool found = false;
+   /* store old vars */
+   nir_foreach_variable_with_modes_safe(var, nir, mode) {
+      if ((mode == nir_var_shader_out && var->data.location < VARYING_SLOT_VAR0) ||
+          (mode == nir_var_shader_in && var->data.location < (nir->info.stage == MESA_SHADER_VERTEX ? VERT_ATTRIB_GENERIC0 : VARYING_SLOT_VAR0)))
+         continue;
+      /* account for vertex attr aliasing and bindless io */
+      if (nir->info.stage != MESA_SHADER_VERTEX || mode == nir_var_shader_out ||
+          (mode == nir_var_shader_in && nir->info.stage == MESA_SHADER_VERTEX && !old_vars[var->data.location][var->data.location_frac]) ||
+          is_var_type_bindless(var))
+         store_location_var(old_vars, var, nir);
+      /* skip interpolated inputs and bindless io */
+      if (is_var_type_bindless(var) ||
+          (mode == nir_var_shader_out && nir->info.stage == MESA_SHADER_FRAGMENT)) {
+         store_location_var(vars, var, nir);
+      } else {
+         var->data.mode = nir_var_shader_temp;
+         found = true;
+      }
+   }
+   if (!found)
+      return;
+   nir_fixup_deref_modes(nir);
+   NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_shader_temp, NULL);
+   /* scan for vars using indirect array access */
+   BITSET_DECLARE(indirect_access, 128);
+   BITSET_ZERO(indirect_access);
+   nir_foreach_function_impl(impl, nir) {
+      nir_foreach_block(block, impl) {
+         nir_foreach_instr(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+            nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+            bool is_load = false;
+            bool is_input = false;
+            bool is_interp = false;
+            if (!filter_io_instr(intr, &is_load, &is_input, &is_interp))
+               continue;
+            nir_src *src_offset = nir_get_io_offset_src(intr);
+            if (!is_input && !src_offset)
+               continue;
+            if (mode == nir_var_shader_in && !is_input)
+               continue;
+            if (mode == nir_var_shader_out && is_input)
+               continue;
+            nir_io_semantics s = nir_intrinsic_io_semantics(intr);
+            if ((mode == nir_var_shader_out && s.location < VARYING_SLOT_VAR0) ||
+                (mode == nir_var_shader_in && s.location < (nir->info.stage == MESA_SHADER_VERTEX ? VERT_ATTRIB_GENERIC0 : VARYING_SLOT_VAR0)))
+               continue;
+            if (!nir_src_is_const(*src_offset))
+               BITSET_SET(indirect_access, s.location);
+         }
+      }
+   }
+   /* loop and create vars */
+   nir_foreach_function_impl(impl, nir) {
+      nir_foreach_block(block, impl) {
+         nir_foreach_instr(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+            nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+            bool is_load = false;
+            bool is_input = false;
+            bool is_interp = false;
+            if (!filter_io_instr(intr, &is_load, &is_input, &is_interp))
+               continue;
+            if (mode == nir_var_shader_in && !is_input)
+               continue;
+            if (mode == nir_var_shader_out && is_input)
+               continue;
+            nir_io_semantics s = nir_intrinsic_io_semantics(intr);
+            if ((mode == nir_var_shader_out && s.location < VARYING_SLOT_VAR0) ||
+                (mode == nir_var_shader_in && s.location < (nir->info.stage == MESA_SHADER_VERTEX ? VERT_ATTRIB_GENERIC0 : VARYING_SLOT_VAR0)))
+               continue;
+            unsigned slot_offset = 0;
+            bool is_indirect = BITSET_TEST(indirect_access, s.location);
+            nir_src *src_offset = nir_get_io_offset_src(intr);
+            if (src_offset && !is_indirect) {
+               assert(nir_src_is_const(*src_offset));
+               slot_offset = nir_src_as_uint(*src_offset);
+            }
+            unsigned location = s.location + slot_offset;
+            unsigned frac = nir_intrinsic_component(intr);
+            unsigned bit_size = is_load ? intr->def.bit_size : nir_src_bit_size(intr->src[0]);
+            /* set c aligned/rounded down to dword */
+            unsigned c = frac;
+            if (frac && bit_size < 32)
+               c = frac * bit_size / 32;
+            nir_alu_type type = is_load ? nir_intrinsic_dest_type(intr) : nir_intrinsic_src_type(intr);
+            nir_variable *old_var = old_vars[location][c];
+            assert(old_var);
+            if (is_var_type_bindless(old_var))
+               continue;
+            /* ensure dword is filled with like-sized components */
+            unsigned max_components = intr->num_components;
+            if (bit_size == 16)
+               max_components = align(max_components, 2);
+            else if (bit_size == 8)
+               max_components = align(max_components, 4);
+            if (c + (bit_size == 64 ? max_components * 2 : max_components) > 4)
+               c = 0;
+            const struct glsl_type *vec_type = glsl_vector_type(nir_get_glsl_base_type_for_nir_type(type), max_components);
+            /* reset the mode for nir_is_arrayed_io to work */
+            nir_variable_mode oldmode = old_var->data.mode;
+            old_var->data.mode = mode;
+            bool is_arrayed = nir_is_arrayed_io(old_var, nir->info.stage);
+            old_var->data.mode = oldmode;
+            if (is_indirect) {
+               /* indirect array access requires the full array in a single variable */
+               unsigned slot_count = 0;
+               if (is_arrayed)
+                  slot_count = glsl_count_vec4_slots(glsl_get_array_element(old_var->type), false, false);
+               else
+                  slot_count = glsl_count_vec4_slots(old_var->type, false, false);
+               if (bit_size == 64 && slot_count > 1)
+                  slot_count /= 2;
+               if (slot_count > 1)
+                  vec_type = glsl_array_type(vec_type, slot_count, glsl_get_explicit_stride(vec_type));
+            }
+            if (is_arrayed)
+               vec_type = glsl_array_type(vec_type, glsl_array_size(old_var->type), glsl_get_explicit_stride(old_var->type));
+            if (vars[location][c]) {
+               if (glsl_get_vector_elements(glsl_without_array(vars[location][c]->type)) < glsl_get_vector_elements(glsl_without_array(vec_type))) {
+                  /* enlarge existing vars if necessary */
+                  vars[location][c]->type = vec_type;
+                  store_location_var(vars, vars[location][c], nir);
+               }
+               continue;
+            }
+
+            assert(!vars[location][c] ||
+                   (nir_get_nir_type_for_glsl_base_type(glsl_get_base_type(glsl_without_array(vars[location][c]->type))) == type &&
+                    glsl_get_vector_elements(glsl_without_array(vars[location][c]->type)) >= intr->num_components));
+            nir_variable *var = nir_variable_clone(old_var, nir);
+            var->data.mode = mode;
+            var->type = vec_type;
+            var->data.driver_location = nir_intrinsic_base(intr) + slot_offset;
+            var->data.location_frac = c;
+            var->data.location = location;
+            nir_shader_add_variable(nir, var);
+            store_location_var(vars, var, nir);
+         }
+      }
+   }
+   if (mode != nir_var_shader_out)
+      return;
+   /* scan for missing components which would break shader interfaces */
+   for (unsigned i = 0; i < VARYING_SLOT_TESS_MAX; i++) {
+      for (unsigned j = 0; j < 4; j++) {
+         if (!old_vars[i][j] || vars[i][j] || glsl_type_is_struct(glsl_without_array(old_vars[i][j]->type)))
+            continue;
+
+         nir_variable *copy = NULL;
+         nir_variable *ref = NULL;
+         for (unsigned k = 0; k < 4; k++) {
+            if (!copy)
+               copy = vars[i][k];
+            if (!ref)
+               ref = old_vars[i][k];
+         }
+         assert(copy);
+         /* add a 1 component variable to fill the hole */
+         nir_variable *var = nir_variable_clone(copy, nir);
+         var->data.mode = mode;
+         const struct glsl_type *type = glsl_without_array_or_matrix(var->type);
+         if (glsl_type_is_vector_or_scalar(type))
+            var->type = glsl_vector_type(glsl_get_base_type(type), 1);
+         else
+            var->type = glsl_vector_type(GLSL_TYPE_FLOAT, 1);
+         var->data.location_frac = j;
+         assert(j % 2 == 0 || !glsl_type_is_64bit(glsl_without_array(var->type)));
+         nir_shader_add_variable(nir, var);
+         store_location_var(vars, var, nir);
+         /* write zero so it doesn't get pruned */
+         nir_builder b = nir_builder_at(nir_after_block(nir_impl_last_block(nir_shader_get_entrypoint(nir))));
+         nir_def *store = nir_imm_intN_t(&b, j == 3 ? 1 : 0, glsl_type_is_64bit(glsl_without_array(var->type)) ? 64 : 32);
+         if (nir_is_arrayed_io(copy, nir->info.stage)) {
+            var->type = glsl_array_type(var->type, glsl_array_size(ref->type), glsl_get_explicit_stride(ref->type));
+            nir_deref_instr *deref = nir_build_deref_var(&b, var);
+            deref = nir_build_deref_array(&b, deref, nir_load_invocation_id(&b));
+            nir_store_deref(&b, deref, store, 0x1);
+         } else {
+            nir_store_var(&b, var, store, 0x1);
+         }
+      }
+   }
+}
+
+
+static bool
+eliminate_io_wrmasks_instr(const nir_instr *instr, const void *data)
+{
+   const nir_shader *nir = data;
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+   switch (intr->intrinsic) {
+   case nir_intrinsic_store_output:
+   case nir_intrinsic_store_per_primitive_output:
+   case nir_intrinsic_store_per_vertex_output:
+      break;
+   default:
+      return false;
+   }
+   unsigned src_components = nir_intrinsic_src_components(intr, 0);
+   unsigned wrmask = nir_intrinsic_write_mask(intr);
+   unsigned num_components = util_bitcount(wrmask);
+   if (num_components != src_components)
+      return true;
+   if ((nir_intrinsic_instr_src_type(intr, 0) & NIR_ALU_TYPE_SIZE_MASK) == 64)
+      num_components *= 2;
+   if (nir->xfb_info) {
+      nir_io_semantics s = nir_intrinsic_io_semantics(intr);
+      nir_src *src_offset = nir_get_io_offset_src(intr);
+      if (nir_src_is_const(*src_offset)) {
+         unsigned slot_offset = nir_src_as_uint(*src_offset);
+         for (unsigned i = 0; i < nir->xfb_info->output_count; i++) {
+            if (nir->xfb_info->outputs[i].location == s.location + slot_offset) {
+               unsigned xfb_components = util_bitcount(nir->xfb_info->outputs[i].component_mask);
+               if (xfb_components != MIN2(4, num_components))
+                  return true;
+               num_components -= xfb_components;
+               if (!num_components)
+                  break;
+            }
+         }
+      } else {
+         for (unsigned i = 0; i <nir->xfb_info->output_count; i++) {
+            if (nir->xfb_info->outputs[i].location >= s.location &&
+               nir->xfb_info->outputs[i].location < s.location + s.num_slots) {
+               unsigned xfb_components = util_bitcount(nir->xfb_info->outputs[i].component_mask);
+               if (xfb_components < MIN2(num_components, 4))
+                  return true;
+               num_components -= xfb_components;
+               if (!num_components)
+                  break;
+            }
+         }
+      }
+   }
+   return false;
+}
+
+static int
+zink_type_size(const struct glsl_type *type, bool bindless)
+{
+   return glsl_count_attribute_slots(type, false);
+}
+
 struct zink_shader *
 zink_shader_create(struct zink_screen *screen, struct nir_shader *nir,
                    const struct pipe_stream_output_info *so_info)
@@ -4849,8 +5418,32 @@ zink_shader_create(struct zink_screen *screen, struct nir_shader *nir,
        nir->info.stage == MESA_SHADER_TESS_EVAL)
       indirect_derefs_modes |= nir_var_shader_in | nir_var_shader_out;
 
-   NIR_PASS_V(nir, nir_lower_indirect_derefs, indirect_derefs_modes,
-              UINT32_MAX);
+   NIR_PASS_V(nir, nir_lower_indirect_derefs, indirect_derefs_modes, UINT32_MAX);
+
+   nir_lower_io_options lower_io_flags = 0;
+   if (!screen->info.feats.features.shaderInt64 || !screen->info.feats.features.shaderFloat64)
+      lower_io_flags = nir_lower_io_lower_64bit_to_32;
+   else if (!screen->info.feats.features.shaderFloat64)
+      lower_io_flags = nir_lower_io_lower_64bit_float_to_32;
+   bool temp_inputs = nir->info.stage != MESA_SHADER_VERTEX && nir->info.inputs_read & BITFIELD_RANGE(VARYING_SLOT_CLIP_DIST0, 4);
+   bool temp_outputs = nir->info.stage != MESA_SHADER_FRAGMENT && (nir->info.outputs_read | nir->info.outputs_written) & BITFIELD_RANGE(VARYING_SLOT_CLIP_DIST0, 4);
+   if (temp_inputs || temp_outputs) {
+      NIR_PASS_V(nir, nir_lower_io_to_temporaries, nir_shader_get_entrypoint(nir), temp_outputs, temp_inputs);
+      NIR_PASS_V(nir, nir_lower_global_vars_to_local);
+      NIR_PASS_V(nir, nir_split_var_copies);
+      NIR_PASS_V(nir, nir_lower_var_copies);
+   }
+   NIR_PASS_V(nir, nir_lower_io, nir_var_shader_out, zink_type_size, lower_io_flags);
+   if (nir->info.stage == MESA_SHADER_VERTEX)
+      lower_io_flags |= nir_lower_io_lower_64bit_to_32;
+   NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in, zink_type_size, lower_io_flags);
+   optimize_nir(nir, NULL);
+   nir_gather_xfb_info_from_intrinsics(nir);
+   NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_shader_in | nir_var_shader_out, eliminate_io_wrmasks_instr, nir);
+   /* clean up io to improve direct access */
+   optimize_nir(nir, NULL);
+   rework_io_vars(nir, nir_var_shader_in);
+   rework_io_vars(nir, nir_var_shader_out);
 
    if (nir->info.stage < MESA_SHADER_COMPUTE)
       create_gfx_pushconst(nir);