From 778fc176b15b65e5814278f22fae1881a8118b82 Mon Sep 17 00:00:00 2001
From: Rhys Perry <pendingchaos02@gmail.com>
Date: Fri, 12 Nov 2021 10:27:13 +0000
Subject: [PATCH] nir/opt_load_store_vectorize: create
 load_shared2_amd/store_shared2_amd
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur KristÃ³f <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13778>
---
 src/compiler/nir/nir.h                          |   1 +
 src/compiler/nir/nir_opt_load_store_vectorize.c | 120 +++++++++++++++++++++---
 2 files changed, 106 insertions(+), 15 deletions(-)

diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 16a4c16..45ca578 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -5426,6 +5426,7 @@ typedef struct {
    nir_variable_mode modes;
    nir_variable_mode robust_modes;
    void *cb_data;
+   bool has_shared2_amd;
 } nir_load_store_vectorize_options;
 
 bool nir_opt_load_store_vectorize(nir_shader *shader, const nir_load_store_vectorize_options *options);
diff --git a/src/compiler/nir/nir_opt_load_store_vectorize.c b/src/compiler/nir/nir_opt_load_store_vectorize.c
index b2e0e5b..81844b8 100644
--- a/src/compiler/nir/nir_opt_load_store_vectorize.c
+++ b/src/compiler/nir/nir_opt_load_store_vectorize.c
@@ -1104,9 +1104,7 @@ is_strided_vector(const struct glsl_type *type)
 }
 
 static bool
-try_vectorize(nir_function_impl *impl, struct vectorize_ctx *ctx,
-              struct entry *low, struct entry *high,
-              struct entry *first, struct entry *second)
+can_vectorize(struct vectorize_ctx *ctx, struct entry *first, struct entry *second)
 {
    if (!(get_variable_mode(first) & ctx->options->modes) ||
        !(get_variable_mode(second) & ctx->options->modes))
@@ -1115,16 +1113,27 @@ try_vectorize(nir_function_impl *impl, struct vectorize_ctx *ctx,
    if (check_for_aliasing(ctx, first, second))
       return false;
 
-   uint64_t diff = high->offset_signed - low->offset_signed;
-   if (check_for_robustness(ctx, low, diff))
-      return false;
-
    /* we can only vectorize non-volatile loads/stores of the same type and with
     * the same access */
    if (first->info != second->info || first->access != second->access ||
        (first->access & ACCESS_VOLATILE) || first->info->is_atomic)
       return false;
 
+   return true;
+}
+
+static bool
+try_vectorize(nir_function_impl *impl, struct vectorize_ctx *ctx,
+              struct entry *low, struct entry *high,
+              struct entry *first, struct entry *second)
+{
+   if (!can_vectorize(ctx, first, second))
+      return false;
+
+   uint64_t diff = high->offset_signed - low->offset_signed;
+   if (check_for_robustness(ctx, low, diff))
+      return false;
+
    /* don't attempt to vectorize accesses of row-major matrix columns */
    if (first->deref) {
       const struct glsl_type *first_type = first->deref->type;
@@ -1176,6 +1185,76 @@ try_vectorize(nir_function_impl *impl, struct vectorize_ctx *ctx,
 }
 
 static bool
+try_vectorize_shared2(nir_function_impl *impl, struct vectorize_ctx *ctx,
+                      struct entry *low, struct entry *high,
+                      struct entry *first, struct entry *second)
+{
+   if (!can_vectorize(ctx, first, second) || first->deref)
+      return false;
+
+   unsigned low_bit_size = get_bit_size(low);
+   unsigned high_bit_size = get_bit_size(high);
+   unsigned low_size = low->intrin->num_components * low_bit_size / 8;
+   unsigned high_size = high->intrin->num_components * high_bit_size / 8;
+   if ((low_size != 4 && low_size != 8) || (high_size != 4 && high_size != 8))
+      return false;
+   if (low_size != high_size)
+      return false;
+   if (low->align_mul % low_size || low->align_offset % low_size)
+      return false;
+   if (high->align_mul % low_size || high->align_offset % low_size)
+      return false;
+
+   uint64_t diff = high->offset_signed - low->offset_signed;
+   bool st64 = diff % (64 * low_size) == 0;
+   unsigned stride = st64 ? 64 * low_size : low_size;
+   if (diff % stride || diff > 255 * stride)
+      return false;
+
+   /* try to avoid creating accesses we can't combine additions/offsets into */
+   if (high->offset > 255 * stride || (st64 && high->offset % stride))
+      return false;
+
+   if (first->is_store) {
+      if (nir_intrinsic_write_mask(low->intrin) != BITFIELD_MASK(low->intrin->num_components))
+         return false;
+      if (nir_intrinsic_write_mask(high->intrin) != BITFIELD_MASK(high->intrin->num_components))
+         return false;
+   }
+
+   /* vectorize the accesses */
+   nir_builder b;
+   nir_builder_init(&b, impl);
+
+   b.cursor = nir_after_instr(first->is_store ? second->instr : first->instr);
+
+   nir_ssa_def *offset = first->intrin->src[first->is_store].ssa;
+   offset = nir_iadd_imm(&b, offset, nir_intrinsic_base(first->intrin));
+   if (first != low)
+      offset = nir_iadd_imm(&b, offset, -(int)diff);
+
+   if (first->is_store) {
+      nir_ssa_def *low_val = low->intrin->src[low->info->value_src].ssa;
+      nir_ssa_def *high_val = high->intrin->src[high->info->value_src].ssa;
+      nir_ssa_def *val = nir_vec2(&b, nir_bitcast_vector(&b, low_val, low_size * 8u),
+                                      nir_bitcast_vector(&b, high_val, low_size * 8u));
+      nir_store_shared2_amd(&b, val, offset, .offset1=diff/stride, .st64=st64);
+   } else {
+      nir_ssa_def *new_def = nir_load_shared2_amd(&b, low_size * 8u, offset, .offset1=diff/stride,
+                                                  .st64=st64);
+      nir_ssa_def_rewrite_uses(&low->intrin->dest.ssa,
+                               nir_bitcast_vector(&b, nir_channel(&b, new_def, 0), low_bit_size));
+      nir_ssa_def_rewrite_uses(&high->intrin->dest.ssa,
+                               nir_bitcast_vector(&b, nir_channel(&b, new_def, 1), high_bit_size));
+   }
+
+   nir_instr_remove(first->instr);
+   nir_instr_remove(second->instr);
+
+   return true;
+}
+
+static bool
 update_align(struct entry *entry)
 {
    if (nir_intrinsic_has_align_mul(entry->intrin) &&
@@ -1204,17 +1283,28 @@ vectorize_sorted_entries(struct vectorize_ctx *ctx, nir_function_impl *impl,
          if (!high)
             continue;
 
-         uint64_t diff = high->offset_signed - low->offset_signed;
-         if (diff > get_bit_size(low) / 8u * low->intrin->num_components)
-            break;
-
          struct entry *first = low->index < high->index ? low : high;
          struct entry *second = low->index < high->index ? high : low;
 
-         if (try_vectorize(impl, ctx, low, high, first, second)) {
-            low = low->is_store ? second : first;
-            *util_dynarray_element(arr, struct entry *, second_idx) = NULL;
-            progress = true;
+         uint64_t diff = high->offset_signed - low->offset_signed;
+         bool separate = diff > get_bit_size(low) / 8u * low->intrin->num_components;
+         if (separate) {
+            if (!ctx->options->has_shared2_amd ||
+                get_variable_mode(first) != nir_var_mem_shared)
+               break;
+
+            if (try_vectorize_shared2(impl, ctx, low, high, first, second)) {
+               low = NULL;
+               *util_dynarray_element(arr, struct entry *, second_idx) = NULL;
+               progress = true;
+               break;
+            }
+         } else {
+            if (try_vectorize(impl, ctx, low, high, first, second)) {
+               low = low->is_store ? second : first;
+               *util_dynarray_element(arr, struct entry *, second_idx) = NULL;
+               progress = true;
+            }
          }
       }
 
-- 
2.7.4