From 778fc176b15b65e5814278f22fae1881a8118b82 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Fri, 12 Nov 2021 10:27:13 +0000 Subject: [PATCH] nir/opt_load_store_vectorize: create load_shared2_amd/store_shared2_amd MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Signed-off-by: Rhys Perry Reviewed-by: Timur Kristóf Part-of: --- src/compiler/nir/nir.h | 1 + src/compiler/nir/nir_opt_load_store_vectorize.c | 120 +++++++++++++++++++++--- 2 files changed, 106 insertions(+), 15 deletions(-) diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index 16a4c16..45ca578 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -5426,6 +5426,7 @@ typedef struct { nir_variable_mode modes; nir_variable_mode robust_modes; void *cb_data; + bool has_shared2_amd; } nir_load_store_vectorize_options; bool nir_opt_load_store_vectorize(nir_shader *shader, const nir_load_store_vectorize_options *options); diff --git a/src/compiler/nir/nir_opt_load_store_vectorize.c b/src/compiler/nir/nir_opt_load_store_vectorize.c index b2e0e5b..81844b8 100644 --- a/src/compiler/nir/nir_opt_load_store_vectorize.c +++ b/src/compiler/nir/nir_opt_load_store_vectorize.c @@ -1104,9 +1104,7 @@ is_strided_vector(const struct glsl_type *type) } static bool -try_vectorize(nir_function_impl *impl, struct vectorize_ctx *ctx, - struct entry *low, struct entry *high, - struct entry *first, struct entry *second) +can_vectorize(struct vectorize_ctx *ctx, struct entry *first, struct entry *second) { if (!(get_variable_mode(first) & ctx->options->modes) || !(get_variable_mode(second) & ctx->options->modes)) @@ -1115,16 +1113,27 @@ try_vectorize(nir_function_impl *impl, struct vectorize_ctx *ctx, if (check_for_aliasing(ctx, first, second)) return false; - uint64_t diff = high->offset_signed - low->offset_signed; - if (check_for_robustness(ctx, low, diff)) - return false; - /* we can only vectorize non-volatile loads/stores of the same type and with * the same access */ if (first->info != second->info || first->access != second->access || (first->access & ACCESS_VOLATILE) || first->info->is_atomic) return false; + return true; +} + +static bool +try_vectorize(nir_function_impl *impl, struct vectorize_ctx *ctx, + struct entry *low, struct entry *high, + struct entry *first, struct entry *second) +{ + if (!can_vectorize(ctx, first, second)) + return false; + + uint64_t diff = high->offset_signed - low->offset_signed; + if (check_for_robustness(ctx, low, diff)) + return false; + /* don't attempt to vectorize accesses of row-major matrix columns */ if (first->deref) { const struct glsl_type *first_type = first->deref->type; @@ -1176,6 +1185,76 @@ try_vectorize(nir_function_impl *impl, struct vectorize_ctx *ctx, } static bool +try_vectorize_shared2(nir_function_impl *impl, struct vectorize_ctx *ctx, + struct entry *low, struct entry *high, + struct entry *first, struct entry *second) +{ + if (!can_vectorize(ctx, first, second) || first->deref) + return false; + + unsigned low_bit_size = get_bit_size(low); + unsigned high_bit_size = get_bit_size(high); + unsigned low_size = low->intrin->num_components * low_bit_size / 8; + unsigned high_size = high->intrin->num_components * high_bit_size / 8; + if ((low_size != 4 && low_size != 8) || (high_size != 4 && high_size != 8)) + return false; + if (low_size != high_size) + return false; + if (low->align_mul % low_size || low->align_offset % low_size) + return false; + if (high->align_mul % low_size || high->align_offset % low_size) + return false; + + uint64_t diff = high->offset_signed - low->offset_signed; + bool st64 = diff % (64 * low_size) == 0; + unsigned stride = st64 ? 64 * low_size : low_size; + if (diff % stride || diff > 255 * stride) + return false; + + /* try to avoid creating accesses we can't combine additions/offsets into */ + if (high->offset > 255 * stride || (st64 && high->offset % stride)) + return false; + + if (first->is_store) { + if (nir_intrinsic_write_mask(low->intrin) != BITFIELD_MASK(low->intrin->num_components)) + return false; + if (nir_intrinsic_write_mask(high->intrin) != BITFIELD_MASK(high->intrin->num_components)) + return false; + } + + /* vectorize the accesses */ + nir_builder b; + nir_builder_init(&b, impl); + + b.cursor = nir_after_instr(first->is_store ? second->instr : first->instr); + + nir_ssa_def *offset = first->intrin->src[first->is_store].ssa; + offset = nir_iadd_imm(&b, offset, nir_intrinsic_base(first->intrin)); + if (first != low) + offset = nir_iadd_imm(&b, offset, -(int)diff); + + if (first->is_store) { + nir_ssa_def *low_val = low->intrin->src[low->info->value_src].ssa; + nir_ssa_def *high_val = high->intrin->src[high->info->value_src].ssa; + nir_ssa_def *val = nir_vec2(&b, nir_bitcast_vector(&b, low_val, low_size * 8u), + nir_bitcast_vector(&b, high_val, low_size * 8u)); + nir_store_shared2_amd(&b, val, offset, .offset1=diff/stride, .st64=st64); + } else { + nir_ssa_def *new_def = nir_load_shared2_amd(&b, low_size * 8u, offset, .offset1=diff/stride, + .st64=st64); + nir_ssa_def_rewrite_uses(&low->intrin->dest.ssa, + nir_bitcast_vector(&b, nir_channel(&b, new_def, 0), low_bit_size)); + nir_ssa_def_rewrite_uses(&high->intrin->dest.ssa, + nir_bitcast_vector(&b, nir_channel(&b, new_def, 1), high_bit_size)); + } + + nir_instr_remove(first->instr); + nir_instr_remove(second->instr); + + return true; +} + +static bool update_align(struct entry *entry) { if (nir_intrinsic_has_align_mul(entry->intrin) && @@ -1204,17 +1283,28 @@ vectorize_sorted_entries(struct vectorize_ctx *ctx, nir_function_impl *impl, if (!high) continue; - uint64_t diff = high->offset_signed - low->offset_signed; - if (diff > get_bit_size(low) / 8u * low->intrin->num_components) - break; - struct entry *first = low->index < high->index ? low : high; struct entry *second = low->index < high->index ? high : low; - if (try_vectorize(impl, ctx, low, high, first, second)) { - low = low->is_store ? second : first; - *util_dynarray_element(arr, struct entry *, second_idx) = NULL; - progress = true; + uint64_t diff = high->offset_signed - low->offset_signed; + bool separate = diff > get_bit_size(low) / 8u * low->intrin->num_components; + if (separate) { + if (!ctx->options->has_shared2_amd || + get_variable_mode(first) != nir_var_mem_shared) + break; + + if (try_vectorize_shared2(impl, ctx, low, high, first, second)) { + low = NULL; + *util_dynarray_element(arr, struct entry *, second_idx) = NULL; + progress = true; + break; + } + } else { + if (try_vectorize(impl, ctx, low, high, first, second)) { + low = low->is_store ? second : first; + *util_dynarray_element(arr, struct entry *, second_idx) = NULL; + progress = true; + } } } -- 2.7.4