From 765d9feb468d4047cb30ba35acef235659a6e991 Mon Sep 17 00:00:00 2001 From: Iago Toral Quiroga Date: Tue, 18 Jan 2022 10:59:42 +0100 Subject: [PATCH] broadcom/compiler: add lowering pass to scalarize non 32-bit general load/store MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit V3D hardware doesn't support vector access for general TMU load/store operations like the ones we use for UBO and SSBO, so we need to split these to scalar operations. It should be noted that we also have a vectorization pass (which runs later, during optimization), that may reconstruct some of these into 32-bit operations when possible (i.e. when the resulting operation is 32-bit aligned). Reviewed-by: Alejandro Piñeiro Part-of: --- src/broadcom/compiler/meson.build | 1 + src/broadcom/compiler/v3d_compiler.h | 1 + .../compiler/v3d_nir_lower_load_store_bitsize.c | 251 +++++++++++++++++++++ src/broadcom/compiler/vir.c | 2 + 4 files changed, 255 insertions(+) create mode 100644 src/broadcom/compiler/v3d_nir_lower_load_store_bitsize.c diff --git a/src/broadcom/compiler/meson.build b/src/broadcom/compiler/meson.build index 9515614..311da12 100644 --- a/src/broadcom/compiler/meson.build +++ b/src/broadcom/compiler/meson.build @@ -39,6 +39,7 @@ libbroadcom_compiler_files = files( 'v3d_nir_lower_io.c', 'v3d_nir_lower_image_load_store.c', 'v3d_nir_lower_line_smooth.c', + 'v3d_nir_lower_load_store_bitsize.c', 'v3d_nir_lower_logic_ops.c', 'v3d_nir_lower_robust_buffer_access.c', 'v3d_nir_lower_scratch.c', diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h index 0d4f54a..3cf6fdc 100644 --- a/src/broadcom/compiler/v3d_compiler.h +++ b/src/broadcom/compiler/v3d_compiler.h @@ -1109,6 +1109,7 @@ void v3d_nir_lower_robust_buffer_access(nir_shader *shader, struct v3d_compile * void v3d_nir_lower_scratch(nir_shader *s); void v3d_nir_lower_txf_ms(nir_shader *s, struct v3d_compile *c); void v3d_nir_lower_image_load_store(nir_shader *s); +void v3d_nir_lower_load_store_bitsize(nir_shader *s, struct v3d_compile *c); void v3d33_vir_vpm_read_setup(struct v3d_compile *c, int num_components); void v3d33_vir_vpm_write_setup(struct v3d_compile *c); diff --git a/src/broadcom/compiler/v3d_nir_lower_load_store_bitsize.c b/src/broadcom/compiler/v3d_nir_lower_load_store_bitsize.c new file mode 100644 index 0000000..8f9729c --- /dev/null +++ b/src/broadcom/compiler/v3d_nir_lower_load_store_bitsize.c @@ -0,0 +1,251 @@ +/* + * Copyright © 2021 Raspberry Pi + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "compiler/v3d_compiler.h" +#include "compiler/nir/nir_builder.h" + +/** + * The V3D TMU unit can only do 32-bit general vector access so for anything + * else we need to split vector load/store instructions to scalar. + * + * Note that a vectorization pass after this lowering may be able to + * re-vectorize some of these using 32-bit load/store instructions instead, + * which we do support. + */ + +static int +value_src(nir_intrinsic_op intrinsic) +{ + switch (intrinsic) { + case nir_intrinsic_store_ssbo: + case nir_intrinsic_store_scratch: + return 0; + default: + unreachable("Unsupported intrinsic"); + } +} + +static int +offset_src(nir_intrinsic_op intrinsic) +{ + switch (intrinsic) { + case nir_intrinsic_load_uniform: + case nir_intrinsic_load_shared: + case nir_intrinsic_load_scratch: + return 0; + case nir_intrinsic_load_ubo: + case nir_intrinsic_load_ssbo: + case nir_intrinsic_store_scratch: + return 1; + case nir_intrinsic_store_ssbo: + return 2; + default: + unreachable("Unsupported intrinsic"); + } +} + +static nir_intrinsic_instr * +init_scalar_intrinsic(nir_builder *b, + nir_intrinsic_instr *intr, + uint32_t component, + nir_ssa_def *offset, + uint32_t bit_size, + nir_ssa_def **scalar_offset) +{ + + nir_intrinsic_instr *new_intr = + nir_intrinsic_instr_create(b->shader, intr->intrinsic); + + nir_intrinsic_copy_const_indices(new_intr, intr); + + const int offset_units = bit_size / 8; + assert(offset_units >= 1); + + if (nir_intrinsic_has_align_mul(intr)) { + assert(nir_intrinsic_has_align_offset(intr)); + unsigned align_mul = nir_intrinsic_align_mul(intr); + unsigned align_off = nir_intrinsic_align_offset(intr); + + align_off += offset_units * component; + align_off = align_off % align_mul; + + nir_intrinsic_set_align(new_intr, align_mul, align_off); + } + + *scalar_offset = offset; + unsigned offset_adj = offset_units * component; + if (nir_intrinsic_has_base(intr)) { + nir_intrinsic_set_base( + new_intr, nir_intrinsic_base(intr) + offset_adj); + } else { + *scalar_offset = + nir_iadd(b, offset, + nir_imm_intN_t(b, offset_adj, + offset->bit_size)); + } + + new_intr->num_components = 1; + + return new_intr; +} + +static bool +lower_load_bitsize(struct v3d_compile *c, + nir_builder *b, + nir_intrinsic_instr *intr) +{ + uint32_t bit_size = nir_dest_bit_size(intr->dest); + if (bit_size == 32) + return false; + + /* No need to split if it is already scalar */ + int num_comp = nir_intrinsic_dest_components(intr); + if (num_comp <= 1) + return false; + + b->cursor = nir_before_instr(&intr->instr); + + unsigned offset_idx = offset_src(intr->intrinsic); + nir_ssa_def *offset = nir_ssa_for_src(b, intr->src[offset_idx], 1); + + /* Split vector store to multiple scalar loads */ + nir_ssa_def *dest_components[4] = { NULL }; + const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic]; + for (int component = 0; component < num_comp; component++) { + nir_ssa_def *scalar_offset; + nir_intrinsic_instr *new_intr = + init_scalar_intrinsic(b, intr, component, offset, + bit_size, &scalar_offset); + + for (unsigned i = 0; i < info->num_srcs; i++) { + if (i == offset_idx) { + new_intr->src[i] = nir_src_for_ssa(scalar_offset); + } else { + new_intr->src[i] = intr->src[i]; + } + } + + nir_ssa_dest_init(&new_intr->instr, &new_intr->dest, + 1, bit_size, NULL); + dest_components[component] = &new_intr->dest.ssa; + + nir_builder_instr_insert(b, &new_intr->instr); + } + + nir_ssa_def *new_dst = nir_vec(b, dest_components, num_comp); + nir_ssa_def_rewrite_uses(&intr->dest.ssa, new_dst); + + nir_instr_remove(&intr->instr); + return true; +} + +static bool +lower_store_bitsize(struct v3d_compile *c, + nir_builder *b, + nir_intrinsic_instr *intr) +{ + /* No need to split if it is already scalar */ + int value_idx = value_src(intr->intrinsic); + int num_comp = nir_intrinsic_src_components(intr, value_idx); + if (num_comp <= 1) + return false; + + /* No need to split if it is 32-bit */ + if (nir_src_bit_size(intr->src[value_idx]) == 32) + return false; + + nir_ssa_def *value = nir_ssa_for_src(b, intr->src[value_idx], num_comp); + + b->cursor = nir_before_instr(&intr->instr); + + unsigned offset_idx = offset_src(intr->intrinsic); + nir_ssa_def *offset = nir_ssa_for_src(b, intr->src[offset_idx], 1); + + /* Split vector store to multiple scalar stores */ + const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic]; + unsigned wrmask = nir_intrinsic_write_mask(intr); + while (wrmask) { + unsigned component = ffs(wrmask) - 1; + + nir_ssa_def *scalar_offset; + nir_intrinsic_instr *new_intr = + init_scalar_intrinsic(b, intr, component, offset, + value->bit_size, &scalar_offset); + + nir_intrinsic_set_write_mask(new_intr, 0x1); + + for (unsigned i = 0; i < info->num_srcs; i++) { + if (i == value_idx) { + nir_ssa_def *scalar_value = + nir_channels(b, value, 1 << component); + new_intr->src[i] = nir_src_for_ssa(scalar_value); + } else if (i == offset_idx) { + new_intr->src[i] = nir_src_for_ssa(scalar_offset); + } else { + new_intr->src[i] = intr->src[i]; + } + } + + nir_builder_instr_insert(b, &new_intr->instr); + + wrmask &= ~(1 << component); + } + + nir_instr_remove(&intr->instr); + return true; +} + +static bool +lower_load_store_bitsize(nir_builder *b, nir_instr *instr, void *data) +{ + struct v3d_compile *c = (struct v3d_compile *) data; + + if (instr->type != nir_instr_type_intrinsic) + return false; + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + + switch (intr->intrinsic) { + case nir_intrinsic_load_ssbo: + case nir_intrinsic_load_ubo: + case nir_intrinsic_load_uniform: + case nir_intrinsic_load_scratch: + return lower_load_bitsize(c, b, intr); + + case nir_intrinsic_store_ssbo: + case nir_intrinsic_store_scratch: + return lower_store_bitsize(c, b, intr); + + default: + return false; + } +} + +void +v3d_nir_lower_load_store_bitsize(nir_shader *s, struct v3d_compile *c) +{ + nir_shader_instructions_pass(s, + lower_load_store_bitsize, + nir_metadata_block_index | + nir_metadata_dominance, + c); +} diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c index 52e1785..61a40dc 100644 --- a/src/broadcom/compiler/vir.c +++ b/src/broadcom/compiler/vir.c @@ -1510,6 +1510,8 @@ v3d_attempt_compile(struct v3d_compile *c) NIR_PASS_V(c->s, nir_lower_wrmasks, should_split_wrmask, c->s); + NIR_PASS_V(c->s, v3d_nir_lower_load_store_bitsize, c); + NIR_PASS_V(c->s, v3d_nir_lower_subgroup_intrinsics, c); v3d_optimize_nir(c, c->s); -- 2.7.4