agx: Lower image atomics
authorAlyssa Rosenzweig <alyssa@rosenzweig.io>
Tue, 6 Jun 2023 23:25:18 +0000 (19:25 -0400)
committerMarge Bot <emma+marge@anholt.net>
Thu, 20 Jul 2023 15:33:28 +0000 (15:33 +0000)
Lower image atomics to texel address loads, and lower texel address loads to
arithmetic and descriptor reads. This implements image atomics.

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24258>

src/asahi/compiler/agx_nir_lower_texture.c

index e903135..a9dd514 100644 (file)
@@ -10,6 +10,9 @@
 #include "compiler/nir/nir_builtin_builder.h"
 #include "agx_compiler.h"
 #include "agx_internal_formats.h"
+#include "nir_builder_opcodes.h"
+#include "nir_intrinsics.h"
+#include "nir_intrinsics_indices.h"
 
 #define AGX_TEXTURE_DESC_STRIDE   24
 #define AGX_FORMAT_RGB32_EMULATED 0x36
@@ -550,6 +553,148 @@ txs_for_image(nir_builder *b, nir_intrinsic_instr *intr,
    return &tex->dest.ssa;
 }
 
+static nir_ssa_def *
+nir_bitfield_mask(nir_builder *b, nir_ssa_def *x)
+{
+   nir_ssa_def *one = nir_imm_intN_t(b, 1, x->bit_size);
+   return nir_iadd_imm(b, nir_ishl(b, one, nir_u2u32(b, x)), -1);
+}
+
+static nir_ssa_def *
+calculate_twiddled_coordinates(nir_builder *b, nir_ssa_def *coord,
+                               nir_ssa_def *tile_w_px_log2,
+                               nir_ssa_def *tile_h_px_log2,
+                               nir_ssa_def *width_tl,
+                               nir_ssa_def *layer_stride_el)
+{
+   /* SIMD-within-a-register */
+   nir_ssa_def *coord_px = nir_pack_32_2x16(b, nir_u2u16(b, coord));
+   nir_ssa_def *tile_mask =
+      nir_pack_32_2x16_split(b, nir_bitfield_mask(b, tile_w_px_log2),
+                             nir_bitfield_mask(b, tile_h_px_log2));
+
+   /* Modulo by the tile width/height to get the offsets within the tile */
+   nir_ssa_def *offs_xy_px = nir_iand(b, coord_px, tile_mask);
+
+   /* Get the coordinates of the corner of the tile */
+   nir_ssa_def *tile_xy_px = nir_isub(b, coord_px, offs_xy_px);
+
+   /* Unpack SIMD-within-a-register */
+   nir_ssa_def *offs_x_px = nir_unpack_32_2x16_split_x(b, offs_xy_px);
+   nir_ssa_def *offs_y_px = nir_unpack_32_2x16_split_y(b, offs_xy_px);
+   nir_ssa_def *tile_x_px =
+      nir_u2u32(b, nir_unpack_32_2x16_split_x(b, tile_xy_px));
+   nir_ssa_def *tile_y_px =
+      nir_u2u32(b, nir_unpack_32_2x16_split_y(b, tile_xy_px));
+
+   /* Get the tile size */
+   nir_ssa_def *one_32 = nir_imm_int(b, 1);
+   nir_ssa_def *tile_w_px = nir_ishl(b, one_32, nir_u2u32(b, tile_w_px_log2));
+   nir_ssa_def *tile_h_px = nir_ishl(b, one_32, nir_u2u32(b, tile_h_px_log2));
+
+   /* tile row start (px) =
+    *   (y // tile height) * (# of tiles/row) * (# of pix/tile) =
+    *   align_down(y, tile height) / tile height * width_tl *tile width *
+    *        tile height =
+    *   align_down(y, tile height) * width_tl * tile width
+    */
+   nir_ssa_def *tile_row_start_px =
+      nir_imul(b, nir_u2u32(b, tile_y_px), nir_imul(b, width_tl, tile_w_px));
+
+   /* tile column start (px) =
+    *   (x // tile width) * (# of pix/tile) =
+    *   align(x, tile width) / tile width * tile width * tile height =
+    *   align(x, tile width) * tile height
+    */
+   nir_ssa_def *tile_col_start_px = nir_imul(b, tile_x_px, tile_h_px);
+
+   /* The pixel at which the tile starts is thus... */
+   nir_ssa_def *tile_offset_px =
+      nir_iadd(b, tile_row_start_px, tile_col_start_px);
+
+   /* Get the total offset */
+   nir_ssa_def *offs_px = nir_interleave_agx(b, offs_x_px, offs_y_px);
+   nir_ssa_def *total_px = nir_iadd(b, tile_offset_px, nir_u2u32(b, offs_px));
+
+   if (layer_stride_el) {
+      nir_ssa_def *layer = nir_channel(b, coord, 2);
+      nir_ssa_def *layer_offset_px = nir_imul(b, layer, layer_stride_el);
+      total_px = nir_iadd(b, total_px, layer_offset_px);
+   }
+
+   return total_px;
+}
+
+static nir_ssa_def *
+image_texel_address(nir_builder *b, nir_intrinsic_instr *intr)
+{
+   /* First, calculate the address of the PBE descriptor */
+   nir_ssa_def *desc_address;
+   if (intr->intrinsic == nir_intrinsic_bindless_image_texel_address)
+      desc_address = texture_descriptor_ptr_for_handle(b, intr->src[0].ssa);
+   else
+      desc_address = texture_descriptor_ptr_for_index(b, intr->src[0].ssa);
+
+   nir_ssa_def *coord = intr->src[1].ssa;
+
+   enum glsl_sampler_dim dim = nir_intrinsic_image_dim(intr);
+   bool layered = nir_intrinsic_image_array(intr) ||
+                  (nir_intrinsic_image_dim(intr) == GLSL_SAMPLER_DIM_CUBE) ||
+                  (nir_intrinsic_image_dim(intr) == GLSL_SAMPLER_DIM_3D);
+
+   /* The last 8 bytes of the 24-byte PBE descriptor contain either the
+    * software-defined atomic descriptor, or (if array image) a pointer to the
+    * descriptor. Grab it.
+    */
+   nir_ssa_def *meta_ptr = nir_iadd_imm(b, desc_address, 16);
+   nir_ssa_def *meta = nir_load_global_constant(b, meta_ptr, 8, 1, 64);
+   nir_ssa_def *layer_stride_el = NULL;
+
+   if (layered) {
+      nir_ssa_def *desc = nir_load_global_constant(b, meta, 8, 3, 32);
+      meta = nir_pack_64_2x32(b, nir_trim_vector(b, desc, 2));
+      layer_stride_el = nir_channel(b, desc, 2);
+   }
+
+   nir_ssa_def *meta_hi = nir_unpack_64_2x32_split_y(b, meta);
+
+   /* See the GenXML definitions of the software-defined atomic descriptors */
+   nir_ssa_def *base;
+
+   if (dim == GLSL_SAMPLER_DIM_BUF)
+      base = meta;
+   else
+      base = nir_ishl_imm(b, nir_iand_imm(b, meta, BITFIELD64_MASK(33)), 7);
+
+   nir_ssa_def *tile_w_px_log2 =
+      nir_u2u16(b, nir_ubitfield_extract_imm(b, meta_hi, 33 - 32, 3));
+   nir_ssa_def *tile_h_px_log2 =
+      nir_u2u16(b, nir_ubitfield_extract_imm(b, meta_hi, 36 - 32, 3));
+   nir_ssa_def *width_tl = nir_ubitfield_extract_imm(b, meta_hi, 39 - 32, 14);
+
+   /* We do not allow atomics on linear 2D or linear 2D arrays, as there are no
+    * known use cases. So, we're linear if buffer or 1D, and twiddled otherwise.
+    */
+   nir_ssa_def *total_px;
+   if (dim == GLSL_SAMPLER_DIM_BUF || dim == GLSL_SAMPLER_DIM_1D) {
+      /* 1D linear is indexed directly */
+      total_px = nir_channel(b, coord, 0);
+   } else {
+      total_px = calculate_twiddled_coordinates(
+         b, coord, tile_w_px_log2, tile_h_px_log2, width_tl, layer_stride_el);
+   }
+
+   /* Calculate the full texel address. This sequence is written carefully to
+    * ensure it will be entirely folded into the atomic's addressing arithmetic.
+    */
+   enum pipe_format format = nir_intrinsic_format(intr);
+   unsigned bytes_per_pixel_B = util_format_get_blocksize(format);
+
+   nir_ssa_def *total_B = nir_imul_imm(b, total_px, bytes_per_pixel_B);
+   return nir_iadd(b, base, nir_u2u64(b, total_B));
+}
+
+
 static bool
 lower_images(nir_builder *b, nir_instr *instr, UNUSED void *data)
 {
@@ -568,6 +713,11 @@ lower_images(nir_builder *b, nir_instr *instr, UNUSED void *data)
                        nir_dest_bit_size(intr->dest)));
       return true;
 
+   case nir_intrinsic_image_texel_address:
+   case nir_intrinsic_bindless_image_texel_address:
+      nir_ssa_def_rewrite_uses(&intr->dest.ssa, image_texel_address(b, intr));
+      return true;
+
    default:
       return false;
    }
@@ -599,6 +749,7 @@ agx_nir_lower_texture(nir_shader *s, bool support_lod_bias)
    };
 
    NIR_PASS(progress, s, nir_lower_tex, &lower_tex_options);
+   NIR_PASS(progress, s, nir_lower_image_atomics_to_global);
 
    /* Lower bias after nir_lower_tex (to get rid of txd) but before
     * lower_regular_texture (which will shuffle around the sources)