From 2f2738dc9020d2e46dd0154ee9eb7f1f38481803 Mon Sep 17 00:00:00 2001
From: Alyssa Rosenzweig <alyssa@collabora.com>
Date: Thu, 30 Mar 2023 18:00:27 -0400
Subject: [PATCH] pan/bi: Use nir_lower_mem_access_bit_sizes

OpenCL can generate large loads and stores that we can't support, so we need to
lower. We can load/store up to 128-bits in a single go. We currently only handle
up to 32-bit components in the load and no more than vec4, so we split up
accordingly.

It's not clear to me what the requirements are for alignment on Valhall, so we
conservatively generate aligned access, at worst there's a performance penalty
in those cases. I think unaligned access is suppoerted, but likely with a
performance penalty of its own? So in the absence of hard data otherwise, let's
just use natural alignment.

Oddly, this shaves off a tiny bit of ALU in a few compute shaders on Valhall,
all in gfxbench. Seems to just be noise from the RA lottery.

total instructions in shared programs: 2686768 -> 2686756 (<.01%)
instructions in affected programs: 584 -> 572 (-2.05%)
helped: 6
HURT: 0
Instructions are helped.

total cvt in shared programs: 14644.33 -> 14644.14 (<.01%)
cvt in affected programs: 5.77 -> 5.58 (-3.25%)
helped: 6
HURT: 0

total quadwords in shared programs: 1455320 -> 1455312 (<.01%)
quadwords in affected programs: 56 -> 48 (-14.29%)
helped: 1
HURT: 0

Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22228>
---
 src/panfrost/compiler/bifrost_compile.c | 36 +++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/src/panfrost/compiler/bifrost_compile.c b/src/panfrost/compiler/bifrost_compile.c
index d9e1f29..5512bbe 100644
--- a/src/panfrost/compiler/bifrost_compile.c
+++ b/src/panfrost/compiler/bifrost_compile.c
@@ -4393,6 +4393,35 @@ bifrost_nir_lower_blend_components(struct nir_builder *b, nir_instr *instr,
    return true;
 }
 
+static nir_mem_access_size_align
+mem_access_size_align_cb(nir_intrinsic_op intrin, uint8_t bytes, uint32_t align,
+                         uint32_t align_offset, bool offset_is_const,
+                         const void *cb_data)
+{
+   align = nir_combined_align(align, align_offset);
+   assert(util_is_power_of_two_nonzero(align));
+
+   /* If the number of bytes is a multiple of 4, use 32-bit loads. Else if it's
+    * a multiple of 2, use 16-bit loads. Else use 8-bit loads.
+    */
+   unsigned bit_size = (bytes & 1) ? 8 : (bytes & 2) ? 16 : 32;
+
+   /* But if we're only aligned to 1 byte, use 8-bit loads. If we're only
+    * aligned to 2 bytes, use 16-bit loads, unless we needed 8-bit loads due to
+    * the size.
+    */
+   if (align == 1)
+      bit_size = 8;
+   else if (align == 2)
+      bit_size = MIN2(bit_size, 16);
+
+   return (nir_mem_access_size_align){
+      .num_components = MIN2(bytes / (bit_size / 8), 4),
+      .bit_size = bit_size,
+      .align = bit_size / 8,
+   };
+}
+
 static void
 bi_optimize_nir(nir_shader *nir, unsigned gpu_id, bool is_blend)
 {
@@ -4760,6 +4789,13 @@ bifrost_preprocess_nir(nir_shader *nir, unsigned gpu_id)
       NIR_PASS_V(nir, pan_nir_lower_store_component);
    }
 
+   NIR_PASS_V(nir, nir_lower_mem_access_bit_sizes,
+              nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_mem_constant |
+                 nir_var_mem_task_payload | nir_var_shader_temp |
+                 nir_var_function_temp | nir_var_mem_global |
+                 nir_var_mem_shared,
+              mem_access_size_align_cb, NULL);
+
    NIR_PASS_V(nir, nir_lower_ssbo);
    NIR_PASS_V(nir, pan_lower_sample_pos);
    NIR_PASS_V(nir, nir_lower_bit_size, bi_lower_bit_size, NULL);
-- 
2.7.4