From 477332a347799e08d884f8fcc31816291575a955 Mon Sep 17 00:00:00 2001
From: Jesse Natalie <jenatali@microsoft.com>
Date: Thu, 23 Mar 2023 08:30:58 -0700
Subject: [PATCH] microsoft/compiler: Fix 8-bit loads and stores when
 supporting 16-bit DXIL

Shifts should always use 32bit shift values, and when lowering to
masked, we need to use 32-bit atomics. That means that we should also
treat 24bit stores as a single masked op rather than one 16bit unmasked
and one 8bit masked.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22225>
---
 src/microsoft/compiler/dxil_nir.c | 37 ++++++++++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 11 deletions(-)

diff --git a/src/microsoft/compiler/dxil_nir.c b/src/microsoft/compiler/dxil_nir.c
index c7acafa..b671bc2 100644
--- a/src/microsoft/compiler/dxil_nir.c
+++ b/src/microsoft/compiler/dxil_nir.c
@@ -257,7 +257,7 @@ lower_load_ssbo(nir_builder *b, nir_intrinsic_instr *intr, unsigned min_bit_size
       if (nir_intrinsic_align(intr) < store_bit_size / 8) {
          nir_ssa_def *shift = nir_imul(b, nir_iand(b, intr->src[1].ssa, nir_imm_int(b, offset_mask)),
                                           nir_imm_int(b, 8));
-         result = nir_ushr(b, result, nir_u2uN(b, shift, store_bit_size));
+         result = nir_ushr(b, result, shift);
       }
 
       /* And now comes the pack/unpack step to match the original type. */
@@ -287,11 +287,14 @@ lower_store_ssbo(nir_builder *b, nir_intrinsic_instr *intr, unsigned min_bit_siz
 
    unsigned src_bit_size = val->bit_size;
    unsigned store_bit_size = CLAMP(src_bit_size, min_bit_size, 32);
+   unsigned masked_store_bit_size = 32;
    unsigned num_components = val->num_components;
    unsigned num_bits = num_components * src_bit_size;
 
    unsigned offset_mask = store_bit_size / 8 - 1;
+   unsigned masked_store_offset_mask = masked_store_bit_size / 8 - 1;
    nir_ssa_def *offset = nir_iand(b, intr->src[2].ssa, nir_imm_int(b, ~offset_mask));
+   nir_ssa_def *masked_offset = nir_iand(b, intr->src[2].ssa, nir_imm_int(b, ~masked_store_offset_mask));
 
    nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS] = { 0 };
    unsigned comp_idx = 0;
@@ -325,32 +328,41 @@ lower_store_ssbo(nir_builder *b, nir_intrinsic_instr *intr, unsigned min_bit_siz
          ++num_src_comps_stored;
          substore_num_bits += src_bit_size;
       }
+      bool force_masked = false;
       if (substore_num_bits > store_bit_size &&
           substore_num_bits % store_bit_size != 0) {
          /* Split this into two, one unmasked store of the first bits,
           * and then the second loop iteration will handle a masked store
           * for the rest. */
          assert(num_src_comps_stored == 3);
-         --num_src_comps_stored;
-         substore_num_bits = store_bit_size;
+         if (store_bit_size == 16) {
+            assert(substore_num_bits < 32);
+            /* If we're already doing atomics to store, just do one
+             * 32bit masked store instead of a 16bit store and a masked
+             * store for the other 8 bits. */
+            force_masked = true;
+         } else {
+            --num_src_comps_stored;
+            substore_num_bits = store_bit_size;
+         }
       }
-      nir_ssa_def *local_offset = nir_iadd(b, offset, nir_imm_int(b, bit_offset / 8));
-      nir_ssa_def *store_vec = load_comps_to_vec(b, src_bit_size, &comps[comp_idx],
-                                             num_src_comps_stored, store_bit_size);
       nir_intrinsic_instr *store;
 
-      if (substore_num_bits < store_bit_size) {
-         nir_ssa_def *mask = nir_imm_intN_t(b, (1 << substore_num_bits) - 1, store_bit_size);
+      if (substore_num_bits < store_bit_size || force_masked) {
+         nir_ssa_def *store_vec = load_comps_to_vec(b, src_bit_size, &comps[comp_idx],
+                                                    num_src_comps_stored, masked_store_bit_size);
+         nir_ssa_def *mask = nir_imm_intN_t(b, (1 << substore_num_bits) - 1, masked_store_bit_size);
 
         /* If we have small alignments we need to place them correctly in the component. */
-         if (nir_intrinsic_align(intr) <= store_bit_size / 8) {
-            nir_ssa_def *pos = nir_iand(b, intr->src[2].ssa, nir_imm_int(b, offset_mask));
-            nir_ssa_def *shift = nir_u2uN(b, nir_imul_imm(b, pos, 8), store_bit_size);
+         if (nir_intrinsic_align(intr) <= masked_store_bit_size / 8) {
+            nir_ssa_def *pos = nir_iand(b, intr->src[2].ssa, nir_imm_int(b, masked_store_offset_mask));
+            nir_ssa_def *shift = nir_imul_imm(b, pos, 8);
 
             store_vec = nir_ishl(b, store_vec, shift);
             mask = nir_ishl(b, mask, shift);
          }
 
+         nir_ssa_def *local_offset = nir_iadd(b, masked_offset, nir_imm_int(b, bit_offset / 8));
          store = nir_intrinsic_instr_create(b->shader,
                                             nir_intrinsic_store_ssbo_masked_dxil);
          store->src[0] = nir_src_for_ssa(store_vec);
@@ -358,6 +370,9 @@ lower_store_ssbo(nir_builder *b, nir_intrinsic_instr *intr, unsigned min_bit_siz
          store->src[2] = nir_src_for_ssa(buffer);
          store->src[3] = nir_src_for_ssa(local_offset);
       } else {
+         nir_ssa_def *local_offset = nir_iadd(b, offset, nir_imm_int(b, bit_offset / 8));
+         nir_ssa_def *store_vec = load_comps_to_vec(b, src_bit_size, &comps[comp_idx],
+                                                    num_src_comps_stored, store_bit_size);
          store = nir_intrinsic_instr_create(b->shader,
                                             nir_intrinsic_store_ssbo);
          store->src[0] = nir_src_for_ssa(store_vec);
-- 
2.7.4