From a8fbfcfbd3ff2c828f99ffc57f9dd4acc340d902 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@collabora.com>
Date: Fri, 4 Jun 2021 14:44:56 +0200
Subject: [PATCH] pan/midg: Support 8/16 bit load/store

Needed for panvk copy shaders to support 8 or 16bit formats.

Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14885>
---
 src/panfrost/midgard/midgard_compile.c |  64 ++++++++++++++++----
 src/panfrost/midgard/midgard_emit.c    | 103 +++++++++++++++------------------
 src/panfrost/midgard/midgard_opt_dce.c |   2 +-
 src/panfrost/midgard/midgard_ra.c      |   2 +-
 4 files changed, 101 insertions(+), 70 deletions(-)

diff --git a/src/panfrost/midgard/midgard_compile.c b/src/panfrost/midgard/midgard_compile.c
index 7e979df..f63e0ae 100644
--- a/src/panfrost/midgard/midgard_compile.c
+++ b/src/panfrost/midgard/midgard_compile.c
@@ -134,9 +134,13 @@ M_LOAD(ld_vary_32, nir_type_uint32);
 M_LOAD(ld_ubo_32, nir_type_uint32);
 M_LOAD(ld_ubo_64, nir_type_uint32);
 M_LOAD(ld_ubo_128, nir_type_uint32);
+M_LOAD(ld_u8, nir_type_uint8);
+M_LOAD(ld_u16, nir_type_uint16);
 M_LOAD(ld_32, nir_type_uint32);
 M_LOAD(ld_64, nir_type_uint32);
 M_LOAD(ld_128, nir_type_uint32);
+M_STORE(st_u8, nir_type_uint8);
+M_STORE(st_u16, nir_type_uint16);
 M_STORE(st_32, nir_type_uint32);
 M_STORE(st_64, nir_type_uint32);
 M_STORE(st_128, nir_type_uint32);
@@ -1198,7 +1202,8 @@ mir_set_intr_mask(nir_instr *instr, midgard_instruction *ins, bool is_read)
                 dsize = nir_dest_bit_size(intr->dest);
         } else {
                 nir_mask = nir_intrinsic_write_mask(intr);
-                dsize = 32;
+                dsize = OP_IS_COMMON_STORE(ins->op) ?
+                        nir_src_bit_size(intr->src[0]) : 32;
         }
 
         /* Once we have the NIR mask, we need to normalize to work in 32-bit space */
@@ -1283,19 +1288,55 @@ emit_global(
                 unsigned bitsize = nir_dest_bit_size(intr->dest) *
                         nir_dest_num_components(intr->dest);
 
-                if (bitsize <= 32)
-                        ins = m_ld_32(srcdest, 0);
-                else if (bitsize <= 64)
-                        ins = m_ld_64(srcdest, 0);
-                else if (bitsize <= 128)
-                        ins = m_ld_128(srcdest, 0);
-                else
-                        unreachable("Invalid global read size");
+                switch (bitsize) {
+                case 8: ins = m_ld_u8(srcdest, 0); break;
+                case 16: ins = m_ld_u16(srcdest, 0); break;
+                case 32: ins = m_ld_32(srcdest, 0); break;
+                case 64: ins = m_ld_64(srcdest, 0); break;
+                case 128: ins = m_ld_128(srcdest, 0); break;
+                default: unreachable("Invalid global read size");
+                }
+
+                mir_set_intr_mask(instr, &ins, is_read);
+
+                /* For anything not aligned on 32bit, make sure we write full
+                 * 32 bits registers. */
+                if (bitsize & 31) {
+                        unsigned comps_per_32b = 32 / nir_dest_bit_size(intr->dest);
+
+                        for (unsigned c = 0; c < 4 * comps_per_32b; c += comps_per_32b) {
+                                if (!(ins.mask & BITFIELD_RANGE(c, comps_per_32b)))
+                                        continue;
+
+                                unsigned base = ~0;
+                                for (unsigned i = 0; i < comps_per_32b; i++) {
+                                        if (ins.mask & BITFIELD_BIT(c + i)) {
+                                                base = ins.swizzle[0][c + i];
+                                                break;
+                                        }
+                                }
+
+                                assert(base != ~0);
+
+                                for (unsigned i = 0; i < comps_per_32b; i++) {
+                                        if (!(ins.mask & BITFIELD_BIT(c + i))) {
+                                                ins.swizzle[0][c + i] = base + i;
+                                                ins.mask |= BITFIELD_BIT(c + i);
+                                        }
+                                        assert(ins.swizzle[0][c + i] == base + i);
+                                }
+                        }
+
+                }
         } else {
                 unsigned bitsize = nir_src_bit_size(intr->src[0]) *
                         nir_src_num_components(intr->src[0]);
 
-                if (bitsize <= 32)
+                if (bitsize == 8)
+                        ins = m_st_u8(srcdest, 0);
+                else if (bitsize == 16)
+                        ins = m_st_u16(srcdest, 0);
+                else if (bitsize <= 32)
                         ins = m_st_32(srcdest, 0);
                 else if (bitsize <= 64)
                         ins = m_st_64(srcdest, 0);
@@ -1303,10 +1344,11 @@ emit_global(
                         ins = m_st_128(srcdest, 0);
                 else
                         unreachable("Invalid global store size");
+
+                mir_set_intr_mask(instr, &ins, is_read);
         }
 
         mir_set_offset(ctx, &ins, offset, seg);
-        mir_set_intr_mask(instr, &ins, is_read);
 
         /* Set a valid swizzle for masked out components */
         assert(ins.mask);
diff --git a/src/panfrost/midgard/midgard_emit.c b/src/panfrost/midgard/midgard_emit.c
index 7c6fbd2..e613aa0 100644
--- a/src/panfrost/midgard/midgard_emit.c
+++ b/src/panfrost/midgard/midgard_emit.c
@@ -368,14 +368,24 @@ mir_pack_vector_srcs(midgard_instruction *ins, midgard_vector_alu *alu)
 static void
 mir_pack_swizzle_ldst(midgard_instruction *ins)
 {
-        /* TODO: non-32-bit, non-vec4 */
-        for (unsigned c = 0; c < 4; ++c) {
+        unsigned compsz = OP_IS_STORE(ins->op) ?
+                          nir_alu_type_get_type_size(ins->src_types[0]) :
+                          nir_alu_type_get_type_size(ins->dest_type);
+        unsigned maxcomps = 128 / compsz;
+        unsigned step = DIV_ROUND_UP(32, compsz);
+
+        for (unsigned c = 0; c < maxcomps; c += step) {
                 unsigned v = ins->swizzle[0][c];
 
-                /* Check vec4 */
-                assert(v <= 3);
+                /* Make sure the component index doesn't exceed the maximum
+                 * number of components. */
+                assert(v <= maxcomps);
 
-                ins->load_store.swizzle |= v << (2 * c);
+                if (compsz <= 32)
+                        ins->load_store.swizzle |= (v / step) << (2 * (c / step));
+                else
+                        ins->load_store.swizzle |= ((v / step) << (4 * c)) |
+                                                   (((v / step) + 1) << ((4 * c) + 2));
         }
 
         /* TODO: arg_1/2 */
@@ -458,57 +468,34 @@ mir_pack_tex_ooo(midgard_block *block, midgard_bundle *bundle, midgard_instructi
 
 static unsigned
 midgard_pack_common_store_mask(midgard_instruction *ins) {
-        unsigned comp_sz = nir_alu_type_get_type_size(ins->dest_type);
-        unsigned mask = ins->mask;
+        ASSERTED unsigned comp_sz = nir_alu_type_get_type_size(ins->src_types[0]);
+        unsigned bytemask = mir_bytemask(ins);
         unsigned packed = 0;
-        unsigned nr_comp;
 
         switch (ins->op) {
-                case midgard_op_st_u8:
-                        packed |= mask & 1;
-                        break;
-                case midgard_op_st_u16:
-                        nr_comp = 16 / comp_sz;
-                        for (int i = 0; i < nr_comp; i++) {
-                                if (mask & (1 << i)) {
-                                        if (comp_sz == 16)
-                                                packed |= 0x3;
-                                        else if (comp_sz == 8)
-                                                packed |= 1 << i;
-                                }
-                        }
-                        break;
-                case midgard_op_st_32:
-                case midgard_op_st_64:
-                case midgard_op_st_128: {
-                        unsigned total_sz = 32;
-                        if (ins->op == midgard_op_st_128)
-                                total_sz = 128;
-                        else if (ins->op == midgard_op_st_64)
-                                total_sz = 64;
-
-                        nr_comp = total_sz / comp_sz;
-
-                        /* Each writemask bit masks 1/4th of the value to be stored. */
-                        assert(comp_sz >= total_sz / 4);
-
-                        for (int i = 0; i < nr_comp; i++) {
-                                if (mask & (1 << i)) {
-                                        if (comp_sz == total_sz)
-                                                packed |= 0xF;
-                                        else if (comp_sz == total_sz / 2)
-                                                packed |= 0x3 << (i * 2);
-                                        else if (comp_sz == total_sz / 4)
-                                                packed |= 0x1 << i;
-                                }
-                        }
-                        break;
+        case midgard_op_st_u8:
+                return mir_bytemask(ins) & 1;
+        case midgard_op_st_u16:
+                return mir_bytemask(ins) & 3;
+        case midgard_op_st_32:
+                return mir_bytemask(ins);
+        case midgard_op_st_64:
+                assert(comp_sz >= 16);
+                for (unsigned i = 0; i < 4; i++) {
+                        if (bytemask & (3 << (i * 2)))
+                                packed |= 1 << i;
+                }
+                return packed;
+        case midgard_op_st_128:
+                assert(comp_sz >= 32);
+                for (unsigned i = 0; i < 4; i++) {
+                        if (bytemask & (0xf << (i * 4)))
+                                packed |= 1 << i;
                 }
-                default:
-                        unreachable("unexpected ldst opcode");
+                return packed;
+        default:
+                unreachable("unexpected ldst opcode");
         }
-
-        return packed;
 }
 
 static void
@@ -523,16 +510,18 @@ mir_pack_ldst_mask(midgard_instruction *ins)
                 if (sz == 64) {
                         packed = ((ins->mask & 0x2) ? (0x8 | 0x4) : 0) |
                                 ((ins->mask & 0x1) ? (0x2 | 0x1) : 0);
-                } else if (sz == 16) {
+                } else if (sz < 32) {
+                        unsigned comps_per_32b = 32 / sz;
+
                         packed = 0;
 
                         for (unsigned i = 0; i < 4; ++i) {
-                                /* Make sure we're duplicated */
-                                bool u = (ins->mask & (1 << (2*i + 0))) != 0;
-                                ASSERTED bool v = (ins->mask & (1 << (2*i + 1))) != 0;
-                                assert(u == v);
+                                unsigned submask = (ins->mask >> (i * comps_per_32b)) &
+                                                   BITFIELD_MASK(comps_per_32b);
 
-                                packed |= (u << i);
+                                /* Make sure we're duplicated */
+                                assert(submask == 0 || submask == BITFIELD_MASK(comps_per_32b));
+                                packed |= (submask != 0) << i;
                         }
                 } else {
                         assert(sz == 32);
diff --git a/src/panfrost/midgard/midgard_opt_dce.c b/src/panfrost/midgard/midgard_opt_dce.c
index c43c4eb..f08972f 100644
--- a/src/panfrost/midgard/midgard_opt_dce.c
+++ b/src/panfrost/midgard/midgard_opt_dce.c
@@ -80,7 +80,7 @@ midgard_opt_dead_code_eliminate_block(compiler_context *ctx, midgard_block *bloc
                         unsigned oldmask = ins->mask;
 
                         /* Make sure we're packable */
-                        if (type_size == 16 && ins->type == TAG_LOAD_STORE_4)
+                        if (type_size < 32 && ins->type == TAG_LOAD_STORE_4)
                                 round_size = 32;
 
                         unsigned rounded = mir_round_bytemask_up(live[ins->dest], round_size);
diff --git a/src/panfrost/midgard/midgard_ra.c b/src/panfrost/midgard/midgard_ra.c
index 40605df..af208bb 100644
--- a/src/panfrost/midgard/midgard_ra.c
+++ b/src/panfrost/midgard/midgard_ra.c
@@ -781,7 +781,7 @@ install_registers_instr(
                         struct phys_reg dst = index_to_reg(ctx, l, ins->dest, dest_shift);
 
                         ins->dest = SSA_FIXED_REGISTER(dst.reg);
-                        offset_swizzle(ins->swizzle[0], 0, 2, 2, dst.offset);
+                        offset_swizzle(ins->swizzle[0], 0, 2, dest_shift, dst.offset);
                         mir_set_bytemask(ins, mir_bytemask(ins) << dst.offset);
                 }
 
-- 
2.7.4