pan/bi: Use write masks on Valhall texture instrs
authorAlyssa Rosenzweig <alyssa@collabora.com>
Thu, 1 Dec 2022 02:29:43 +0000 (21:29 -0500)
committerMarge Bot <emma+marge@anholt.net>
Fri, 23 Dec 2022 19:05:10 +0000 (19:05 +0000)
I noticed a sequence like the following in a scheduled SuperTuxKart shader:

   TEX_SINGLE.slot0 @r0:r1, ..
   LD_VAR.wait0 @r2, ...
   FMA r1, ...

Why do we stall waiting for the TEX_SINGLE instruction when it's not actually
read? Because its upper channels are *never* read, leading to a
write-after-write dependency when the register allocator puts some unrelated ALU
destination in there. By appropriately masking the texture instruction's write,
that false dependency disappears, avoiding the stall.

Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20426>

src/panfrost/bifrost/bifrost_compile.c

index f5af591..5aa152e 100644 (file)
@@ -3780,8 +3780,12 @@ bi_emit_tex_valhall(bi_builder *b, nir_tex_instr *instr)
         image_src = bi_lshift_or_i32(b, sampler, image_src, bi_imm_u8(0));
         image_src = bi_lshift_or_i32(b, texture, image_src, bi_imm_u8(16));
 
-        unsigned mask = BI_WRITE_MASK_RGBA;
-        unsigned res_size = nir_dest_bit_size(instr->dest) == 16 ? 2 : 4;
+
+        /* Only write the components that we actually read */
+        unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
+        unsigned comps_per_reg = nir_dest_bit_size(instr->dest) == 16 ? 2 : 1;
+        unsigned res_size = DIV_ROUND_UP(util_bitcount(mask), comps_per_reg);
+
         enum bi_register_format regfmt = bi_reg_fmt_for_nir(instr->dest_type);
         enum bi_dimension dim = valhall_tex_dimension(instr->sampler_dim);
         bi_index dest = bi_temp(b->shader);
@@ -3810,10 +3814,32 @@ bi_emit_tex_valhall(bi_builder *b, nir_tex_instr *instr)
                 unreachable("Unhandled Valhall texture op");
         }
 
-        bi_index w[4] = { bi_null(), bi_null(), bi_null(), bi_null() };
-        bi_emit_split_i32(b, w, dest, res_size);
-        bi_emit_collect_to(b, bi_dest_index(&instr->dest), w,
-                        DIV_ROUND_UP(nir_dest_num_components(instr->dest) * res_size, 4));
+        /* The hardware will write only what we read, and it will into
+         * contiguous registers without gaps (different from Bifrost). NIR
+         * expects the gaps, so fill in the holes (they'll be copypropped and
+         * DCE'd away later).
+         */
+        bi_index unpacked[4] = { bi_null(), bi_null(), bi_null(), bi_null() };
+
+        bi_emit_cached_split_i32(b, dest, res_size);
+
+        /* Index into the packed component array */
+        unsigned j = 0;
+        unsigned comps[4] = { 0 };
+        unsigned nr_components = nir_dest_num_components(instr->dest);
+
+        for (unsigned i = 0; i < nr_components; ++i) {
+                if (mask & BITFIELD_BIT(i)) {
+                        unpacked[i] = dest;
+                        comps[i] = j++;
+                } else {
+                        unpacked[i] = bi_zero();
+                }
+        }
+
+        bi_make_vec_to(b, bi_dest_index(&instr->dest), unpacked,
+                        comps, nir_dest_num_components(instr->dest),
+                        nir_dest_bit_size(instr->dest));
 }
 
 /* Simple textures ops correspond to NIR tex or txl with LOD = 0 on 2D/cube