panfrost: Preprocess shaders in the driver
authorAlyssa Rosenzweig <alyssa@collabora.com>
Mon, 6 Feb 2023 22:23:19 +0000 (17:23 -0500)
committerMarge Bot <emma+marge@anholt.net>
Thu, 23 Mar 2023 23:53:46 +0000 (23:53 +0000)
This is a flag-day change to how we compile. We split preprocessing NIR into a
separate step from compiling, giving the driver a chance to apply its own
lowerings on the preprocessed NIR before the final optimization loop. During
that time, the different producers of NIR (panfrost, panvk, blend shaders, blit
shaders...) will be able to (differently) lower system values.

Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com>
Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20906>

15 files changed:
src/gallium/drivers/panfrost/pan_shader.c
src/panfrost/compiler/bifrost_compile.c
src/panfrost/compiler/bifrost_compile.h
src/panfrost/lib/pan_blend.c
src/panfrost/lib/pan_blend.h
src/panfrost/lib/pan_blitter.c
src/panfrost/lib/pan_indirect_dispatch.c
src/panfrost/lib/pan_shader.c
src/panfrost/lib/pan_shader.h
src/panfrost/midgard/midgard_compile.c
src/panfrost/midgard/midgard_compile.h
src/panfrost/util/pan_ir.h
src/panfrost/vulkan/panvk_vX_meta_clear.c
src/panfrost/vulkan/panvk_vX_meta_copy.c
src/panfrost/vulkan/panvk_vX_shader.c

index e393565..08b5676 100644 (file)
@@ -77,6 +77,15 @@ panfrost_shader_compile(struct panfrost_screen *screen, const nir_shader *ir,
 
    nir_shader *s = nir_shader_clone(NULL, ir);
 
+   /* While graphics shaders are preprocessed at CSO create time, compute
+    * kernels are not preprocessed until they're cloned since the driver does
+    * not get ownership of the NIR from compute CSOs. Do this preprocessing now.
+    * Compute CSOs call this function during create time, so preprocessing
+    * happens at CSO create time regardless.
+    */
+   if (gl_shader_stage_is_compute(s->info.stage))
+      pan_shader_preprocess(s, dev->gpu_id);
+
    struct panfrost_compile_inputs inputs = {
       .debug = dbg,
       .gpu_id = dev->gpu_id,
@@ -109,6 +118,14 @@ panfrost_shader_compile(struct panfrost_screen *screen, const nir_shader *ir,
    }
 
    util_dynarray_init(&out->binary, NULL);
+   pan_shader_preprocess(s, inputs.gpu_id);
+
+   if (dev->arch <= 5 && s->info.stage == MESA_SHADER_FRAGMENT) {
+      NIR_PASS_V(s, pan_lower_framebuffer, key->fs.rt_formats,
+                 pan_raw_format_mask_midgard(key->fs.rt_formats), false,
+                 dev->gpu_id < 0x700);
+   }
+
    screen->vtbl.compile_shader(s, &inputs, &out->binary, &out->info);
 
    assert(req_local_mem >= out->info.wls_size);
index e34172a..3edba4e 100644 (file)
@@ -4716,7 +4716,7 @@ bi_lower_sample_mask_writes(nir_builder *b, nir_instr *instr, void *data)
 }
 
 static bool
-bi_lower_load_output(nir_builder *b, nir_instr *instr, void *data)
+bi_lower_load_output(nir_builder *b, nir_instr *instr, UNUSED void *data)
 {
    if (instr->type != nir_instr_type_intrinsic)
       return false;
@@ -4734,15 +4734,6 @@ bi_lower_load_output(nir_builder *b, nir_instr *instr, void *data)
    nir_ssa_def *conversion = nir_load_rt_conversion_pan(
       b, .base = rt, .src_type = nir_intrinsic_dest_type(intr));
 
-   /* TODO: This should be optimized/lowered by the driver */
-   const struct panfrost_compile_inputs *inputs = data;
-
-   if (inputs->is_blend) {
-      conversion = nir_imm_int(b, inputs->blend.bifrost_blend_desc >> 32);
-   } else if (inputs->bifrost.static_rt_conv) {
-      conversion = nir_imm_int(b, inputs->bifrost.rt_conv[rt]);
-   }
-
    nir_ssa_def *lowered = nir_load_converted_output_pan(
       b, nir_dest_num_components(intr->dest), nir_dest_bit_size(intr->dest),
       conversion, .dest_type = nir_intrinsic_dest_type(intr),
@@ -4753,8 +4744,7 @@ bi_lower_load_output(nir_builder *b, nir_instr *instr, void *data)
 }
 
 void
-bifrost_preprocess_nir(nir_shader *nir,
-                       const struct panfrost_compile_inputs *inputs)
+bifrost_preprocess_nir(nir_shader *nir, unsigned gpu_id)
 {
    /* Lower gl_Position pre-optimisation, but after lowering vars to ssa
     * (so we don't accidentally duplicate the epilogue since mesa/st has
@@ -4781,7 +4771,7 @@ bifrost_preprocess_nir(nir_shader *nir,
     * (currently unconditional for Valhall), we force vec4 alignment for
     * scratch access.
     */
-   bool packed_tls = (inputs->gpu_id >= 0x9000);
+   bool packed_tls = (gpu_id >= 0x9000);
 
    /* Lower large arrays to scratch and small arrays to bcsel */
    NIR_PASS_V(nir, nir_lower_vars_to_scratch, nir_var_function_temp, 256,
@@ -4810,10 +4800,9 @@ bifrost_preprocess_nir(nir_shader *nir,
                  nir_metadata_block_index | nir_metadata_dominance, NULL);
 
       NIR_PASS_V(nir, nir_shader_instructions_pass, bi_lower_load_output,
-                 nir_metadata_block_index | nir_metadata_dominance,
-                 (void *)inputs);
+                 nir_metadata_block_index | nir_metadata_dominance, NULL);
    } else if (nir->info.stage == MESA_SHADER_VERTEX) {
-      if (inputs->gpu_id >= 0x9000) {
+      if (gpu_id >= 0x9000) {
          NIR_PASS_V(nir, nir_lower_mediump_io, nir_var_shader_out,
                     BITFIELD64_BIT(VARYING_SLOT_PSIZ), false);
       }
@@ -5251,7 +5240,6 @@ bifrost_compile_shader_nir(nir_shader *nir,
 {
    bifrost_debug = debug_get_option_bifrost_debug();
 
-   bifrost_preprocess_nir(nir, inputs);
    bi_optimize_nir(nir, inputs->gpu_id, inputs->is_blend);
 
    struct hash_table_u64 *sysval_to_id =
index a746703..dca1aba 100644 (file)
@@ -28,8 +28,7 @@
 #include "panfrost/util/pan_ir.h"
 #include "util/u_dynarray.h"
 
-void bifrost_preprocess_nir(nir_shader *nir,
-                            const struct panfrost_compile_inputs *inputs);
+void bifrost_preprocess_nir(nir_shader *nir, unsigned gpu_id);
 
 void bifrost_compile_shader_nir(nir_shader *nir,
                                 const struct panfrost_compile_inputs *inputs,
index 9967590..e36506a 100644 (file)
@@ -763,6 +763,42 @@ GENX(pan_blend_get_internal_desc)(const struct panfrost_device *dev,
 
    return res;
 }
+
+struct rt_conversion_inputs {
+   const struct panfrost_device *dev;
+   enum pipe_format *formats;
+};
+
+static bool
+inline_rt_conversion(nir_builder *b, nir_instr *instr, void *data)
+{
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+   if (intr->intrinsic != nir_intrinsic_load_rt_conversion_pan)
+      return false;
+
+   struct rt_conversion_inputs *inputs = data;
+   unsigned rt = nir_intrinsic_base(intr);
+   unsigned size = nir_alu_type_get_type_size(nir_intrinsic_src_type(intr));
+   uint64_t conversion = GENX(pan_blend_get_internal_desc)(
+      inputs->dev, inputs->formats[rt], rt, size, false);
+
+   b->cursor = nir_after_instr(instr);
+   nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_imm_int(b, conversion >> 32));
+   return true;
+}
+
+bool
+GENX(pan_inline_rt_conversion)(nir_shader *s, const struct panfrost_device *dev,
+                               enum pipe_format *formats)
+{
+   return nir_shader_instructions_pass(
+      s, inline_rt_conversion,
+      nir_metadata_block_index | nir_metadata_dominance,
+      &(struct rt_conversion_inputs){.dev = dev, .formats = formats});
+}
 #endif
 
 struct pan_blend_shader_variant *
@@ -843,6 +879,11 @@ GENX(pan_blend_get_shader_locked)(const struct panfrost_device *dev,
 #endif
 
    struct pan_shader_info info;
+   pan_shader_preprocess(nir, inputs.gpu_id);
+
+#if PAN_ARCH >= 6
+   NIR_PASS_V(nir, GENX(pan_inline_rt_conversion), dev, inputs.rt_formats);
+#endif
 
    GENX(pan_shader_compile)(nir, &inputs, &variant->binary, &info);
 
index 8b826d4..914b9a1 100644 (file)
@@ -161,6 +161,10 @@ nir_shader *GENX(pan_blend_create_shader)(const struct panfrost_device *dev,
 uint64_t GENX(pan_blend_get_internal_desc)(const struct panfrost_device *dev,
                                            enum pipe_format fmt, unsigned rt,
                                            unsigned force_size, bool dithered);
+
+bool GENX(pan_inline_rt_conversion)(nir_shader *s,
+                                    const struct panfrost_device *dev,
+                                    enum pipe_format *formats);
 #endif
 
 /* Take blend_shaders.lock before calling this function and release it when
index 44e59af..205b67e 100644 (file)
@@ -631,6 +631,8 @@ pan_blitter_get_blit_shader(struct panfrost_device *dev,
    for (unsigned i = 0; i < active_count; ++i)
       BITSET_SET(b.shader->info.textures_used, i);
 
+   pan_shader_preprocess(b.shader, inputs.gpu_id);
+
    if (PAN_ARCH == 4) {
       NIR_PASS_V(b.shader, nir_shader_instructions_pass,
                  lower_sampler_parameters,
index 8f7e75e..019db47 100644 (file)
@@ -130,6 +130,7 @@ pan_indirect_dispatch_init(struct panfrost_device *dev)
    struct util_dynarray binary;
 
    util_dynarray_init(&binary, NULL);
+   pan_shader_preprocess(b.shader, inputs.gpu_id);
    GENX(pan_shader_compile)(b.shader, &inputs, &binary, &shader_info);
 
    ralloc_free(b.shader);
index eb683de..afe1ccf 100644 (file)
@@ -83,6 +83,27 @@ GENX(pan_fixup_blend_type)(nir_alu_type T_size, enum pipe_format format)
 #endif
 #endif
 
+/* This is only needed on Midgard. It's the same on both v4 and v5, so only
+ * compile once to avoid the GenXML dependency for calls.
+ */
+#if PAN_ARCH == 5
+uint8_t
+pan_raw_format_mask_midgard(enum pipe_format *formats)
+{
+   uint8_t out = 0;
+
+   for (unsigned i = 0; i < 8; i++) {
+      enum pipe_format fmt = formats[i];
+      unsigned wb_fmt = panfrost_blendable_formats_v6[fmt].writeback;
+
+      if (wb_fmt < MALI_COLOR_FORMAT_R8)
+         out |= BITFIELD_BIT(i);
+   }
+
+   return out;
+}
+#endif
+
 void
 GENX(pan_shader_compile)(nir_shader *s, struct panfrost_compile_inputs *inputs,
                          struct util_dynarray *binary,
@@ -93,14 +114,6 @@ GENX(pan_shader_compile)(nir_shader *s, struct panfrost_compile_inputs *inputs,
 #if PAN_ARCH >= 6
    bifrost_compile_shader_nir(s, inputs, binary, info);
 #else
-   for (unsigned i = 0; i < ARRAY_SIZE(inputs->rt_formats); i++) {
-      enum pipe_format fmt = inputs->rt_formats[i];
-      unsigned wb_fmt = panfrost_blendable_formats_v6[fmt].writeback;
-
-      if (wb_fmt < MALI_COLOR_FORMAT_R8)
-         inputs->raw_fmt_mask |= BITFIELD_BIT(i);
-   }
-
    midgard_compile_shader_nir(s, inputs, binary, info);
 #endif
 
index 406db3d..df955af 100644 (file)
 
 struct panfrost_device;
 
+void bifrost_preprocess_nir(nir_shader *nir, unsigned gpu_id);
+void midgard_preprocess_nir(nir_shader *nir, unsigned gpu_id);
+
+static inline void
+pan_shader_preprocess(nir_shader *nir, unsigned gpu_id)
+{
+   if (pan_arch(gpu_id) >= 6)
+      bifrost_preprocess_nir(nir, gpu_id);
+   else
+      midgard_preprocess_nir(nir, gpu_id);
+}
+
+uint8_t pan_raw_format_mask_midgard(enum pipe_format *formats);
+
 #ifdef PAN_ARCH
 const nir_shader_compiler_options *GENX(pan_shader_get_compiler_options)(void);
 
index 35e40ac..2081c40 100644 (file)
@@ -40,7 +40,6 @@
 #include "util/u_dynarray.h"
 #include "util/u_math.h"
 
-#include "panfrost/util/pan_lower_framebuffer.h"
 #include "compiler.h"
 #include "helpers.h"
 #include "midgard.h"
@@ -330,10 +329,9 @@ midgard_vectorize_filter(const nir_instr *instr, const void *data)
 }
 
 void
-midgard_preprocess_nir(nir_shader *nir,
-                       const struct panfrost_compile_inputs *inputs)
+midgard_preprocess_nir(nir_shader *nir, unsigned gpu_id)
 {
-   unsigned quirks = midgard_get_quirks(inputs->gpu_id);
+   unsigned quirks = midgard_get_quirks(gpu_id);
 
    /* Lower gl_Position pre-optimisation, but after lowering vars to ssa
     * (so we don't accidentally duplicate the epilogue since mesa/st has
@@ -391,10 +389,9 @@ midgard_preprocess_nir(nir_shader *nir,
    NIR_PASS_V(nir, nir_lower_tex, &lower_tex_options);
 
    /* TEX_GRAD fails to apply sampler descriptor settings on some
-    * implementations, requiring a lowering. However, blit shaders do not
-    * use the affected settings and should skip the workaround.
+    * implementations, requiring a lowering.
     */
-   if ((quirks & MIDGARD_BROKEN_LOD) && !inputs->is_blit)
+   if (quirks & MIDGARD_BROKEN_LOD)
       NIR_PASS_V(nir, midgard_nir_lod_errata);
 
    /* Midgard image ops coordinates are 16-bit instead of 32-bit */
@@ -417,12 +414,6 @@ midgard_preprocess_nir(nir_shader *nir,
    NIR_PASS_V(nir, nir_lower_alu_to_scalar, mdg_should_scalarize, NULL);
    NIR_PASS_V(nir, nir_lower_flrp, 16 | 32 | 64, false /* always_precise */);
    NIR_PASS_V(nir, nir_lower_var_copies);
-
-   if (nir->info.stage == MESA_SHADER_FRAGMENT) {
-      NIR_PASS_V(nir, pan_lower_framebuffer, inputs->rt_formats,
-                 inputs->raw_fmt_mask, inputs->is_blend,
-                 quirks & MIDGARD_BROKEN_BLEND_LOADS);
-   }
 }
 
 static void
@@ -3177,8 +3168,6 @@ midgard_compile_shader_nir(nir_shader *nir,
 
    ctx->ssa_constants = _mesa_hash_table_u64_create(ctx);
 
-   midgard_preprocess_nir(nir, inputs);
-
    /* Collect varyings after lowering I/O */
    pan_nir_collect_varyings(nir, info);
 
index 0ea6ead..d2a1cd0 100644 (file)
@@ -29,8 +29,7 @@
 #include "panfrost/util/pan_ir.h"
 #include "util/u_dynarray.h"
 
-void midgard_preprocess_nir(nir_shader *nir,
-                            const struct panfrost_compile_inputs *inputs);
+void midgard_preprocess_nir(nir_shader *nir, unsigned gpu_id);
 
 void midgard_compile_shader_nir(nir_shader *nir,
                                 const struct panfrost_compile_inputs *inputs,
index 41fdcd1..97df6eb 100644 (file)
@@ -184,7 +184,6 @@ struct panfrost_compile_inputs {
    bool no_ubo_to_push;
 
    enum pipe_format rt_formats[8];
-   uint8_t raw_fmt_mask;
 
    /* Used on Valhall.
     *
@@ -198,7 +197,6 @@ struct panfrost_compile_inputs {
 
    union {
       struct {
-         bool static_rt_conv;
          uint32_t rt_conv[8];
       } bifrost;
    };
index cc28748..32f8e64 100644 (file)
@@ -61,6 +61,7 @@ panvk_meta_clear_color_attachment_shader(struct panfrost_device *pdev,
    struct util_dynarray binary;
 
    util_dynarray_init(&binary, NULL);
+   pan_shader_preprocess(b.shader, inputs.gpu_id);
    GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info);
 
    shader_info->push.count = 4;
index 3bb2218..3aec1f9 100644 (file)
@@ -449,17 +449,11 @@ panvk_meta_copy_img2img_shader(struct panfrost_device *pdev,
       .no_ubo_to_push = true,
    };
 
-   pan_pack(&inputs.bifrost.rt_conv[0], INTERNAL_CONVERSION, cfg) {
-      cfg.memory_format = (dstcompsz == 2 ? MALI_RG16UI : MALI_RG32UI) << 12;
-      cfg.register_format = dstcompsz == 2 ?
-                            MALI_REGISTER_FILE_FORMAT_U16 :
-                            MALI_REGISTER_FILE_FORMAT_U32;
-   }
-   inputs.bifrost.static_rt_conv = true;
-
    struct util_dynarray binary;
 
    util_dynarray_init(&binary, NULL);
+   pan_shader_preprocess(b.shader, inputs.gpu_id);
+   NIR_PASS_V(b.shader, GENX(pan_inline_rt_conversion), pdev, &dstfmt);
    GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info);
 
    shader_info->fs.sample_shading = is_ms;
@@ -984,17 +978,14 @@ panvk_meta_copy_buf2img_shader(struct panfrost_device *pdev,
       .no_ubo_to_push = true,
    };
 
-   pan_pack(&inputs.bifrost.rt_conv[0], INTERNAL_CONVERSION, cfg) {
-      cfg.memory_format = (imgcompsz == 2 ? MALI_RG16UI : MALI_RG32UI) << 12;
-      cfg.register_format = imgcompsz == 2 ?
-                            MALI_REGISTER_FILE_FORMAT_U16 :
-                            MALI_REGISTER_FILE_FORMAT_U32;
-   }
-   inputs.bifrost.static_rt_conv = true;
-
    struct util_dynarray binary;
 
    util_dynarray_init(&binary, NULL);
+   pan_shader_preprocess(b.shader, inputs.gpu_id);
+
+   enum pipe_format rt_formats[8] = {key.imgfmt};
+   NIR_PASS_V(b.shader, GENX(pan_inline_rt_conversion), pdev, rt_formats);
+
    GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info);
    shader_info->push.count = DIV_ROUND_UP(sizeof(struct panvk_meta_copy_buf2img_info), 4);
 
@@ -1434,6 +1425,7 @@ panvk_meta_copy_img2buf_shader(struct panfrost_device *pdev,
    struct util_dynarray binary;
 
    util_dynarray_init(&binary, NULL);
+   pan_shader_preprocess(b.shader, inputs.gpu_id);
    GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info);
 
    shader_info->push.count = DIV_ROUND_UP(sizeof(struct panvk_meta_copy_img2buf_info), 4);
@@ -1662,6 +1654,7 @@ panvk_meta_copy_buf2buf_shader(struct panfrost_device *pdev,
    struct util_dynarray binary;
 
    util_dynarray_init(&binary, NULL);
+   pan_shader_preprocess(b.shader, inputs.gpu_id);
    GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info);
 
    shader_info->push.count = DIV_ROUND_UP(sizeof(struct panvk_meta_copy_buf2buf_info), 4);
@@ -1791,6 +1784,7 @@ panvk_meta_fill_buf_shader(struct panfrost_device *pdev,
    struct util_dynarray binary;
 
    util_dynarray_init(&binary, NULL);
+   pan_shader_preprocess(b.shader, inputs.gpu_id);
    GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info);
 
    shader_info->push.count = DIV_ROUND_UP(sizeof(struct panvk_meta_fill_buf_info), 4);
index fde8445..6061cd1 100644 (file)
@@ -153,10 +153,6 @@ panvk_lower_blend(struct panfrost_device *pdev,
       rt_state->equation.alpha_dst_factor = BLEND_FACTOR_ZERO;
       rt_state->equation.alpha_invert_dst_factor = false;
       lower_blend = true;
-
-      inputs->bifrost.static_rt_conv = true;
-      inputs->bifrost.rt_conv[rt] =
-         GENX(pan_blend_get_internal_desc)(pdev, fmt, rt, 32, false) >> 32;
    }
 
    if (lower_blend) {
@@ -371,6 +367,17 @@ panvk_per_arch(shader_create)(struct panvk_device *dev,
       nir_print_shader(nir, stderr);
    }
 
+   pan_shader_preprocess(nir, inputs.gpu_id);
+
+   if (stage == MESA_SHADER_FRAGMENT) {
+      enum pipe_format rt_formats[MAX_RTS] = {PIPE_FORMAT_NONE};
+
+      for (unsigned rt = 0; rt < MAX_RTS; ++rt)
+         rt_formats[rt] = blend_state->rts[rt].format;
+
+      NIR_PASS_V(nir, GENX(pan_inline_rt_conversion), pdev, rt_formats);
+   }
+
    GENX(pan_shader_compile)(nir, &inputs, &shader->binary, &shader->info);
 
    /* System values shouldn't have changed */