asahi: Arrange VS varyings in the correct order
authorAsahi Lina <lina@asahilina.net>
Wed, 28 Jun 2023 10:15:28 +0000 (19:15 +0900)
committerMarge Bot <emma+marge@anholt.net>
Wed, 5 Jul 2023 05:11:49 +0000 (05:11 +0000)
The GPU ABI requires varyings to be grouped as follows:

- Position
- Smooth shaded fp32
- Flat shaded fp32
- Linear shaded fp32
- Smooth shaded fp16
- Flat shaded fp16
- Linear shaded fp16
- Point size

Use the flat shaded mask info we now have in the vertex shader key to
sort things properly, and pass the counts to the hardware.
FP16 is still TODO.

Signed-off-by: Asahi Lina <lina@asahilina.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23998>

src/asahi/compiler/agx_compile.c
src/asahi/compiler/agx_compile.h
src/gallium/drivers/asahi/agx_state.c

index 43b536c..6ab3fed 100644 (file)
@@ -2018,7 +2018,8 @@ agx_optimize_nir(nir_shader *nir, unsigned *preamble_size)
 
 /* ABI: position first, then user, then psiz */
 static void
-agx_remap_varyings_vs(nir_shader *nir, struct agx_varyings_vs *varyings)
+agx_remap_varyings_vs(nir_shader *nir, struct agx_varyings_vs *varyings,
+                      struct agx_shader_key *key)
 {
    unsigned base = 0;
 
@@ -2033,16 +2034,47 @@ agx_remap_varyings_vs(nir_shader *nir, struct agx_varyings_vs *varyings)
    varyings->slots[VARYING_SLOT_POS] = base;
    base += 4;
 
-   u_foreach_bit64(loc, nir->info.outputs_written) {
+   assert(!(key->vs.outputs_flat_shaded & key->vs.outputs_linear_shaded));
+
+   /* Smooth 32-bit user bindings go next */
+   u_foreach_bit64(loc, nir->info.outputs_written &
+                           ~key->vs.outputs_flat_shaded &
+                           ~key->vs.outputs_linear_shaded) {
+      if (loc == VARYING_SLOT_POS || loc == VARYING_SLOT_PSIZ)
+         continue;
+
+      varyings->slots[loc] = base;
+      base += 4;
+      varyings->num_32_smooth += 4;
+   }
+
+   /* Flat 32-bit user bindings go next */
+   u_foreach_bit64(loc,
+                   nir->info.outputs_written & key->vs.outputs_flat_shaded) {
+      if (loc == VARYING_SLOT_POS || loc == VARYING_SLOT_PSIZ)
+         continue;
+
+      varyings->slots[loc] = base;
+      base += 4;
+      varyings->num_32_flat += 4;
+   }
+
+   /* Linear 32-bit user bindings go next */
+   u_foreach_bit64(loc,
+                   nir->info.outputs_written & key->vs.outputs_linear_shaded) {
       if (loc == VARYING_SLOT_POS || loc == VARYING_SLOT_PSIZ)
          continue;
 
       varyings->slots[loc] = base;
       base += 4;
+      varyings->num_32_linear += 4;
    }
 
    /* TODO: Link FP16 varyings */
    varyings->base_index_fp16 = base;
+   varyings->num_16_smooth = 0;
+   varyings->num_16_flat = 0;
+   varyings->num_16_linear = 0;
 
    if (nir->info.outputs_written & VARYING_BIT_PSIZ) {
       varyings->slots[VARYING_SLOT_PSIZ] = base;
@@ -2471,7 +2503,7 @@ agx_compile_shader_nir(nir_shader *nir, struct agx_shader_key *key,
 
    /* Must be last since NIR passes can remap driver_location freely */
    if (nir->info.stage == MESA_SHADER_VERTEX)
-      agx_remap_varyings_vs(nir, &out->varyings.vs);
+      agx_remap_varyings_vs(nir, &out->varyings.vs, key);
 
    if (agx_should_dump(nir, AGX_DBG_SHADERS))
       nir_print_shader(nir, stdout);
index a38aa09..0ef4b93 100644 (file)
 #include "util/u_dynarray.h"
 
 struct agx_varyings_vs {
+   /* The number of user varyings of each type. The varyings must be allocated
+    * in this order ({smooth, flat, linear} × {32, 16}), which may require
+    * remapping.
+    */
+   unsigned num_32_smooth;
+   unsigned num_32_flat;
+   unsigned num_32_linear;
+   unsigned num_16_smooth;
+   unsigned num_16_flat;
+   unsigned num_16_linear;
+
    /* The first index used for FP16 varyings. Indices less than this are treated
     * as FP32. This may require remapping slots to guarantee.
     */
index 4ae696e..592ec62 100644 (file)
@@ -2212,7 +2212,6 @@ agx_batch_init_state(struct agx_batch *batch)
    struct agx_ppp_update ppp =
       agx_new_ppp_update(&batch->pool, (struct AGX_PPP_HEADER){
                                           .w_clamp = true,
-                                          .varying_counts_16 = true,
                                           .cull_2 = true,
                                           .occlusion_query_2 = true,
                                           .output_unknown = true,
@@ -2221,7 +2220,6 @@ agx_batch_init_state(struct agx_batch *batch)
 
    /* clang-format off */
    agx_ppp_push(&ppp, W_CLAMP, cfg) cfg.w_clamp = 1e-10;
-   agx_ppp_push(&ppp, VARYING_COUNTS, cfg);
    agx_ppp_push(&ppp, CULL_2, cfg);
    agx_ppp_push(&ppp, FRAGMENT_OCCLUSION_QUERY_2, cfg);
    agx_ppp_push(&ppp, OUTPUT_UNKNOWN, cfg);
@@ -2416,6 +2414,7 @@ agx_encode_state(struct agx_batch *batch, uint8_t *out, bool is_lines,
       .fragment_back_stencil = IS_DIRTY(ZS),
       .output_select = IS_DIRTY(VS_PROG) || IS_DIRTY(FS_PROG),
       .varying_counts_32 = IS_DIRTY(VS_PROG),
+      .varying_counts_16 = IS_DIRTY(VS_PROG),
       .cull = IS_DIRTY(RS),
       .fragment_shader =
          IS_DIRTY(FS) || varyings_dirty || IS_DIRTY(SAMPLE_MASK),
@@ -2506,9 +2505,19 @@ agx_encode_state(struct agx_batch *batch, uint8_t *out, bool is_lines,
       }
    }
 
+   assert(dirty.varying_counts_32 == dirty.varying_counts_16);
+
    if (dirty.varying_counts_32) {
       agx_ppp_push(&ppp, VARYING_COUNTS, cfg) {
-         cfg.smooth = agx_num_general_outputs(&ctx->vs->info.varyings.vs);
+         cfg.smooth = vs->info.varyings.vs.num_32_smooth;
+         cfg.flat = vs->info.varyings.vs.num_32_flat;
+         cfg.linear = vs->info.varyings.vs.num_32_linear;
+      }
+
+      agx_ppp_push(&ppp, VARYING_COUNTS, cfg) {
+         cfg.smooth = vs->info.varyings.vs.num_16_smooth;
+         cfg.flat = vs->info.varyings.vs.num_16_flat;
+         cfg.linear = vs->info.varyings.vs.num_16_linear;
       }
    }