intel/fs: handle URB setup for fast linked mesh pipelines
authorIván Briano <ivan.briano@intel.com>
Tue, 5 Sep 2023 05:31:17 +0000 (22:31 -0700)
committerMarge Bot <emma+marge@anholt.net>
Tue, 12 Sep 2023 02:51:31 +0000 (02:51 +0000)
Up until now, the mesh pipeline assumed it would be always linked to the
fragment shader, and so the calculated MUE map would always be
available.
That is not the case for fast linked pipeline libraries, so the URB
setup needs to account for this. We do this by replicating what's done
for non-mesh pipelines, defining the URB based on the FS inputs, and
always assuming they will be laid out in order of varying number, except
that we also account for per-primitive attributes.

Fixes all GPL using tests under dEQP-VK.mesh_shader.ext.smoke.*

Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25047>

src/intel/compiler/brw_fs.cpp

index a886be7..f7507da 100644 (file)
@@ -1809,7 +1809,7 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
       nir->info.inputs_read & ~nir->info.per_primitive_inputs;
 
    /* Figure out where each of the incoming setup attributes lands. */
-   if (mue_map) {
+   if (key->mesh_input != BRW_NEVER) {
       /* Per-Primitive Attributes are laid out by Hardware before the regular
        * attributes, so order them like this to make easy later to map setup
        * into real HW registers.
@@ -1818,9 +1818,6 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
          uint64_t per_prim_inputs_read =
                nir->info.inputs_read & nir->info.per_primitive_inputs;
 
-         unsigned per_prim_start_dw = mue_map->per_primitive_start_dw;
-         unsigned per_prim_size_dw = mue_map->per_primitive_pitch_dw;
-
          /* In Mesh, PRIMITIVE_SHADING_RATE, VIEWPORT and LAYER slots
           * are always at the beginning, because they come from MUE
           * Primitive Header, not Per-Primitive Attributes.
@@ -1828,46 +1825,67 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
          const uint64_t primitive_header_bits = VARYING_BIT_VIEWPORT |
                                                 VARYING_BIT_LAYER |
                                                 VARYING_BIT_PRIMITIVE_SHADING_RATE;
-         bool reads_header = (per_prim_inputs_read & primitive_header_bits) != 0;
 
-         if (reads_header || mue_map->user_data_in_primitive_header) {
-            /* Primitive Shading Rate, Layer and Viewport live in the same
-             * 4-dwords slot (psr is dword 0, layer is dword 1, and viewport
-             * is dword 2).
-             */
-            if (per_prim_inputs_read & VARYING_BIT_PRIMITIVE_SHADING_RATE)
-               prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_SHADING_RATE] = 0;
+         if (mue_map) {
+            unsigned per_prim_start_dw = mue_map->per_primitive_start_dw;
+            unsigned per_prim_size_dw = mue_map->per_primitive_pitch_dw;
 
-            if (per_prim_inputs_read & VARYING_BIT_LAYER)
-               prog_data->urb_setup[VARYING_SLOT_LAYER] = 0;
+            bool reads_header = (per_prim_inputs_read & primitive_header_bits) != 0;
 
-            if (per_prim_inputs_read & VARYING_BIT_VIEWPORT)
-               prog_data->urb_setup[VARYING_SLOT_VIEWPORT] = 0;
+            if (reads_header || mue_map->user_data_in_primitive_header) {
+               /* Primitive Shading Rate, Layer and Viewport live in the same
+                * 4-dwords slot (psr is dword 0, layer is dword 1, and viewport
+                * is dword 2).
+                */
+               if (per_prim_inputs_read & VARYING_BIT_PRIMITIVE_SHADING_RATE)
+                  prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_SHADING_RATE] = 0;
 
-            per_prim_inputs_read &= ~primitive_header_bits;
-         } else {
-            /* If fs doesn't need primitive header, then it won't be made
-             * available through SBE_MESH, so we have to skip them when
-             * calculating offset from start of per-prim data.
-             */
-            per_prim_start_dw += mue_map->per_primitive_header_size_dw;
-            per_prim_size_dw -= mue_map->per_primitive_header_size_dw;
-         }
+               if (per_prim_inputs_read & VARYING_BIT_LAYER)
+                  prog_data->urb_setup[VARYING_SLOT_LAYER] = 0;
 
-         u_foreach_bit64(i, per_prim_inputs_read) {
-            int start = mue_map->start_dw[i];
+               if (per_prim_inputs_read & VARYING_BIT_VIEWPORT)
+                  prog_data->urb_setup[VARYING_SLOT_VIEWPORT] = 0;
 
-            assert(start >= 0);
-            assert(mue_map->len_dw[i] > 0);
+               per_prim_inputs_read &= ~primitive_header_bits;
+            } else {
+               /* If fs doesn't need primitive header, then it won't be made
+                * available through SBE_MESH, so we have to skip them when
+                * calculating offset from start of per-prim data.
+                */
+               per_prim_start_dw += mue_map->per_primitive_header_size_dw;
+               per_prim_size_dw -= mue_map->per_primitive_header_size_dw;
+            }
 
-            assert(unsigned(start) >= per_prim_start_dw);
-            unsigned pos_dw = unsigned(start) - per_prim_start_dw;
+            u_foreach_bit64(i, per_prim_inputs_read) {
+               int start = mue_map->start_dw[i];
 
-            prog_data->urb_setup[i] = urb_next + pos_dw / 4;
-            prog_data->urb_setup_channel[i] = pos_dw % 4;
-         }
+               assert(start >= 0);
+               assert(mue_map->len_dw[i] > 0);
 
-         urb_next = per_prim_size_dw / 4;
+               assert(unsigned(start) >= per_prim_start_dw);
+               unsigned pos_dw = unsigned(start) - per_prim_start_dw;
+
+               prog_data->urb_setup[i] = urb_next + pos_dw / 4;
+               prog_data->urb_setup_channel[i] = pos_dw % 4;
+            }
+
+            urb_next = per_prim_size_dw / 4;
+         } else {
+            /* With no MUE map, we never read the primitive header, and
+             * per-primitive attributes won't be packed either, so just lay
+             * them in varying order.
+             */
+            per_prim_inputs_read &= ~primitive_header_bits;
+
+            for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
+               if (per_prim_inputs_read & BITFIELD64_BIT(i)) {
+                  prog_data->urb_setup[i] = urb_next++;
+               }
+            }
+
+            /* The actual setup attributes later must be aligned to a full GRF. */
+            urb_next = ALIGN(urb_next, 2);
+         }
 
          prog_data->num_per_primitive_inputs = urb_next;
       }
@@ -1878,52 +1896,68 @@ calculate_urb_setup(const struct intel_device_info *devinfo,
       uint64_t unique_fs_attrs = inputs_read & BRW_FS_VARYING_INPUT_MASK;
 
       if (inputs_read & clip_dist_bits) {
-         assert(mue_map->per_vertex_header_size_dw > 8);
+         assert(!mue_map || mue_map->per_vertex_header_size_dw > 8);
          unique_fs_attrs &= ~clip_dist_bits;
       }
 
-      unsigned per_vertex_start_dw = mue_map->per_vertex_start_dw;
-      unsigned per_vertex_size_dw = mue_map->per_vertex_pitch_dw;
+      if (mue_map) {
+         unsigned per_vertex_start_dw = mue_map->per_vertex_start_dw;
+         unsigned per_vertex_size_dw = mue_map->per_vertex_pitch_dw;
 
-      /* Per-Vertex header is available to fragment shader only if there's
-       * user data there.
-       */
-      if (!mue_map->user_data_in_vertex_header) {
-         per_vertex_start_dw += 8;
-         per_vertex_size_dw -= 8;
-      }
+         /* Per-Vertex header is available to fragment shader only if there's
+          * user data there.
+          */
+         if (!mue_map->user_data_in_vertex_header) {
+            per_vertex_start_dw += 8;
+            per_vertex_size_dw -= 8;
+         }
 
-      /* In Mesh, CLIP_DIST slots are always at the beginning, because
-       * they come from MUE Vertex Header, not Per-Vertex Attributes.
-       */
-      if (inputs_read & clip_dist_bits) {
-         prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next;
-         prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next + 1;
-      } else if (mue_map->per_vertex_header_size_dw > 8) {
-         /* Clip distances are in MUE, but we are not reading them in FS. */
-         per_vertex_start_dw += 8;
-         per_vertex_size_dw -= 8;
-      }
+         /* In Mesh, CLIP_DIST slots are always at the beginning, because
+          * they come from MUE Vertex Header, not Per-Vertex Attributes.
+          */
+         if (inputs_read & clip_dist_bits) {
+            prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next;
+            prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next + 1;
+         } else if (mue_map && mue_map->per_vertex_header_size_dw > 8) {
+            /* Clip distances are in MUE, but we are not reading them in FS. */
+            per_vertex_start_dw += 8;
+            per_vertex_size_dw -= 8;
+         }
 
-      /* Per-Vertex attributes are laid out ordered.  Because we always link
-       * Mesh and Fragment shaders, the which slots are written and read by
-       * each of them will match. */
+         /* Per-Vertex attributes are laid out ordered.  Because we always link
+          * Mesh and Fragment shaders, the which slots are written and read by
+          * each of them will match. */
+         u_foreach_bit64(i, unique_fs_attrs) {
+            int start = mue_map->start_dw[i];
 
-      u_foreach_bit64(i, unique_fs_attrs) {
-         int start = mue_map->start_dw[i];
+            assert(start >= 0);
+            assert(mue_map->len_dw[i] > 0);
 
-         assert(start >= 0);
-         assert(mue_map->len_dw[i] > 0);
+            assert(unsigned(start) >= per_vertex_start_dw);
+            unsigned pos_dw = unsigned(start) - per_vertex_start_dw;
 
-         assert(unsigned(start) >= per_vertex_start_dw);
-         unsigned pos_dw = unsigned(start) - per_vertex_start_dw;
+            prog_data->urb_setup[i] = urb_next + pos_dw / 4;
+            prog_data->urb_setup_channel[i] = pos_dw % 4;
+         }
 
-         prog_data->urb_setup[i] = urb_next + pos_dw / 4;
-         prog_data->urb_setup_channel[i] = pos_dw % 4;
-      }
+         urb_next += per_vertex_size_dw / 4;
+      } else {
+         /* If we don't have an MUE map, just lay down the inputs the FS reads
+          * in varying order, as we do for the legacy pipeline.
+          */
+         if (inputs_read & clip_dist_bits) {
+            prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next++;
+            prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next++;
+         }
 
-      urb_next += per_vertex_size_dw / 4;
+         for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
+            if (unique_fs_attrs & BITFIELD64_BIT(i))
+               prog_data->urb_setup[i] = urb_next++;
+         }
+      }
    } else if (devinfo->ver >= 6) {
+      assert(!nir->info.per_primitive_inputs);
+
       uint64_t vue_header_bits =
          VARYING_BIT_PSIZ | VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT;