anv: split pipeline programming into instructions
authorLionel Landwerlin <lionel.g.landwerlin@intel.com>
Tue, 1 Aug 2023 09:20:19 +0000 (12:20 +0300)
committerMarge Bot <emma+marge@anholt.net>
Wed, 6 Sep 2023 20:07:02 +0000 (20:07 +0000)
The goal of this change it to move away from a single batch buffer
containing all kind of pipeline instructions to a list of instructions
we can emit separately.

We will later implement pipeline diffing and finer state tracking that
will allow fewer instructions to be emitted.

This changes the following things :

   * instead of having a batch & partially packed instructions, move
     everything into the batch

   * add a set of pointer in the batch that allows us to point to each
     instruction (almost... we group some like URB instructions,
     etc...).

At pipeline emission time, we just go through all of those pointers
and emit the instruction into the batch. No additional packing is
involved.

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Tapani Pälli <tapani.palli@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24536>

src/intel/vulkan/anv_batch_chain.c
src/intel/vulkan/anv_genX.h
src/intel/vulkan/anv_private.h
src/intel/vulkan/genX_cmd_buffer.c
src/intel/vulkan/genX_gfx_state.c
src/intel/vulkan/genX_pipeline.c

index 2f8039e..73f7b29 100644 (file)
@@ -136,7 +136,7 @@ anv_reloc_list_clear(struct anv_reloc_list *list)
       memset(list->deps, 0, list->dep_words * sizeof(BITSET_WORD));
 }
 
-static VkResult
+VkResult
 anv_reloc_list_append(struct anv_reloc_list *list,
                       struct anv_reloc_list *other)
 {
index 2779a85..e7ba120 100644 (file)
@@ -96,8 +96,9 @@ void genX(apply_task_urb_workaround)(struct anv_cmd_buffer *cmd_buffer);
 
 void genX(emit_vertex_input)(struct anv_batch *batch,
                              uint32_t *vertex_element_dws,
-                             const struct anv_graphics_pipeline *pipeline,
-                             const struct vk_vertex_input_state *vi);
+                             struct anv_graphics_pipeline *pipeline,
+                             const struct vk_vertex_input_state *vi,
+                             bool emit_in_pipeline);
 
 enum anv_pipe_bits
 genX(emit_apply_pipe_flushes)(struct anv_batch *batch,
@@ -125,7 +126,7 @@ void genX(emit_l3_config)(struct anv_batch *batch,
 void genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer,
                                 const struct intel_l3_config *cfg);
 
-void genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer);
+void genX(cmd_buffer_flush_gfx_hw_state)(struct anv_cmd_buffer *cmd_buffer);
 
 void genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer,
                                      bool enable);
index 2122165..777c398 100644 (file)
@@ -1464,6 +1464,9 @@ anv_reloc_list_add_bo(struct anv_reloc_list *list, struct anv_bo *target_bo)
    return list->uses_relocs ? anv_reloc_list_add_bo_impl(list, target_bo) : VK_SUCCESS;
 }
 
+VkResult anv_reloc_list_append(struct anv_reloc_list *list,
+                               struct anv_reloc_list *other);
+
 struct anv_batch_bo {
    /* Link in the anv_cmd_buffer.owned_batch_bos list */
    struct list_head                             link;
@@ -1603,14 +1606,16 @@ _anv_combine_address(struct anv_batch *batch, void *location,
       __dst;                                               \
    })
 
-#define anv_batch_emit_merge(batch, cmd, prepacked, name)               \
+#define anv_batch_emit_merge(batch, cmd, pipeline, state, name)         \
    for (struct cmd name = { 0 },                                        \
         *_dst = anv_batch_emit_dwords(batch, __anv_cmd_length(cmd));    \
         __builtin_expect(_dst != NULL, 1);                              \
         ({ uint32_t _partial[__anv_cmd_length(cmd)];                    \
            __anv_cmd_pack(cmd)(batch, _partial, &name);                 \
-           for (uint32_t i = 0; i < __anv_cmd_length(cmd); i++)         \
-              ((uint32_t *)_dst)[i] = _partial[i] | (prepacked)[i];     \
+           for (uint32_t i = 0; i < __anv_cmd_length(cmd); i++) {       \
+              ((uint32_t *)_dst)[i] = _partial[i] |                     \
+                 (pipeline)->batch_data[(pipeline)->state.offset + i];  \
+           }                                                            \
            VG(VALGRIND_CHECK_MEM_IS_DEFINED(_dst, __anv_cmd_length(cmd) * 4)); \
            _dst = NULL;                                                 \
          }))
@@ -3515,6 +3520,12 @@ struct anv_graphics_lib_pipeline {
    bool                                         retain_shaders;
 };
 
+struct anv_gfx_state_ptr {
+   /* Both in dwords */
+   uint16_t  offset;
+   uint16_t  len;
+};
+
 /* The final graphics pipeline object has all the graphics state ready to be
  * programmed into HW packets (dynamic_state field) or fully baked in its
  * batch.
@@ -3564,7 +3575,7 @@ struct anv_graphics_pipeline {
     * this array only holds the svgs_count elements.
     */
    uint32_t                                     vertex_input_elems;
-   uint32_t                                     vertex_input_data[96];
+   uint32_t                                     vertex_input_data[2 * 31 /* MAX_VES + 2 internal */];
 
    enum brw_wm_msaa_flags                       fs_msaa_flags;
 
@@ -3575,25 +3586,75 @@ struct anv_graphics_pipeline {
 
    /* Fully backed instructions, ready to be emitted in the anv_cmd_buffer */
    struct {
-      uint32_t                                  hs[9];
-      uint32_t                                  ds[11];
+      struct anv_gfx_state_ptr                  urb;
+      struct anv_gfx_state_ptr                  vf_statistics;
+      struct anv_gfx_state_ptr                  vf_sgvs;
+      struct anv_gfx_state_ptr                  vf_sgvs_2;
+      struct anv_gfx_state_ptr                  vf_sgvs_instancing;
+      struct anv_gfx_state_ptr                  vf_instancing;
+      struct anv_gfx_state_ptr                  primitive_replication;
+      struct anv_gfx_state_ptr                  sbe;
+      struct anv_gfx_state_ptr                  sbe_swiz;
+      struct anv_gfx_state_ptr                  so_decl_list;
+      struct anv_gfx_state_ptr                  ms;
+      struct anv_gfx_state_ptr                  vs;
+      struct anv_gfx_state_ptr                  hs;
+      struct anv_gfx_state_ptr                  ds;
+      struct anv_gfx_state_ptr                  ps;
+      struct anv_gfx_state_ptr                  ps_extra;
+
+      struct anv_gfx_state_ptr                  task_control;
+      struct anv_gfx_state_ptr                  task_shader;
+      struct anv_gfx_state_ptr                  task_redistrib;
+      struct anv_gfx_state_ptr                  clip_mesh;
+      struct anv_gfx_state_ptr                  mesh_control;
+      struct anv_gfx_state_ptr                  mesh_shader;
+      struct anv_gfx_state_ptr                  mesh_distrib;
+      struct anv_gfx_state_ptr                  sbe_mesh;
    } final;
 
    /* Pre packed CS instructions & structures that need to be merged later
     * with dynamic state.
     */
    struct {
-      uint32_t                                  clip[4];
-      uint32_t                                  sf[4];
-      uint32_t                                  raster[5];
-      uint32_t                                  wm[2];
-      uint32_t                                  streamout_state[5];
-      uint32_t                                  gs[10];
-      uint32_t                                  te[4];
-      uint32_t                                  vfg[4];
+      struct anv_gfx_state_ptr                  clip;
+      struct anv_gfx_state_ptr                  sf;
+      struct anv_gfx_state_ptr                  raster;
+      struct anv_gfx_state_ptr                  wm;
+      struct anv_gfx_state_ptr                  so;
+      struct anv_gfx_state_ptr                  gs;
+      struct anv_gfx_state_ptr                  te;
+      struct anv_gfx_state_ptr                  vfg;
    } partial;
 };
 
+#define anv_batch_merge_pipeline_state(batch, dwords0, pipeline, state) \
+   do {                                                                 \
+      uint32_t *dw;                                                     \
+                                                                        \
+      assert(ARRAY_SIZE(dwords0) == (pipeline)->state.len);             \
+      dw = anv_batch_emit_dwords((batch), ARRAY_SIZE(dwords0));         \
+      if (!dw)                                                          \
+         break;                                                         \
+      for (uint32_t i = 0; i < ARRAY_SIZE(dwords0); i++)                \
+         dw[i] = (dwords0)[i] |                                         \
+            (pipeline)->batch_data[(pipeline)->state.offset + i];       \
+      VG(VALGRIND_CHECK_MEM_IS_DEFINED(dw, ARRAY_SIZE(dwords0) * 4));   \
+   } while (0)
+
+#define anv_batch_emit_pipeline_state(batch, pipeline, state)           \
+   do {                                                                 \
+      if ((pipeline)->state.len == 0)                                   \
+         break;                                                         \
+      uint32_t *dw;                                                     \
+      dw = anv_batch_emit_dwords((batch), (pipeline)->state.len);       \
+      if (!dw)                                                          \
+         break;                                                         \
+      memcpy(dw, &(pipeline)->batch_data[(pipeline)->state.offset],     \
+             4 * (pipeline)->state.len);                                \
+   } while (0)
+
+
 struct anv_compute_pipeline {
    struct anv_pipeline                          base;
 
index 6157e55..86ab22a 100644 (file)
@@ -2994,10 +2994,7 @@ genX(emit_hs)(struct anv_cmd_buffer *cmd_buffer)
    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
       return;
 
-   uint32_t *dw =
-      anv_batch_emitn(&cmd_buffer->batch, GENX(3DSTATE_HS_length),
-                         GENX(3DSTATE_HS));
-   memcpy(dw, &pipeline->final.hs, sizeof(pipeline->final.hs));
+   anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.hs);
 }
 
 ALWAYS_INLINE static void
@@ -3022,10 +3019,7 @@ genX(emit_ds)(struct anv_cmd_buffer *cmd_buffer)
    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
       return;
 
-   uint32_t *dw =
-      anv_batch_emitn(&cmd_buffer->batch, GENX(3DSTATE_DS_length),
-                         GENX(3DSTATE_DS));
-   memcpy(dw, &pipeline->final.ds, sizeof(pipeline->final.ds));
+   anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.ds);
 #endif
 }
 
@@ -3224,13 +3218,22 @@ genX(cmd_buffer_flush_gfx_state)(struct anv_cmd_buffer *cmd_buffer)
       }
    }
 
-   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
-      anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.base.batch);
+   if (any_dynamic_state_dirty || cmd_buffer->state.gfx.dirty)
+      genX(cmd_buffer_flush_gfx_hw_state)(cmd_buffer);
 
-      /* If the pipeline changed, we may need to re-allocate push constant
-       * space in the URB.
-       */
+   /* If the pipeline changed, we may need to re-allocate push constant space
+    * in the URB.
+    */
+   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
       cmd_buffer_alloc_gfx_push_constants(cmd_buffer);
+
+      /* Also add the relocations (scratch buffers) */
+      VkResult result = anv_reloc_list_append(cmd_buffer->batch.relocs,
+                                              pipeline->base.base.batch.relocs);
+      if (result != VK_SUCCESS) {
+         anv_batch_set_error(&cmd_buffer->batch, result);
+         return;
+      }
    }
 
    /* Render targets live in the same binding table as fragment descriptors */
@@ -3274,8 +3277,9 @@ genX(cmd_buffer_flush_gfx_state)(struct anv_cmd_buffer *cmd_buffer)
                                           dirty & VK_SHADER_STAGE_ALL_GRAPHICS);
    }
 
-   if (any_dynamic_state_dirty || cmd_buffer->state.gfx.dirty)
-      genX(cmd_buffer_flush_dynamic_state)(cmd_buffer);
+   /* When we're done, there is no more dirty gfx state. */
+   vk_dynamic_graphics_state_clear_dirty(&cmd_buffer->vk.dynamic_graphics_state);
+   cmd_buffer->state.gfx.dirty = 0;
 }
 
 #include "genX_cmd_draw_generated_indirect.h"
index 3d4ec69..8a30316 100644 (file)
@@ -215,15 +215,12 @@ genX(cmd_emit_te)(struct anv_cmd_buffer *cmd_buffer)
 
    if (!tes_prog_data ||
        !anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
-      uint32_t *dw =
-         anv_batch_emitn(&cmd_buffer->batch, GENX(3DSTATE_TE_length),
-                         GENX(3DSTATE_TE));
-      memcpy(dw, &pipeline->partial.te, sizeof(pipeline->partial.te));
+      anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, partial.te);
       return;
    }
 
    anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_TE),
-                        pipeline->partial.te, te) {
+                        pipelinepartial.te, te) {
       if (dyn->ts.domain_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT) {
          te.OutputTopology = tes_prog_data->output_topology;
       } else {
@@ -244,14 +241,14 @@ genX(emit_gs)(struct anv_cmd_buffer *cmd_buffer)
 {
    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
-      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_GS), gs);
+      anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, partial.gs);
       return;
    }
 
    const struct vk_dynamic_graphics_state *dyn =
       &cmd_buffer->vk.dynamic_graphics_state;
    anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_GS),
-                        pipeline->partial.gs, gs) {
+                        pipelinepartial.gs, gs) {
       switch (dyn->rs.provoking_vertex) {
       case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT:
          gs.ReorderMode = LEADING;
@@ -463,7 +460,7 @@ cmd_buffer_emit_clip(struct anv_cmd_buffer *cmd_buffer)
       return;
 
    anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_CLIP),
-                        pipeline->partial.clip, clip) {
+                        pipelinepartial.clip, clip) {
       /* Take dynamic primitive topology in to account with
        *    3DSTATE_CLIP::ViewportXYClipTestEnable
        */
@@ -532,7 +529,7 @@ cmd_buffer_emit_streamout(struct anv_cmd_buffer *cmd_buffer)
    genX(streamout_prologue)(cmd_buffer);
 
    anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_STREAMOUT),
-                        pipeline->partial.streamout_state, so) {
+                        pipeline, partial.so, so) {
       so.RenderingDisable = dyn->rs.rasterizer_discard_enable;
       so.RenderStreamSelect = dyn->rs.rasterization_stream;
 #if INTEL_NEEDS_WA_18022508906
@@ -802,13 +799,58 @@ cmd_buffer_emit_scissor(struct anv_cmd_buffer *cmd_buffer)
    }
 }
 
+#define cmd_buffer_emit_pipeline_state(batch, pipeline, state)          \
+   do {                                                                 \
+      if ((pipeline)->state.len == 0)                                   \
+         break;                                                         \
+      void *dw = anv_batch_emit_dwords(batch, (pipeline)->state.len);   \
+      if (!dw)                                                          \
+         break;                                                         \
+      memcpy(dw,                                                        \
+             &(pipeline)->batch_data[(pipeline)->state.offset],         \
+             4 * (pipeline)->state.len);                                \
+   } while (0)
+
+
 void
-genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
+genX(cmd_buffer_flush_gfx_hw_state)(struct anv_cmd_buffer *cmd_buffer)
 {
    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
    struct vk_dynamic_graphics_state *dyn =
       &cmd_buffer->vk.dynamic_graphics_state;
 
+   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
+      struct anv_batch *batch = &cmd_buffer->batch;
+
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.urb);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.ms);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.primitive_replication);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.vf_instancing);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.vf_sgvs_instancing);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.vf_sgvs);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.vf_sgvs_2);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.vs);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.hs);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.ds);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.vf_statistics);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.so_decl_list);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.sbe);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.sbe_swiz);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.ps);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.ps_extra);
+
+      if (cmd_buffer->device->vk.enabled_extensions.EXT_mesh_shader) {
+         cmd_buffer_emit_pipeline_state(batch, pipeline, final.task_control);
+         cmd_buffer_emit_pipeline_state(batch, pipeline, final.task_shader);
+         cmd_buffer_emit_pipeline_state(batch, pipeline, final.task_redistrib);
+         cmd_buffer_emit_pipeline_state(batch, pipeline, final.clip_mesh);
+         cmd_buffer_emit_pipeline_state(batch, pipeline, final.mesh_control);
+         cmd_buffer_emit_pipeline_state(batch, pipeline, final.mesh_shader);
+         cmd_buffer_emit_pipeline_state(batch, pipeline, final.mesh_distrib);
+         cmd_buffer_emit_pipeline_state(batch, pipeline, final.sbe_mesh);
+      }
+   }
+
    cmd_buffer_emit_clip(cmd_buffer);
 
    if ((cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
@@ -865,7 +907,7 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
          } else {
             /* Use dyn->vi to emit the dynamic VERTEX_ELEMENT_STATE input. */
             genX(emit_vertex_input)(&cmd_buffer->batch, p + 1,
-                                    pipeline, dyn->vi);
+                                    pipeline, dyn->vi, false /* emit_in_pipeline */);
             /* Then append the VERTEX_ELEMENT_STATE for the draw parameters */
             memcpy(p + 1 + 2 * pipeline->vs_input_elements,
                    pipeline->vertex_input_data,
@@ -896,7 +938,7 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_PROVOKING_VERTEX) ||
        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS)) {
       anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_SF),
-                           pipeline->partial.sf, sf) {
+                           pipelinepartial.sf, sf) {
          ANV_SETUP_PROVOKING_VERTEX(sf, dyn->rs.provoking_vertex);
 
          sf.LineWidth = dyn->rs.line.width;
@@ -978,7 +1020,7 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
          vk_rasterization_state_depth_clip_enable(&dyn->rs);
 
       anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_RASTER),
-                           pipeline->partial.raster, raster) {
+                           pipelinepartial.raster, raster) {
          raster.APIMode = api_mode;
          raster.DXMultisampleRasterizationEnable   = msaa_raster_enable;
          raster.AntialiasingEnable                 = aa_enable;
@@ -1120,7 +1162,7 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
    if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) ||
        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE)) {
       anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_VFG),
-                           pipeline->partial.vfg, vfg) {
+                           pipelinepartial.vfg, vfg) {
          vfg.ListCutIndexEnable = dyn->ia.primitive_restart_enable;
       }
    }
@@ -1141,7 +1183,7 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
        * threads.
        */
       anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_WM),
-                           pipeline->partial.wm, wm) {
+                           pipelinepartial.wm, wm) {
          wm.ForceThreadDispatchEnable = anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT) &&
                                         (pipeline->force_fragment_thread_dispatch ||
                                         anv_cmd_buffer_all_color_write_masked(cmd_buffer)) ?
@@ -1365,8 +1407,4 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
          ccp.ColorCalcStatePointerValid = true;
       }
    }
-
-   /* When we're done, there is no more dirty gfx state. */
-   vk_dynamic_graphics_state_clear_dirty(&cmd_buffer->vk.dynamic_graphics_state);
-   cmd_buffer->state.gfx.dirty = 0;
 }
index 8174ab5..b7f4dba 100644 (file)
 #include "vk_log.h"
 #include "vk_render_pass.h"
 
+static inline struct anv_batch *
+anv_gfx_pipeline_add(struct anv_graphics_pipeline *pipeline,
+                     struct anv_gfx_state_ptr *ptr,
+                     uint32_t n_dwords)
+{
+   struct anv_batch *batch = &pipeline->base.base.batch;
+
+   assert(ptr->len == 0 ||
+          (batch->next - batch->start) / 4 == (ptr->offset + ptr->len));
+   if (ptr->len == 0)
+      ptr->offset = (batch->next - batch->start) / 4;
+   ptr->len += n_dwords;
+
+   return batch;
+}
+
+#define anv_pipeline_emit(pipeline, state, cmd, name)                   \
+   for (struct cmd name = { __anv_cmd_header(cmd) },                    \
+           *_dst = anv_batch_emit_dwords(                               \
+              anv_gfx_pipeline_add(pipeline,                            \
+                                   &(pipeline)->state,                  \
+                                   __anv_cmd_length(cmd)),              \
+              __anv_cmd_length(cmd));                                   \
+        __builtin_expect(_dst != NULL, 1);                              \
+        ({ __anv_cmd_pack(cmd)(&(pipeline)->base.base.batch,            \
+                               _dst, &name);                            \
+           VG(VALGRIND_CHECK_MEM_IS_DEFINED(_dst, __anv_cmd_length(cmd) * 4)); \
+           _dst = NULL;                                                 \
+        }))
+
+#define anv_pipeline_emitn(pipeline, state, n, cmd, ...) ({             \
+   void *__dst = anv_batch_emit_dwords(                                 \
+      anv_gfx_pipeline_add(pipeline, &(pipeline)->state, n), n);        \
+   if (__dst) {                                                         \
+      struct cmd __template = {                                         \
+         __anv_cmd_header(cmd),                                         \
+         .DWordLength = n - __anv_cmd_length_bias(cmd),                 \
+         __VA_ARGS__                                                    \
+      };                                                                \
+      __anv_cmd_pack(cmd)(&pipeline->base.base.batch,                   \
+                          __dst, &__template);                          \
+   }                                                                    \
+   __dst;                                                               \
+   })
+
+
 static uint32_t
 vertex_element_comp_control(enum isl_format format, unsigned comp)
 {
@@ -91,8 +137,9 @@ vertex_element_comp_control(enum isl_format format, unsigned comp)
 void
 genX(emit_vertex_input)(struct anv_batch *batch,
                         uint32_t *vertex_element_dws,
-                        const struct anv_graphics_pipeline *pipeline,
-                        const struct vk_vertex_input_state *vi)
+                        struct anv_graphics_pipeline *pipeline,
+                        const struct vk_vertex_input_state *vi,
+                        bool emit_in_pipeline)
 {
    const struct anv_device *device = pipeline->base.base.device;
    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
@@ -169,15 +216,28 @@ genX(emit_vertex_input)(struct anv_batch *batch,
        * that controls instancing.  On Haswell and prior, that's part of
        * VERTEX_BUFFER_STATE which we emit later.
        */
-      anv_batch_emit(batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
-         bool per_instance = vi->bindings[binding].input_rate ==
-                             VK_VERTEX_INPUT_RATE_INSTANCE;
-         uint32_t divisor = vi->bindings[binding].divisor *
-                            pipeline->instance_multiplier;
-
-         vfi.InstancingEnable = per_instance;
-         vfi.VertexElementIndex = slot;
-         vfi.InstanceDataStepRate = per_instance ? divisor : 1;
+      if (emit_in_pipeline) {
+         anv_pipeline_emit(pipeline, final.vf_instancing, GENX(3DSTATE_VF_INSTANCING), vfi) {
+            bool per_instance = vi->bindings[binding].input_rate ==
+               VK_VERTEX_INPUT_RATE_INSTANCE;
+            uint32_t divisor = vi->bindings[binding].divisor *
+               pipeline->instance_multiplier;
+
+            vfi.InstancingEnable = per_instance;
+            vfi.VertexElementIndex = slot;
+            vfi.InstanceDataStepRate = per_instance ? divisor : 1;
+         }
+      } else {
+         anv_batch_emit(batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
+            bool per_instance = vi->bindings[binding].input_rate ==
+               VK_VERTEX_INPUT_RATE_INSTANCE;
+            uint32_t divisor = vi->bindings[binding].divisor *
+               pipeline->instance_multiplier;
+
+            vfi.InstancingEnable = per_instance;
+            vfi.VertexElementIndex = slot;
+            vfi.InstanceDataStepRate = per_instance ? divisor : 1;
+         }
       }
    }
 }
@@ -187,15 +247,13 @@ emit_vertex_input(struct anv_graphics_pipeline *pipeline,
                   const struct vk_graphics_pipeline_state *state,
                   const struct vk_vertex_input_state *vi)
 {
-   struct anv_batch *batch = &pipeline->base.base.batch;
-
    /* Only pack the VERTEX_ELEMENT_STATE if not dynamic so we can just memcpy
     * everything in gfx8_cmd_buffer.c
     */
    if (!BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_VI)) {
-      genX(emit_vertex_input)(batch,
+      genX(emit_vertex_input)(NULL,
                               pipeline->vertex_input_data,
-                              pipeline, vi);
+                              pipeline, vi, true /* emit_in_pipeline */);
    }
 
    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
@@ -207,6 +265,7 @@ emit_vertex_input(struct anv_graphics_pipeline *pipeline,
       assert(pipeline->vertex_input_elems >= pipeline->svgs_count);
       uint32_t slot_offset =
          pipeline->vertex_input_elems - pipeline->svgs_count;
+
       if (needs_svgs_elem) {
 #if GFX_VER < 11
          /* From the Broadwell PRM for the 3D_Vertex_Component_Control enum:
@@ -243,7 +302,8 @@ emit_vertex_input(struct anv_graphics_pipeline *pipeline,
                                          &element);
          slot_offset++;
 
-         anv_batch_emit(batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
+         anv_pipeline_emit(pipeline, final.vf_sgvs_instancing,
+                           GENX(3DSTATE_VF_INSTANCING), vfi) {
             vfi.VertexElementIndex = id_slot;
          }
       }
@@ -268,13 +328,14 @@ emit_vertex_input(struct anv_graphics_pipeline *pipeline,
                                          &element);
          slot_offset++;
 
-         anv_batch_emit(batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
+         anv_pipeline_emit(pipeline, final.vf_sgvs_instancing,
+                           GENX(3DSTATE_VF_INSTANCING), vfi) {
             vfi.VertexElementIndex = drawid_slot;
          }
       }
    }
 
-   anv_batch_emit(batch, GENX(3DSTATE_VF_SGVS), sgvs) {
+   anv_pipeline_emit(pipeline, final.vf_sgvs, GENX(3DSTATE_VF_SGVS), sgvs) {
       sgvs.VertexIDEnable              = vs_prog_data->uses_vertexid;
       sgvs.VertexIDComponentNumber     = 2;
       sgvs.VertexIDElementOffset       = id_slot;
@@ -284,7 +345,7 @@ emit_vertex_input(struct anv_graphics_pipeline *pipeline,
    }
 
 #if GFX_VER >= 11
-   anv_batch_emit(batch, GENX(3DSTATE_VF_SGVS_2), sgvs) {
+   anv_pipeline_emit(pipeline, final.vf_sgvs_2, GENX(3DSTATE_VF_SGVS_2), sgvs) {
       /* gl_BaseVertex */
       sgvs.XP0Enable                   = vs_prog_data->uses_firstvertex;
       sgvs.XP0SourceSelect             = XP0_PARAMETER;
@@ -306,32 +367,30 @@ emit_vertex_input(struct anv_graphics_pipeline *pipeline,
 
 #if GFX_VERx10 >= 125
    struct anv_device *device = pipeline->base.base.device;
-   struct GENX(3DSTATE_VFG) vfg = {
-      GENX(3DSTATE_VFG_header),
+   anv_pipeline_emit(pipeline, partial.vfg, GENX(3DSTATE_VFG), vfg) {
       /* If 3DSTATE_TE: TE Enable == 1 then RR_STRICT else RR_FREE*/
-      .DistributionMode =
+      vfg.DistributionMode =
          anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL) ? RR_STRICT :
-         RR_FREE,
-      .DistributionGranularity = BatchLevelGranularity,
-   };
-   /* Wa_14014890652 */
-   if (intel_device_info_is_dg2(device->info))
-      vfg.GranularityThresholdDisable = 1;
-   /* 192 vertices for TRILIST_ADJ */
-   vfg.ListNBatchSizeScale = 0;
-   /* Batch size of 384 vertices */
-   vfg.List3BatchSizeScale = 2;
-   /* Batch size of 128 vertices */
-   vfg.List2BatchSizeScale = 1;
-   /* Batch size of 128 vertices */
-   vfg.List1BatchSizeScale = 2;
-   /* Batch size of 256 vertices for STRIP topologies */
-   vfg.StripBatchSizeScale = 3;
-   /* 192 control points for PATCHLIST_3 */
-   vfg.PatchBatchSizeScale = 1;
-   /* 192 control points for PATCHLIST_3 */
-   vfg.PatchBatchSizeMultiplier = 31;
-   GENX(3DSTATE_VFG_pack)(NULL, pipeline->partial.vfg, &vfg);
+         RR_FREE;
+      vfg.DistributionGranularity = BatchLevelGranularity;
+      /* Wa_14014890652 */
+      if (intel_device_info_is_dg2(device->info))
+         vfg.GranularityThresholdDisable = 1;
+      /* 192 vertices for TRILIST_ADJ */
+      vfg.ListNBatchSizeScale = 0;
+      /* Batch size of 384 vertices */
+      vfg.List3BatchSizeScale = 2;
+      /* Batch size of 128 vertices */
+      vfg.List2BatchSizeScale = 1;
+      /* Batch size of 128 vertices */
+      vfg.List1BatchSizeScale = 2;
+      /* Batch size of 256 vertices for STRIP topologies */
+      vfg.StripBatchSizeScale = 3;
+      /* 192 control points for PATCHLIST_3 */
+      vfg.PatchBatchSizeScale = 1;
+      /* 192 control points for PATCHLIST_3 */
+      vfg.PatchBatchSizeMultiplier = 31;
+   }
 #endif
 }
 
@@ -375,7 +434,6 @@ static void
 emit_urb_setup_mesh(struct anv_graphics_pipeline *pipeline,
                     enum intel_urb_deref_block_size *deref_block_size)
 {
-   struct anv_batch *batch = &pipeline->base.base.batch;
    const struct intel_device_info *devinfo = pipeline->base.base.device->info;
 
    const struct brw_task_prog_data *task_prog_data =
@@ -390,12 +448,12 @@ emit_urb_setup_mesh(struct anv_graphics_pipeline *pipeline,
 
    /* Zero out the primitive pipeline URB allocations. */
    for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
-      anv_batch_emit(batch, GENX(3DSTATE_URB_VS), urb) {
+      anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_VS), urb) {
          urb._3DCommandSubOpcode += i;
       }
    }
 
-   anv_batch_emit(batch, GENX(3DSTATE_URB_ALLOC_TASK), urb) {
+   anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_TASK), urb) {
       if (task_prog_data) {
          urb.TASKURBEntryAllocationSize   = alloc.task_entry_size_64b - 1;
          urb.TASKNumberofURBEntriesSlice0 = alloc.task_entries;
@@ -405,7 +463,7 @@ emit_urb_setup_mesh(struct anv_graphics_pipeline *pipeline,
       }
    }
 
-   anv_batch_emit(batch, GENX(3DSTATE_URB_ALLOC_MESH), urb) {
+   anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_MESH), urb) {
       urb.MESHURBEntryAllocationSize   = alloc.mesh_entry_size_64b - 1;
       urb.MESHNumberofURBEntriesSlice0 = alloc.mesh_entries;
       urb.MESHNumberofURBEntriesSliceN = alloc.mesh_entries;
@@ -437,192 +495,207 @@ emit_urb_setup(struct anv_graphics_pipeline *pipeline,
       entry_size[i] = prog_data ? prog_data->urb_entry_size : 1;
    }
 
-   genX(emit_urb_setup)(pipeline->base.base.device,
-                        &pipeline->base.base.batch,
+   struct anv_device *device = pipeline->base.base.device;
+   const struct intel_device_info *devinfo = device->info;
+
+   unsigned entries[4];
+   unsigned start[4];
+   bool constrained;
+   intel_get_urb_config(devinfo,
                         pipeline->base.base.l3_config,
-                        pipeline->base.base.active_stages, entry_size,
-                        deref_block_size);
+                        pipeline->base.base.active_stages &
+                           VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT,
+                        pipeline->base.base.active_stages &
+                           VK_SHADER_STAGE_GEOMETRY_BIT,
+                        entry_size, entries, start, deref_block_size,
+                        &constrained);
+
+   for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
+      anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_VS), urb) {
+         urb._3DCommandSubOpcode      += i;
+         urb.VSURBStartingAddress      = start[i];
+         urb.VSURBEntryAllocationSize  = entry_size[i] - 1;
+         urb.VSNumberofURBEntries      = entries[i];
+      }
+   }
+#if GFX_VERx10 >= 125
+   if (device->vk.enabled_extensions.EXT_mesh_shader) {
+      anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_TASK), zero);
+      anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_MESH), zero);
+   }
+#endif
+
 }
 
 static void
 emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline)
 {
-   struct anv_batch *batch = &pipeline->base.base.batch;
    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
 
    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
-      anv_batch_emit(batch, GENX(3DSTATE_SBE), sbe);
-      anv_batch_emit(batch, GENX(3DSTATE_SBE_SWIZ), sbe);
+      anv_pipeline_emit(pipeline, final.sbe, GENX(3DSTATE_SBE), sbe);
+      anv_pipeline_emit(pipeline, final.sbe_swiz, GENX(3DSTATE_SBE_SWIZ), sbe);
 #if GFX_VERx10 >= 125
       if (anv_pipeline_is_mesh(pipeline))
-         anv_batch_emit(batch, GENX(3DSTATE_SBE_MESH), sbe_mesh);
+         anv_pipeline_emit(pipeline, final.sbe_mesh, GENX(3DSTATE_SBE_MESH), sbe);
 #endif
       return;
    }
 
-   struct GENX(3DSTATE_SBE) sbe = {
-      GENX(3DSTATE_SBE_header),
+   anv_pipeline_emit(pipeline, final.sbe, GENX(3DSTATE_SBE), sbe) {
+   anv_pipeline_emit(pipeline, final.sbe_swiz, GENX(3DSTATE_SBE_SWIZ), swiz) {
+
       /* TODO(mesh): Figure out cases where we need attribute swizzling.  See also
        * calculate_urb_setup() and related functions.
        */
-      .AttributeSwizzleEnable = anv_pipeline_is_primitive(pipeline),
-      .PointSpriteTextureCoordinateOrigin = UPPERLEFT,
-      .NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs,
-      .ConstantInterpolationEnable = wm_prog_data->flat_inputs,
-   };
-
-   for (unsigned i = 0; i < 32; i++)
-      sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;
-
-   /* On Broadwell, they broke 3DSTATE_SBE into two packets */
-   struct GENX(3DSTATE_SBE_SWIZ) swiz = {
-      GENX(3DSTATE_SBE_SWIZ_header),
-   };
-
-   if (anv_pipeline_is_primitive(pipeline)) {
-      const struct brw_vue_map *fs_input_map =
-         &anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map;
-
-      int first_slot = brw_compute_first_urb_slot_required(wm_prog_data->inputs,
-                                                           fs_input_map);
-      assert(first_slot % 2 == 0);
-      unsigned urb_entry_read_offset = first_slot / 2;
-      int max_source_attr = 0;
-      for (uint8_t idx = 0; idx < wm_prog_data->urb_setup_attribs_count; idx++) {
-         uint8_t attr = wm_prog_data->urb_setup_attribs[idx];
-         int input_index = wm_prog_data->urb_setup[attr];
-
-         assert(0 <= input_index);
-
-         /* gl_Viewport, gl_Layer and FragmentShadingRateKHR are stored in the
-          * VUE header
-          */
-         if (attr == VARYING_SLOT_VIEWPORT ||
-             attr == VARYING_SLOT_LAYER ||
-             attr == VARYING_SLOT_PRIMITIVE_SHADING_RATE) {
-            continue;
-         }
-
-         if (attr == VARYING_SLOT_PNTC) {
-            sbe.PointSpriteTextureCoordinateEnable = 1 << input_index;
-            continue;
-         }
-
-         const int slot = fs_input_map->varying_to_slot[attr];
-
-         if (slot == -1) {
-            /* This attribute does not exist in the VUE--that means that the
-             * vertex shader did not write to it.  It could be that it's a
-             * regular varying read by the fragment shader but not written by
-             * the vertex shader or it's gl_PrimitiveID. In the first case the
-             * value is undefined, in the second it needs to be
-             * gl_PrimitiveID.
+      sbe.AttributeSwizzleEnable = anv_pipeline_is_primitive(pipeline);
+      sbe.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
+      sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
+      sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
+
+      for (unsigned i = 0; i < 32; i++)
+         sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;
+
+      if (anv_pipeline_is_primitive(pipeline)) {
+         const struct brw_vue_map *fs_input_map =
+            &anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map;
+
+         int first_slot =
+            brw_compute_first_urb_slot_required(wm_prog_data->inputs,
+                                                fs_input_map);
+         assert(first_slot % 2 == 0);
+         unsigned urb_entry_read_offset = first_slot / 2;
+         int max_source_attr = 0;
+         for (uint8_t idx = 0; idx < wm_prog_data->urb_setup_attribs_count; idx++) {
+            uint8_t attr = wm_prog_data->urb_setup_attribs[idx];
+            int input_index = wm_prog_data->urb_setup[attr];
+
+            assert(0 <= input_index);
+
+            /* gl_Viewport, gl_Layer and FragmentShadingRateKHR are stored in the
+             * VUE header
+             */
+            if (attr == VARYING_SLOT_VIEWPORT ||
+                attr == VARYING_SLOT_LAYER ||
+                attr == VARYING_SLOT_PRIMITIVE_SHADING_RATE) {
+               continue;
+            }
+
+            if (attr == VARYING_SLOT_PNTC) {
+               sbe.PointSpriteTextureCoordinateEnable = 1 << input_index;
+               continue;
+            }
+
+            const int slot = fs_input_map->varying_to_slot[attr];
+
+            if (slot == -1) {
+               /* This attribute does not exist in the VUE--that means that
+                * the vertex shader did not write to it. It could be that it's
+                * a regular varying read by the fragment shader but not
+                * written by the vertex shader or it's gl_PrimitiveID. In the
+                * first case the value is undefined, in the second it needs to
+                * be gl_PrimitiveID.
+                */
+               swiz.Attribute[input_index].ConstantSource = PRIM_ID;
+               swiz.Attribute[input_index].ComponentOverrideX = true;
+               swiz.Attribute[input_index].ComponentOverrideY = true;
+               swiz.Attribute[input_index].ComponentOverrideZ = true;
+               swiz.Attribute[input_index].ComponentOverrideW = true;
+               continue;
+            }
+
+            /* We have to subtract two slots to account for the URB entry
+             * output read offset in the VS and GS stages.
+             */
+            const int source_attr = slot - 2 * urb_entry_read_offset;
+            assert(source_attr >= 0 && source_attr < 32);
+            max_source_attr = MAX2(max_source_attr, source_attr);
+            /* The hardware can only do overrides on 16 overrides at a time,
+             * and the other up to 16 have to be lined up so that the input
+             * index = the output index. We'll need to do some tweaking to
+             * make sure that's the case.
              */
-            swiz.Attribute[input_index].ConstantSource = PRIM_ID;
-            swiz.Attribute[input_index].ComponentOverrideX = true;
-            swiz.Attribute[input_index].ComponentOverrideY = true;
-            swiz.Attribute[input_index].ComponentOverrideZ = true;
-            swiz.Attribute[input_index].ComponentOverrideW = true;
-            continue;
+            if (input_index < 16)
+               swiz.Attribute[input_index].SourceAttribute = source_attr;
+            else
+               assert(source_attr == input_index);
          }
 
-         /* We have to subtract two slots to account for the URB entry output
-          * read offset in the VS and GS stages.
-          */
-         const int source_attr = slot - 2 * urb_entry_read_offset;
-         assert(source_attr >= 0 && source_attr < 32);
-         max_source_attr = MAX2(max_source_attr, source_attr);
-         /* The hardware can only do overrides on 16 overrides at a time, and the
-          * other up to 16 have to be lined up so that the input index = the
-          * output index. We'll need to do some tweaking to make sure that's the
-          * case.
-          */
-         if (input_index < 16)
-            swiz.Attribute[input_index].SourceAttribute = source_attr;
-         else
-            assert(source_attr == input_index);
-      }
-
-      sbe.VertexURBEntryReadOffset = urb_entry_read_offset;
-      sbe.VertexURBEntryReadLength = DIV_ROUND_UP(max_source_attr + 1, 2);
-      sbe.ForceVertexURBEntryReadOffset = true;
-      sbe.ForceVertexURBEntryReadLength = true;
+         sbe.VertexURBEntryReadOffset = urb_entry_read_offset;
+         sbe.VertexURBEntryReadLength = DIV_ROUND_UP(max_source_attr + 1, 2);
+         sbe.ForceVertexURBEntryReadOffset = true;
+         sbe.ForceVertexURBEntryReadLength = true;
 
-      /* Ask the hardware to supply PrimitiveID if the fragment shader
-       * reads it but a previous stage didn't write one.
-       */
-      if ((wm_prog_data->inputs & VARYING_BIT_PRIMITIVE_ID) &&
-          fs_input_map->varying_to_slot[VARYING_SLOT_PRIMITIVE_ID] == -1) {
-         sbe.PrimitiveIDOverrideAttributeSelect =
-            wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_ID];
-         sbe.PrimitiveIDOverrideComponentX = true;
-         sbe.PrimitiveIDOverrideComponentY = true;
-         sbe.PrimitiveIDOverrideComponentZ = true;
-         sbe.PrimitiveIDOverrideComponentW = true;
-         pipeline->primitive_id_override = true;
-      }
-   } else {
-      assert(anv_pipeline_is_mesh(pipeline));
-#if GFX_VERx10 >= 125
-      const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
-      anv_batch_emit(batch, GENX(3DSTATE_SBE_MESH), sbe_mesh) {
-         const struct brw_mue_map *mue = &mesh_prog_data->map;
-
-         assert(mue->per_vertex_header_size_dw % 8 == 0);
-         sbe_mesh.PerVertexURBEntryOutputReadOffset = mue->per_vertex_header_size_dw / 8;
-         sbe_mesh.PerVertexURBEntryOutputReadLength = DIV_ROUND_UP(mue->per_vertex_data_size_dw, 8);
-
-         /* Clip distance array is passed in the per-vertex header so that
-          * it can be consumed by the HW. If user wants to read it in the FS,
-          * adjust the offset and length to cover it. Conveniently it is at
-          * the end of the per-vertex header, right before per-vertex
-          * attributes.
-          *
-          * Note that FS attribute reading must be aware that the clip
-          * distances have fixed position.
+         /* Ask the hardware to supply PrimitiveID if the fragment shader
+          * reads it but a previous stage didn't write one.
           */
-         if (mue->per_vertex_header_size_dw > 8 &&
-               (wm_prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] >= 0 ||
-                wm_prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] >= 0)) {
-            sbe_mesh.PerVertexURBEntryOutputReadOffset -= 1;
-            sbe_mesh.PerVertexURBEntryOutputReadLength += 1;
+         if ((wm_prog_data->inputs & VARYING_BIT_PRIMITIVE_ID) &&
+             fs_input_map->varying_to_slot[VARYING_SLOT_PRIMITIVE_ID] == -1) {
+            sbe.PrimitiveIDOverrideAttributeSelect =
+               wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_ID];
+            sbe.PrimitiveIDOverrideComponentX = true;
+            sbe.PrimitiveIDOverrideComponentY = true;
+            sbe.PrimitiveIDOverrideComponentZ = true;
+            sbe.PrimitiveIDOverrideComponentW = true;
+            pipeline->primitive_id_override = true;
          }
-
-         if (mue->user_data_in_vertex_header) {
-            sbe_mesh.PerVertexURBEntryOutputReadOffset -= 1;
-            sbe_mesh.PerVertexURBEntryOutputReadLength += 1;
-         }
-
-         assert(mue->per_primitive_header_size_dw % 8 == 0);
-         sbe_mesh.PerPrimitiveURBEntryOutputReadOffset = mue->per_primitive_header_size_dw / 8;
-         sbe_mesh.PerPrimitiveURBEntryOutputReadLength = DIV_ROUND_UP(mue->per_primitive_data_size_dw, 8);
-
-         /* Just like with clip distances, if Primitive Shading Rate,
-          * Viewport Index or Layer is read back in the FS, adjust
-          * the offset and length to cover the Primitive Header, where
-          * PSR, Viewport Index & Layer are stored.
-          */
-         if (wm_prog_data->urb_setup[VARYING_SLOT_VIEWPORT] >= 0 ||
-             wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_SHADING_RATE] >= 0 ||
-             wm_prog_data->urb_setup[VARYING_SLOT_LAYER] >= 0 ||
-             mue->user_data_in_primitive_header) {
-            assert(sbe_mesh.PerPrimitiveURBEntryOutputReadOffset > 0);
-            sbe_mesh.PerPrimitiveURBEntryOutputReadOffset -= 1;
-            sbe_mesh.PerPrimitiveURBEntryOutputReadLength += 1;
+      } else {
+         assert(anv_pipeline_is_mesh(pipeline));
+#if GFX_VERx10 >= 125
+         const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
+         anv_pipeline_emit(pipeline, final.sbe_mesh,
+                           GENX(3DSTATE_SBE_MESH), sbe_mesh) {
+            const struct brw_mue_map *mue = &mesh_prog_data->map;
+
+            assert(mue->per_vertex_header_size_dw % 8 == 0);
+            sbe_mesh.PerVertexURBEntryOutputReadOffset = mue->per_vertex_header_size_dw / 8;
+            sbe_mesh.PerVertexURBEntryOutputReadLength = DIV_ROUND_UP(mue->per_vertex_data_size_dw, 8);
+
+            /* Clip distance array is passed in the per-vertex header so that
+             * it can be consumed by the HW. If user wants to read it in the
+             * FS, adjust the offset and length to cover it. Conveniently it
+             * is at the end of the per-vertex header, right before per-vertex
+             * attributes.
+             *
+             * Note that FS attribute reading must be aware that the clip
+             * distances have fixed position.
+             */
+            if (mue->per_vertex_header_size_dw > 8 &&
+                (wm_prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] >= 0 ||
+                 wm_prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] >= 0)) {
+               sbe_mesh.PerVertexURBEntryOutputReadOffset -= 1;
+               sbe_mesh.PerVertexURBEntryOutputReadLength += 1;
+            }
+
+            if (mue->user_data_in_vertex_header) {
+               sbe_mesh.PerVertexURBEntryOutputReadOffset -= 1;
+               sbe_mesh.PerVertexURBEntryOutputReadLength += 1;
+            }
+
+            assert(mue->per_primitive_header_size_dw % 8 == 0);
+            sbe_mesh.PerPrimitiveURBEntryOutputReadOffset =
+               mue->per_primitive_header_size_dw / 8;
+            sbe_mesh.PerPrimitiveURBEntryOutputReadLength =
+               DIV_ROUND_UP(mue->per_primitive_data_size_dw, 8);
+
+            /* Just like with clip distances, if Primitive Shading Rate,
+             * Viewport Index or Layer is read back in the FS, adjust the
+             * offset and length to cover the Primitive Header, where PSR,
+             * Viewport Index & Layer are stored.
+             */
+            if (wm_prog_data->urb_setup[VARYING_SLOT_VIEWPORT] >= 0 ||
+                wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_SHADING_RATE] >= 0 ||
+                wm_prog_data->urb_setup[VARYING_SLOT_LAYER] >= 0 ||
+                mue->user_data_in_primitive_header) {
+               assert(sbe_mesh.PerPrimitiveURBEntryOutputReadOffset > 0);
+               sbe_mesh.PerPrimitiveURBEntryOutputReadOffset -= 1;
+               sbe_mesh.PerPrimitiveURBEntryOutputReadLength += 1;
+            }
          }
-      }
 #endif
+      }
+   }
    }
-
-   uint32_t *dw = anv_batch_emit_dwords(batch, GENX(3DSTATE_SBE_length));
-   if (!dw)
-      return;
-   GENX(3DSTATE_SBE_pack)(batch, dw, &sbe);
-
-   dw = anv_batch_emit_dwords(batch, GENX(3DSTATE_SBE_SWIZ_length));
-   if (!dw)
-      return;
-   GENX(3DSTATE_SBE_SWIZ_pack)(batch, dw, &swiz);
 }
 
 /** Returns the final polygon mode for rasterization
@@ -729,63 +802,55 @@ emit_rs_state(struct anv_graphics_pipeline *pipeline,
               const struct vk_render_pass_state *rp,
               enum intel_urb_deref_block_size urb_deref_block_size)
 {
-   struct GENX(3DSTATE_SF) sf = {
-      GENX(3DSTATE_SF_header),
-   };
-
-   sf.ViewportTransformEnable = true;
-   sf.StatisticsEnable = true;
-   sf.VertexSubPixelPrecisionSelect = _8Bit;
-   sf.AALineDistanceMode = true;
+   anv_pipeline_emit(pipeline, partial.sf, GENX(3DSTATE_SF), sf) {
+      sf.ViewportTransformEnable = true;
+      sf.StatisticsEnable = true;
+      sf.VertexSubPixelPrecisionSelect = _8Bit;
+      sf.AALineDistanceMode = true;
 
 #if GFX_VER >= 12
-   sf.DerefBlockSize = urb_deref_block_size;
+      sf.DerefBlockSize = urb_deref_block_size;
 #endif
 
-   bool point_from_shader;
-   if (anv_pipeline_is_primitive(pipeline)) {
-      const struct brw_vue_prog_data *last_vue_prog_data =
-         anv_pipeline_get_last_vue_prog_data(pipeline);
-      point_from_shader = last_vue_prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ;
-   } else {
-      assert(anv_pipeline_is_mesh(pipeline));
-      const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
-      point_from_shader = mesh_prog_data->map.start_dw[VARYING_SLOT_PSIZ] >= 0;
-   }
+      bool point_from_shader;
+      if (anv_pipeline_is_primitive(pipeline)) {
+         const struct brw_vue_prog_data *last_vue_prog_data =
+            anv_pipeline_get_last_vue_prog_data(pipeline);
+         point_from_shader = last_vue_prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ;
+      } else {
+         assert(anv_pipeline_is_mesh(pipeline));
+         const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
+         point_from_shader = mesh_prog_data->map.start_dw[VARYING_SLOT_PSIZ] >= 0;
+      }
 
-   if (point_from_shader) {
-      sf.PointWidthSource = Vertex;
-   } else {
-      sf.PointWidthSource = State;
-      sf.PointWidth = 1.0;
+      if (point_from_shader) {
+         sf.PointWidthSource = Vertex;
+      } else {
+         sf.PointWidthSource = State;
+         sf.PointWidth = 1.0;
+      }
    }
 
-   struct GENX(3DSTATE_RASTER) raster = {
-      GENX(3DSTATE_RASTER_header),
-   };
-
-   /* For details on 3DSTATE_RASTER multisample state, see the BSpec table
-    * "Multisample Modes State".
-    */
-   /* NOTE: 3DSTATE_RASTER::ForcedSampleCount affects the BDW and SKL PMA fix
-    * computations.  If we ever set this bit to a different value, they will
-    * need to be updated accordingly.
-    */
-   raster.ForcedSampleCount = FSC_NUMRASTSAMPLES_0;
-   raster.ForceMultisampling = false;
-
-   raster.ScissorRectangleEnable = true;
+   anv_pipeline_emit(pipeline, partial.raster, GENX(3DSTATE_RASTER), raster) {
+      /* For details on 3DSTATE_RASTER multisample state, see the BSpec table
+       * "Multisample Modes State".
+       */
+      /* NOTE: 3DSTATE_RASTER::ForcedSampleCount affects the BDW and SKL PMA fix
+       * computations.  If we ever set this bit to a different value, they will
+       * need to be updated accordingly.
+       */
+      raster.ForcedSampleCount = FSC_NUMRASTSAMPLES_0;
+      raster.ForceMultisampling = false;
 
-   GENX(3DSTATE_SF_pack)(NULL, pipeline->partial.sf, &sf);
-   GENX(3DSTATE_RASTER_pack)(NULL, pipeline->partial.raster, &raster);
+      raster.ScissorRectangleEnable = true;
+   }
 }
 
 static void
 emit_ms_state(struct anv_graphics_pipeline *pipeline,
               const struct vk_multisample_state *ms)
 {
-   struct anv_batch *batch = &pipeline->base.base.batch;
-   anv_batch_emit(batch, GENX(3DSTATE_MULTISAMPLE), ms) {
+   anv_pipeline_emit(pipeline, final.ms, GENX(3DSTATE_MULTISAMPLE), ms) {
       ms.NumberofMultisamples       = __builtin_ffs(pipeline->rasterization_samples) - 1;
 
       ms.PixelLocation              = CENTER;
@@ -862,71 +927,67 @@ emit_3dstate_clip(struct anv_graphics_pipeline *pipeline,
    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
    (void) wm_prog_data;
 
-   struct GENX(3DSTATE_CLIP) clip = {
-      GENX(3DSTATE_CLIP_header),
-   };
-
-   clip.ClipEnable               = true;
-   clip.StatisticsEnable         = true;
-   clip.EarlyCullEnable          = true;
-   clip.GuardbandClipTestEnable  = true;
+   anv_pipeline_emit(pipeline, partial.clip, GENX(3DSTATE_CLIP), clip) {
+      clip.ClipEnable               = true;
+      clip.StatisticsEnable         = true;
+      clip.EarlyCullEnable          = true;
+      clip.GuardbandClipTestEnable  = true;
 
-   clip.VertexSubPixelPrecisionSelect = _8Bit;
-   clip.ClipMode = CLIPMODE_NORMAL;
+      clip.VertexSubPixelPrecisionSelect = _8Bit;
+      clip.ClipMode = CLIPMODE_NORMAL;
 
-   clip.MinimumPointWidth = 0.125;
-   clip.MaximumPointWidth = 255.875;
+      clip.MinimumPointWidth = 0.125;
+      clip.MaximumPointWidth = 255.875;
 
-   /* TODO(mesh): Multiview. */
-   if (anv_pipeline_is_primitive(pipeline)) {
-      const struct brw_vue_prog_data *last =
-         anv_pipeline_get_last_vue_prog_data(pipeline);
+      /* TODO(mesh): Multiview. */
+      if (anv_pipeline_is_primitive(pipeline)) {
+         const struct brw_vue_prog_data *last =
+            anv_pipeline_get_last_vue_prog_data(pipeline);
 
-      /* From the Vulkan 1.0.45 spec:
-       *
-       *    "If the last active vertex processing stage shader entry point's
-       *    interface does not include a variable decorated with
-       *    ViewportIndex, then the first viewport is used."
-       */
-      if (vp && (last->vue_map.slots_valid & VARYING_BIT_VIEWPORT)) {
-         clip.MaximumVPIndex = vp->viewport_count > 0 ?
-                               vp->viewport_count - 1 : 0;
-      } else {
-         clip.MaximumVPIndex = 0;
-      }
+         /* From the Vulkan 1.0.45 spec:
+          *
+          *    "If the last active vertex processing stage shader entry
+          *    point's interface does not include a variable decorated with
+          *    ViewportIndex, then the first viewport is used."
+          */
+         if (vp && (last->vue_map.slots_valid & VARYING_BIT_VIEWPORT)) {
+            clip.MaximumVPIndex = vp->viewport_count > 0 ?
+               vp->viewport_count - 1 : 0;
+         } else {
+            clip.MaximumVPIndex = 0;
+         }
 
-      /* From the Vulkan 1.0.45 spec:
-       *
-       *    "If the last active vertex processing stage shader entry point's
-       *    interface does not include a variable decorated with Layer, then
-       *    the first layer is used."
-       */
-      clip.ForceZeroRTAIndexEnable =
-         !(last->vue_map.slots_valid & VARYING_BIT_LAYER);
+         /* From the Vulkan 1.0.45 spec:
+          *
+          *    "If the last active vertex processing stage shader entry point's
+          *    interface does not include a variable decorated with Layer, then
+          *    the first layer is used."
+          */
+         clip.ForceZeroRTAIndexEnable =
+            !(last->vue_map.slots_valid & VARYING_BIT_LAYER);
+
+      } else if (anv_pipeline_is_mesh(pipeline)) {
+         const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
+         if (vp && vp->viewport_count > 0 &&
+             mesh_prog_data->map.start_dw[VARYING_SLOT_VIEWPORT] >= 0) {
+            clip.MaximumVPIndex = vp->viewport_count - 1;
+         } else {
+            clip.MaximumVPIndex = 0;
+         }
 
-   } else if (anv_pipeline_is_mesh(pipeline)) {
-      const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
-      if (vp && vp->viewport_count > 0 &&
-          mesh_prog_data->map.start_dw[VARYING_SLOT_VIEWPORT] >= 0) {
-         clip.MaximumVPIndex = vp->viewport_count - 1;
-      } else {
-         clip.MaximumVPIndex = 0;
+         clip.ForceZeroRTAIndexEnable =
+            mesh_prog_data->map.start_dw[VARYING_SLOT_LAYER] < 0;
       }
 
-      clip.ForceZeroRTAIndexEnable =
-            mesh_prog_data->map.start_dw[VARYING_SLOT_LAYER] < 0;
+      clip.NonPerspectiveBarycentricEnable = wm_prog_data ?
+         wm_prog_data->uses_nonperspective_interp_modes : 0;
    }
 
-   clip.NonPerspectiveBarycentricEnable = wm_prog_data ?
-      wm_prog_data->uses_nonperspective_interp_modes : 0;
-
-   GENX(3DSTATE_CLIP_pack)(NULL, pipeline->partial.clip, &clip);
-
 #if GFX_VERx10 >= 125
    if (anv_pipeline_is_mesh(pipeline)) {
-      struct anv_batch *batch = &pipeline->base.base.batch;
       const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
-      anv_batch_emit(batch, GENX(3DSTATE_CLIP_MESH), clip_mesh) {
+      anv_pipeline_emit(pipeline, final.clip_mesh,
+                        GENX(3DSTATE_CLIP_MESH), clip_mesh) {
          clip_mesh.PrimitiveHeaderEnable = mesh_prog_data->map.per_primitive_header_size_dw > 0;
          clip_mesh.UserClipDistanceClipTestEnableBitmask = mesh_prog_data->clip_distance_mask;
          clip_mesh.UserClipDistanceCullTestEnableBitmask = mesh_prog_data->cull_distance_mask;
@@ -939,8 +1000,6 @@ static void
 emit_3dstate_streamout(struct anv_graphics_pipeline *pipeline,
                        const struct vk_rasterization_state *rs)
 {
-   struct anv_batch *batch = &pipeline->base.base.batch;
-   const struct anv_device *device = pipeline->base.base.device;
    const struct brw_vue_prog_data *prog_data =
       anv_pipeline_get_last_vue_prog_data(pipeline);
    const struct brw_vue_map *vue_map = &prog_data->vue_map;
@@ -1034,25 +1093,17 @@ emit_3dstate_streamout(struct anv_graphics_pipeline *pipeline,
             sbs[xfb_info->buffer_to_stream[b]] |= 1 << b;
       }
 
-      /* Wa_16011773973:
-       * If SOL is enabled and SO_DECL state has to be programmed,
-       *    1. Send 3D State SOL state with SOL disabled
-       *    2. Send SO_DECL NP state
-       *    3. Send 3D State SOL with SOL Enabled
-       */
-      if (intel_device_info_is_dg2(device->info))
-         anv_batch_emit(batch, GENX(3DSTATE_STREAMOUT), so);
-
-      uint32_t *dw = anv_batch_emitn(batch, 3 + 2 * max_decls,
-                                     GENX(3DSTATE_SO_DECL_LIST),
-                                     .StreamtoBufferSelects0 = sbs[0],
-                                     .StreamtoBufferSelects1 = sbs[1],
-                                     .StreamtoBufferSelects2 = sbs[2],
-                                     .StreamtoBufferSelects3 = sbs[3],
-                                     .NumEntries0 = decls[0],
-                                     .NumEntries1 = decls[1],
-                                     .NumEntries2 = decls[2],
-                                     .NumEntries3 = decls[3]);
+      uint32_t *dw = anv_pipeline_emitn(pipeline, final.so_decl_list,
+                                        3 + 2 * max_decls,
+                                        GENX(3DSTATE_SO_DECL_LIST),
+                                        .StreamtoBufferSelects0 = sbs[0],
+                                        .StreamtoBufferSelects1 = sbs[1],
+                                        .StreamtoBufferSelects2 = sbs[2],
+                                        .StreamtoBufferSelects3 = sbs[3],
+                                        .NumEntries0 = decls[0],
+                                        .NumEntries1 = decls[1],
+                                        .NumEntries2 = decls[2],
+                                        .NumEntries3 = decls[3]);
 
       for (int i = 0; i < max_decls; i++) {
          GENX(SO_DECL_ENTRY_pack)(NULL, dw + 3 + i * 2,
@@ -1063,47 +1114,38 @@ emit_3dstate_streamout(struct anv_graphics_pipeline *pipeline,
                .Stream3Decl = so_decl[3][i],
             });
       }
-
-#if GFX_VERx10 == 125
-      /* Wa_14015946265: Send PC with CS stall after SO_DECL. */
-      genX(batch_emit_pipe_control)(batch, device->info, ANV_PIPE_CS_STALL_BIT);
-#endif
    }
 
-   struct GENX(3DSTATE_STREAMOUT) so = {
-      GENX(3DSTATE_STREAMOUT_header),
-   };
-
-   if (xfb_info) {
-      pipeline->uses_xfb = true;
+   anv_pipeline_emit(pipeline, partial.so, GENX(3DSTATE_STREAMOUT), so) {
+      if (xfb_info) {
+         pipeline->uses_xfb = true;
 
-      so.SOFunctionEnable = true;
-      so.SOStatisticsEnable = true;
+         so.SOFunctionEnable = true;
+         so.SOStatisticsEnable = true;
 
-      so.Buffer0SurfacePitch = xfb_info->buffers[0].stride;
-      so.Buffer1SurfacePitch = xfb_info->buffers[1].stride;
-      so.Buffer2SurfacePitch = xfb_info->buffers[2].stride;
-      so.Buffer3SurfacePitch = xfb_info->buffers[3].stride;
+         so.Buffer0SurfacePitch = xfb_info->buffers[0].stride;
+         so.Buffer1SurfacePitch = xfb_info->buffers[1].stride;
+         so.Buffer2SurfacePitch = xfb_info->buffers[2].stride;
+         so.Buffer3SurfacePitch = xfb_info->buffers[3].stride;
 
-      int urb_entry_read_offset = 0;
-      int urb_entry_read_length =
-         (prog_data->vue_map.num_slots + 1) / 2 - urb_entry_read_offset;
+         int urb_entry_read_offset = 0;
+         int urb_entry_read_length =
+            (prog_data->vue_map.num_slots + 1) / 2 - urb_entry_read_offset;
 
-      /* We always read the whole vertex.  This could be reduced at some
-       * point by reading less and offsetting the register index in the
-       * SO_DECLs.
-       */
-      so.Stream0VertexReadOffset = urb_entry_read_offset;
-      so.Stream0VertexReadLength = urb_entry_read_length - 1;
-      so.Stream1VertexReadOffset = urb_entry_read_offset;
-      so.Stream1VertexReadLength = urb_entry_read_length - 1;
-      so.Stream2VertexReadOffset = urb_entry_read_offset;
-      so.Stream2VertexReadLength = urb_entry_read_length - 1;
-      so.Stream3VertexReadOffset = urb_entry_read_offset;
-      so.Stream3VertexReadLength = urb_entry_read_length - 1;
+         /* We always read the whole vertex. This could be reduced at some
+          * point by reading less and offsetting the register index in the
+          * SO_DECLs.
+          */
+         so.Stream0VertexReadOffset = urb_entry_read_offset;
+         so.Stream0VertexReadLength = urb_entry_read_length - 1;
+         so.Stream1VertexReadOffset = urb_entry_read_offset;
+         so.Stream1VertexReadLength = urb_entry_read_length - 1;
+         so.Stream2VertexReadOffset = urb_entry_read_offset;
+         so.Stream2VertexReadLength = urb_entry_read_length - 1;
+         so.Stream3VertexReadOffset = urb_entry_read_offset;
+         so.Stream3VertexReadLength = urb_entry_read_length - 1;
+      }
    }
-
-   GENX(3DSTATE_STREAMOUT_pack)(NULL, pipeline->partial.streamout_state, &so);
 }
 
 static uint32_t
@@ -1158,7 +1200,6 @@ get_scratch_surf(struct anv_pipeline *pipeline,
 static void
 emit_3dstate_vs(struct anv_graphics_pipeline *pipeline)
 {
-   struct anv_batch *batch = &pipeline->base.base.batch;
    const struct intel_device_info *devinfo = pipeline->base.base.device->info;
    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
    const struct anv_shader_bin *vs_bin =
@@ -1166,7 +1207,7 @@ emit_3dstate_vs(struct anv_graphics_pipeline *pipeline)
 
    assert(anv_pipeline_has_stage(pipeline, MESA_SHADER_VERTEX));
 
-   anv_batch_emit(batch, GENX(3DSTATE_VS), vs) {
+   anv_pipeline_emit(pipeline, final.vs, GENX(3DSTATE_VS), vs) {
       vs.Enable               = true;
       vs.StatisticsEnable     = true;
       vs.KernelStartPointer   = vs_bin->kernel.offset;
@@ -1237,11 +1278,9 @@ static void
 emit_3dstate_hs_ds(struct anv_graphics_pipeline *pipeline,
                    const struct vk_tessellation_state *ts)
 {
-   struct anv_batch *batch = &pipeline->base.base.batch;
-
    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
-      anv_batch_emit(batch, GENX(3DSTATE_HS), hs);
-      anv_batch_emit(batch, GENX(3DSTATE_DS), ds);
+      anv_pipeline_emit(pipeline, final.hs, GENX(3DSTATE_HS), hs);
+      anv_pipeline_emit(pipeline, final.ds, GENX(3DSTATE_DS), ds);
       return;
    }
 
@@ -1254,120 +1293,101 @@ emit_3dstate_hs_ds(struct anv_graphics_pipeline *pipeline,
    const struct brw_tcs_prog_data *tcs_prog_data = get_tcs_prog_data(pipeline);
    const struct brw_tes_prog_data *tes_prog_data = get_tes_prog_data(pipeline);
 
-   struct GENX(3DSTATE_HS) hs = {
-      GENX(3DSTATE_HS_header),
-   };
-
-   hs.Enable = true;
-   hs.StatisticsEnable = true;
-   hs.KernelStartPointer = tcs_bin->kernel.offset;
-   /* Wa_1606682166 */
-   hs.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(tcs_bin);
-   hs.BindingTableEntryCount = tcs_bin->bind_map.surface_count;
+   anv_pipeline_emit(pipeline, final.hs, GENX(3DSTATE_HS), hs) {
+      hs.Enable = true;
+      hs.StatisticsEnable = true;
+      hs.KernelStartPointer = tcs_bin->kernel.offset;
+      /* Wa_1606682166 */
+      hs.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(tcs_bin);
+      hs.BindingTableEntryCount = tcs_bin->bind_map.surface_count;
 
 #if GFX_VER >= 12
-   /* Wa_1604578095:
-    *
-    *    Hang occurs when the number of max threads is less than 2 times
-    *    the number of instance count. The number of max threads must be
-    *    more than 2 times the number of instance count.
-    */
-   assert((devinfo->max_tcs_threads / 2) > tcs_prog_data->instances);
+      /* Wa_1604578095:
+       *
+       *    Hang occurs when the number of max threads is less than 2 times
+       *    the number of instance count. The number of max threads must be
+       *    more than 2 times the number of instance count.
+       */
+      assert((devinfo->max_tcs_threads / 2) > tcs_prog_data->instances);
 #endif
 
-   hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;
-   hs.IncludeVertexHandles = true;
-   hs.InstanceCount = tcs_prog_data->instances - 1;
+      hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;
+      hs.IncludeVertexHandles = true;
+      hs.InstanceCount = tcs_prog_data->instances - 1;
 
-   hs.VertexURBEntryReadLength = 0;
-   hs.VertexURBEntryReadOffset = 0;
-   hs.DispatchGRFStartRegisterForURBData =
-      tcs_prog_data->base.base.dispatch_grf_start_reg & 0x1f;
+      hs.VertexURBEntryReadLength = 0;
+      hs.VertexURBEntryReadOffset = 0;
+      hs.DispatchGRFStartRegisterForURBData =
+         tcs_prog_data->base.base.dispatch_grf_start_reg & 0x1f;
 #if GFX_VER >= 12
-   hs.DispatchGRFStartRegisterForURBData5 =
-      tcs_prog_data->base.base.dispatch_grf_start_reg >> 5;
+      hs.DispatchGRFStartRegisterForURBData5 =
+         tcs_prog_data->base.base.dispatch_grf_start_reg >> 5;
 #endif
 
 #if GFX_VERx10 >= 125
-   hs.ScratchSpaceBuffer =
-      get_scratch_surf(&pipeline->base.base, MESA_SHADER_TESS_CTRL, tcs_bin);
+      hs.ScratchSpaceBuffer =
+         get_scratch_surf(&pipeline->base.base, MESA_SHADER_TESS_CTRL, tcs_bin);
 #else
-   hs.PerThreadScratchSpace = get_scratch_space(tcs_bin);
-   hs.ScratchSpaceBasePointer =
-      get_scratch_address(&pipeline->base.base, MESA_SHADER_TESS_CTRL, tcs_bin);
+      hs.PerThreadScratchSpace = get_scratch_space(tcs_bin);
+      hs.ScratchSpaceBasePointer =
+         get_scratch_address(&pipeline->base.base, MESA_SHADER_TESS_CTRL, tcs_bin);
 #endif
 
 #if GFX_VER == 12
-   /*  Patch Count threshold specifies the maximum number of patches that
-    *  will be accumulated before a thread dispatch is forced.
-    */
-   hs.PatchCountThreshold = tcs_prog_data->patch_count_threshold;
+      /*  Patch Count threshold specifies the maximum number of patches that
+       *  will be accumulated before a thread dispatch is forced.
+       */
+      hs.PatchCountThreshold = tcs_prog_data->patch_count_threshold;
 #endif
 
-   hs.DispatchMode = tcs_prog_data->base.dispatch_mode;
-   hs.IncludePrimitiveID = tcs_prog_data->include_primitive_id;
-
-   STATIC_ASSERT(ARRAY_SIZE(pipeline->final.hs) == GENX(3DSTATE_HS_length));
-   GENX(3DSTATE_HS_pack)(&pipeline->base.base.batch, pipeline->final.hs, &hs);
-
-   struct GENX(3DSTATE_DS) ds = {
-      GENX(3DSTATE_DS_header),
+      hs.DispatchMode = tcs_prog_data->base.dispatch_mode;
+      hs.IncludePrimitiveID = tcs_prog_data->include_primitive_id;
    };
 
-   ds.Enable = true;
-   ds.StatisticsEnable = true;
-   ds.KernelStartPointer = tes_bin->kernel.offset;
-   /* Wa_1606682166 */
-   ds.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(tes_bin);
-   ds.BindingTableEntryCount = tes_bin->bind_map.surface_count;
-   ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;
+   anv_pipeline_emit(pipeline, final.ds, GENX(3DSTATE_DS), ds) {
+      ds.Enable = true;
+      ds.StatisticsEnable = true;
+      ds.KernelStartPointer = tes_bin->kernel.offset;
+      /* Wa_1606682166 */
+      ds.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(tes_bin);
+      ds.BindingTableEntryCount = tes_bin->bind_map.surface_count;
+      ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;
 
-   ds.ComputeWCoordinateEnable =
-      tes_prog_data->domain == BRW_TESS_DOMAIN_TRI;
+      ds.ComputeWCoordinateEnable =
+         tes_prog_data->domain == BRW_TESS_DOMAIN_TRI;
 
-   ds.PatchURBEntryReadLength = tes_prog_data->base.urb_read_length;
-   ds.PatchURBEntryReadOffset = 0;
-   ds.DispatchGRFStartRegisterForURBData =
-      tes_prog_data->base.base.dispatch_grf_start_reg;
+      ds.PatchURBEntryReadLength = tes_prog_data->base.urb_read_length;
+      ds.PatchURBEntryReadOffset = 0;
+      ds.DispatchGRFStartRegisterForURBData =
+         tes_prog_data->base.base.dispatch_grf_start_reg;
 
 #if GFX_VER < 11
-   ds.DispatchMode =
-      tes_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8 ?
-      DISPATCH_MODE_SIMD8_SINGLE_PATCH :
-      DISPATCH_MODE_SIMD4X2;
+      ds.DispatchMode =
+         tes_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8 ?
+         DISPATCH_MODE_SIMD8_SINGLE_PATCH :
+         DISPATCH_MODE_SIMD4X2;
 #else
-   assert(tes_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8);
-   ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
+      assert(tes_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8);
+      ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
 #endif
 
-   ds.UserClipDistanceClipTestEnableBitmask =
-      tes_prog_data->base.clip_distance_mask;
-   ds.UserClipDistanceCullTestEnableBitmask =
-      tes_prog_data->base.cull_distance_mask;
+      ds.UserClipDistanceClipTestEnableBitmask =
+         tes_prog_data->base.clip_distance_mask;
+      ds.UserClipDistanceCullTestEnableBitmask =
+         tes_prog_data->base.cull_distance_mask;
 
 #if GFX_VER >= 12
-   ds.PrimitiveIDNotRequired = !tes_prog_data->include_primitive_id;
+      ds.PrimitiveIDNotRequired = !tes_prog_data->include_primitive_id;
 #endif
 #if GFX_VERx10 >= 125
-   ds.ScratchSpaceBuffer =
-      get_scratch_surf(&pipeline->base.base, MESA_SHADER_TESS_EVAL, tes_bin);
+      ds.ScratchSpaceBuffer =
+         get_scratch_surf(&pipeline->base.base, MESA_SHADER_TESS_EVAL, tes_bin);
 #else
-   ds.PerThreadScratchSpace = get_scratch_space(tes_bin);
-   ds.ScratchSpaceBasePointer =
-      get_scratch_address(&pipeline->base.base, MESA_SHADER_TESS_EVAL, tes_bin);
+      ds.PerThreadScratchSpace = get_scratch_space(tes_bin);
+      ds.ScratchSpaceBasePointer =
+         get_scratch_address(&pipeline->base.base, MESA_SHADER_TESS_EVAL, tes_bin);
 #endif
-
-   /* Wa_14019750404:
-    * See genX(emit_ds)().
-    * We need to both emit 3DSTATE_DS now, and before each 3DPRIMITIVE, so
-    * we pack it to have it later, and memcpy into the current batch.
-    */
-   STATIC_ASSERT(ARRAY_SIZE(pipeline->final.ds) == GENX(3DSTATE_DS_length));
-   GENX(3DSTATE_DS_pack)(&pipeline->base.base.batch, pipeline->final.ds, &ds);
-
-   uint32_t *dw =
-      anv_batch_emitn(batch, GENX(3DSTATE_DS_length), GENX(3DSTATE_DS));
-   memcpy(dw, &pipeline->final.ds, sizeof(pipeline->final.ds));
+   }
 }
 
 static UNUSED bool
@@ -1391,63 +1411,59 @@ geom_or_tess_prim_id_used(struct anv_graphics_pipeline *pipeline)
 static void
 emit_3dstate_te(struct anv_graphics_pipeline *pipeline)
 {
-   struct GENX(3DSTATE_TE) te = {
-      GENX(3DSTATE_TE_header),
-   };
-
-   if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
-      const struct brw_tes_prog_data *tes_prog_data =
-         get_tes_prog_data(pipeline);
-
-      te.Partitioning = tes_prog_data->partitioning;
-      te.TEDomain = tes_prog_data->domain;
-      te.TEEnable = true;
-      te.MaximumTessellationFactorOdd = 63.0;
-      te.MaximumTessellationFactorNotOdd = 64.0;
+   anv_pipeline_emit(pipeline, partial.te, GENX(3DSTATE_TE), te) {
+      if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
+         const struct brw_tes_prog_data *tes_prog_data =
+            get_tes_prog_data(pipeline);
+
+         te.Partitioning = tes_prog_data->partitioning;
+         te.TEDomain = tes_prog_data->domain;
+         te.TEEnable = true;
+         te.MaximumTessellationFactorOdd = 63.0;
+         te.MaximumTessellationFactorNotOdd = 64.0;
 #if GFX_VERx10 >= 125
-      const struct anv_device *device = pipeline->base.base.device;
-      if (intel_needs_workaround(device->info, 22012699309))
-         te.TessellationDistributionMode = TEDMODE_RR_STRICT;
-      else
-         te.TessellationDistributionMode = TEDMODE_RR_FREE;
+         const struct anv_device *device = pipeline->base.base.device;
+         if (intel_needs_workaround(device->info, 22012699309))
+            te.TessellationDistributionMode = TEDMODE_RR_STRICT;
+         else
+            te.TessellationDistributionMode = TEDMODE_RR_FREE;
 
-      if (intel_needs_workaround(device->info, 14015055625)) {
-         /* Wa_14015055625:
-          *
-          * Disable Tessellation Distribution when primitive Id is enabled.
-          */
-         if (pipeline->primitive_id_override ||
-             geom_or_tess_prim_id_used(pipeline))
-            te.TessellationDistributionMode = TEDMODE_OFF;
-      }
+         if (intel_needs_workaround(device->info, 14015055625)) {
+            /* Wa_14015055625:
+             *
+             * Disable Tessellation Distribution when primitive Id is enabled.
+             */
+            if (pipeline->primitive_id_override ||
+                geom_or_tess_prim_id_used(pipeline))
+               te.TessellationDistributionMode = TEDMODE_OFF;
+         }
 
-      te.TessellationDistributionLevel = TEDLEVEL_PATCH;
-      /* 64_TRIANGLES */
-      te.SmallPatchThreshold = 3;
-      /* 1K_TRIANGLES */
-      te.TargetBlockSize = 8;
-      /* 1K_TRIANGLES */
-      te.LocalBOPAccumulatorThreshold = 1;
+         te.TessellationDistributionLevel = TEDLEVEL_PATCH;
+         /* 64_TRIANGLES */
+         te.SmallPatchThreshold = 3;
+         /* 1K_TRIANGLES */
+         te.TargetBlockSize = 8;
+         /* 1K_TRIANGLES */
+         te.LocalBOPAccumulatorThreshold = 1;
 #endif
+      }
    }
-
-   GENX(3DSTATE_TE_pack)(NULL, pipeline->partial.te, &te);
 }
 
 static void
 emit_3dstate_gs(struct anv_graphics_pipeline *pipeline)
 {
+   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
+      anv_pipeline_emit(pipeline, partial.gs, GENX(3DSTATE_GS), gs);
+      return;
+   }
+
    const struct intel_device_info *devinfo = pipeline->base.base.device->info;
    const struct anv_shader_bin *gs_bin =
       pipeline->base.shaders[MESA_SHADER_GEOMETRY];
+   const struct brw_gs_prog_data *gs_prog_data = get_gs_prog_data(pipeline);
 
-   struct GENX(3DSTATE_GS) gs = {
-      GENX(3DSTATE_GS_header),
-   };
-
-   if (anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
-       const struct brw_gs_prog_data *gs_prog_data = get_gs_prog_data(pipeline);
-
+   anv_pipeline_emit(pipeline, partial.gs, GENX(3DSTATE_GS), gs) {
       gs.Enable                  = true;
       gs.StatisticsEnable        = true;
       gs.KernelStartPointer      = gs_bin->kernel.offset;
@@ -1493,8 +1509,6 @@ emit_3dstate_gs(struct anv_graphics_pipeline *pipeline)
          get_scratch_address(&pipeline->base.base, MESA_SHADER_GEOMETRY, gs_bin);
 #endif
    }
-
-   GENX(3DSTATE_GS_pack)(&pipeline->base.base.batch, pipeline->partial.gs, &gs);
 }
 
 static bool
@@ -1514,49 +1528,46 @@ emit_3dstate_wm(struct anv_graphics_pipeline *pipeline,
 {
    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
 
-   struct GENX(3DSTATE_WM) wm = {
-      GENX(3DSTATE_WM_header),
-   };
-   wm.StatisticsEnable                    = true;
-   wm.LineEndCapAntialiasingRegionWidth   = _05pixels;
-   wm.LineAntialiasingRegionWidth         = _10pixels;
-   wm.PointRasterizationRule              = RASTRULE_UPPER_LEFT;
+   anv_pipeline_emit(pipeline, partial.wm, GENX(3DSTATE_WM), wm) {
+      wm.StatisticsEnable                    = true;
+      wm.LineEndCapAntialiasingRegionWidth   = _05pixels;
+      wm.LineAntialiasingRegionWidth         = _10pixels;
+      wm.PointRasterizationRule              = RASTRULE_UPPER_LEFT;
 
-   if (anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
-      if (wm_prog_data->early_fragment_tests) {
+      if (anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
+         if (wm_prog_data->early_fragment_tests) {
             wm.EarlyDepthStencilControl         = EDSC_PREPS;
-      } else if (wm_prog_data->has_side_effects) {
-         wm.EarlyDepthStencilControl         = EDSC_PSEXEC;
-      } else {
-         wm.EarlyDepthStencilControl         = EDSC_NORMAL;
-      }
+         } else if (wm_prog_data->has_side_effects) {
+            wm.EarlyDepthStencilControl         = EDSC_PSEXEC;
+         } else {
+            wm.EarlyDepthStencilControl         = EDSC_NORMAL;
+         }
 
-      /* Gen8 hardware tries to compute ThreadDispatchEnable for us but
-       * doesn't take into account KillPixels when no depth or stencil
-       * writes are enabled.  In order for occlusion queries to work
-       * correctly with no attachments, we need to force-enable PS thread
-       * dispatch.
-       *
-       * The BDW docs are pretty clear that that this bit isn't validated
-       * and probably shouldn't be used in production:
-       *
-       *    "This must always be set to Normal. This field should not be
-       *    tested for functional validation."
-       *
-       * Unfortunately, however, the other mechanism we have for doing this
-       * is 3DSTATE_PS_EXTRA::PixelShaderHasUAV which causes hangs on BDW.
-       * Given two bad options, we choose the one which works.
-       */
-      pipeline->force_fragment_thread_dispatch =
-         wm_prog_data->has_side_effects ||
-         wm_prog_data->uses_kill;
+         /* Gen8 hardware tries to compute ThreadDispatchEnable for us but
+          * doesn't take into account KillPixels when no depth or stencil
+          * writes are enabled. In order for occlusion queries to work
+          * correctly with no attachments, we need to force-enable PS thread
+          * dispatch.
+          *
+          * The BDW docs are pretty clear that that this bit isn't validated
+          * and probably shouldn't be used in production:
+          *
+          *    "This must always be set to Normal. This field should not be
+          *     tested for functional validation."
+          *
+          * Unfortunately, however, the other mechanism we have for doing this
+          * is 3DSTATE_PS_EXTRA::PixelShaderHasUAV which causes hangs on BDW.
+          * Given two bad options, we choose the one which works.
+          */
+         pipeline->force_fragment_thread_dispatch =
+            wm_prog_data->has_side_effects ||
+            wm_prog_data->uses_kill;
 
-      wm.BarycentricInterpolationMode =
-         wm_prog_data_barycentric_modes(wm_prog_data,
-                                        pipeline->fs_msaa_flags);
+         wm.BarycentricInterpolationMode =
+            wm_prog_data_barycentric_modes(wm_prog_data,
+                                           pipeline->fs_msaa_flags);
+      }
    }
-
-   GENX(3DSTATE_WM_pack)(NULL, pipeline->partial.wm, &wm);
 }
 
 static void
@@ -1564,21 +1575,19 @@ emit_3dstate_ps(struct anv_graphics_pipeline *pipeline,
                 const struct vk_multisample_state *ms,
                 const struct vk_color_blend_state *cb)
 {
-   struct anv_batch *batch = &pipeline->base.base.batch;
    UNUSED const struct intel_device_info *devinfo =
       pipeline->base.base.device->info;
    const struct anv_shader_bin *fs_bin =
       pipeline->base.shaders[MESA_SHADER_FRAGMENT];
 
    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
-      anv_batch_emit(batch, GENX(3DSTATE_PS), ps) {
-      }
+      anv_pipeline_emit(pipeline, final.ps, GENX(3DSTATE_PS), ps);
       return;
    }
 
    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
 
-   anv_batch_emit(batch, GENX(3DSTATE_PS), ps) {
+   anv_pipeline_emit(pipeline, final.ps, GENX(3DSTATE_PS), ps) {
       intel_set_ps_dispatch_state(&ps, devinfo, wm_prog_data,
                                   ms != NULL ? ms->rasterization_samples : 1,
                                   pipeline->fs_msaa_flags);
@@ -1629,15 +1638,14 @@ emit_3dstate_ps_extra(struct anv_graphics_pipeline *pipeline,
                       const struct vk_rasterization_state *rs,
                       const struct vk_render_pass_state *rp)
 {
-   struct anv_batch *batch = &pipeline->base.base.batch;
    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
 
    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
-      anv_batch_emit(batch, GENX(3DSTATE_PS_EXTRA), ps);
+      anv_pipeline_emit(pipeline, final.ps_extra, GENX(3DSTATE_PS_EXTRA), ps);
       return;
    }
 
-   anv_batch_emit(batch, GENX(3DSTATE_PS_EXTRA), ps) {
+   anv_pipeline_emit(pipeline, final.ps_extra, GENX(3DSTATE_PS_EXTRA), ps) {
       ps.PixelShaderValid              = true;
       ps.AttributeEnable               = wm_prog_data->num_varying_inputs > 0;
       ps.oMaskPresenttoRenderTarget    = wm_prog_data->uses_omask;
@@ -1689,8 +1697,8 @@ emit_3dstate_ps_extra(struct anv_graphics_pipeline *pipeline,
 static void
 emit_3dstate_vf_statistics(struct anv_graphics_pipeline *pipeline)
 {
-   struct anv_batch *batch = &pipeline->base.base.batch;
-   anv_batch_emit(batch, GENX(3DSTATE_VF_STATISTICS), vfs) {
+   anv_pipeline_emit(pipeline, final.vf_statistics,
+                     GENX(3DSTATE_VF_STATISTICS), vfs) {
       vfs.StatisticsEnable = true;
    }
 }
@@ -1733,10 +1741,9 @@ static void
 emit_3dstate_primitive_replication(struct anv_graphics_pipeline *pipeline,
                                    const struct vk_render_pass_state *rp)
 {
-   struct anv_batch *batch = &pipeline->base.base.batch;
-
    if (anv_pipeline_is_mesh(pipeline)) {
-      anv_batch_emit(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
+      anv_pipeline_emit(pipeline, final.primitive_replication,
+                        GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
       return;
    }
 
@@ -1745,14 +1752,16 @@ emit_3dstate_primitive_replication(struct anv_graphics_pipeline *pipeline,
 
    assert(replication_count >= 1);
    if (replication_count == 1) {
-      anv_batch_emit(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
+      anv_pipeline_emit(pipeline, final.primitive_replication,
+                        GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
       return;
    }
 
    assert(replication_count == util_bitcount(rp->view_mask));
    assert(replication_count <= MAX_VIEWS_FOR_PRIMITIVE_REPLICATION);
 
-   anv_batch_emit(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) {
+   anv_pipeline_emit(pipeline, final.primitive_replication,
+                     GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) {
       pr.ReplicaMask = (1 << replication_count) - 1;
       pr.ReplicationCount = replication_count - 1;
 
@@ -1769,18 +1778,19 @@ emit_3dstate_primitive_replication(struct anv_graphics_pipeline *pipeline,
 static void
 emit_task_state(struct anv_graphics_pipeline *pipeline)
 {
-   struct anv_batch *batch = &pipeline->base.base.batch;
    assert(anv_pipeline_is_mesh(pipeline));
 
    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
-      anv_batch_emit(batch, GENX(3DSTATE_TASK_CONTROL), zero);
+      anv_pipeline_emit(pipeline, final.task_control,
+                        GENX(3DSTATE_TASK_CONTROL), zero);
       return;
    }
 
    const struct anv_shader_bin *task_bin =
       pipeline->base.shaders[MESA_SHADER_TASK];
 
-   anv_batch_emit(batch, GENX(3DSTATE_TASK_CONTROL), tc) {
+   anv_pipeline_emit(pipeline, final.task_control,
+                     GENX(3DSTATE_TASK_CONTROL), tc) {
       tc.TaskShaderEnable = true;
       tc.ScratchSpaceBuffer =
          get_scratch_surf(&pipeline->base.base, MESA_SHADER_TASK, task_bin);
@@ -1792,7 +1802,8 @@ emit_task_state(struct anv_graphics_pipeline *pipeline)
    const struct brw_cs_dispatch_info task_dispatch =
       brw_cs_get_dispatch_info(devinfo, &task_prog_data->base, NULL);
 
-   anv_batch_emit(batch, GENX(3DSTATE_TASK_SHADER), task) {
+   anv_pipeline_emit(pipeline, final.task_shader,
+                     GENX(3DSTATE_TASK_SHADER), task) {
       task.KernelStartPointer                = task_bin->kernel.offset;
       task.SIMDSize                          = task_dispatch.simd_size / 16;
       task.MessageSIMD                       = task.SIMDSize;
@@ -1818,7 +1829,8 @@ emit_task_state(struct anv_graphics_pipeline *pipeline)
    }
 
    /* Recommended values from "Task and Mesh Distribution Programming". */
-   anv_batch_emit(batch, GENX(3DSTATE_TASK_REDISTRIB), redistrib) {
+   anv_pipeline_emit(pipeline, final.task_redistrib,
+                     GENX(3DSTATE_TASK_REDISTRIB), redistrib) {
       redistrib.LocalBOTAccumulatorThreshold = MULTIPLIER_1;
       redistrib.SmallTaskThreshold = 1; /* 2^N */
       redistrib.TargetMeshBatchSize = devinfo->num_slices > 2 ? 3 : 5; /* 2^N */
@@ -1830,12 +1842,12 @@ emit_task_state(struct anv_graphics_pipeline *pipeline)
 static void
 emit_mesh_state(struct anv_graphics_pipeline *pipeline)
 {
-   struct anv_batch *batch = &pipeline->base.base.batch;
    assert(anv_pipeline_is_mesh(pipeline));
 
    const struct anv_shader_bin *mesh_bin = pipeline->base.shaders[MESA_SHADER_MESH];
 
-   anv_batch_emit(batch, GENX(3DSTATE_MESH_CONTROL), mc) {
+   anv_pipeline_emit(pipeline, final.mesh_control,
+                     GENX(3DSTATE_MESH_CONTROL), mc) {
       mc.MeshShaderEnable = true;
       mc.ScratchSpaceBuffer =
          get_scratch_surf(&pipeline->base.base, MESA_SHADER_MESH, mesh_bin);
@@ -1864,7 +1876,8 @@ emit_mesh_state(struct anv_graphics_pipeline *pipeline)
       unreachable("invalid index format");
    }
 
-   anv_batch_emit(batch, GENX(3DSTATE_MESH_SHADER), mesh) {
+   anv_pipeline_emit(pipeline, final.mesh_shader,
+                     GENX(3DSTATE_MESH_SHADER), mesh) {
       mesh.KernelStartPointer                = mesh_bin->kernel.offset;
       mesh.SIMDSize                          = mesh_dispatch.simd_size / 16;
       mesh.MessageSIMD                       = mesh.SIMDSize;
@@ -1897,7 +1910,8 @@ emit_mesh_state(struct anv_graphics_pipeline *pipeline)
    }
 
    /* Recommended values from "Task and Mesh Distribution Programming". */
-   anv_batch_emit(batch, GENX(3DSTATE_MESH_DISTRIB), distrib) {
+   anv_pipeline_emit(pipeline, final.mesh_distrib,
+                     GENX(3DSTATE_MESH_DISTRIB), distrib) {
       distrib.DistributionMode = MESH_RR_FREE;
       distrib.TaskDistributionBatchSize = devinfo->num_slices > 2 ? 4 : 9; /* 2^N thread groups */
       distrib.MeshDistributionBatchSize = devinfo->num_slices > 2 ? 3 : 3; /* 2^N thread groups */
@@ -1909,7 +1923,6 @@ void
 genX(graphics_pipeline_emit)(struct anv_graphics_pipeline *pipeline,
                              const struct vk_graphics_pipeline_state *state)
 {
-   struct anv_batch *batch = &pipeline->base.base.batch;
    enum intel_urb_deref_block_size urb_deref_block_size;
    emit_urb_setup(pipeline, &urb_deref_block_size);
 
@@ -1940,10 +1953,10 @@ genX(graphics_pipeline_emit)(struct anv_graphics_pipeline *pipeline,
       const struct anv_device *device = pipeline->base.base.device;
       /* Disable Mesh. */
       if (device->vk.enabled_extensions.EXT_mesh_shader) {
-         struct anv_batch *batch = &pipeline->base.base.batch;
-
-         anv_batch_emit(batch, GENX(3DSTATE_MESH_CONTROL), zero);
-         anv_batch_emit(batch, GENX(3DSTATE_TASK_CONTROL), zero);
+         anv_pipeline_emit(pipeline, final.mesh_control,
+                           GENX(3DSTATE_MESH_CONTROL), zero);
+         anv_pipeline_emit(pipeline, final.task_control,
+                           GENX(3DSTATE_TASK_CONTROL), zero);
       }
 #endif
    } else {
@@ -1952,7 +1965,7 @@ genX(graphics_pipeline_emit)(struct anv_graphics_pipeline *pipeline,
       /* BSpec 46303 forbids both 3DSTATE_MESH_CONTROL.MeshShaderEnable
        * and 3DSTATE_STREAMOUT.SOFunctionEnable to be 1.
        */
-      anv_batch_emit(batch, GENX(3DSTATE_STREAMOUT), so) {}
+      anv_pipeline_emit(pipeline, partial.so, GENX(3DSTATE_STREAMOUT), so);
 
 #if GFX_VERx10 >= 125
       emit_task_state(pipeline);