anv: split pipeline programming into instructions

author Lionel Landwerlin <lionel.g.landwerlin@intel.com>

Tue, 1 Aug 2023 09:20:19 +0000 (12:20 +0300)

committer Marge Bot <emma+marge@anholt.net>

Wed, 6 Sep 2023 20:07:02 +0000 (20:07 +0000)
author Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Tue, 1 Aug 2023 09:20:19 +0000 (12:20 +0300)
committer Marge Bot <emma+marge@anholt.net>
Wed, 6 Sep 2023 20:07:02 +0000 (20:07 +0000)
diff --git a/src/intel/vulkan/anv_batch_chain.c b/src/intel/vulkan/anv_batch_chain.c

index 2f8039e..73f7b29 100644 (file)
--- a/src/intel/vulkan/anv_batch_chain.c
+++ b/src/intel/vulkan/anv_batch_chain.c
@@ -136,7 +136,7 @@ anv_reloc_list_clear(struct anv_reloc_list *list)
        memset(list->deps, 0, list->dep_words * sizeof(BITSET_WORD));
  }
  
-static VkResult
+VkResult
  anv_reloc_list_append(struct anv_reloc_list *list,
                        struct anv_reloc_list *other)
  {
diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h

index 2779a85..e7ba120 100644 (file)
--- a/src/intel/vulkan/anv_genX.h
+++ b/src/intel/vulkan/anv_genX.h
@@ -96,8 +96,9 @@ void genX(apply_task_urb_workaround)(struct anv_cmd_buffer *cmd_buffer);
  
  void genX(emit_vertex_input)(struct anv_batch *batch,
                               uint32_t *vertex_element_dws,
-                             const struct anv_graphics_pipeline *pipeline,
-                             const struct vk_vertex_input_state *vi);
+                             struct anv_graphics_pipeline *pipeline,
+                             const struct vk_vertex_input_state *vi,
+                             bool emit_in_pipeline);
  
  enum anv_pipe_bits
  genX(emit_apply_pipe_flushes)(struct anv_batch *batch,
@@ -125,7 +126,7 @@ void genX(emit_l3_config)(struct anv_batch *batch,
  void genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer,
                                  const struct intel_l3_config *cfg);
  
-void genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer);
+void genX(cmd_buffer_flush_gfx_hw_state)(struct anv_cmd_buffer *cmd_buffer);
  
  void genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer,
                                       bool enable);
diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h

index 2122165..777c398 100644 (file)
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -1464,6 +1464,9 @@ anv_reloc_list_add_bo(struct anv_reloc_list *list, struct anv_bo *target_bo)
     return list->uses_relocs ? anv_reloc_list_add_bo_impl(list, target_bo) : VK_SUCCESS;
  }
  
+VkResult anv_reloc_list_append(struct anv_reloc_list *list,
+                               struct anv_reloc_list *other);
+
  struct anv_batch_bo {
     /* Link in the anv_cmd_buffer.owned_batch_bos list */
     struct list_head                             link;
@@ -1603,14 +1606,16 @@ _anv_combine_address(struct anv_batch *batch, void *location,
        __dst;                                               \
     })
  
-#define anv_batch_emit_merge(batch, cmd, prepacked, name)               \
+#define anv_batch_emit_merge(batch, cmd, pipeline, state, name)         \
     for (struct cmd name = { 0 },                                        \
          *_dst = anv_batch_emit_dwords(batch, __anv_cmd_length(cmd));    \
          __builtin_expect(_dst != NULL, 1);                              \
          ({ uint32_t _partial[__anv_cmd_length(cmd)];                    \
             __anv_cmd_pack(cmd)(batch, _partial, &name);                 \
-           for (uint32_t i = 0; i < __anv_cmd_length(cmd); i++)         \
-              ((uint32_t *)_dst)[i] = _partial[i] | (prepacked)[i];     \
+           for (uint32_t i = 0; i < __anv_cmd_length(cmd); i++) {       \
+              ((uint32_t *)_dst)[i] = _partial[i] |                     \
+                 (pipeline)->batch_data[(pipeline)->state.offset + i];  \
+           }                                                            \
             VG(VALGRIND_CHECK_MEM_IS_DEFINED(_dst, __anv_cmd_length(cmd) * 4)); \
             _dst = NULL;                                                 \
           }))
@@ -3515,6 +3520,12 @@ struct anv_graphics_lib_pipeline {
     bool                                         retain_shaders;
  };
  
+struct anv_gfx_state_ptr {
+   /* Both in dwords */
+   uint16_t  offset;
+   uint16_t  len;
+};
+
  /* The final graphics pipeline object has all the graphics state ready to be
   * programmed into HW packets (dynamic_state field) or fully baked in its
   * batch.
@@ -3564,7 +3575,7 @@ struct anv_graphics_pipeline {
      * this array only holds the svgs_count elements.
      */
     uint32_t                                     vertex_input_elems;
-   uint32_t                                     vertex_input_data[96];
+   uint32_t                                     vertex_input_data[2 * 31 /* MAX_VES + 2 internal */];
  
     enum brw_wm_msaa_flags                       fs_msaa_flags;
  
@@ -3575,25 +3586,75 @@ struct anv_graphics_pipeline {
  
     /* Fully backed instructions, ready to be emitted in the anv_cmd_buffer */
     struct {
-      uint32_t                                  hs[9];
-      uint32_t                                  ds[11];
+      struct anv_gfx_state_ptr                  urb;
+      struct anv_gfx_state_ptr                  vf_statistics;
+      struct anv_gfx_state_ptr                  vf_sgvs;
+      struct anv_gfx_state_ptr                  vf_sgvs_2;
+      struct anv_gfx_state_ptr                  vf_sgvs_instancing;
+      struct anv_gfx_state_ptr                  vf_instancing;
+      struct anv_gfx_state_ptr                  primitive_replication;
+      struct anv_gfx_state_ptr                  sbe;
+      struct anv_gfx_state_ptr                  sbe_swiz;
+      struct anv_gfx_state_ptr                  so_decl_list;
+      struct anv_gfx_state_ptr                  ms;
+      struct anv_gfx_state_ptr                  vs;
+      struct anv_gfx_state_ptr                  hs;
+      struct anv_gfx_state_ptr                  ds;
+      struct anv_gfx_state_ptr                  ps;
+      struct anv_gfx_state_ptr                  ps_extra;
+
+      struct anv_gfx_state_ptr                  task_control;
+      struct anv_gfx_state_ptr                  task_shader;
+      struct anv_gfx_state_ptr                  task_redistrib;
+      struct anv_gfx_state_ptr                  clip_mesh;
+      struct anv_gfx_state_ptr                  mesh_control;
+      struct anv_gfx_state_ptr                  mesh_shader;
+      struct anv_gfx_state_ptr                  mesh_distrib;
+      struct anv_gfx_state_ptr                  sbe_mesh;
     } final;
  
     /* Pre packed CS instructions & structures that need to be merged later
      * with dynamic state.
      */
     struct {
-      uint32_t                                  clip[4];
-      uint32_t                                  sf[4];
-      uint32_t                                  raster[5];
-      uint32_t                                  wm[2];
-      uint32_t                                  streamout_state[5];
-      uint32_t                                  gs[10];
-      uint32_t                                  te[4];
-      uint32_t                                  vfg[4];
+      struct anv_gfx_state_ptr                  clip;
+      struct anv_gfx_state_ptr                  sf;
+      struct anv_gfx_state_ptr                  raster;
+      struct anv_gfx_state_ptr                  wm;
+      struct anv_gfx_state_ptr                  so;
+      struct anv_gfx_state_ptr                  gs;
+      struct anv_gfx_state_ptr                  te;
+      struct anv_gfx_state_ptr                  vfg;
     } partial;
  };
  
+#define anv_batch_merge_pipeline_state(batch, dwords0, pipeline, state) \
+   do {                                                                 \
+      uint32_t *dw;                                                     \
+                                                                        \
+      assert(ARRAY_SIZE(dwords0) == (pipeline)->state.len);             \
+      dw = anv_batch_emit_dwords((batch), ARRAY_SIZE(dwords0));         \
+      if (!dw)                                                          \
+         break;                                                         \
+      for (uint32_t i = 0; i < ARRAY_SIZE(dwords0); i++)                \
+         dw[i] = (dwords0)[i] |                                         \
+            (pipeline)->batch_data[(pipeline)->state.offset + i];       \
+      VG(VALGRIND_CHECK_MEM_IS_DEFINED(dw, ARRAY_SIZE(dwords0) * 4));   \
+   } while (0)
+
+#define anv_batch_emit_pipeline_state(batch, pipeline, state)           \
+   do {                                                                 \
+      if ((pipeline)->state.len == 0)                                   \
+         break;                                                         \
+      uint32_t *dw;                                                     \
+      dw = anv_batch_emit_dwords((batch), (pipeline)->state.len);       \
+      if (!dw)                                                          \
+         break;                                                         \
+      memcpy(dw, &(pipeline)->batch_data[(pipeline)->state.offset],     \
+             4 * (pipeline)->state.len);                                \
+   } while (0)
+
+
  struct anv_compute_pipeline {
     struct anv_pipeline                          base;
  
diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c

index 6157e55..86ab22a 100644 (file)
--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@@ -2994,10 +2994,7 @@ genX(emit_hs)(struct anv_cmd_buffer *cmd_buffer)
     if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
        return;
  
-   uint32_t *dw =
-      anv_batch_emitn(&cmd_buffer->batch, GENX(3DSTATE_HS_length),
-                         GENX(3DSTATE_HS));
-   memcpy(dw, &pipeline->final.hs, sizeof(pipeline->final.hs));
+   anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.hs);
  }
  
  ALWAYS_INLINE static void
@@ -3022,10 +3019,7 @@ genX(emit_ds)(struct anv_cmd_buffer *cmd_buffer)
     if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
        return;
  
-   uint32_t *dw =
-      anv_batch_emitn(&cmd_buffer->batch, GENX(3DSTATE_DS_length),
-                         GENX(3DSTATE_DS));
-   memcpy(dw, &pipeline->final.ds, sizeof(pipeline->final.ds));
+   anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.ds);
  #endif
  }
  
@@ -3224,13 +3218,22 @@ genX(cmd_buffer_flush_gfx_state)(struct anv_cmd_buffer *cmd_buffer)
        }
     }
  
-   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
-      anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.base.batch);
+   if (any_dynamic_state_dirty || cmd_buffer->state.gfx.dirty)
+      genX(cmd_buffer_flush_gfx_hw_state)(cmd_buffer);
  
-      /* If the pipeline changed, we may need to re-allocate push constant
-       * space in the URB.
-       */
+   /* If the pipeline changed, we may need to re-allocate push constant space
+    * in the URB.
+    */
+   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
        cmd_buffer_alloc_gfx_push_constants(cmd_buffer);
+
+      /* Also add the relocations (scratch buffers) */
+      VkResult result = anv_reloc_list_append(cmd_buffer->batch.relocs,
+                                              pipeline->base.base.batch.relocs);
+      if (result != VK_SUCCESS) {
+         anv_batch_set_error(&cmd_buffer->batch, result);
+         return;
+      }
     }
  
     /* Render targets live in the same binding table as fragment descriptors */
@@ -3274,8 +3277,9 @@ genX(cmd_buffer_flush_gfx_state)(struct anv_cmd_buffer *cmd_buffer)
                                            dirty & VK_SHADER_STAGE_ALL_GRAPHICS);
     }
  
-   if (any_dynamic_state_dirty || cmd_buffer->state.gfx.dirty)
-      genX(cmd_buffer_flush_dynamic_state)(cmd_buffer);
+   /* When we're done, there is no more dirty gfx state. */
+   vk_dynamic_graphics_state_clear_dirty(&cmd_buffer->vk.dynamic_graphics_state);
+   cmd_buffer->state.gfx.dirty = 0;
  }
  
  #include "genX_cmd_draw_generated_indirect.h"
diff --git a/src/intel/vulkan/genX_gfx_state.c b/src/intel/vulkan/genX_gfx_state.c

index 3d4ec69..8a30316 100644 (file)
--- a/src/intel/vulkan/genX_gfx_state.c
+++ b/src/intel/vulkan/genX_gfx_state.c
@@ -215,15 +215,12 @@ genX(cmd_emit_te)(struct anv_cmd_buffer *cmd_buffer)
  
     if (!tes_prog_data ||
         !anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
-      uint32_t *dw =
-         anv_batch_emitn(&cmd_buffer->batch, GENX(3DSTATE_TE_length),
-                         GENX(3DSTATE_TE));
-      memcpy(dw, &pipeline->partial.te, sizeof(pipeline->partial.te));
+      anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, partial.te);
        return;
     }
  
     anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_TE),
-                        pipeline->partial.te, te) {
+                        pipeline, partial.te, te) {
        if (dyn->ts.domain_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT) {
           te.OutputTopology = tes_prog_data->output_topology;
        } else {
@@ -244,14 +241,14 @@ genX(emit_gs)(struct anv_cmd_buffer *cmd_buffer)
  {
     struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
     if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
-      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_GS), gs);
+      anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, partial.gs);
        return;
     }
  
     const struct vk_dynamic_graphics_state *dyn =
        &cmd_buffer->vk.dynamic_graphics_state;
     anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_GS),
-                        pipeline->partial.gs, gs) {
+                        pipeline, partial.gs, gs) {
        switch (dyn->rs.provoking_vertex) {
        case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT:
           gs.ReorderMode = LEADING;
@@ -463,7 +460,7 @@ cmd_buffer_emit_clip(struct anv_cmd_buffer *cmd_buffer)
        return;
  
     anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_CLIP),
-                        pipeline->partial.clip, clip) {
+                        pipeline, partial.clip, clip) {
        /* Take dynamic primitive topology in to account with
         *    3DSTATE_CLIP::ViewportXYClipTestEnable
         */
@@ -532,7 +529,7 @@ cmd_buffer_emit_streamout(struct anv_cmd_buffer *cmd_buffer)
     genX(streamout_prologue)(cmd_buffer);
  
     anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_STREAMOUT),
-                        pipeline->partial.streamout_state, so) {
+                        pipeline, partial.so, so) {
        so.RenderingDisable = dyn->rs.rasterizer_discard_enable;
        so.RenderStreamSelect = dyn->rs.rasterization_stream;
  #if INTEL_NEEDS_WA_18022508906
@@ -802,13 +799,58 @@ cmd_buffer_emit_scissor(struct anv_cmd_buffer *cmd_buffer)
     }
  }
  
+#define cmd_buffer_emit_pipeline_state(batch, pipeline, state)          \
+   do {                                                                 \
+      if ((pipeline)->state.len == 0)                                   \
+         break;                                                         \
+      void *dw = anv_batch_emit_dwords(batch, (pipeline)->state.len);   \
+      if (!dw)                                                          \
+         break;                                                         \
+      memcpy(dw,                                                        \
+             &(pipeline)->batch_data[(pipeline)->state.offset],         \
+             4 * (pipeline)->state.len);                                \
+   } while (0)
+
+
  void
-genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
+genX(cmd_buffer_flush_gfx_hw_state)(struct anv_cmd_buffer *cmd_buffer)
  {
     struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
     struct vk_dynamic_graphics_state *dyn =
        &cmd_buffer->vk.dynamic_graphics_state;
  
+   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
+      struct anv_batch *batch = &cmd_buffer->batch;
+
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.urb);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.ms);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.primitive_replication);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.vf_instancing);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.vf_sgvs_instancing);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.vf_sgvs);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.vf_sgvs_2);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.vs);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.hs);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.ds);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.vf_statistics);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.so_decl_list);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.sbe);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.sbe_swiz);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.ps);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.ps_extra);
+
+      if (cmd_buffer->device->vk.enabled_extensions.EXT_mesh_shader) {
+         cmd_buffer_emit_pipeline_state(batch, pipeline, final.task_control);
+         cmd_buffer_emit_pipeline_state(batch, pipeline, final.task_shader);
+         cmd_buffer_emit_pipeline_state(batch, pipeline, final.task_redistrib);
+         cmd_buffer_emit_pipeline_state(batch, pipeline, final.clip_mesh);
+         cmd_buffer_emit_pipeline_state(batch, pipeline, final.mesh_control);
+         cmd_buffer_emit_pipeline_state(batch, pipeline, final.mesh_shader);
+         cmd_buffer_emit_pipeline_state(batch, pipeline, final.mesh_distrib);
+         cmd_buffer_emit_pipeline_state(batch, pipeline, final.sbe_mesh);
+      }
+   }
+
     cmd_buffer_emit_clip(cmd_buffer);
  
     if ((cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
@@ -865,7 +907,7 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
           } else {
              /* Use dyn->vi to emit the dynamic VERTEX_ELEMENT_STATE input. */
              genX(emit_vertex_input)(&cmd_buffer->batch, p + 1,
-                                    pipeline, dyn->vi);
+                                    pipeline, dyn->vi, false /* emit_in_pipeline */);
              /* Then append the VERTEX_ELEMENT_STATE for the draw parameters */
              memcpy(p + 1 + 2 * pipeline->vs_input_elements,
                     pipeline->vertex_input_data,
@@ -896,7 +938,7 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
         BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_PROVOKING_VERTEX) ||
         BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS)) {
        anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_SF),
-                           pipeline->partial.sf, sf) {
+                           pipeline, partial.sf, sf) {
           ANV_SETUP_PROVOKING_VERTEX(sf, dyn->rs.provoking_vertex);
  
           sf.LineWidth = dyn->rs.line.width;
@@ -978,7 +1020,7 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
           vk_rasterization_state_depth_clip_enable(&dyn->rs);
  
        anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_RASTER),
-                           pipeline->partial.raster, raster) {
+                           pipeline, partial.raster, raster) {
           raster.APIMode = api_mode;
           raster.DXMultisampleRasterizationEnable   = msaa_raster_enable;
           raster.AntialiasingEnable                 = aa_enable;
@@ -1120,7 +1162,7 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
     if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) ||
         BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE)) {
        anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_VFG),
-                           pipeline->partial.vfg, vfg) {
+                           pipeline, partial.vfg, vfg) {
           vfg.ListCutIndexEnable = dyn->ia.primitive_restart_enable;
        }
     }
@@ -1141,7 +1183,7 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
         * threads.
         */
        anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_WM),
-                           pipeline->partial.wm, wm) {
+                           pipeline, partial.wm, wm) {
           wm.ForceThreadDispatchEnable = anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT) &&
                                          (pipeline->force_fragment_thread_dispatch ||
                                          anv_cmd_buffer_all_color_write_masked(cmd_buffer)) ?
@@ -1365,8 +1407,4 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
           ccp.ColorCalcStatePointerValid = true;
        }
     }
-
-   /* When we're done, there is no more dirty gfx state. */
-   vk_dynamic_graphics_state_clear_dirty(&cmd_buffer->vk.dynamic_graphics_state);
-   cmd_buffer->state.gfx.dirty = 0;
  }
diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c

index 8174ab5..b7f4dba 100644 (file)
--- a/src/intel/vulkan/genX_pipeline.c
+++ b/src/intel/vulkan/genX_pipeline.c
@@ -36,6 +36,52 @@
  #include "vk_log.h"
  #include "vk_render_pass.h"
  
+static inline struct anv_batch *
+anv_gfx_pipeline_add(struct anv_graphics_pipeline *pipeline,
+                     struct anv_gfx_state_ptr *ptr,
+                     uint32_t n_dwords)
+{
+   struct anv_batch *batch = &pipeline->base.base.batch;
+
+   assert(ptr->len == 0 ||
+          (batch->next - batch->start) / 4 == (ptr->offset + ptr->len));
+   if (ptr->len == 0)
+      ptr->offset = (batch->next - batch->start) / 4;
+   ptr->len += n_dwords;
+
+   return batch;
+}
+
+#define anv_pipeline_emit(pipeline, state, cmd, name)                   \
+   for (struct cmd name = { __anv_cmd_header(cmd) },                    \
+           *_dst = anv_batch_emit_dwords(                               \
+              anv_gfx_pipeline_add(pipeline,                            \
+                                   &(pipeline)->state,                  \
+                                   __anv_cmd_length(cmd)),              \
+              __anv_cmd_length(cmd));                                   \
+        __builtin_expect(_dst != NULL, 1);                              \
+        ({ __anv_cmd_pack(cmd)(&(pipeline)->base.base.batch,            \
+                               _dst, &name);                            \
+           VG(VALGRIND_CHECK_MEM_IS_DEFINED(_dst, __anv_cmd_length(cmd) * 4)); \
+           _dst = NULL;                                                 \
+        }))
+
+#define anv_pipeline_emitn(pipeline, state, n, cmd, ...) ({             \
+   void *__dst = anv_batch_emit_dwords(                                 \
+      anv_gfx_pipeline_add(pipeline, &(pipeline)->state, n), n);        \
+   if (__dst) {                                                         \
+      struct cmd __template = {                                         \
+         __anv_cmd_header(cmd),                                         \
+         .DWordLength = n - __anv_cmd_length_bias(cmd),                 \
+         __VA_ARGS__                                                    \
+      };                                                                \
+      __anv_cmd_pack(cmd)(&pipeline->base.base.batch,                   \
+                          __dst, &__template);                          \
+   }                                                                    \
+   __dst;                                                               \
+   })
+
+
  static uint32_t
  vertex_element_comp_control(enum isl_format format, unsigned comp)
  {
@@ -91,8 +137,9 @@ vertex_element_comp_control(enum isl_format format, unsigned comp)
  void
  genX(emit_vertex_input)(struct anv_batch *batch,
                          uint32_t *vertex_element_dws,
-                        const struct anv_graphics_pipeline *pipeline,
-                        const struct vk_vertex_input_state *vi)
+                        struct anv_graphics_pipeline *pipeline,
+                        const struct vk_vertex_input_state *vi,
+                        bool emit_in_pipeline)
  {
     const struct anv_device *device = pipeline->base.base.device;
     const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
@@ -169,15 +216,28 @@ genX(emit_vertex_input)(struct anv_batch *batch,
         * that controls instancing.  On Haswell and prior, that's part of
         * VERTEX_BUFFER_STATE which we emit later.
         */
-      anv_batch_emit(batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
-         bool per_instance = vi->bindings[binding].input_rate ==
-                             VK_VERTEX_INPUT_RATE_INSTANCE;
-         uint32_t divisor = vi->bindings[binding].divisor *
-                            pipeline->instance_multiplier;
-
-         vfi.InstancingEnable = per_instance;
-         vfi.VertexElementIndex = slot;
-         vfi.InstanceDataStepRate = per_instance ? divisor : 1;
+      if (emit_in_pipeline) {
+         anv_pipeline_emit(pipeline, final.vf_instancing, GENX(3DSTATE_VF_INSTANCING), vfi) {
+            bool per_instance = vi->bindings[binding].input_rate ==
+               VK_VERTEX_INPUT_RATE_INSTANCE;
+            uint32_t divisor = vi->bindings[binding].divisor *
+               pipeline->instance_multiplier;
+
+            vfi.InstancingEnable = per_instance;
+            vfi.VertexElementIndex = slot;
+            vfi.InstanceDataStepRate = per_instance ? divisor : 1;
+         }
+      } else {
+         anv_batch_emit(batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
+            bool per_instance = vi->bindings[binding].input_rate ==
+               VK_VERTEX_INPUT_RATE_INSTANCE;
+            uint32_t divisor = vi->bindings[binding].divisor *
+               pipeline->instance_multiplier;
+
+            vfi.InstancingEnable = per_instance;
+            vfi.VertexElementIndex = slot;
+            vfi.InstanceDataStepRate = per_instance ? divisor : 1;
+         }
        }
     }
  }
@@ -187,15 +247,13 @@ emit_vertex_input(struct anv_graphics_pipeline *pipeline,
                    const struct vk_graphics_pipeline_state *state,
                    const struct vk_vertex_input_state *vi)
  {
-   struct anv_batch *batch = &pipeline->base.base.batch;
-
     /* Only pack the VERTEX_ELEMENT_STATE if not dynamic so we can just memcpy
      * everything in gfx8_cmd_buffer.c
      */
     if (!BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_VI)) {
-      genX(emit_vertex_input)(batch,
+      genX(emit_vertex_input)(NULL,
                                pipeline->vertex_input_data,
-                              pipeline, vi);
+                              pipeline, vi, true /* emit_in_pipeline */);
     }
  
     const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
@@ -207,6 +265,7 @@ emit_vertex_input(struct anv_graphics_pipeline *pipeline,
        assert(pipeline->vertex_input_elems >= pipeline->svgs_count);
        uint32_t slot_offset =
           pipeline->vertex_input_elems - pipeline->svgs_count;
+
        if (needs_svgs_elem) {
  #if GFX_VER < 11
           /* From the Broadwell PRM for the 3D_Vertex_Component_Control enum:
@@ -243,7 +302,8 @@ emit_vertex_input(struct anv_graphics_pipeline *pipeline,
                                           &element);
           slot_offset++;
  
-         anv_batch_emit(batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
+         anv_pipeline_emit(pipeline, final.vf_sgvs_instancing,
+                           GENX(3DSTATE_VF_INSTANCING), vfi) {
              vfi.VertexElementIndex = id_slot;
           }
        }
@@ -268,13 +328,14 @@ emit_vertex_input(struct anv_graphics_pipeline *pipeline,
                                           &element);
           slot_offset++;
  
-         anv_batch_emit(batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
+         anv_pipeline_emit(pipeline, final.vf_sgvs_instancing,
+                           GENX(3DSTATE_VF_INSTANCING), vfi) {
              vfi.VertexElementIndex = drawid_slot;
           }
        }
     }
  
-   anv_batch_emit(batch, GENX(3DSTATE_VF_SGVS), sgvs) {
+   anv_pipeline_emit(pipeline, final.vf_sgvs, GENX(3DSTATE_VF_SGVS), sgvs) {
        sgvs.VertexIDEnable              = vs_prog_data->uses_vertexid;
        sgvs.VertexIDComponentNumber     = 2;
        sgvs.VertexIDElementOffset       = id_slot;
@@ -284,7 +345,7 @@ emit_vertex_input(struct anv_graphics_pipeline *pipeline,
     }
  
  #if GFX_VER >= 11
-   anv_batch_emit(batch, GENX(3DSTATE_VF_SGVS_2), sgvs) {
+   anv_pipeline_emit(pipeline, final.vf_sgvs_2, GENX(3DSTATE_VF_SGVS_2), sgvs) {
        /* gl_BaseVertex */
        sgvs.XP0Enable                   = vs_prog_data->uses_firstvertex;
        sgvs.XP0SourceSelect             = XP0_PARAMETER;
@@ -306,32 +367,30 @@ emit_vertex_input(struct anv_graphics_pipeline *pipeline,
  
  #if GFX_VERx10 >= 125
     struct anv_device *device = pipeline->base.base.device;
-   struct GENX(3DSTATE_VFG) vfg = {
-      GENX(3DSTATE_VFG_header),
+   anv_pipeline_emit(pipeline, partial.vfg, GENX(3DSTATE_VFG), vfg) {
        /* If 3DSTATE_TE: TE Enable == 1 then RR_STRICT else RR_FREE*/
-      .DistributionMode =
+      vfg.DistributionMode =
           anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL) ? RR_STRICT :
-         RR_FREE,
-      .DistributionGranularity = BatchLevelGranularity,
-   };
-   /* Wa_14014890652 */
-   if (intel_device_info_is_dg2(device->info))
-      vfg.GranularityThresholdDisable = 1;
-   /* 192 vertices for TRILIST_ADJ */
-   vfg.ListNBatchSizeScale = 0;
-   /* Batch size of 384 vertices */
-   vfg.List3BatchSizeScale = 2;
-   /* Batch size of 128 vertices */
-   vfg.List2BatchSizeScale = 1;
-   /* Batch size of 128 vertices */
-   vfg.List1BatchSizeScale = 2;
-   /* Batch size of 256 vertices for STRIP topologies */
-   vfg.StripBatchSizeScale = 3;
-   /* 192 control points for PATCHLIST_3 */
-   vfg.PatchBatchSizeScale = 1;
-   /* 192 control points for PATCHLIST_3 */
-   vfg.PatchBatchSizeMultiplier = 31;
-   GENX(3DSTATE_VFG_pack)(NULL, pipeline->partial.vfg, &vfg);
+         RR_FREE;
+      vfg.DistributionGranularity = BatchLevelGranularity;
+      /* Wa_14014890652 */
+      if (intel_device_info_is_dg2(device->info))
+         vfg.GranularityThresholdDisable = 1;
+      /* 192 vertices for TRILIST_ADJ */
+      vfg.ListNBatchSizeScale = 0;
+      /* Batch size of 384 vertices */
+      vfg.List3BatchSizeScale = 2;
+      /* Batch size of 128 vertices */
+      vfg.List2BatchSizeScale = 1;
+      /* Batch size of 128 vertices */
+      vfg.List1BatchSizeScale = 2;
+      /* Batch size of 256 vertices for STRIP topologies */
+      vfg.StripBatchSizeScale = 3;
+      /* 192 control points for PATCHLIST_3 */
+      vfg.PatchBatchSizeScale = 1;
+      /* 192 control points for PATCHLIST_3 */
+      vfg.PatchBatchSizeMultiplier = 31;
+   }
  #endif
  }
  
@@ -375,7 +434,6 @@ static void
  emit_urb_setup_mesh(struct anv_graphics_pipeline *pipeline,
                      enum intel_urb_deref_block_size *deref_block_size)
  {
-   struct anv_batch *batch = &pipeline->base.base.batch;
     const struct intel_device_info *devinfo = pipeline->base.base.device->info;
  
     const struct brw_task_prog_data *task_prog_data =
@@ -390,12 +448,12 @@ emit_urb_setup_mesh(struct anv_graphics_pipeline *pipeline,
  
     /* Zero out the primitive pipeline URB allocations. */
     for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
-      anv_batch_emit(batch, GENX(3DSTATE_URB_VS), urb) {
+      anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_VS), urb) {
           urb._3DCommandSubOpcode += i;
        }
     }
  
-   anv_batch_emit(batch, GENX(3DSTATE_URB_ALLOC_TASK), urb) {
+   anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_TASK), urb) {
        if (task_prog_data) {
           urb.TASKURBEntryAllocationSize   = alloc.task_entry_size_64b - 1;
           urb.TASKNumberofURBEntriesSlice0 = alloc.task_entries;
@@ -405,7 +463,7 @@ emit_urb_setup_mesh(struct anv_graphics_pipeline *pipeline,
        }
     }
  
-   anv_batch_emit(batch, GENX(3DSTATE_URB_ALLOC_MESH), urb) {
+   anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_MESH), urb) {
        urb.MESHURBEntryAllocationSize   = alloc.mesh_entry_size_64b - 1;
        urb.MESHNumberofURBEntriesSlice0 = alloc.mesh_entries;
        urb.MESHNumberofURBEntriesSliceN = alloc.mesh_entries;
@@ -437,192 +495,207 @@ emit_urb_setup(struct anv_graphics_pipeline *pipeline,
        entry_size[i] = prog_data ? prog_data->urb_entry_size : 1;
     }
  
-   genX(emit_urb_setup)(pipeline->base.base.device,
-                        &pipeline->base.base.batch,
+   struct anv_device *device = pipeline->base.base.device;
+   const struct intel_device_info *devinfo = device->info;
+
+   unsigned entries[4];
+   unsigned start[4];
+   bool constrained;
+   intel_get_urb_config(devinfo,
                          pipeline->base.base.l3_config,
-                        pipeline->base.base.active_stages, entry_size,
-                        deref_block_size);
+                        pipeline->base.base.active_stages &
+                           VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT,
+                        pipeline->base.base.active_stages &
+                           VK_SHADER_STAGE_GEOMETRY_BIT,
+                        entry_size, entries, start, deref_block_size,
+                        &constrained);
+
+   for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
+      anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_VS), urb) {
+         urb._3DCommandSubOpcode      += i;
+         urb.VSURBStartingAddress      = start[i];
+         urb.VSURBEntryAllocationSize  = entry_size[i] - 1;
+         urb.VSNumberofURBEntries      = entries[i];
+      }
+   }
+#if GFX_VERx10 >= 125
+   if (device->vk.enabled_extensions.EXT_mesh_shader) {
+      anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_TASK), zero);
+      anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_MESH), zero);
+   }
+#endif
+
  }
  
  static void
  emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline)
  {
-   struct anv_batch *batch = &pipeline->base.base.batch;
     const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
  
     if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
-      anv_batch_emit(batch, GENX(3DSTATE_SBE), sbe);
-      anv_batch_emit(batch, GENX(3DSTATE_SBE_SWIZ), sbe);
+      anv_pipeline_emit(pipeline, final.sbe, GENX(3DSTATE_SBE), sbe);
+      anv_pipeline_emit(pipeline, final.sbe_swiz, GENX(3DSTATE_SBE_SWIZ), sbe);
  #if GFX_VERx10 >= 125
        if (anv_pipeline_is_mesh(pipeline))
-         anv_batch_emit(batch, GENX(3DSTATE_SBE_MESH), sbe_mesh);
+         anv_pipeline_emit(pipeline, final.sbe_mesh, GENX(3DSTATE_SBE_MESH), sbe);
  #endif
        return;
     }
  
-   struct GENX(3DSTATE_SBE) sbe = {
-      GENX(3DSTATE_SBE_header),
+   anv_pipeline_emit(pipeline, final.sbe, GENX(3DSTATE_SBE), sbe) {
+   anv_pipeline_emit(pipeline, final.sbe_swiz, GENX(3DSTATE_SBE_SWIZ), swiz) {
+
        /* TODO(mesh): Figure out cases where we need attribute swizzling.  See also
         * calculate_urb_setup() and related functions.
         */
-      .AttributeSwizzleEnable = anv_pipeline_is_primitive(pipeline),
-      .PointSpriteTextureCoordinateOrigin = UPPERLEFT,
-      .NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs,
-      .ConstantInterpolationEnable = wm_prog_data->flat_inputs,
-   };
-
-   for (unsigned i = 0; i < 32; i++)
-      sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;
-
-   /* On Broadwell, they broke 3DSTATE_SBE into two packets */
-   struct GENX(3DSTATE_SBE_SWIZ) swiz = {
-      GENX(3DSTATE_SBE_SWIZ_header),
-   };
-
-   if (anv_pipeline_is_primitive(pipeline)) {
-      const struct brw_vue_map *fs_input_map =
-         &anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map;
-
-      int first_slot = brw_compute_first_urb_slot_required(wm_prog_data->inputs,
-                                                           fs_input_map);
-      assert(first_slot % 2 == 0);
-      unsigned urb_entry_read_offset = first_slot / 2;
-      int max_source_attr = 0;
-      for (uint8_t idx = 0; idx < wm_prog_data->urb_setup_attribs_count; idx++) {
-         uint8_t attr = wm_prog_data->urb_setup_attribs[idx];
-         int input_index = wm_prog_data->urb_setup[attr];
-
-         assert(0 <= input_index);
-
-         /* gl_Viewport, gl_Layer and FragmentShadingRateKHR are stored in the
-          * VUE header
-          */
-         if (attr == VARYING_SLOT_VIEWPORT ||
-             attr == VARYING_SLOT_LAYER ||
-             attr == VARYING_SLOT_PRIMITIVE_SHADING_RATE) {
-            continue;
-         }
-
-         if (attr == VARYING_SLOT_PNTC) {
-            sbe.PointSpriteTextureCoordinateEnable = 1 << input_index;
-            continue;
-         }
-
-         const int slot = fs_input_map->varying_to_slot[attr];
-
-         if (slot == -1) {
-            /* This attribute does not exist in the VUE--that means that the
-             * vertex shader did not write to it.  It could be that it's a
-             * regular varying read by the fragment shader but not written by
-             * the vertex shader or it's gl_PrimitiveID. In the first case the
-             * value is undefined, in the second it needs to be
-             * gl_PrimitiveID.
+      sbe.AttributeSwizzleEnable = anv_pipeline_is_primitive(pipeline);
+      sbe.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
+      sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
+      sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
+
+      for (unsigned i = 0; i < 32; i++)
+         sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;
+
+      if (anv_pipeline_is_primitive(pipeline)) {
+         const struct brw_vue_map *fs_input_map =
+            &anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map;
+
+         int first_slot =
+            brw_compute_first_urb_slot_required(wm_prog_data->inputs,
+                                                fs_input_map);
+         assert(first_slot % 2 == 0);
+         unsigned urb_entry_read_offset = first_slot / 2;
+         int max_source_attr = 0;
+         for (uint8_t idx = 0; idx < wm_prog_data->urb_setup_attribs_count; idx++) {
+            uint8_t attr = wm_prog_data->urb_setup_attribs[idx];
+            int input_index = wm_prog_data->urb_setup[attr];
+
+            assert(0 <= input_index);
+
+            /* gl_Viewport, gl_Layer and FragmentShadingRateKHR are stored in the
+             * VUE header
+             */
+            if (attr == VARYING_SLOT_VIEWPORT ||
+                attr == VARYING_SLOT_LAYER ||
+                attr == VARYING_SLOT_PRIMITIVE_SHADING_RATE) {
+               continue;
+            }
+
+            if (attr == VARYING_SLOT_PNTC) {
+               sbe.PointSpriteTextureCoordinateEnable = 1 << input_index;
+               continue;
+            }
+
+            const int slot = fs_input_map->varying_to_slot[attr];
+
+            if (slot == -1) {
+               /* This attribute does not exist in the VUE--that means that
+                * the vertex shader did not write to it. It could be that it's
+                * a regular varying read by the fragment shader but not
+                * written by the vertex shader or it's gl_PrimitiveID. In the
+                * first case the value is undefined, in the second it needs to
+                * be gl_PrimitiveID.
+                */
+               swiz.Attribute[input_index].ConstantSource = PRIM_ID;
+               swiz.Attribute[input_index].ComponentOverrideX = true;
+               swiz.Attribute[input_index].ComponentOverrideY = true;
+               swiz.Attribute[input_index].ComponentOverrideZ = true;
+               swiz.Attribute[input_index].ComponentOverrideW = true;
+               continue;
+            }
+
+            /* We have to subtract two slots to account for the URB entry
+             * output read offset in the VS and GS stages.
+             */
+            const int source_attr = slot - 2 * urb_entry_read_offset;
+            assert(source_attr >= 0 && source_attr < 32);
+            max_source_attr = MAX2(max_source_attr, source_attr);
+            /* The hardware can only do overrides on 16 overrides at a time,
+             * and the other up to 16 have to be lined up so that the input
+             * index = the output index. We'll need to do some tweaking to
+             * make sure that's the case.
               */
-            swiz.Attribute[input_index].ConstantSource = PRIM_ID;
-            swiz.Attribute[input_index].ComponentOverrideX = true;
-            swiz.Attribute[input_index].ComponentOverrideY = true;
-            swiz.Attribute[input_index].ComponentOverrideZ = true;
-            swiz.Attribute[input_index].ComponentOverrideW = true;
-            continue;
+            if (input_index < 16)
+               swiz.Attribute[input_index].SourceAttribute = source_attr;
+            else
+               assert(source_attr == input_index);
           }
  
-         /* We have to subtract two slots to account for the URB entry output
-          * read offset in the VS and GS stages.
-          */
-         const int source_attr = slot - 2 * urb_entry_read_offset;
-         assert(source_attr >= 0 && source_attr < 32);
-         max_source_attr = MAX2(max_source_attr, source_attr);
-         /* The hardware can only do overrides on 16 overrides at a time, and the
-          * other up to 16 have to be lined up so that the input index = the
-          * output index. We'll need to do some tweaking to make sure that's the
-          * case.
-          */
-         if (input_index < 16)
-            swiz.Attribute[input_index].SourceAttribute = source_attr;
-         else
-            assert(source_attr == input_index);
-      }
-
-      sbe.VertexURBEntryReadOffset = urb_entry_read_offset;
-      sbe.VertexURBEntryReadLength = DIV_ROUND_UP(max_source_attr + 1, 2);
-      sbe.ForceVertexURBEntryReadOffset = true;
-      sbe.ForceVertexURBEntryReadLength = true;
+         sbe.VertexURBEntryReadOffset = urb_entry_read_offset;
+         sbe.VertexURBEntryReadLength = DIV_ROUND_UP(max_source_attr + 1, 2);
+         sbe.ForceVertexURBEntryReadOffset = true;
+         sbe.ForceVertexURBEntryReadLength = true;
  
-      /* Ask the hardware to supply PrimitiveID if the fragment shader
-       * reads it but a previous stage didn't write one.
-       */
-      if ((wm_prog_data->inputs & VARYING_BIT_PRIMITIVE_ID) &&
-          fs_input_map->varying_to_slot[VARYING_SLOT_PRIMITIVE_ID] == -1) {
-         sbe.PrimitiveIDOverrideAttributeSelect =
-            wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_ID];
-         sbe.PrimitiveIDOverrideComponentX = true;
-         sbe.PrimitiveIDOverrideComponentY = true;
-         sbe.PrimitiveIDOverrideComponentZ = true;
-         sbe.PrimitiveIDOverrideComponentW = true;
-         pipeline->primitive_id_override = true;
-      }
-   } else {
-      assert(anv_pipeline_is_mesh(pipeline));
-#if GFX_VERx10 >= 125
-      const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
-      anv_batch_emit(batch, GENX(3DSTATE_SBE_MESH), sbe_mesh) {
-         const struct brw_mue_map *mue = &mesh_prog_data->map;
-
-         assert(mue->per_vertex_header_size_dw % 8 == 0);
-         sbe_mesh.PerVertexURBEntryOutputReadOffset = mue->per_vertex_header_size_dw / 8;
-         sbe_mesh.PerVertexURBEntryOutputReadLength = DIV_ROUND_UP(mue->per_vertex_data_size_dw, 8);
-
-         /* Clip distance array is passed in the per-vertex header so that
-          * it can be consumed by the HW. If user wants to read it in the FS,
-          * adjust the offset and length to cover it. Conveniently it is at
-          * the end of the per-vertex header, right before per-vertex
-          * attributes.
-          *
-          * Note that FS attribute reading must be aware that the clip
-          * distances have fixed position.
+         /* Ask the hardware to supply PrimitiveID if the fragment shader
+          * reads it but a previous stage didn't write one.
            */
-         if (mue->per_vertex_header_size_dw > 8 &&
-               (wm_prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] >= 0 ||
-                wm_prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] >= 0)) {
-            sbe_mesh.PerVertexURBEntryOutputReadOffset -= 1;
-            sbe_mesh.PerVertexURBEntryOutputReadLength += 1;
+         if ((wm_prog_data->inputs & VARYING_BIT_PRIMITIVE_ID) &&
+             fs_input_map->varying_to_slot[VARYING_SLOT_PRIMITIVE_ID] == -1) {
+            sbe.PrimitiveIDOverrideAttributeSelect =
+               wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_ID];
+            sbe.PrimitiveIDOverrideComponentX = true;
+            sbe.PrimitiveIDOverrideComponentY = true;
+            sbe.PrimitiveIDOverrideComponentZ = true;
+            sbe.PrimitiveIDOverrideComponentW = true;
+            pipeline->primitive_id_override = true;
           }
-
-         if (mue->user_data_in_vertex_header) {
-            sbe_mesh.PerVertexURBEntryOutputReadOffset -= 1;
-            sbe_mesh.PerVertexURBEntryOutputReadLength += 1;
-         }
-
-         assert(mue->per_primitive_header_size_dw % 8 == 0);
-         sbe_mesh.PerPrimitiveURBEntryOutputReadOffset = mue->per_primitive_header_size_dw / 8;
-         sbe_mesh.PerPrimitiveURBEntryOutputReadLength = DIV_ROUND_UP(mue->per_primitive_data_size_dw, 8);
-
-         /* Just like with clip distances, if Primitive Shading Rate,
-          * Viewport Index or Layer is read back in the FS, adjust
-          * the offset and length to cover the Primitive Header, where
-          * PSR, Viewport Index & Layer are stored.
-          */
-         if (wm_prog_data->urb_setup[VARYING_SLOT_VIEWPORT] >= 0 ||
-             wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_SHADING_RATE] >= 0 ||
-             wm_prog_data->urb_setup[VARYING_SLOT_LAYER] >= 0 ||
-             mue->user_data_in_primitive_header) {
-            assert(sbe_mesh.PerPrimitiveURBEntryOutputReadOffset > 0);
-            sbe_mesh.PerPrimitiveURBEntryOutputReadOffset -= 1;
-            sbe_mesh.PerPrimitiveURBEntryOutputReadLength += 1;
+      } else {
+         assert(anv_pipeline_is_mesh(pipeline));
+#if GFX_VERx10 >= 125
+         const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
+         anv_pipeline_emit(pipeline, final.sbe_mesh,
+                           GENX(3DSTATE_SBE_MESH), sbe_mesh) {
+            const struct brw_mue_map *mue = &mesh_prog_data->map;
+
+            assert(mue->per_vertex_header_size_dw % 8 == 0);
+            sbe_mesh.PerVertexURBEntryOutputReadOffset = mue->per_vertex_header_size_dw / 8;
+            sbe_mesh.PerVertexURBEntryOutputReadLength = DIV_ROUND_UP(mue->per_vertex_data_size_dw, 8);
+
+            /* Clip distance array is passed in the per-vertex header so that
+             * it can be consumed by the HW. If user wants to read it in the
+             * FS, adjust the offset and length to cover it. Conveniently it
+             * is at the end of the per-vertex header, right before per-vertex
+             * attributes.
+             *
+             * Note that FS attribute reading must be aware that the clip
+             * distances have fixed position.
+             */
+            if (mue->per_vertex_header_size_dw > 8 &&
+                (wm_prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] >= 0 ||
+                 wm_prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] >= 0)) {
+               sbe_mesh.PerVertexURBEntryOutputReadOffset -= 1;
+               sbe_mesh.PerVertexURBEntryOutputReadLength += 1;
+            }
+
+            if (mue->user_data_in_vertex_header) {
+               sbe_mesh.PerVertexURBEntryOutputReadOffset -= 1;
+               sbe_mesh.PerVertexURBEntryOutputReadLength += 1;
+            }
+
+            assert(mue->per_primitive_header_size_dw % 8 == 0);
+            sbe_mesh.PerPrimitiveURBEntryOutputReadOffset =
+               mue->per_primitive_header_size_dw / 8;
+            sbe_mesh.PerPrimitiveURBEntryOutputReadLength =
+               DIV_ROUND_UP(mue->per_primitive_data_size_dw, 8);
+
+            /* Just like with clip distances, if Primitive Shading Rate,
+             * Viewport Index or Layer is read back in the FS, adjust the
+             * offset and length to cover the Primitive Header, where PSR,
+             * Viewport Index & Layer are stored.
+             */
+            if (wm_prog_data->urb_setup[VARYING_SLOT_VIEWPORT] >= 0 ||
+                wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_SHADING_RATE] >= 0 ||
+                wm_prog_data->urb_setup[VARYING_SLOT_LAYER] >= 0 ||
+                mue->user_data_in_primitive_header) {
+               assert(sbe_mesh.PerPrimitiveURBEntryOutputReadOffset > 0);
+               sbe_mesh.PerPrimitiveURBEntryOutputReadOffset -= 1;
+               sbe_mesh.PerPrimitiveURBEntryOutputReadLength += 1;
+            }
           }
-      }
  #endif
+      }
+   }
     }
-
-   uint32_t *dw = anv_batch_emit_dwords(batch, GENX(3DSTATE_SBE_length));
-   if (!dw)
-      return;
-   GENX(3DSTATE_SBE_pack)(batch, dw, &sbe);
-
-   dw = anv_batch_emit_dwords(batch, GENX(3DSTATE_SBE_SWIZ_length));
-   if (!dw)
-      return;
-   GENX(3DSTATE_SBE_SWIZ_pack)(batch, dw, &swiz);
  }
  
  /** Returns the final polygon mode for rasterization
@@ -729,63 +802,55 @@ emit_rs_state(struct anv_graphics_pipeline *pipeline,
                const struct vk_render_pass_state *rp,
                enum intel_urb_deref_block_size urb_deref_block_size)
  {
-   struct GENX(3DSTATE_SF) sf = {
-      GENX(3DSTATE_SF_header),
-   };
-
-   sf.ViewportTransformEnable = true;
-   sf.StatisticsEnable = true;
-   sf.VertexSubPixelPrecisionSelect = _8Bit;
-   sf.AALineDistanceMode = true;
+   anv_pipeline_emit(pipeline, partial.sf, GENX(3DSTATE_SF), sf) {
+      sf.ViewportTransformEnable = true;
+      sf.StatisticsEnable = true;
+      sf.VertexSubPixelPrecisionSelect = _8Bit;
+      sf.AALineDistanceMode = true;
  
  #if GFX_VER >= 12
-   sf.DerefBlockSize = urb_deref_block_size;
+      sf.DerefBlockSize = urb_deref_block_size;
  #endif
  
-   bool point_from_shader;
-   if (anv_pipeline_is_primitive(pipeline)) {
-      const struct brw_vue_prog_data *last_vue_prog_data =
-         anv_pipeline_get_last_vue_prog_data(pipeline);
-      point_from_shader = last_vue_prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ;
-   } else {
-      assert(anv_pipeline_is_mesh(pipeline));
-      const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
-      point_from_shader = mesh_prog_data->map.start_dw[VARYING_SLOT_PSIZ] >= 0;
-   }
+      bool point_from_shader;
+      if (anv_pipeline_is_primitive(pipeline)) {
+         const struct brw_vue_prog_data *last_vue_prog_data =
+            anv_pipeline_get_last_vue_prog_data(pipeline);
+         point_from_shader = last_vue_prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ;
+      } else {
+         assert(anv_pipeline_is_mesh(pipeline));
+         const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
+         point_from_shader = mesh_prog_data->map.start_dw[VARYING_SLOT_PSIZ] >= 0;
+      }
  
-   if (point_from_shader) {
-      sf.PointWidthSource = Vertex;
-   } else {
-      sf.PointWidthSource = State;
-      sf.PointWidth = 1.0;
+      if (point_from_shader) {
+         sf.PointWidthSource = Vertex;
+      } else {
+         sf.PointWidthSource = State;
+         sf.PointWidth = 1.0;
+      }
     }
  
-   struct GENX(3DSTATE_RASTER) raster = {
-      GENX(3DSTATE_RASTER_header),
-   };
-
-   /* For details on 3DSTATE_RASTER multisample state, see the BSpec table
-    * "Multisample Modes State".
-    */
-   /* NOTE: 3DSTATE_RASTER::ForcedSampleCount affects the BDW and SKL PMA fix
-    * computations.  If we ever set this bit to a different value, they will
-    * need to be updated accordingly.
-    */
-   raster.ForcedSampleCount = FSC_NUMRASTSAMPLES_0;
-   raster.ForceMultisampling = false;
-
-   raster.ScissorRectangleEnable = true;
+   anv_pipeline_emit(pipeline, partial.raster, GENX(3DSTATE_RASTER), raster) {
+      /* For details on 3DSTATE_RASTER multisample state, see the BSpec table
+       * "Multisample Modes State".
+       */
+      /* NOTE: 3DSTATE_RASTER::ForcedSampleCount affects the BDW and SKL PMA fix
+       * computations.  If we ever set this bit to a different value, they will
+       * need to be updated accordingly.
+       */
+      raster.ForcedSampleCount = FSC_NUMRASTSAMPLES_0;
+      raster.ForceMultisampling = false;
  
-   GENX(3DSTATE_SF_pack)(NULL, pipeline->partial.sf, &sf);
-   GENX(3DSTATE_RASTER_pack)(NULL, pipeline->partial.raster, &raster);
+      raster.ScissorRectangleEnable = true;
+   }
  }
  
  static void
  emit_ms_state(struct anv_graphics_pipeline *pipeline,
                const struct vk_multisample_state *ms)
  {
-   struct anv_batch *batch = &pipeline->base.base.batch;
-   anv_batch_emit(batch, GENX(3DSTATE_MULTISAMPLE), ms) {
+   anv_pipeline_emit(pipeline, final.ms, GENX(3DSTATE_MULTISAMPLE), ms) {
        ms.NumberofMultisamples       = __builtin_ffs(pipeline->rasterization_samples) - 1;
  
        ms.PixelLocation              = CENTER;
@@ -862,71 +927,67 @@ emit_3dstate_clip(struct anv_graphics_pipeline *pipeline,
     const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
     (void) wm_prog_data;
  
-   struct GENX(3DSTATE_CLIP) clip = {
-      GENX(3DSTATE_CLIP_header),
-   };
-
-   clip.ClipEnable               = true;
-   clip.StatisticsEnable         = true;
-   clip.EarlyCullEnable          = true;
-   clip.GuardbandClipTestEnable  = true;
+   anv_pipeline_emit(pipeline, partial.clip, GENX(3DSTATE_CLIP), clip) {
+      clip.ClipEnable               = true;
+      clip.StatisticsEnable         = true;
+      clip.EarlyCullEnable          = true;
+      clip.GuardbandClipTestEnable  = true;
  
-   clip.VertexSubPixelPrecisionSelect = _8Bit;
-   clip.ClipMode = CLIPMODE_NORMAL;
+      clip.VertexSubPixelPrecisionSelect = _8Bit;
+      clip.ClipMode = CLIPMODE_NORMAL;
  
-   clip.MinimumPointWidth = 0.125;
-   clip.MaximumPointWidth = 255.875;
+      clip.MinimumPointWidth = 0.125;
+      clip.MaximumPointWidth = 255.875;
  
-   /* TODO(mesh): Multiview. */
-   if (anv_pipeline_is_primitive(pipeline)) {
-      const struct brw_vue_prog_data *last =
-         anv_pipeline_get_last_vue_prog_data(pipeline);
+      /* TODO(mesh): Multiview. */
+      if (anv_pipeline_is_primitive(pipeline)) {
+         const struct brw_vue_prog_data *last =
+            anv_pipeline_get_last_vue_prog_data(pipeline);
  
-      /* From the Vulkan 1.0.45 spec:
-       *
-       *    "If the last active vertex processing stage shader entry point's
-       *    interface does not include a variable decorated with
-       *    ViewportIndex, then the first viewport is used."
-       */
-      if (vp && (last->vue_map.slots_valid & VARYING_BIT_VIEWPORT)) {
-         clip.MaximumVPIndex = vp->viewport_count > 0 ?
-                               vp->viewport_count - 1 : 0;
-      } else {
-         clip.MaximumVPIndex = 0;
-      }
+         /* From the Vulkan 1.0.45 spec:
+          *
+          *    "If the last active vertex processing stage shader entry
+          *    point's interface does not include a variable decorated with
+          *    ViewportIndex, then the first viewport is used."
+          */
+         if (vp && (last->vue_map.slots_valid & VARYING_BIT_VIEWPORT)) {
+            clip.MaximumVPIndex = vp->viewport_count > 0 ?
+               vp->viewport_count - 1 : 0;
+         } else {
+            clip.MaximumVPIndex = 0;
+         }
  
-      /* From the Vulkan 1.0.45 spec:
-       *
-       *    "If the last active vertex processing stage shader entry point's
-       *    interface does not include a variable decorated with Layer, then
-       *    the first layer is used."
-       */
-      clip.ForceZeroRTAIndexEnable =
-         !(last->vue_map.slots_valid & VARYING_BIT_LAYER);
+         /* From the Vulkan 1.0.45 spec:
+          *
+          *    "If the last active vertex processing stage shader entry point's
+          *    interface does not include a variable decorated with Layer, then
+          *    the first layer is used."
+          */
+         clip.ForceZeroRTAIndexEnable =
+            !(last->vue_map.slots_valid & VARYING_BIT_LAYER);
+
+      } else if (anv_pipeline_is_mesh(pipeline)) {
+         const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
+         if (vp && vp->viewport_count > 0 &&
+             mesh_prog_data->map.start_dw[VARYING_SLOT_VIEWPORT] >= 0) {
+            clip.MaximumVPIndex = vp->viewport_count - 1;
+         } else {
+            clip.MaximumVPIndex = 0;
+         }
  
-   } else if (anv_pipeline_is_mesh(pipeline)) {
-      const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
-      if (vp && vp->viewport_count > 0 &&
-          mesh_prog_data->map.start_dw[VARYING_SLOT_VIEWPORT] >= 0) {
-         clip.MaximumVPIndex = vp->viewport_count - 1;
-      } else {
-         clip.MaximumVPIndex = 0;
+         clip.ForceZeroRTAIndexEnable =
+            mesh_prog_data->map.start_dw[VARYING_SLOT_LAYER] < 0;
        }
  
-      clip.ForceZeroRTAIndexEnable =
-            mesh_prog_data->map.start_dw[VARYING_SLOT_LAYER] < 0;
+      clip.NonPerspectiveBarycentricEnable = wm_prog_data ?
+         wm_prog_data->uses_nonperspective_interp_modes : 0;
     }
  
-   clip.NonPerspectiveBarycentricEnable = wm_prog_data ?
-      wm_prog_data->uses_nonperspective_interp_modes : 0;
-
-   GENX(3DSTATE_CLIP_pack)(NULL, pipeline->partial.clip, &clip);
-
  #if GFX_VERx10 >= 125
     if (anv_pipeline_is_mesh(pipeline)) {
-      struct anv_batch *batch = &pipeline->base.base.batch;
        const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
-      anv_batch_emit(batch, GENX(3DSTATE_CLIP_MESH), clip_mesh) {
+      anv_pipeline_emit(pipeline, final.clip_mesh,
+                        GENX(3DSTATE_CLIP_MESH), clip_mesh) {
           clip_mesh.PrimitiveHeaderEnable = mesh_prog_data->map.per_primitive_header_size_dw > 0;
           clip_mesh.UserClipDistanceClipTestEnableBitmask = mesh_prog_data->clip_distance_mask;
           clip_mesh.UserClipDistanceCullTestEnableBitmask = mesh_prog_data->cull_distance_mask;
@@ -939,8 +1000,6 @@ static void
  emit_3dstate_streamout(struct anv_graphics_pipeline *pipeline,
                         const struct vk_rasterization_state *rs)
  {
-   struct anv_batch *batch = &pipeline->base.base.batch;
-   const struct anv_device *device = pipeline->base.base.device;
     const struct brw_vue_prog_data *prog_data =
        anv_pipeline_get_last_vue_prog_data(pipeline);
     const struct brw_vue_map *vue_map = &prog_data->vue_map;
@@ -1034,25 +1093,17 @@ emit_3dstate_streamout(struct anv_graphics_pipeline *pipeline,
              sbs[xfb_info->buffer_to_stream[b]] |= 1 << b;
        }
  
-      /* Wa_16011773973:
-       * If SOL is enabled and SO_DECL state has to be programmed,
-       *    1. Send 3D State SOL state with SOL disabled
-       *    2. Send SO_DECL NP state
-       *    3. Send 3D State SOL with SOL Enabled
-       */
-      if (intel_device_info_is_dg2(device->info))
-         anv_batch_emit(batch, GENX(3DSTATE_STREAMOUT), so);
-
-      uint32_t *dw = anv_batch_emitn(batch, 3 + 2 * max_decls,
-                                     GENX(3DSTATE_SO_DECL_LIST),
-                                     .StreamtoBufferSelects0 = sbs[0],
-                                     .StreamtoBufferSelects1 = sbs[1],
-                                     .StreamtoBufferSelects2 = sbs[2],
-                                     .StreamtoBufferSelects3 = sbs[3],
-                                     .NumEntries0 = decls[0],
-                                     .NumEntries1 = decls[1],
-                                     .NumEntries2 = decls[2],
-                                     .NumEntries3 = decls[3]);
+      uint32_t *dw = anv_pipeline_emitn(pipeline, final.so_decl_list,
+                                        3 + 2 * max_decls,
+                                        GENX(3DSTATE_SO_DECL_LIST),
+                                        .StreamtoBufferSelects0 = sbs[0],
+                                        .StreamtoBufferSelects1 = sbs[1],
+                                        .StreamtoBufferSelects2 = sbs[2],
+                                        .StreamtoBufferSelects3 = sbs[3],
+                                        .NumEntries0 = decls[0],
+                                        .NumEntries1 = decls[1],
+                                        .NumEntries2 = decls[2],
+                                        .NumEntries3 = decls[3]);
  
        for (int i = 0; i < max_decls; i++) {
           GENX(SO_DECL_ENTRY_pack)(NULL, dw + 3 + i * 2,
@@ -1063,47 +1114,38 @@ emit_3dstate_streamout(struct anv_graphics_pipeline *pipeline,
                 .Stream3Decl = so_decl[3][i],
              });
        }
-
-#if GFX_VERx10 == 125
-      /* Wa_14015946265: Send PC with CS stall after SO_DECL. */
-      genX(batch_emit_pipe_control)(batch, device->info, ANV_PIPE_CS_STALL_BIT);
-#endif
     }
  
-   struct GENX(3DSTATE_STREAMOUT) so = {
-      GENX(3DSTATE_STREAMOUT_header),
-   };
-
-   if (xfb_info) {
-      pipeline->uses_xfb = true;
+   anv_pipeline_emit(pipeline, partial.so, GENX(3DSTATE_STREAMOUT), so) {
+      if (xfb_info) {
+         pipeline->uses_xfb = true;
  
-      so.SOFunctionEnable = true;
-      so.SOStatisticsEnable = true;
+         so.SOFunctionEnable = true;
+         so.SOStatisticsEnable = true;
  
-      so.Buffer0SurfacePitch = xfb_info->buffers[0].stride;
-      so.Buffer1SurfacePitch = xfb_info->buffers[1].stride;
-      so.Buffer2SurfacePitch = xfb_info->buffers[2].stride;
-      so.Buffer3SurfacePitch = xfb_info->buffers[3].stride;
+         so.Buffer0SurfacePitch = xfb_info->buffers[0].stride;
+         so.Buffer1SurfacePitch = xfb_info->buffers[1].stride;
+         so.Buffer2SurfacePitch = xfb_info->buffers[2].stride;
+         so.Buffer3SurfacePitch = xfb_info->buffers[3].stride;
  
-      int urb_entry_read_offset = 0;
-      int urb_entry_read_length =
-         (prog_data->vue_map.num_slots + 1) / 2 - urb_entry_read_offset;
+         int urb_entry_read_offset = 0;
+         int urb_entry_read_length =
+            (prog_data->vue_map.num_slots + 1) / 2 - urb_entry_read_offset;
  
-      /* We always read the whole vertex.  This could be reduced at some
-       * point by reading less and offsetting the register index in the
-       * SO_DECLs.
-       */
-      so.Stream0VertexReadOffset = urb_entry_read_offset;
-      so.Stream0VertexReadLength = urb_entry_read_length - 1;
-      so.Stream1VertexReadOffset = urb_entry_read_offset;
-      so.Stream1VertexReadLength = urb_entry_read_length - 1;
-      so.Stream2VertexReadOffset = urb_entry_read_offset;
-      so.Stream2VertexReadLength = urb_entry_read_length - 1;
-      so.Stream3VertexReadOffset = urb_entry_read_offset;
-      so.Stream3VertexReadLength = urb_entry_read_length - 1;
+         /* We always read the whole vertex. This could be reduced at some
+          * point by reading less and offsetting the register index in the
+          * SO_DECLs.
+          */
+         so.Stream0VertexReadOffset = urb_entry_read_offset;
+         so.Stream0VertexReadLength = urb_entry_read_length - 1;
+         so.Stream1VertexReadOffset = urb_entry_read_offset;
+         so.Stream1VertexReadLength = urb_entry_read_length - 1;
+         so.Stream2VertexReadOffset = urb_entry_read_offset;
+         so.Stream2VertexReadLength = urb_entry_read_length - 1;
+         so.Stream3VertexReadOffset = urb_entry_read_offset;
+         so.Stream3VertexReadLength = urb_entry_read_length - 1;
+      }
     }
-
-   GENX(3DSTATE_STREAMOUT_pack)(NULL, pipeline->partial.streamout_state, &so);
  }
  
  static uint32_t
@@ -1158,7 +1200,6 @@ get_scratch_surf(struct anv_pipeline *pipeline,
  static void
  emit_3dstate_vs(struct anv_graphics_pipeline *pipeline)
  {
-   struct anv_batch *batch = &pipeline->base.base.batch;
     const struct intel_device_info *devinfo = pipeline->base.base.device->info;
     const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
     const struct anv_shader_bin *vs_bin =
@@ -1166,7 +1207,7 @@ emit_3dstate_vs(struct anv_graphics_pipeline *pipeline)
  
     assert(anv_pipeline_has_stage(pipeline, MESA_SHADER_VERTEX));
  
-   anv_batch_emit(batch, GENX(3DSTATE_VS), vs) {
+   anv_pipeline_emit(pipeline, final.vs, GENX(3DSTATE_VS), vs) {
        vs.Enable               = true;
        vs.StatisticsEnable     = true;
        vs.KernelStartPointer   = vs_bin->kernel.offset;
@@ -1237,11 +1278,9 @@ static void
  emit_3dstate_hs_ds(struct anv_graphics_pipeline *pipeline,
                     const struct vk_tessellation_state *ts)
  {
-   struct anv_batch *batch = &pipeline->base.base.batch;
-
     if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
-      anv_batch_emit(batch, GENX(3DSTATE_HS), hs);
-      anv_batch_emit(batch, GENX(3DSTATE_DS), ds);
+      anv_pipeline_emit(pipeline, final.hs, GENX(3DSTATE_HS), hs);
+      anv_pipeline_emit(pipeline, final.ds, GENX(3DSTATE_DS), ds);
        return;
     }
  
@@ -1254,120 +1293,101 @@ emit_3dstate_hs_ds(struct anv_graphics_pipeline *pipeline,
     const struct brw_tcs_prog_data *tcs_prog_data = get_tcs_prog_data(pipeline);
     const struct brw_tes_prog_data *tes_prog_data = get_tes_prog_data(pipeline);
  
-   struct GENX(3DSTATE_HS) hs = {
-      GENX(3DSTATE_HS_header),
-   };
-
-   hs.Enable = true;
-   hs.StatisticsEnable = true;
-   hs.KernelStartPointer = tcs_bin->kernel.offset;
-   /* Wa_1606682166 */
-   hs.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(tcs_bin);
-   hs.BindingTableEntryCount = tcs_bin->bind_map.surface_count;
+   anv_pipeline_emit(pipeline, final.hs, GENX(3DSTATE_HS), hs) {
+      hs.Enable = true;
+      hs.StatisticsEnable = true;
+      hs.KernelStartPointer = tcs_bin->kernel.offset;
+      /* Wa_1606682166 */
+      hs.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(tcs_bin);
+      hs.BindingTableEntryCount = tcs_bin->bind_map.surface_count;
  
  #if GFX_VER >= 12
-   /* Wa_1604578095:
-    *
-    *    Hang occurs when the number of max threads is less than 2 times
-    *    the number of instance count. The number of max threads must be
-    *    more than 2 times the number of instance count.
-    */
-   assert((devinfo->max_tcs_threads / 2) > tcs_prog_data->instances);
+      /* Wa_1604578095:
+       *
+       *    Hang occurs when the number of max threads is less than 2 times
+       *    the number of instance count. The number of max threads must be
+       *    more than 2 times the number of instance count.
+       */
+      assert((devinfo->max_tcs_threads / 2) > tcs_prog_data->instances);
  #endif
  
-   hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;
-   hs.IncludeVertexHandles = true;
-   hs.InstanceCount = tcs_prog_data->instances - 1;
+      hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;
+      hs.IncludeVertexHandles = true;
+      hs.InstanceCount = tcs_prog_data->instances - 1;
  
-   hs.VertexURBEntryReadLength = 0;
-   hs.VertexURBEntryReadOffset = 0;
-   hs.DispatchGRFStartRegisterForURBData =
-      tcs_prog_data->base.base.dispatch_grf_start_reg & 0x1f;
+      hs.VertexURBEntryReadLength = 0;
+      hs.VertexURBEntryReadOffset = 0;
+      hs.DispatchGRFStartRegisterForURBData =
+         tcs_prog_data->base.base.dispatch_grf_start_reg & 0x1f;
  #if GFX_VER >= 12
-   hs.DispatchGRFStartRegisterForURBData5 =
-      tcs_prog_data->base.base.dispatch_grf_start_reg >> 5;
+      hs.DispatchGRFStartRegisterForURBData5 =
+         tcs_prog_data->base.base.dispatch_grf_start_reg >> 5;
  #endif
  
  #if GFX_VERx10 >= 125
-   hs.ScratchSpaceBuffer =
-      get_scratch_surf(&pipeline->base.base, MESA_SHADER_TESS_CTRL, tcs_bin);
+      hs.ScratchSpaceBuffer =
+         get_scratch_surf(&pipeline->base.base, MESA_SHADER_TESS_CTRL, tcs_bin);
  #else
-   hs.PerThreadScratchSpace = get_scratch_space(tcs_bin);
-   hs.ScratchSpaceBasePointer =
-      get_scratch_address(&pipeline->base.base, MESA_SHADER_TESS_CTRL, tcs_bin);
+      hs.PerThreadScratchSpace = get_scratch_space(tcs_bin);
+      hs.ScratchSpaceBasePointer =
+         get_scratch_address(&pipeline->base.base, MESA_SHADER_TESS_CTRL, tcs_bin);
  #endif
  
  #if GFX_VER == 12
-   /*  Patch Count threshold specifies the maximum number of patches that
-    *  will be accumulated before a thread dispatch is forced.
-    */
-   hs.PatchCountThreshold = tcs_prog_data->patch_count_threshold;
+      /*  Patch Count threshold specifies the maximum number of patches that
+       *  will be accumulated before a thread dispatch is forced.
+       */
+      hs.PatchCountThreshold = tcs_prog_data->patch_count_threshold;
  #endif
  
-   hs.DispatchMode = tcs_prog_data->base.dispatch_mode;
-   hs.IncludePrimitiveID = tcs_prog_data->include_primitive_id;
-
-   STATIC_ASSERT(ARRAY_SIZE(pipeline->final.hs) == GENX(3DSTATE_HS_length));
-   GENX(3DSTATE_HS_pack)(&pipeline->base.base.batch, pipeline->final.hs, &hs);
-
-   struct GENX(3DSTATE_DS) ds = {
-      GENX(3DSTATE_DS_header),
+      hs.DispatchMode = tcs_prog_data->base.dispatch_mode;
+      hs.IncludePrimitiveID = tcs_prog_data->include_primitive_id;
     };
  
-   ds.Enable = true;
-   ds.StatisticsEnable = true;
-   ds.KernelStartPointer = tes_bin->kernel.offset;
-   /* Wa_1606682166 */
-   ds.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(tes_bin);
-   ds.BindingTableEntryCount = tes_bin->bind_map.surface_count;
-   ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;
+   anv_pipeline_emit(pipeline, final.ds, GENX(3DSTATE_DS), ds) {
+      ds.Enable = true;
+      ds.StatisticsEnable = true;
+      ds.KernelStartPointer = tes_bin->kernel.offset;
+      /* Wa_1606682166 */
+      ds.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(tes_bin);
+      ds.BindingTableEntryCount = tes_bin->bind_map.surface_count;
+      ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;
  
-   ds.ComputeWCoordinateEnable =
-      tes_prog_data->domain == BRW_TESS_DOMAIN_TRI;
+      ds.ComputeWCoordinateEnable =
+         tes_prog_data->domain == BRW_TESS_DOMAIN_TRI;
  
-   ds.PatchURBEntryReadLength = tes_prog_data->base.urb_read_length;
-   ds.PatchURBEntryReadOffset = 0;
-   ds.DispatchGRFStartRegisterForURBData =
-      tes_prog_data->base.base.dispatch_grf_start_reg;
+      ds.PatchURBEntryReadLength = tes_prog_data->base.urb_read_length;
+      ds.PatchURBEntryReadOffset = 0;
+      ds.DispatchGRFStartRegisterForURBData =
+         tes_prog_data->base.base.dispatch_grf_start_reg;
  
  #if GFX_VER < 11
-   ds.DispatchMode =
-      tes_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8 ?
-      DISPATCH_MODE_SIMD8_SINGLE_PATCH :
-      DISPATCH_MODE_SIMD4X2;
+      ds.DispatchMode =
+         tes_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8 ?
+         DISPATCH_MODE_SIMD8_SINGLE_PATCH :
+         DISPATCH_MODE_SIMD4X2;
  #else
-   assert(tes_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8);
-   ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
+      assert(tes_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8);
+      ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
  #endif
  
-   ds.UserClipDistanceClipTestEnableBitmask =
-      tes_prog_data->base.clip_distance_mask;
-   ds.UserClipDistanceCullTestEnableBitmask =
-      tes_prog_data->base.cull_distance_mask;
+      ds.UserClipDistanceClipTestEnableBitmask =
+         tes_prog_data->base.clip_distance_mask;
+      ds.UserClipDistanceCullTestEnableBitmask =
+         tes_prog_data->base.cull_distance_mask;
  
  #if GFX_VER >= 12
-   ds.PrimitiveIDNotRequired = !tes_prog_data->include_primitive_id;
+      ds.PrimitiveIDNotRequired = !tes_prog_data->include_primitive_id;
  #endif
  #if GFX_VERx10 >= 125
-   ds.ScratchSpaceBuffer =
-      get_scratch_surf(&pipeline->base.base, MESA_SHADER_TESS_EVAL, tes_bin);
+      ds.ScratchSpaceBuffer =
+         get_scratch_surf(&pipeline->base.base, MESA_SHADER_TESS_EVAL, tes_bin);
  #else
-   ds.PerThreadScratchSpace = get_scratch_space(tes_bin);
-   ds.ScratchSpaceBasePointer =
-      get_scratch_address(&pipeline->base.base, MESA_SHADER_TESS_EVAL, tes_bin);
+      ds.PerThreadScratchSpace = get_scratch_space(tes_bin);
+      ds.ScratchSpaceBasePointer =
+         get_scratch_address(&pipeline->base.base, MESA_SHADER_TESS_EVAL, tes_bin);
  #endif
-
-   /* Wa_14019750404:
-    * See genX(emit_ds)().
-    * We need to both emit 3DSTATE_DS now, and before each 3DPRIMITIVE, so
-    * we pack it to have it later, and memcpy into the current batch.
-    */
-   STATIC_ASSERT(ARRAY_SIZE(pipeline->final.ds) == GENX(3DSTATE_DS_length));
-   GENX(3DSTATE_DS_pack)(&pipeline->base.base.batch, pipeline->final.ds, &ds);
-
-   uint32_t *dw =
-      anv_batch_emitn(batch, GENX(3DSTATE_DS_length), GENX(3DSTATE_DS));
-   memcpy(dw, &pipeline->final.ds, sizeof(pipeline->final.ds));
+   }
  }
  
  static UNUSED bool
@@ -1391,63 +1411,59 @@ geom_or_tess_prim_id_used(struct anv_graphics_pipeline *pipeline)
  static void
  emit_3dstate_te(struct anv_graphics_pipeline *pipeline)
  {
-   struct GENX(3DSTATE_TE) te = {
-      GENX(3DSTATE_TE_header),
-   };
-
-   if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
-      const struct brw_tes_prog_data *tes_prog_data =
-         get_tes_prog_data(pipeline);
-
-      te.Partitioning = tes_prog_data->partitioning;
-      te.TEDomain = tes_prog_data->domain;
-      te.TEEnable = true;
-      te.MaximumTessellationFactorOdd = 63.0;
-      te.MaximumTessellationFactorNotOdd = 64.0;
+   anv_pipeline_emit(pipeline, partial.te, GENX(3DSTATE_TE), te) {
+      if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
+         const struct brw_tes_prog_data *tes_prog_data =
+            get_tes_prog_data(pipeline);
+
+         te.Partitioning = tes_prog_data->partitioning;
+         te.TEDomain = tes_prog_data->domain;
+         te.TEEnable = true;
+         te.MaximumTessellationFactorOdd = 63.0;
+         te.MaximumTessellationFactorNotOdd = 64.0;
  #if GFX_VERx10 >= 125
-      const struct anv_device *device = pipeline->base.base.device;
-      if (intel_needs_workaround(device->info, 22012699309))
-         te.TessellationDistributionMode = TEDMODE_RR_STRICT;
-      else
-         te.TessellationDistributionMode = TEDMODE_RR_FREE;
+         const struct anv_device *device = pipeline->base.base.device;
+         if (intel_needs_workaround(device->info, 22012699309))
+            te.TessellationDistributionMode = TEDMODE_RR_STRICT;
+         else
+            te.TessellationDistributionMode = TEDMODE_RR_FREE;
  
-      if (intel_needs_workaround(device->info, 14015055625)) {
-         /* Wa_14015055625:
-          *
-          * Disable Tessellation Distribution when primitive Id is enabled.
-          */
-         if (pipeline->primitive_id_override ||
-             geom_or_tess_prim_id_used(pipeline))
-            te.TessellationDistributionMode = TEDMODE_OFF;
-      }
+         if (intel_needs_workaround(device->info, 14015055625)) {
+            /* Wa_14015055625:
+             *
+             * Disable Tessellation Distribution when primitive Id is enabled.
+             */
+            if (pipeline->primitive_id_override ||
+                geom_or_tess_prim_id_used(pipeline))
+               te.TessellationDistributionMode = TEDMODE_OFF;
+         }
  
-      te.TessellationDistributionLevel = TEDLEVEL_PATCH;
-      /* 64_TRIANGLES */
-      te.SmallPatchThreshold = 3;
-      /* 1K_TRIANGLES */
-      te.TargetBlockSize = 8;
-      /* 1K_TRIANGLES */
-      te.LocalBOPAccumulatorThreshold = 1;
+         te.TessellationDistributionLevel = TEDLEVEL_PATCH;
+         /* 64_TRIANGLES */
+         te.SmallPatchThreshold = 3;
+         /* 1K_TRIANGLES */
+         te.TargetBlockSize = 8;
+         /* 1K_TRIANGLES */
+         te.LocalBOPAccumulatorThreshold = 1;
  #endif
+      }
     }
-
-   GENX(3DSTATE_TE_pack)(NULL, pipeline->partial.te, &te);
  }
  
  static void
  emit_3dstate_gs(struct anv_graphics_pipeline *pipeline)
  {
+   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
+      anv_pipeline_emit(pipeline, partial.gs, GENX(3DSTATE_GS), gs);
+      return;
+   }
+
     const struct intel_device_info *devinfo = pipeline->base.base.device->info;
     const struct anv_shader_bin *gs_bin =
        pipeline->base.shaders[MESA_SHADER_GEOMETRY];
+   const struct brw_gs_prog_data *gs_prog_data = get_gs_prog_data(pipeline);
  
-   struct GENX(3DSTATE_GS) gs = {
-      GENX(3DSTATE_GS_header),
-   };
-
-   if (anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
-       const struct brw_gs_prog_data *gs_prog_data = get_gs_prog_data(pipeline);
-
+   anv_pipeline_emit(pipeline, partial.gs, GENX(3DSTATE_GS), gs) {
        gs.Enable                  = true;
        gs.StatisticsEnable        = true;
        gs.KernelStartPointer      = gs_bin->kernel.offset;
@@ -1493,8 +1509,6 @@ emit_3dstate_gs(struct anv_graphics_pipeline *pipeline)
           get_scratch_address(&pipeline->base.base, MESA_SHADER_GEOMETRY, gs_bin);
  #endif
     }
-
-   GENX(3DSTATE_GS_pack)(&pipeline->base.base.batch, pipeline->partial.gs, &gs);
  }
  
  static bool
@@ -1514,49 +1528,46 @@ emit_3dstate_wm(struct anv_graphics_pipeline *pipeline,
  {
     const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
  
-   struct GENX(3DSTATE_WM) wm = {
-      GENX(3DSTATE_WM_header),
-   };
-   wm.StatisticsEnable                    = true;
-   wm.LineEndCapAntialiasingRegionWidth   = _05pixels;
-   wm.LineAntialiasingRegionWidth         = _10pixels;
-   wm.PointRasterizationRule              = RASTRULE_UPPER_LEFT;
+   anv_pipeline_emit(pipeline, partial.wm, GENX(3DSTATE_WM), wm) {
+      wm.StatisticsEnable                    = true;
+      wm.LineEndCapAntialiasingRegionWidth   = _05pixels;
+      wm.LineAntialiasingRegionWidth         = _10pixels;
+      wm.PointRasterizationRule              = RASTRULE_UPPER_LEFT;
  
-   if (anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
-      if (wm_prog_data->early_fragment_tests) {
+      if (anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
+         if (wm_prog_data->early_fragment_tests) {
              wm.EarlyDepthStencilControl         = EDSC_PREPS;
-      } else if (wm_prog_data->has_side_effects) {
-         wm.EarlyDepthStencilControl         = EDSC_PSEXEC;
-      } else {
-         wm.EarlyDepthStencilControl         = EDSC_NORMAL;
-      }
+         } else if (wm_prog_data->has_side_effects) {
+            wm.EarlyDepthStencilControl         = EDSC_PSEXEC;
+         } else {
+            wm.EarlyDepthStencilControl         = EDSC_NORMAL;
+         }
  
-      /* Gen8 hardware tries to compute ThreadDispatchEnable for us but
-       * doesn't take into account KillPixels when no depth or stencil
-       * writes are enabled.  In order for occlusion queries to work
-       * correctly with no attachments, we need to force-enable PS thread
-       * dispatch.
-       *
-       * The BDW docs are pretty clear that that this bit isn't validated
-       * and probably shouldn't be used in production:
-       *
-       *    "This must always be set to Normal. This field should not be
-       *    tested for functional validation."
-       *
-       * Unfortunately, however, the other mechanism we have for doing this
-       * is 3DSTATE_PS_EXTRA::PixelShaderHasUAV which causes hangs on BDW.
-       * Given two bad options, we choose the one which works.
-       */
-      pipeline->force_fragment_thread_dispatch =
-         wm_prog_data->has_side_effects ||
-         wm_prog_data->uses_kill;
+         /* Gen8 hardware tries to compute ThreadDispatchEnable for us but
+          * doesn't take into account KillPixels when no depth or stencil
+          * writes are enabled. In order for occlusion queries to work
+          * correctly with no attachments, we need to force-enable PS thread
+          * dispatch.
+          *
+          * The BDW docs are pretty clear that that this bit isn't validated
+          * and probably shouldn't be used in production:
+          *
+          *    "This must always be set to Normal. This field should not be
+          *     tested for functional validation."
+          *
+          * Unfortunately, however, the other mechanism we have for doing this
+          * is 3DSTATE_PS_EXTRA::PixelShaderHasUAV which causes hangs on BDW.
+          * Given two bad options, we choose the one which works.
+          */
+         pipeline->force_fragment_thread_dispatch =
+            wm_prog_data->has_side_effects ||
+            wm_prog_data->uses_kill;
  
-      wm.BarycentricInterpolationMode =
-         wm_prog_data_barycentric_modes(wm_prog_data,
-                                        pipeline->fs_msaa_flags);
+         wm.BarycentricInterpolationMode =
+            wm_prog_data_barycentric_modes(wm_prog_data,
+                                           pipeline->fs_msaa_flags);
+      }
     }
-
-   GENX(3DSTATE_WM_pack)(NULL, pipeline->partial.wm, &wm);
  }
  
  static void
@@ -1564,21 +1575,19 @@ emit_3dstate_ps(struct anv_graphics_pipeline *pipeline,
                  const struct vk_multisample_state *ms,
                  const struct vk_color_blend_state *cb)
  {
-   struct anv_batch *batch = &pipeline->base.base.batch;
     UNUSED const struct intel_device_info *devinfo =
        pipeline->base.base.device->info;
     const struct anv_shader_bin *fs_bin =
        pipeline->base.shaders[MESA_SHADER_FRAGMENT];
  
     if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
-      anv_batch_emit(batch, GENX(3DSTATE_PS), ps) {
-      }
+      anv_pipeline_emit(pipeline, final.ps, GENX(3DSTATE_PS), ps);
        return;
     }
  
     const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
  
-   anv_batch_emit(batch, GENX(3DSTATE_PS), ps) {
+   anv_pipeline_emit(pipeline, final.ps, GENX(3DSTATE_PS), ps) {
        intel_set_ps_dispatch_state(&ps, devinfo, wm_prog_data,
                                    ms != NULL ? ms->rasterization_samples : 1,
                                    pipeline->fs_msaa_flags);
@@ -1629,15 +1638,14 @@ emit_3dstate_ps_extra(struct anv_graphics_pipeline *pipeline,
                        const struct vk_rasterization_state *rs,
                        const struct vk_render_pass_state *rp)
  {
-   struct anv_batch *batch = &pipeline->base.base.batch;
     const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
  
     if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
-      anv_batch_emit(batch, GENX(3DSTATE_PS_EXTRA), ps);
+      anv_pipeline_emit(pipeline, final.ps_extra, GENX(3DSTATE_PS_EXTRA), ps);
        return;
     }
  
-   anv_batch_emit(batch, GENX(3DSTATE_PS_EXTRA), ps) {
+   anv_pipeline_emit(pipeline, final.ps_extra, GENX(3DSTATE_PS_EXTRA), ps) {
        ps.PixelShaderValid              = true;
        ps.AttributeEnable               = wm_prog_data->num_varying_inputs > 0;
        ps.oMaskPresenttoRenderTarget    = wm_prog_data->uses_omask;
@@ -1689,8 +1697,8 @@ emit_3dstate_ps_extra(struct anv_graphics_pipeline *pipeline,
  static void
  emit_3dstate_vf_statistics(struct anv_graphics_pipeline *pipeline)
  {
-   struct anv_batch *batch = &pipeline->base.base.batch;
-   anv_batch_emit(batch, GENX(3DSTATE_VF_STATISTICS), vfs) {
+   anv_pipeline_emit(pipeline, final.vf_statistics,
+                     GENX(3DSTATE_VF_STATISTICS), vfs) {
        vfs.StatisticsEnable = true;
     }
  }
@@ -1733,10 +1741,9 @@ static void
  emit_3dstate_primitive_replication(struct anv_graphics_pipeline *pipeline,
                                     const struct vk_render_pass_state *rp)
  {
-   struct anv_batch *batch = &pipeline->base.base.batch;
-
     if (anv_pipeline_is_mesh(pipeline)) {
-      anv_batch_emit(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
+      anv_pipeline_emit(pipeline, final.primitive_replication,
+                        GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
        return;
     }
  
@@ -1745,14 +1752,16 @@ emit_3dstate_primitive_replication(struct anv_graphics_pipeline *pipeline,
  
     assert(replication_count >= 1);
     if (replication_count == 1) {
-      anv_batch_emit(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
+      anv_pipeline_emit(pipeline, final.primitive_replication,
+                        GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
        return;
     }
  
     assert(replication_count == util_bitcount(rp->view_mask));
     assert(replication_count <= MAX_VIEWS_FOR_PRIMITIVE_REPLICATION);
  
-   anv_batch_emit(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) {
+   anv_pipeline_emit(pipeline, final.primitive_replication,
+                     GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) {
        pr.ReplicaMask = (1 << replication_count) - 1;
        pr.ReplicationCount = replication_count - 1;
  
@@ -1769,18 +1778,19 @@ emit_3dstate_primitive_replication(struct anv_graphics_pipeline *pipeline,
  static void
  emit_task_state(struct anv_graphics_pipeline *pipeline)
  {
-   struct anv_batch *batch = &pipeline->base.base.batch;
     assert(anv_pipeline_is_mesh(pipeline));
  
     if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
-      anv_batch_emit(batch, GENX(3DSTATE_TASK_CONTROL), zero);
+      anv_pipeline_emit(pipeline, final.task_control,
+                        GENX(3DSTATE_TASK_CONTROL), zero);
        return;
     }
  
     const struct anv_shader_bin *task_bin =
        pipeline->base.shaders[MESA_SHADER_TASK];
  
-   anv_batch_emit(batch, GENX(3DSTATE_TASK_CONTROL), tc) {
+   anv_pipeline_emit(pipeline, final.task_control,
+                     GENX(3DSTATE_TASK_CONTROL), tc) {
        tc.TaskShaderEnable = true;
        tc.ScratchSpaceBuffer =
           get_scratch_surf(&pipeline->base.base, MESA_SHADER_TASK, task_bin);
@@ -1792,7 +1802,8 @@ emit_task_state(struct anv_graphics_pipeline *pipeline)
     const struct brw_cs_dispatch_info task_dispatch =
        brw_cs_get_dispatch_info(devinfo, &task_prog_data->base, NULL);
  
-   anv_batch_emit(batch, GENX(3DSTATE_TASK_SHADER), task) {
+   anv_pipeline_emit(pipeline, final.task_shader,
+                     GENX(3DSTATE_TASK_SHADER), task) {
        task.KernelStartPointer                = task_bin->kernel.offset;
        task.SIMDSize                          = task_dispatch.simd_size / 16;
        task.MessageSIMD                       = task.SIMDSize;
@@ -1818,7 +1829,8 @@ emit_task_state(struct anv_graphics_pipeline *pipeline)
     }
  
     /* Recommended values from "Task and Mesh Distribution Programming". */
-   anv_batch_emit(batch, GENX(3DSTATE_TASK_REDISTRIB), redistrib) {
+   anv_pipeline_emit(pipeline, final.task_redistrib,
+                     GENX(3DSTATE_TASK_REDISTRIB), redistrib) {
        redistrib.LocalBOTAccumulatorThreshold = MULTIPLIER_1;
        redistrib.SmallTaskThreshold = 1; /* 2^N */
        redistrib.TargetMeshBatchSize = devinfo->num_slices > 2 ? 3 : 5; /* 2^N */
@@ -1830,12 +1842,12 @@ emit_task_state(struct anv_graphics_pipeline *pipeline)
  static void
  emit_mesh_state(struct anv_graphics_pipeline *pipeline)
  {
-   struct anv_batch *batch = &pipeline->base.base.batch;
     assert(anv_pipeline_is_mesh(pipeline));
  
     const struct anv_shader_bin *mesh_bin = pipeline->base.shaders[MESA_SHADER_MESH];
  
-   anv_batch_emit(batch, GENX(3DSTATE_MESH_CONTROL), mc) {
+   anv_pipeline_emit(pipeline, final.mesh_control,
+                     GENX(3DSTATE_MESH_CONTROL), mc) {
        mc.MeshShaderEnable = true;
        mc.ScratchSpaceBuffer =
           get_scratch_surf(&pipeline->base.base, MESA_SHADER_MESH, mesh_bin);
@@ -1864,7 +1876,8 @@ emit_mesh_state(struct anv_graphics_pipeline *pipeline)
        unreachable("invalid index format");
     }
  
-   anv_batch_emit(batch, GENX(3DSTATE_MESH_SHADER), mesh) {
+   anv_pipeline_emit(pipeline, final.mesh_shader,
+                     GENX(3DSTATE_MESH_SHADER), mesh) {
        mesh.KernelStartPointer                = mesh_bin->kernel.offset;
        mesh.SIMDSize                          = mesh_dispatch.simd_size / 16;
        mesh.MessageSIMD                       = mesh.SIMDSize;
@@ -1897,7 +1910,8 @@ emit_mesh_state(struct anv_graphics_pipeline *pipeline)
     }
  
     /* Recommended values from "Task and Mesh Distribution Programming". */
-   anv_batch_emit(batch, GENX(3DSTATE_MESH_DISTRIB), distrib) {
+   anv_pipeline_emit(pipeline, final.mesh_distrib,
+                     GENX(3DSTATE_MESH_DISTRIB), distrib) {
        distrib.DistributionMode = MESH_RR_FREE;
        distrib.TaskDistributionBatchSize = devinfo->num_slices > 2 ? 4 : 9; /* 2^N thread groups */
        distrib.MeshDistributionBatchSize = devinfo->num_slices > 2 ? 3 : 3; /* 2^N thread groups */
@@ -1909,7 +1923,6 @@ void
  genX(graphics_pipeline_emit)(struct anv_graphics_pipeline *pipeline,
                               const struct vk_graphics_pipeline_state *state)
  {
-   struct anv_batch *batch = &pipeline->base.base.batch;
     enum intel_urb_deref_block_size urb_deref_block_size;
     emit_urb_setup(pipeline, &urb_deref_block_size);
  
@@ -1940,10 +1953,10 @@ genX(graphics_pipeline_emit)(struct anv_graphics_pipeline *pipeline,
        const struct anv_device *device = pipeline->base.base.device;
        /* Disable Mesh. */
        if (device->vk.enabled_extensions.EXT_mesh_shader) {
-         struct anv_batch *batch = &pipeline->base.base.batch;
-
-         anv_batch_emit(batch, GENX(3DSTATE_MESH_CONTROL), zero);
-         anv_batch_emit(batch, GENX(3DSTATE_TASK_CONTROL), zero);
+         anv_pipeline_emit(pipeline, final.mesh_control,
+                           GENX(3DSTATE_MESH_CONTROL), zero);
+         anv_pipeline_emit(pipeline, final.task_control,
+                           GENX(3DSTATE_TASK_CONTROL), zero);
        }
  #endif
     } else {
@@ -1952,7 +1965,7 @@ genX(graphics_pipeline_emit)(struct anv_graphics_pipeline *pipeline,
        /* BSpec 46303 forbids both 3DSTATE_MESH_CONTROL.MeshShaderEnable
         * and 3DSTATE_STREAMOUT.SOFunctionEnable to be 1.
         */
-      anv_batch_emit(batch, GENX(3DSTATE_STREAMOUT), so) {}
+      anv_pipeline_emit(pipeline, partial.so, GENX(3DSTATE_STREAMOUT), so);
  
  #if GFX_VERx10 >= 125
        emit_task_state(pipeline);
author	Lionel Landwerlin <lionel.g.landwerlin@intel.com>
	Tue, 1 Aug 2023 09:20:19 +0000 (12:20 +0300)
committer	Marge Bot <emma+marge@anholt.net>
	Wed, 6 Sep 2023 20:07:02 +0000 (20:07 +0000)
src/intel/vulkan/anv_batch_chain.c		patch \| blob \| history
src/intel/vulkan/anv_genX.h		patch \| blob \| history
src/intel/vulkan/anv_private.h		patch \| blob \| history
src/intel/vulkan/genX_cmd_buffer.c		patch \| blob \| history
src/intel/vulkan/genX_gfx_state.c		patch \| blob \| history
src/intel/vulkan/genX_pipeline.c		patch \| blob \| history