v3dv: implement double-buffer mode
[platform/upstream/mesa.git] / src / broadcom / vulkan / v3dv_cmd_buffer.c
index 283591e..df2f488 100644 (file)
  */
 
 #include "v3dv_private.h"
-#include "broadcom/cle/v3dx_pack.h"
-#include "util/half_float.h"
 #include "util/u_pack_color.h"
-#include "vk_format_info.h"
-
-const struct v3dv_dynamic_state default_dynamic_state = {
-   .viewport = {
-      .count = 0,
-   },
-   .scissor = {
-      .count = 0,
-   },
-   .stencil_compare_mask =
-   {
-     .front = ~0u,
-     .back = ~0u,
-   },
-   .stencil_write_mask =
-   {
-     .front = ~0u,
-     .back = ~0u,
-   },
-   .stencil_reference =
-   {
-     .front = 0u,
-     .back = 0u,
-   },
-   .blend_constants = { 0.0f, 0.0f, 0.0f, 0.0f },
-   .depth_bias = {
-      .constant_factor = 0.0f,
-      .slope_factor = 0.0f,
-   },
-   .line_width = 1.0f,
-};
+#include "vk_util.h"
 
 void
 v3dv_job_add_bo(struct v3dv_job *job, struct v3dv_bo *bo)
@@ -63,17 +31,26 @@ v3dv_job_add_bo(struct v3dv_job *job, struct v3dv_bo *bo)
    if (!bo)
       return;
 
-   if (_mesa_set_search(job->bos, bo))
-      return;
+   if (job->bo_handle_mask & bo->handle_bit) {
+      if (_mesa_set_search(job->bos, bo))
+         return;
+   }
 
    _mesa_set_add(job->bos, bo);
    job->bo_count++;
+   job->bo_handle_mask |= bo->handle_bit;
 }
 
-static void
-cmd_buffer_emit_render_pass_rcl(struct v3dv_cmd_buffer *cmd_buffer);
+void
+v3dv_job_add_bo_unchecked(struct v3dv_job *job, struct v3dv_bo *bo)
+{
+   assert(bo);
+   _mesa_set_add(job->bos, bo);
+   job->bo_count++;
+   job->bo_handle_mask |= bo->handle_bit;
+}
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_CreateCommandPool(VkDevice _device,
                        const VkCommandPoolCreateInfo *pCreateInfo,
                        const VkAllocationCallbacks *pAllocator,
@@ -85,15 +62,15 @@ v3dv_CreateCommandPool(VkDevice _device,
    /* We only support one queue */
    assert(pCreateInfo->queueFamilyIndex == 0);
 
-   pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
-                     VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   pool = vk_object_zalloc(&device->vk, pAllocator, sizeof(*pool),
+                           VK_OBJECT_TYPE_COMMAND_POOL);
    if (pool == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    if (pAllocator)
       pool->alloc = *pAllocator;
    else
-      pool->alloc = device->alloc;
+      pool->alloc = device->vk.alloc;
 
    list_inithead(&pool->cmd_buffers);
 
@@ -108,13 +85,13 @@ cmd_buffer_init(struct v3dv_cmd_buffer *cmd_buffer,
                 struct v3dv_cmd_pool *pool,
                 VkCommandBufferLevel level)
 {
-   /* Do not reset the loader data header! If we are calling this from
-    * a command buffer reset that would reset the loader's dispatch table for
-    * the command buffer.
+   /* Do not reset the base object! If we are calling this from a command
+    * buffer reset that would reset the loader's dispatch table for the
+    * command buffer, and any other relevant info from vk_object_base
     */
-   const uint32_t ld_size = sizeof(VK_LOADER_DATA);
-   uint8_t *cmd_buffer_driver_start = ((uint8_t *) cmd_buffer) + ld_size;
-   memset(cmd_buffer_driver_start, 0, sizeof(*cmd_buffer) - ld_size);
+   const uint32_t base_size = sizeof(struct vk_command_buffer);
+   uint8_t *cmd_buffer_driver_start = ((uint8_t *) cmd_buffer) + base_size;
+   memset(cmd_buffer_driver_start, 0, sizeof(*cmd_buffer) - base_size);
 
    cmd_buffer->device = device;
    cmd_buffer->pool = pool;
@@ -140,14 +117,22 @@ cmd_buffer_create(struct v3dv_device *device,
                   VkCommandBuffer *pCommandBuffer)
 {
    struct v3dv_cmd_buffer *cmd_buffer;
-   cmd_buffer = vk_alloc(&pool->alloc, sizeof(*cmd_buffer), 8,
-                         VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   cmd_buffer = vk_zalloc2(&device->vk.alloc,
+                           &pool->alloc,
+                           sizeof(*cmd_buffer),
+                           8,
+                           VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (cmd_buffer == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
-   cmd_buffer_init(cmd_buffer, device, pool, level);
+   VkResult result;
+   result = vk_command_buffer_init(&cmd_buffer->vk, &device->vk);
+   if (result != VK_SUCCESS) {
+      vk_free2(&device->vk.alloc, &pool->alloc, cmd_buffer);
+      return result;
+   }
 
-   cmd_buffer->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
+   cmd_buffer_init(cmd_buffer, device, pool, level);
 
    *pCommandBuffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
 
@@ -181,17 +166,17 @@ job_destroy_cloned_gpu_cl_resources(struct v3dv_job *job)
 
    list_for_each_entry_safe(struct v3dv_bo, bo, &job->bcl.bo_list, list_link) {
       list_del(&bo->list_link);
-      vk_free(&job->device->alloc, bo);
+      vk_free(&job->device->vk.alloc, bo);
    }
 
    list_for_each_entry_safe(struct v3dv_bo, bo, &job->rcl.bo_list, list_link) {
       list_del(&bo->list_link);
-      vk_free(&job->device->alloc, bo);
+      vk_free(&job->device->vk.alloc, bo);
    }
 
    list_for_each_entry_safe(struct v3dv_bo, bo, &job->indirect.bo_list, list_link) {
       list_del(&bo->list_link);
-      vk_free(&job->device->alloc, bo);
+      vk_free(&job->device->vk.alloc, bo);
    }
 }
 
@@ -214,7 +199,7 @@ job_destroy_cpu_wait_events_resources(struct v3dv_job *job)
 {
    assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS);
    assert(job->cmd_buffer);
-   vk_free(&job->cmd_buffer->device->alloc, job->cpu.event_wait.events);
+   vk_free(&job->cmd_buffer->device->vk.alloc, job->cpu.event_wait.events);
 }
 
 static void
@@ -260,7 +245,7 @@ v3dv_job_destroy(struct v3dv_job *job)
          job_destroy_cloned_gpu_cl_resources(job);
    }
 
-   vk_free(&job->device->alloc, job);
+   vk_free(&job->device->vk.alloc, job);
 }
 
 void
@@ -269,7 +254,7 @@ v3dv_cmd_buffer_add_private_obj(struct v3dv_cmd_buffer *cmd_buffer,
                                 v3dv_cmd_buffer_private_obj_destroy_cb destroy_cb)
 {
    struct v3dv_cmd_buffer_private_obj *pobj =
-      vk_alloc(&cmd_buffer->device->alloc, sizeof(*pobj), 8,
+      vk_alloc(&cmd_buffer->device->vk.alloc, sizeof(*pobj), 8,
                VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
    if (!pobj) {
       v3dv_flag_oom(cmd_buffer, NULL);
@@ -289,9 +274,9 @@ cmd_buffer_destroy_private_obj(struct v3dv_cmd_buffer *cmd_buffer,
    assert(pobj && pobj->obj && pobj->destroy_cb);
    pobj->destroy_cb(v3dv_device_to_handle(cmd_buffer->device),
                     pobj->obj,
-                    &cmd_buffer->device->alloc);
+                    &cmd_buffer->device->vk.alloc);
    list_del(&pobj->list_link);
-   vk_free(&cmd_buffer->device->alloc, pobj);
+   vk_free(&cmd_buffer->device->vk.alloc, pobj);
 }
 
 static void
@@ -309,7 +294,7 @@ cmd_buffer_free_resources(struct v3dv_cmd_buffer *cmd_buffer)
       vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments);
 
    if (cmd_buffer->state.query.end.alloc_count > 0)
-      vk_free(&cmd_buffer->device->alloc, cmd_buffer->state.query.end.states);
+      vk_free(&cmd_buffer->device->vk.alloc, cmd_buffer->state.query.end.states);
 
    if (cmd_buffer->push_constants_resource.bo)
       v3dv_bo_free(cmd_buffer->device, cmd_buffer->push_constants_resource.bo);
@@ -319,15 +304,9 @@ cmd_buffer_free_resources(struct v3dv_cmd_buffer *cmd_buffer)
       cmd_buffer_destroy_private_obj(cmd_buffer, pobj);
    }
 
-   if (cmd_buffer->meta.blit.dspool) {
-      v3dv_DestroyDescriptorPool(v3dv_device_to_handle(cmd_buffer->device),
-                                 cmd_buffer->meta.blit.dspool,
-                                 &cmd_buffer->device->alloc);
-   }
-
    if (cmd_buffer->state.meta.attachments) {
          assert(cmd_buffer->state.meta.attachment_alloc_count > 0);
-         vk_free(&cmd_buffer->device->alloc, cmd_buffer->state.meta.attachments);
+         vk_free(&cmd_buffer->device->vk.alloc, cmd_buffer->state.meta.attachments);
    }
 }
 
@@ -336,18 +315,9 @@ cmd_buffer_destroy(struct v3dv_cmd_buffer *cmd_buffer)
 {
    list_del(&cmd_buffer->pool_link);
    cmd_buffer_free_resources(cmd_buffer);
-   vk_free(&cmd_buffer->pool->alloc, cmd_buffer);
-}
-
-void
-v3dv_job_emit_binning_flush(struct v3dv_job *job)
-{
-   assert(job);
-
-   v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(FLUSH));
-   v3dv_return_if_oom(NULL, job);
-
-   cl_emit(&job->bcl, FLUSH, flush);
+   vk_command_buffer_finish(&cmd_buffer->vk);
+   vk_free2(&cmd_buffer->device->vk.alloc, &cmd_buffer->pool->alloc,
+            cmd_buffer);
 }
 
 static bool
@@ -406,6 +376,13 @@ cmd_buffer_can_merge_subpass(struct v3dv_cmd_buffer *cmd_buffer,
    struct v3dv_subpass *prev_subpass = &state->pass->subpasses[state->subpass_idx];
    struct v3dv_subpass *subpass = &state->pass->subpasses[subpass_idx];
 
+   /* Don't merge if the subpasses have different view masks, since in that
+    * case the framebuffer setup is different and we need to emit different
+    * RCLs.
+    */
+   if (subpass->view_mask != prev_subpass->view_mask)
+      return false;
+
    /* Because the list of subpass attachments can include VK_ATTACHMENT_UNUSED,
     * we need to check that for each subpass all its used attachments are
     * used by the other subpass.
@@ -426,12 +403,18 @@ cmd_buffer_can_merge_subpass(struct v3dv_cmd_buffer *cmd_buffer,
    if (!compatible)
       return false;
 
-   /* FIXME: resolve attachments */
-
    if (subpass->ds_attachment.attachment !=
        prev_subpass->ds_attachment.attachment)
       return false;
 
+   /* FIXME: Since some attachment formats can't be resolved using the TLB we
+    * need to emit separate resolve jobs for them and that would not be
+    * compatible with subpass merges. We could fix that by testing if any of
+    * the attachments to resolve doesn't suppotr TLB resolves.
+    */
+   if (prev_subpass->resolve_attachments || subpass->resolve_attachments)
+      return false;
+
    return true;
 }
 
@@ -445,16 +428,9 @@ job_compute_frame_tiling(struct v3dv_job *job,
                          uint32_t height,
                          uint32_t layers,
                          uint32_t render_target_count,
-                         uint8_t max_internal_bpp)
+                         uint8_t max_internal_bpp,
+                         bool msaa)
 {
-   static const uint8_t tile_sizes[] = {
-      64, 64,
-      64, 32,
-      32, 32,
-      32, 16,
-      16, 16,
-   };
-
    assert(job);
    struct v3dv_frame_tiling *tiling = &job->frame_tiling;
 
@@ -462,22 +438,25 @@ job_compute_frame_tiling(struct v3dv_job *job,
    tiling->height = height;
    tiling->layers = layers;
    tiling->render_target_count = render_target_count;
+   tiling->msaa = msaa;
+   tiling->internal_bpp = max_internal_bpp;
 
-   uint32_t tile_size_index = 0;
-
-   /* FIXME: MSAA */
-
-   if (render_target_count > 2)
-      tile_size_index += 2;
-   else if (render_target_count > 1)
-      tile_size_index += 1;
+   /* We can use double-buffer when MSAA is disabled to reduce tile store
+    * overhead.
+    *
+    * FIXME: if we are emitting any tile loads the hardware will serialize
+    * loads and stores across tiles effectivley disabling double buffering,
+    * so we would want to check for that and not enable it in that case to
+    * avoid reducing the tile size.
+    */
+   tiling->double_buffer =
+      unlikely(V3D_DEBUG & V3D_DEBUG_DOUBLE_BUFFER) && !msaa;
 
-   tiling->internal_bpp = max_internal_bpp;
-   tile_size_index += tiling->internal_bpp;
-   assert(tile_size_index < ARRAY_SIZE(tile_sizes));
+   assert(!tiling->msaa || !tiling->double_buffer);
 
-   tiling->tile_width = tile_sizes[tile_size_index * 2];
-   tiling->tile_height = tile_sizes[tile_size_index * 2 + 1];
+   v3d_choose_tile_size(render_target_count, max_internal_bpp,
+                        tiling->msaa, tiling->double_buffer,
+                        &tiling->tile_width, &tiling->tile_height);
 
    tiling->draw_tiles_x = DIV_ROUND_UP(width, tiling->tile_width);
    tiling->draw_tiles_y = DIV_ROUND_UP(height, tiling->tile_height);
@@ -510,8 +489,10 @@ v3dv_job_start_frame(struct v3dv_job *job,
                      uint32_t width,
                      uint32_t height,
                      uint32_t layers,
+                     bool allocate_tile_state_for_all_layers,
                      uint32_t render_target_count,
-                     uint8_t max_internal_bpp)
+                     uint8_t max_internal_bpp,
+                     bool msaa)
 {
    assert(job);
 
@@ -519,11 +500,21 @@ v3dv_job_start_frame(struct v3dv_job *job,
    const struct v3dv_frame_tiling *tiling =
       job_compute_frame_tiling(job,
                                width, height, layers,
-                               render_target_count, max_internal_bpp);
+                               render_target_count, max_internal_bpp, msaa);
 
    v3dv_cl_ensure_space_with_branch(&job->bcl, 256);
    v3dv_return_if_oom(NULL, job);
 
+   /* We only need to allocate tile state for all layers if the binner
+    * writes primitives to layers other than the first. This can only be
+    * done using layered rendering (writing gl_Layer from a geometry shader),
+    * so for other cases of multilayered framebuffers (typically with
+    * meta copy/clear operations) that won't use layered rendering, we only
+    * need one layer worth of of tile state for the binner.
+    */
+   if (!allocate_tile_state_for_all_layers)
+      layers = 1;
+
    /* The PTB will request the tile alloc initial size per tile at start
     * of tile binning.
     */
@@ -553,7 +544,7 @@ v3dv_job_start_frame(struct v3dv_job *job,
       return;
    }
 
-   v3dv_job_add_bo(job, job->tile_alloc);
+   v3dv_job_add_bo_unchecked(job, job->tile_alloc);
 
    const uint32_t tsda_per_tile_size = 256;
    const uint32_t tile_state_size = tiling->layers *
@@ -566,33 +557,12 @@ v3dv_job_start_frame(struct v3dv_job *job,
       return;
    }
 
-   v3dv_job_add_bo(job, job->tile_state);
-
-   /* This must go before the binning mode configuration. It is
-    * required for layered framebuffers to work.
-    */
-   cl_emit(&job->bcl, NUMBER_OF_LAYERS, config) {
-      config.number_of_layers = layers;
-   }
-
-   cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) {
-      config.width_in_pixels = tiling->width;
-      config.height_in_pixels = tiling->height;
-      config.number_of_render_targets = MAX2(tiling->render_target_count, 1);
-      config.multisample_mode_4x = false; /* FIXME */
-      config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
-   }
-
-   /* There's definitely nothing in the VCD cache we want. */
-   cl_emit(&job->bcl, FLUSH_VCD_CACHE, bin);
+   v3dv_job_add_bo_unchecked(job, job->tile_state);
 
-   /* "Binning mode lists must have a Start Tile Binning item (6) after
-    *  any prefix state data before the binning list proper starts."
-    */
-   cl_emit(&job->bcl, START_TILE_BINNING, bin);
+   v3dv_X(job->device, job_emit_binning_prolog)(job, tiling, layers);
 
-   job->ez_state = VC5_EZ_UNDECIDED;
-   job->first_ez_state = VC5_EZ_UNDECIDED;
+   job->ez_state = V3D_EZ_UNDECIDED;
+   job->first_ez_state = V3D_EZ_UNDECIDED;
 }
 
 static void
@@ -609,19 +579,9 @@ cmd_buffer_end_render_pass_frame(struct v3dv_cmd_buffer *cmd_buffer)
     * any RCL commands of its own.
     */
    if (v3dv_cl_offset(&cmd_buffer->state.job->rcl) == 0)
-      cmd_buffer_emit_render_pass_rcl(cmd_buffer);
-
-   v3dv_job_emit_binning_flush(cmd_buffer->state.job);
-}
+      v3dv_X(cmd_buffer->device, cmd_buffer_emit_render_pass_rcl)(cmd_buffer);
 
-static void
-cmd_buffer_end_render_pass_secondary(struct v3dv_cmd_buffer *cmd_buffer)
-{
-   assert(cmd_buffer->state.job);
-   v3dv_cl_ensure_space_with_branch(&cmd_buffer->state.job->bcl,
-                                    cl_packet_length(RETURN_FROM_SUB_LIST));
-   v3dv_return_if_oom(cmd_buffer, NULL);
-   cl_emit(&cmd_buffer->state.job->bcl, RETURN_FROM_SUB_LIST, ret);
+   v3dv_X(cmd_buffer->device, job_emit_binning_flush)(cmd_buffer->state.job);
 }
 
 struct v3dv_job *
@@ -630,7 +590,7 @@ v3dv_cmd_buffer_create_cpu_job(struct v3dv_device *device,
                                struct v3dv_cmd_buffer *cmd_buffer,
                                uint32_t subpass_idx)
 {
-   struct v3dv_job *job = vk_zalloc(&device->alloc,
+   struct v3dv_job *job = vk_zalloc(&device->vk.alloc,
                                     sizeof(struct v3dv_job), 8,
                                     VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
    if (!job) {
@@ -708,7 +668,7 @@ v3dv_cmd_buffer_finish_job(struct v3dv_cmd_buffer *cmd_buffer)
          cmd_buffer_end_render_pass_frame(cmd_buffer);
       } else {
          assert(job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY);
-         cmd_buffer_end_render_pass_secondary(cmd_buffer);
+         v3dv_X(cmd_buffer->device, cmd_buffer_end_render_pass_secondary)(cmd_buffer);
       }
    }
 
@@ -795,7 +755,7 @@ v3dv_job_init(struct v3dv_job *job,
 
       v3dv_cl_init(job, &job->indirect);
 
-      if (V3D_DEBUG & V3D_DEBUG_ALWAYS_FLUSH)
+      if (unlikely(V3D_DEBUG & V3D_DEBUG_ALWAYS_FLUSH))
          job->always_flush = true;
    }
 
@@ -813,6 +773,13 @@ v3dv_job_init(struct v3dv_job *job,
        * bits.
        */
       cmd_buffer->state.dirty = ~0;
+      cmd_buffer->state.dirty_descriptor_stages = ~0;
+
+      /* Honor inheritance of occlussion queries in secondaries if requested */
+      if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
+          cmd_buffer->state.inheritance.occlusion_query_enable) {
+         cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_OCCLUSION_QUERY;
+      }
 
       /* Keep track of the first subpass that we are recording in this new job.
        * We will use this when we emit the RCL to decide how to emit our loads
@@ -845,7 +812,7 @@ v3dv_cmd_buffer_start_job(struct v3dv_cmd_buffer *cmd_buffer,
       v3dv_cmd_buffer_finish_job(cmd_buffer);
 
    assert(cmd_buffer->state.job == NULL);
-   struct v3dv_job *job = vk_zalloc(&cmd_buffer->device->alloc,
+   struct v3dv_job *job = vk_zalloc(&cmd_buffer->device->vk.alloc,
                                     sizeof(struct v3dv_job), 8,
                                     VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
 
@@ -865,6 +832,7 @@ static VkResult
 cmd_buffer_reset(struct v3dv_cmd_buffer *cmd_buffer,
                  VkCommandBufferResetFlags flags)
 {
+   vk_command_buffer_reset(&cmd_buffer->vk);
    if (cmd_buffer->status != V3DV_CMD_BUFFER_STATUS_INITIALIZED) {
       struct v3dv_device *device = cmd_buffer->device;
       struct v3dv_cmd_pool *pool = cmd_buffer->pool;
@@ -888,7 +856,7 @@ cmd_buffer_reset(struct v3dv_cmd_buffer *cmd_buffer,
    return VK_SUCCESS;
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_AllocateCommandBuffers(VkDevice _device,
                             const VkCommandBufferAllocateInfo *pAllocateInfo,
                             VkCommandBuffer *pCommandBuffers)
@@ -916,7 +884,7 @@ v3dv_AllocateCommandBuffers(VkDevice _device,
    return result;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_FreeCommandBuffers(VkDevice device,
                         VkCommandPool commandPool,
                         uint32_t commandBufferCount,
@@ -932,7 +900,7 @@ v3dv_FreeCommandBuffers(VkDevice device,
    }
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_DestroyCommandPool(VkDevice _device,
                         VkCommandPool commandPool,
                         const VkAllocationCallbacks *pAllocator)
@@ -948,7 +916,106 @@ v3dv_DestroyCommandPool(VkDevice _device,
       cmd_buffer_destroy(cmd_buffer);
    }
 
-   vk_free2(&device->alloc, pAllocator, pool);
+   vk_object_free(&device->vk, pAllocator, pool);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+v3dv_TrimCommandPool(VkDevice device,
+                     VkCommandPool commandPool,
+                     VkCommandPoolTrimFlags flags)
+{
+   /* We don't need to do anything here, our command pools never hold on to
+    * any resources from command buffers that are freed or reset.
+    */
+}
+
+
+static void
+cmd_buffer_subpass_handle_pending_resolves(struct v3dv_cmd_buffer *cmd_buffer)
+{
+   assert(cmd_buffer->state.subpass_idx < cmd_buffer->state.pass->subpass_count);
+   const struct v3dv_render_pass *pass = cmd_buffer->state.pass;
+   const struct v3dv_subpass *subpass =
+      &pass->subpasses[cmd_buffer->state.subpass_idx];
+
+   if (!subpass->resolve_attachments)
+      return;
+
+   struct v3dv_framebuffer *fb = cmd_buffer->state.framebuffer;
+
+   /* At this point we have already ended the current subpass and now we are
+    * about to emit vkCmdResolveImage calls to get the resolves we can't handle
+    * handle in the subpass RCL.
+    *
+    * vkCmdResolveImage is not supposed to be called inside a render pass so
+    * before we call that we need to make sure our command buffer state reflects
+    * that we are no longer in a subpass by finishing the current job and
+    * resetting the framebuffer and render pass state temporarily and then
+    * restoring it after we are done with the resolves.
+    */
+   if (cmd_buffer->state.job)
+      v3dv_cmd_buffer_finish_job(cmd_buffer);
+   struct v3dv_framebuffer *restore_fb = cmd_buffer->state.framebuffer;
+   struct v3dv_render_pass *restore_pass = cmd_buffer->state.pass;
+   uint32_t restore_subpass_idx = cmd_buffer->state.subpass_idx;
+   cmd_buffer->state.framebuffer = NULL;
+   cmd_buffer->state.pass = NULL;
+   cmd_buffer->state.subpass_idx = -1;
+
+   VkCommandBuffer cmd_buffer_handle = v3dv_cmd_buffer_to_handle(cmd_buffer);
+   for (uint32_t i = 0; i < subpass->color_count; i++) {
+      const uint32_t src_attachment_idx =
+         subpass->color_attachments[i].attachment;
+      if (src_attachment_idx == VK_ATTACHMENT_UNUSED)
+         continue;
+
+      if (pass->attachments[src_attachment_idx].use_tlb_resolve)
+         continue;
+
+      const uint32_t dst_attachment_idx =
+         subpass->resolve_attachments[i].attachment;
+      if (dst_attachment_idx == VK_ATTACHMENT_UNUSED)
+         continue;
+
+      struct v3dv_image_view *src_iview = fb->attachments[src_attachment_idx];
+      struct v3dv_image_view *dst_iview = fb->attachments[dst_attachment_idx];
+
+      VkImageResolve2KHR region = {
+         .sType = VK_STRUCTURE_TYPE_IMAGE_RESOLVE_2_KHR,
+         .srcSubresource = {
+            VK_IMAGE_ASPECT_COLOR_BIT,
+            src_iview->vk.base_mip_level,
+            src_iview->vk.base_array_layer,
+            src_iview->vk.layer_count,
+         },
+         .srcOffset = { 0, 0, 0 },
+         .dstSubresource =  {
+            VK_IMAGE_ASPECT_COLOR_BIT,
+            dst_iview->vk.base_mip_level,
+            dst_iview->vk.base_array_layer,
+            dst_iview->vk.layer_count,
+         },
+         .dstOffset = { 0, 0, 0 },
+         .extent = src_iview->vk.image->extent,
+      };
+
+      struct v3dv_image *src_image = (struct v3dv_image *) src_iview->vk.image;
+      struct v3dv_image *dst_image = (struct v3dv_image *) dst_iview->vk.image;
+      VkResolveImageInfo2KHR resolve_info = {
+         .sType = VK_STRUCTURE_TYPE_RESOLVE_IMAGE_INFO_2_KHR,
+         .srcImage = v3dv_image_to_handle(src_image),
+         .srcImageLayout = VK_IMAGE_LAYOUT_GENERAL,
+         .dstImage = v3dv_image_to_handle(dst_image),
+         .dstImageLayout = VK_IMAGE_LAYOUT_GENERAL,
+         .regionCount = 1,
+         .pRegions = &region,
+      };
+      v3dv_CmdResolveImage2KHR(cmd_buffer_handle, &resolve_info);
+   }
+
+   cmd_buffer->state.framebuffer = restore_fb;
+   cmd_buffer->state.pass = restore_pass;
+   cmd_buffer->state.subpass_idx = restore_subpass_idx;
 }
 
 static VkResult
@@ -967,10 +1034,15 @@ cmd_buffer_begin_render_pass_secondary(
    cmd_buffer->state.framebuffer =
       v3dv_framebuffer_from_handle(inheritance_info->framebuffer);
 
+   assert(inheritance_info->subpass < cmd_buffer->state.pass->subpass_count);
+   cmd_buffer->state.subpass_idx = inheritance_info->subpass;
+
+   cmd_buffer->state.inheritance.occlusion_query_enable =
+      inheritance_info->occlusionQueryEnable;
+
    /* Secondaries that execute inside a render pass won't start subpasses
     * so we want to create a job for them here.
     */
-   assert(inheritance_info->subpass < cmd_buffer->state.pass->subpass_count);
    struct v3dv_job *job =
       v3dv_cmd_buffer_start_job(cmd_buffer, inheritance_info->subpass,
                                 V3DV_JOB_TYPE_GPU_CL_SECONDARY);
@@ -979,8 +1051,6 @@ cmd_buffer_begin_render_pass_secondary(
       return VK_ERROR_OUT_OF_HOST_MEMORY;
    }
 
-   cmd_buffer->state.subpass_idx = inheritance_info->subpass;
-
    /* Secondary command buffers don't know about the render area, but our
     * scissor setup accounts for it, so let's make sure we make it large
     * enough that it doesn't actually constrain any rendering. This should
@@ -1003,7 +1073,7 @@ cmd_buffer_begin_render_pass_secondary(
    return VK_SUCCESS;
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_BeginCommandBuffer(VkCommandBuffer commandBuffer,
                         const VkCommandBufferBeginInfo *pBeginInfo)
 {
@@ -1029,12 +1099,6 @@ v3dv_BeginCommandBuffer(VkCommandBuffer commandBuffer,
          if (result != VK_SUCCESS)
             return result;
       }
-
-      /* If the primary may have an active occlusion query we need to honor
-       * that in the secondary.
-       */
-      if (pBeginInfo->pInheritanceInfo->occlusionQueryEnable)
-         cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_OCCLUSION_QUERY;
    }
 
    cmd_buffer->status = V3DV_CMD_BUFFER_STATUS_RECORDING;
@@ -1042,7 +1106,7 @@ v3dv_BeginCommandBuffer(VkCommandBuffer commandBuffer,
    return VK_SUCCESS;
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_ResetCommandBuffer(VkCommandBuffer commandBuffer,
                         VkCommandBufferResetFlags flags)
 {
@@ -1050,7 +1114,7 @@ v3dv_ResetCommandBuffer(VkCommandBuffer commandBuffer,
    return cmd_buffer_reset(cmd_buffer, flags);
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_ResetCommandPool(VkDevice device,
                       VkCommandPool commandPool,
                       VkCommandPoolResetFlags flags)
@@ -1067,27 +1131,7 @@ v3dv_ResetCommandPool(VkDevice device,
 
    return VK_SUCCESS;
 }
-static void
-emit_clip_window(struct v3dv_job *job, const VkRect2D *rect)
-{
-   assert(job);
-
-   v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(CLIP_WINDOW));
-   v3dv_return_if_oom(NULL, job);
-
-   cl_emit(&job->bcl, CLIP_WINDOW, clip) {
-      clip.clip_window_left_pixel_coordinate = rect->offset.x;
-      clip.clip_window_bottom_pixel_coordinate = rect->offset.y;
-      clip.clip_window_width_in_pixels = rect->extent.width;
-      clip.clip_window_height_in_pixels = rect->extent.height;
-   }
-}
 
-/* Checks whether the render area rectangle covers a region that is aligned to
- * tile boundaries, which means that for all tiles covered by the render area
- * region, there are no uncovered pixels (unless they are also outside the
- * framebuffer).
- */
 static void
 cmd_buffer_update_tile_alignment(struct v3dv_cmd_buffer *cmd_buffer)
 {
@@ -1102,24 +1146,11 @@ cmd_buffer_update_tile_alignment(struct v3dv_cmd_buffer *cmd_buffer)
     * always have framebuffer information available.
     */
    assert(cmd_buffer->state.framebuffer);
-
-   const VkExtent2D fb_extent = {
-      .width = cmd_buffer->state.framebuffer->width,
-      .height = cmd_buffer->state.framebuffer->height
-   };
-
-   VkExtent2D granularity;
-   v3dv_subpass_get_granularity(cmd_buffer->state.pass,
-                                cmd_buffer->state.subpass_idx,
-                                &granularity);
-
    cmd_buffer->state.tile_aligned_render_area =
-      rect->offset.x % granularity.width == 0 &&
-      rect->offset.y % granularity.height == 0 &&
-      (rect->extent.width % granularity.width == 0 ||
-       rect->offset.x + rect->extent.width >= fb_extent.width) &&
-      (rect->extent.height % granularity.height == 0 ||
-       rect->offset.y + rect->extent.height >= fb_extent.height);
+      v3dv_subpass_area_is_tile_aligned(cmd_buffer->device, rect,
+                                        cmd_buffer->state.framebuffer,
+                                        cmd_buffer->state.pass,
+                                        cmd_buffer->state.subpass_idx);
 
    if (!cmd_buffer->state.tile_aligned_render_area) {
       perf_debug("Render area for subpass %d of render pass %p doesn't "
@@ -1128,42 +1159,6 @@ cmd_buffer_update_tile_alignment(struct v3dv_cmd_buffer *cmd_buffer)
    }
 }
 
-void
-v3dv_get_hw_clear_color(const VkClearColorValue *color,
-                        uint32_t internal_type,
-                        uint32_t internal_size,
-                        uint32_t *hw_color)
-{
-   union util_color uc;
-   switch (internal_type) {
-   case V3D_INTERNAL_TYPE_8:
-      util_pack_color(color->float32, PIPE_FORMAT_R8G8B8A8_UNORM, &uc);
-      memcpy(hw_color, uc.ui, internal_size);
-   break;
-   case V3D_INTERNAL_TYPE_8I:
-   case V3D_INTERNAL_TYPE_8UI:
-      hw_color[0] = ((color->uint32[0] & 0xff) |
-                     (color->uint32[1] & 0xff) << 8 |
-                     (color->uint32[2] & 0xff) << 16 |
-                     (color->uint32[3] & 0xff) << 24);
-   break;
-   case V3D_INTERNAL_TYPE_16F:
-      util_pack_color(color->float32, PIPE_FORMAT_R16G16B16A16_FLOAT, &uc);
-      memcpy(hw_color, uc.ui, internal_size);
-   break;
-   case V3D_INTERNAL_TYPE_16I:
-   case V3D_INTERNAL_TYPE_16UI:
-      hw_color[0] = ((color->uint32[0] & 0xffff) | color->uint32[1] << 16);
-      hw_color[1] = ((color->uint32[2] & 0xffff) | color->uint32[3] << 16);
-   break;
-   case V3D_INTERNAL_TYPE_32F:
-   case V3D_INTERNAL_TYPE_32I:
-   case V3D_INTERNAL_TYPE_32UI:
-      memcpy(hw_color, color->uint32, internal_size);
-      break;
-   }
-}
-
 static void
 cmd_buffer_state_set_attachment_clear_color(struct v3dv_cmd_buffer *cmd_buffer,
                                             uint32_t attachment_idx,
@@ -1175,18 +1170,19 @@ cmd_buffer_state_set_attachment_clear_color(struct v3dv_cmd_buffer *cmd_buffer,
       &cmd_buffer->state.pass->attachments[attachment_idx];
 
    uint32_t internal_type, internal_bpp;
-   const struct v3dv_format *format = v3dv_get_format(attachment->desc.format);
-   v3dv_get_internal_type_bpp_for_output_format(format->rt_type,
-                                                &internal_type,
-                                                &internal_bpp);
+   const struct v3dv_format *format =
+      v3dv_X(cmd_buffer->device, get_format)(attachment->desc.format);
+
+   v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_output_format)
+      (format->rt_type, &internal_type, &internal_bpp);
 
    uint32_t internal_size = 4 << internal_bpp;
 
    struct v3dv_cmd_buffer_attachment_state *attachment_state =
       &cmd_buffer->state.attachments[attachment_idx];
 
-   v3dv_get_hw_clear_color(color, internal_type, internal_size,
-                           &attachment_state->clear_value.color[0]);
+   v3dv_X(cmd_buffer->device, get_hw_clear_color)
+      (color, internal_type, internal_size, &attachment_state->clear_value.color[0]);
 
    attachment_state->vk_clear_value.color = *color;
 }
@@ -1263,12 +1259,12 @@ cmd_buffer_ensure_render_pass_attachment_state(struct v3dv_cmd_buffer *cmd_buffe
    if (state->attachment_alloc_count < pass->attachment_count) {
       if (state->attachments > 0) {
          assert(state->attachment_alloc_count > 0);
-         vk_free(&cmd_buffer->device->alloc, state->attachments);
+         vk_free(&cmd_buffer->device->vk.alloc, state->attachments);
       }
 
       uint32_t size = sizeof(struct v3dv_cmd_buffer_attachment_state) *
                       pass->attachment_count;
-      state->attachments = vk_zalloc(&cmd_buffer->device->alloc, size, 8,
+      state->attachments = vk_zalloc(&cmd_buffer->device->vk.alloc, size, 8,
                                      VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
       if (!state->attachments) {
          v3dv_flag_oom(cmd_buffer, NULL);
@@ -1280,10 +1276,10 @@ cmd_buffer_ensure_render_pass_attachment_state(struct v3dv_cmd_buffer *cmd_buffe
    assert(state->attachment_alloc_count >= pass->attachment_count);
 }
 
-void
-v3dv_CmdBeginRenderPass(VkCommandBuffer commandBuffer,
-                        const VkRenderPassBeginInfo *pRenderPassBegin,
-                        VkSubpassContents contents)
+VKAPI_ATTR void VKAPI_CALL
+v3dv_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
+                         const VkRenderPassBeginInfo *pRenderPassBegin,
+                         const VkSubpassBeginInfo *pSubpassBeginInfo)
 {
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
    V3DV_FROM_HANDLE(v3dv_render_pass, pass, pRenderPassBegin->renderPass);
@@ -1304,7 +1300,7 @@ v3dv_CmdBeginRenderPass(VkCommandBuffer commandBuffer,
     * to emit a new clip window to constraint it to the render area.
     */
    uint32_t min_render_x = state->render_area.offset.x;
-   uint32_t min_render_y = state->render_area.offset.x;
+   uint32_t min_render_y = state->render_area.offset.y;
    uint32_t max_render_x = min_render_x + state->render_area.extent.width - 1;
    uint32_t max_render_y = min_render_y + state->render_area.extent.height - 1;
    uint32_t min_clip_x = state->clip_window.offset.x;
@@ -1320,8 +1316,10 @@ v3dv_CmdBeginRenderPass(VkCommandBuffer commandBuffer,
    v3dv_cmd_buffer_subpass_start(cmd_buffer, 0);
 }
 
-void
-v3dv_CmdNextSubpass(VkCommandBuffer commandBuffer, VkSubpassContents contents)
+VKAPI_ATTR void VKAPI_CALL
+v3dv_CmdNextSubpass2(VkCommandBuffer commandBuffer,
+                     const VkSubpassBeginInfo *pSubpassBeginInfo,
+                     const VkSubpassEndInfo *pSubpassEndInfo)
 {
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
 
@@ -1330,803 +1328,180 @@ v3dv_CmdNextSubpass(VkCommandBuffer commandBuffer, VkSubpassContents contents)
 
    /* Finish the previous subpass */
    v3dv_cmd_buffer_subpass_finish(cmd_buffer);
+   cmd_buffer_subpass_handle_pending_resolves(cmd_buffer);
 
    /* Start the next subpass */
    v3dv_cmd_buffer_subpass_start(cmd_buffer, state->subpass_idx + 1);
 }
 
-void
-v3dv_render_pass_setup_render_target(struct v3dv_cmd_buffer *cmd_buffer,
-                                     int rt,
-                                     uint32_t *rt_bpp,
-                                     uint32_t *rt_type,
-                                     uint32_t *rt_clamp)
-{
-   const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
-
-   assert(state->subpass_idx < state->pass->subpass_count);
-   const struct v3dv_subpass *subpass =
-      &state->pass->subpasses[state->subpass_idx];
-
-   if (rt >= subpass->color_count)
-      return;
-
-   struct v3dv_subpass_attachment *attachment = &subpass->color_attachments[rt];
-   const uint32_t attachment_idx = attachment->attachment;
-   if (attachment_idx == VK_ATTACHMENT_UNUSED)
-      return;
-
-   const struct v3dv_framebuffer *framebuffer = state->framebuffer;
-   assert(attachment_idx < framebuffer->attachment_count);
-   struct v3dv_image_view *iview = framebuffer->attachments[attachment_idx];
-   assert(iview->aspects & VK_IMAGE_ASPECT_COLOR_BIT);
-
-   *rt_bpp = iview->internal_bpp;
-   *rt_type = iview->internal_type;
-   *rt_clamp =vk_format_is_int(iview->vk_format) ?
-      V3D_RENDER_TARGET_CLAMP_INT : V3D_RENDER_TARGET_CLAMP_NONE;
-}
-
 static void
-cmd_buffer_render_pass_emit_load(struct v3dv_cmd_buffer *cmd_buffer,
-                                 struct v3dv_cl *cl,
-                                 struct v3dv_image_view *iview,
-                                 uint32_t layer,
-                                 uint32_t buffer)
+cmd_buffer_emit_subpass_clears(struct v3dv_cmd_buffer *cmd_buffer)
 {
-   const struct v3dv_image *image = iview->image;
-   const struct v3d_resource_slice *slice = &image->slices[iview->base_level];
-   uint32_t layer_offset = v3dv_layer_offset(image,
-                                             iview->base_level,
-                                             iview->first_layer + layer);
-
-   cl_emit(cl, LOAD_TILE_BUFFER_GENERAL, load) {
-      load.buffer_to_load = buffer;
-      load.address = v3dv_cl_address(image->mem->bo, layer_offset);
-
-      load.input_image_format = iview->format->rt_type;
-      load.r_b_swap = iview->swap_rb;
-      load.memory_format = slice->tiling;
-
-      if (slice->tiling == VC5_TILING_UIF_NO_XOR ||
-          slice->tiling == VC5_TILING_UIF_XOR) {
-         load.height_in_ub_or_stride =
-            slice->padded_height_of_output_image_in_uif_blocks;
-      } else if (slice->tiling == VC5_TILING_RASTER) {
-         load.height_in_ub_or_stride = slice->stride;
-      }
-
-      if (image->samples > VK_SAMPLE_COUNT_1_BIT)
-         load.decimate_mode = V3D_DECIMATE_MODE_ALL_SAMPLES;
-      else
-         load.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0;
-   }
-}
+   assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
 
-static void
-cmd_buffer_render_pass_emit_loads(struct v3dv_cmd_buffer *cmd_buffer,
-                                  struct v3dv_cl *cl,
-                                  uint32_t layer)
-{
+   assert(cmd_buffer->state.pass);
+   assert(cmd_buffer->state.subpass_idx < cmd_buffer->state.pass->subpass_count);
    const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
-   const struct v3dv_framebuffer *framebuffer = state->framebuffer;
    const struct v3dv_render_pass *pass = state->pass;
    const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
 
-   for (uint32_t i = 0; i < subpass->color_count; i++) {
-      uint32_t attachment_idx = subpass->color_attachments[i].attachment;
+   /* We only need to emit subpass clears as draw calls when the render
+    * area is not aligned to tile boundaries or for GFXH-1461.
+    */
+   if (cmd_buffer->state.tile_aligned_render_area &&
+       !subpass->do_depth_clear_with_draw &&
+       !subpass->do_depth_clear_with_draw) {
+      return;
+   }
 
-      if (attachment_idx == VK_ATTACHMENT_UNUSED)
-         continue;
+   uint32_t att_count = 0;
+   VkClearAttachment atts[V3D_MAX_DRAW_BUFFERS + 1]; /* 4 color + D/S */
 
-      const struct v3dv_render_pass_attachment *attachment =
-         &state->pass->attachments[attachment_idx];
+   /* We only need to emit subpass clears as draw calls for color attachments
+    * if the render area is not aligned to tile boundaries.
+    */
+   if (!cmd_buffer->state.tile_aligned_render_area) {
+      for (uint32_t i = 0; i < subpass->color_count; i++) {
+         const uint32_t att_idx = subpass->color_attachments[i].attachment;
+         if (att_idx == VK_ATTACHMENT_UNUSED)
+            continue;
 
-      /* According to the Vulkan spec:
-       *
-       *    "The load operation for each sample in an attachment happens before
-       *     any recorded command which accesses the sample in the first subpass
-       *     where the attachment is used."
-       *
-       * If the load operation is CLEAR, we must only clear once on the first
-       * subpass that uses the attachment (and in that case we don't LOAD).
-       * After that, we always want to load so we don't lose any rendering done
-       * by a previous subpass to the same attachment. We also want to load
-       * if the current job is continuing subpass work started by a previous
-       * job, for the same reason.
-       *
-       * If the render area is not aligned to tile boundaries then we have
-       * tiles which are partially covered by it. In this case, we need to
-       * load the tiles so we can preserve the pixels that are outside the
-       * render area for any such tiles.
-       */
-      assert(state->job->first_subpass >= attachment->first_subpass);
-      bool needs_load =
-         state->job->first_subpass > attachment->first_subpass ||
-         state->job->is_subpass_continue ||
-         attachment->desc.loadOp == VK_ATTACHMENT_LOAD_OP_LOAD ||
-         !state->tile_aligned_render_area;
-
-      if (needs_load) {
-         struct v3dv_image_view *iview = framebuffer->attachments[attachment_idx];
-         cmd_buffer_render_pass_emit_load(cmd_buffer, cl, iview,
-                                          layer, RENDER_TARGET_0 + i);
+         struct v3dv_render_pass_attachment *att = &pass->attachments[att_idx];
+         if (att->desc.loadOp != VK_ATTACHMENT_LOAD_OP_CLEAR)
+            continue;
+
+         if (state->subpass_idx != att->first_subpass)
+            continue;
+
+         atts[att_count].aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+         atts[att_count].colorAttachment = i;
+         atts[att_count].clearValue = state->attachments[att_idx].vk_clear_value;
+         att_count++;
       }
    }
 
-   uint32_t ds_attachment_idx = subpass->ds_attachment.attachment;
-   if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
-      const struct v3dv_render_pass_attachment *ds_attachment =
-         &state->pass->attachments[ds_attachment_idx];
-
-      assert(state->job->first_subpass >= ds_attachment->first_subpass);
-      const bool might_need_load =
-         state->job->first_subpass > ds_attachment->first_subpass ||
-         state->job->is_subpass_continue ||
-         !state->tile_aligned_render_area;
-
-      const bool needs_depth_load =
-         vk_format_has_depth(ds_attachment->desc.format) &&
-         (ds_attachment->desc.loadOp == VK_ATTACHMENT_LOAD_OP_LOAD ||
-          might_need_load);
-
-      const bool needs_stencil_load =
-         vk_format_has_stencil(ds_attachment->desc.format) &&
-         (ds_attachment->desc.stencilLoadOp == VK_ATTACHMENT_LOAD_OP_LOAD ||
-          might_need_load);
-
-      if (needs_depth_load || needs_stencil_load) {
-         struct v3dv_image_view *iview =
-            framebuffer->attachments[ds_attachment_idx];
-         /* From the Vulkan spec:
-          *
-          *   "When an image view of a depth/stencil image is used as a
-          *   depth/stencil framebuffer attachment, the aspectMask is ignored
-          *   and both depth and stencil image subresources are used."
-          *
-          * So we ignore the aspects from the subresource range of the image
-          * view for the depth/stencil attachment, but we still need to restrict
-          * the to aspects compatible with the render pass and the image.
-          */
-         const uint32_t zs_buffer =
-            v3dv_zs_buffer(needs_depth_load, needs_stencil_load);
-         cmd_buffer_render_pass_emit_load(cmd_buffer, cl,
-                                          iview, layer, zs_buffer);
+   /* For D/S we may also need to emit a subpass clear for GFXH-1461 */
+   const uint32_t ds_att_idx = subpass->ds_attachment.attachment;
+   if (ds_att_idx != VK_ATTACHMENT_UNUSED) {
+      struct v3dv_render_pass_attachment *att = &pass->attachments[ds_att_idx];
+      if (state->subpass_idx == att->first_subpass) {
+         VkImageAspectFlags aspects = vk_format_aspects(att->desc.format);
+         if (att->desc.loadOp != VK_ATTACHMENT_LOAD_OP_CLEAR ||
+             (cmd_buffer->state.tile_aligned_render_area &&
+              !subpass->do_depth_clear_with_draw)) {
+            aspects &= ~VK_IMAGE_ASPECT_DEPTH_BIT;
+         }
+         if (att->desc.stencilLoadOp != VK_ATTACHMENT_LOAD_OP_CLEAR ||
+             (cmd_buffer->state.tile_aligned_render_area &&
+              !subpass->do_stencil_clear_with_draw)) {
+            aspects &= ~VK_IMAGE_ASPECT_STENCIL_BIT;
+         }
+         if (aspects) {
+            atts[att_count].aspectMask = aspects;
+            atts[att_count].colorAttachment = 0; /* Ignored */
+            atts[att_count].clearValue =
+               state->attachments[ds_att_idx].vk_clear_value;
+            att_count++;
+         }
       }
    }
 
-   cl_emit(cl, END_OF_LOADS, end);
-}
-
-static void
-cmd_buffer_render_pass_emit_store(struct v3dv_cmd_buffer *cmd_buffer,
-                                  struct v3dv_cl *cl,
-                                  uint32_t attachment_idx,
-                                  uint32_t layer,
-                                  uint32_t buffer,
-                                  bool clear)
-{
-   const struct v3dv_image_view *iview =
-      cmd_buffer->state.framebuffer->attachments[attachment_idx];
-   const struct v3dv_image *image = iview->image;
-   const struct v3d_resource_slice *slice = &image->slices[iview->base_level];
-   uint32_t layer_offset = v3dv_layer_offset(image,
-                                             iview->base_level,
-                                             iview->first_layer + layer);
-
-   cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
-      store.buffer_to_store = buffer;
-      store.address = v3dv_cl_address(image->mem->bo, layer_offset);
-      store.clear_buffer_being_stored = clear;
-
-      store.output_image_format = iview->format->rt_type;
-      store.r_b_swap = iview->swap_rb;
-      store.memory_format = slice->tiling;
-
-      if (slice->tiling == VC5_TILING_UIF_NO_XOR ||
-          slice->tiling == VC5_TILING_UIF_XOR) {
-         store.height_in_ub_or_stride =
-            slice->padded_height_of_output_image_in_uif_blocks;
-      } else if (slice->tiling == VC5_TILING_RASTER) {
-         store.height_in_ub_or_stride = slice->stride;
-      }
+   if (att_count == 0)
+      return;
 
-      if (image->samples > VK_SAMPLE_COUNT_1_BIT)
-         store.decimate_mode = V3D_DECIMATE_MODE_ALL_SAMPLES;
-      else
-         store.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0;
+   if (!cmd_buffer->state.tile_aligned_render_area) {
+      perf_debug("Render area doesn't match render pass granularity, falling "
+                 "back to vkCmdClearAttachments for "
+                 "VK_ATTACHMENT_LOAD_OP_CLEAR.\n");
+   } else if (subpass->do_depth_clear_with_draw ||
+              subpass->do_stencil_clear_with_draw) {
+      perf_debug("Subpass clears DEPTH but loads STENCIL (or viceversa), "
+                 "falling back to vkCmdClearAttachments for "
+                 "VK_ATTACHMENT_LOAD_OP_CLEAR.\n");
    }
+
+   /* From the Vulkan 1.0 spec:
+    *
+    *    "VK_ATTACHMENT_LOAD_OP_CLEAR specifies that the contents within the
+    *     render area will be cleared to a uniform value, which is specified
+    *     when a render pass instance is begun."
+    *
+    * So the clear is only constrained by the render area and not by pipeline
+    * state such as scissor or viewport, these are the semantics of
+    * vkCmdClearAttachments as well.
+    */
+   VkCommandBuffer _cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
+   VkClearRect rect = {
+      .rect = state->render_area,
+      .baseArrayLayer = 0,
+      .layerCount = 1,
+   };
+   v3dv_CmdClearAttachments(_cmd_buffer, att_count, atts, 1, &rect);
 }
 
-static void
-cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
-                                   struct v3dv_cl *cl,
-                                   uint32_t layer)
+static struct v3dv_job *
+cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer *cmd_buffer,
+                              uint32_t subpass_idx,
+                              enum v3dv_job_type type)
 {
+   assert(type == V3DV_JOB_TYPE_GPU_CL ||
+          type == V3DV_JOB_TYPE_GPU_CL_SECONDARY);
+
    struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
-   const struct v3dv_subpass *subpass =
-      &state->pass->subpasses[state->subpass_idx];
+   assert(subpass_idx < state->pass->subpass_count);
+
+   /* Starting a new job can trigger a finish of the current one, so don't
+    * change the command buffer state for the new job until we are done creating
+    * the new job.
+    */
+   struct v3dv_job *job =
+      v3dv_cmd_buffer_start_job(cmd_buffer, subpass_idx, type);
+   if (!job)
+      return NULL;
+
+   state->subpass_idx = subpass_idx;
 
-   bool has_stores = false;
-   bool use_global_clear = false;
+   /* If we are starting a new job we need to setup binning. We only do this
+    * for V3DV_JOB_TYPE_GPU_CL jobs because V3DV_JOB_TYPE_GPU_CL_SECONDARY
+    * jobs are not submitted to the GPU directly, and are instead meant to be
+    * branched to from other V3DV_JOB_TYPE_GPU_CL jobs.
+    */
+   if (type == V3DV_JOB_TYPE_GPU_CL &&
+       job->first_subpass == state->subpass_idx) {
+      const struct v3dv_subpass *subpass =
+         &state->pass->subpasses[state->subpass_idx];
 
-   /* FIXME: separate stencil */
-   uint32_t ds_attachment_idx = subpass->ds_attachment.attachment;
-   if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
-      const struct v3dv_render_pass_attachment *ds_attachment =
-         &state->pass->attachments[ds_attachment_idx];
+      const struct v3dv_framebuffer *framebuffer = state->framebuffer;
 
-      assert(state->job->first_subpass >= ds_attachment->first_subpass);
-      assert(state->subpass_idx >= ds_attachment->first_subpass);
-      assert(state->subpass_idx <= ds_attachment->last_subpass);
+      uint8_t internal_bpp;
+      bool msaa;
+      v3dv_X(job->device, framebuffer_compute_internal_bpp_msaa)
+         (framebuffer, subpass, &internal_bpp, &msaa);
 
-      /* From the Vulkan spec, VkImageSubresourceRange:
-       *
-       *   "When an image view of a depth/stencil image is used as a
-       *   depth/stencil framebuffer attachment, the aspectMask is ignored
-       *   and both depth and stencil image subresources are used."
-       *
-       * So we ignore the aspects from the subresource range of the image
-       * view for the depth/stencil attachment, but we still need to restrict
-       * the to aspects compatible with the render pass and the image.
-       */
-      const VkImageAspectFlags aspects =
-         vk_format_aspects(ds_attachment->desc.format);
-
-      /* Only clear once on the first subpass that uses the attachment */
-      bool needs_depth_clear =
-         (aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
-         state->tile_aligned_render_area &&
-         state->job->first_subpass == ds_attachment->first_subpass &&
-         ds_attachment->desc.loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
-         !state->job->is_subpass_continue;
-
-      bool needs_stencil_clear =
-         (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
-         state->tile_aligned_render_area &&
-         state->job->first_subpass == ds_attachment->first_subpass &&
-         ds_attachment->desc.stencilLoadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
-         !state->job->is_subpass_continue;
-
-      /* Skip the last store if it is not required */
-      bool needs_depth_store =
-         (aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
-         (state->subpass_idx < ds_attachment->last_subpass ||
-          ds_attachment->desc.storeOp == VK_ATTACHMENT_STORE_OP_STORE ||
-          !state->job->is_subpass_finish);
-
-      bool needs_stencil_store =
-         (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
-         (state->subpass_idx < ds_attachment->last_subpass ||
-          ds_attachment->desc.stencilStoreOp == VK_ATTACHMENT_STORE_OP_STORE ||
-          !state->job->is_subpass_finish);
-
-      /* GFXH-1461/GFXH-1689: The per-buffer store command's clear
-       * buffer bit is broken for depth/stencil.  In addition, the
-       * clear packet's Z/S bit is broken, but the RTs bit ends up
-       * clearing Z/S.
+      /* From the Vulkan spec:
        *
-       * So if we have to emit a clear of depth or stencil we don't use
-       * per-buffer clears, not even for color, since we will have to emit
-       * a clear command for all tile buffers (including color) to handle
-       * the depth/stencil clears.
+       *    "If the render pass uses multiview, then layers must be one and
+       *     each attachment requires a number of layers that is greater than
+       *     the maximum bit index set in the view mask in the subpasses in
+       *     which it is used."
        *
-       * Note that this bug is not reproduced in the simulator, where
-       * using the clear buffer bit in depth/stencil stores seems to work
-       * correctly.
+       * So when multiview is enabled, we take the number of layers from the
+       * last bit set in the view mask.
        */
-      use_global_clear = needs_depth_clear || needs_stencil_clear;
-      if (needs_depth_store || needs_stencil_store) {
-         const uint32_t zs_buffer =
-            v3dv_zs_buffer(needs_depth_store, needs_stencil_store);
-         cmd_buffer_render_pass_emit_store(cmd_buffer, cl,
-                                           ds_attachment_idx, layer,
-                                           zs_buffer, false);
-         has_stores = true;
+      uint32_t layers = framebuffer->layers;
+      if (subpass->view_mask != 0) {
+         assert(framebuffer->layers == 1);
+         layers = util_last_bit(subpass->view_mask);
       }
-   }
-
-   for (uint32_t i = 0; i < subpass->color_count; i++) {
-      uint32_t attachment_idx = subpass->color_attachments[i].attachment;
-
-      if (attachment_idx == VK_ATTACHMENT_UNUSED)
-         continue;
 
-      const struct v3dv_render_pass_attachment *attachment =
-         &state->pass->attachments[attachment_idx];
-
-      assert(state->job->first_subpass >= attachment->first_subpass);
-      assert(state->subpass_idx >= attachment->first_subpass);
-      assert(state->subpass_idx <= attachment->last_subpass);
-
-      /* Only clear once on the first subpass that uses the attachment */
-      bool needs_clear =
-         state->tile_aligned_render_area &&
-         state->job->first_subpass == attachment->first_subpass &&
-         attachment->desc.loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
-         !state->job->is_subpass_continue;
-
-      /* Skip the last store if it is not required  */
-      bool needs_store =
-         state->subpass_idx < attachment->last_subpass ||
-         attachment->desc.storeOp == VK_ATTACHMENT_STORE_OP_STORE ||
-         !state->job->is_subpass_finish;
-
-      if (needs_store) {
-         cmd_buffer_render_pass_emit_store(cmd_buffer, cl,
-                                           attachment_idx, layer,
-                                           RENDER_TARGET_0 + i,
-                                           needs_clear && !use_global_clear);
-         has_stores = true;
-      } else if (needs_clear) {
-         use_global_clear = true;
-      }
-   }
-
-   /* We always need to emit at least one dummy store */
-   if (!has_stores) {
-      cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
-         store.buffer_to_store = NONE;
-      }
-   }
-
-   /* If we have any depth/stencil clears we can't use the per-buffer clear
-    * bit and instead we have to emit a single clear of all tile buffers.
-    */
-   if (use_global_clear) {
-      cl_emit(cl, CLEAR_TILE_BUFFERS, clear) {
-         clear.clear_z_stencil_buffer = true;
-         clear.clear_all_render_targets = true;
-      }
-   }
-}
-
-static void
-cmd_buffer_render_pass_emit_per_tile_rcl(struct v3dv_cmd_buffer *cmd_buffer,
-                                         uint32_t layer)
-{
-   struct v3dv_job *job = cmd_buffer->state.job;
-   assert(job);
-
-   /* Emit the generic list in our indirect state -- the rcl will just
-    * have pointers into it.
-    */
-   struct v3dv_cl *cl = &job->indirect;
-   v3dv_cl_ensure_space(cl, 200, 1);
-   v3dv_return_if_oom(cmd_buffer, NULL);
-
-   struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
-
-   cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
-
-   cmd_buffer_render_pass_emit_loads(cmd_buffer, cl, layer);
-
-   /* The binner starts out writing tiles assuming that the initial mode
-    * is triangles, so make sure that's the case.
-    */
-   cl_emit(cl, PRIM_LIST_FORMAT, fmt) {
-      fmt.primitive_type = LIST_TRIANGLES;
-   }
-
-   cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
-
-   cmd_buffer_render_pass_emit_stores(cmd_buffer, cl, layer);
-
-   cl_emit(cl, END_OF_TILE_MARKER, end);
-
-   cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
-
-   cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
-      branch.start = tile_list_start;
-      branch.end = v3dv_cl_get_address(cl);
-   }
-}
-
-static void
-cmd_buffer_emit_render_pass_layer_rcl(struct v3dv_cmd_buffer *cmd_buffer,
-                                      uint32_t layer)
-{
-   const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
-
-   struct v3dv_job *job = cmd_buffer->state.job;
-   struct v3dv_cl *rcl = &job->rcl;
-
-   /* If doing multicore binning, we would need to initialize each
-    * core's tile list here.
-    */
-   const struct v3dv_frame_tiling *tiling = &job->frame_tiling;
-   const uint32_t tile_alloc_offset =
-      64 * layer * tiling->draw_tiles_x * tiling->draw_tiles_y;
-   cl_emit(rcl, MULTICORE_RENDERING_TILE_LIST_SET_BASE, list) {
-      list.address = v3dv_cl_address(job->tile_alloc, tile_alloc_offset);
-   }
-
-   cl_emit(rcl, MULTICORE_RENDERING_SUPERTILE_CFG, config) {
-      config.number_of_bin_tile_lists = 1;
-      config.total_frame_width_in_tiles = tiling->draw_tiles_x;
-      config.total_frame_height_in_tiles = tiling->draw_tiles_y;
-
-      config.supertile_width_in_tiles = tiling->supertile_width;
-      config.supertile_height_in_tiles = tiling->supertile_height;
-
-      config.total_frame_width_in_supertiles =
-         tiling->frame_width_in_supertiles;
-      config.total_frame_height_in_supertiles =
-         tiling->frame_height_in_supertiles;
-   }
-
-   /* Start by clearing the tile buffer. */
-   cl_emit(rcl, TILE_COORDINATES, coords) {
-      coords.tile_column_number = 0;
-      coords.tile_row_number = 0;
-   }
-
-   /* Emit an initial clear of the tile buffers. This is necessary
-    * for any buffers that should be cleared (since clearing
-    * normally happens at the *end* of the generic tile list), but
-    * it's also nice to clear everything so the first tile doesn't
-    * inherit any contents from some previous frame.
-    *
-    * Also, implement the GFXH-1742 workaround. There's a race in
-    * the HW between the RCL updating the TLB's internal type/size
-    * and the spawning of the QPU instances using the TLB's current
-    * internal type/size. To make sure the QPUs get the right
-    * state, we need 1 dummy store in between internal type/size
-    * changes on V3D 3.x, and 2 dummy stores on 4.x.
-    */
-   for (int i = 0; i < 2; i++) {
-      if (i > 0)
-         cl_emit(rcl, TILE_COORDINATES, coords);
-      cl_emit(rcl, END_OF_LOADS, end);
-      cl_emit(rcl, STORE_TILE_BUFFER_GENERAL, store) {
-         store.buffer_to_store = NONE;
-      }
-      if (i == 0 && cmd_buffer->state.tile_aligned_render_area) {
-         cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) {
-            clear.clear_z_stencil_buffer = true;
-            clear.clear_all_render_targets = true;
-         }
-      }
-      cl_emit(rcl, END_OF_TILE_MARKER, end);
-   }
-
-   cl_emit(rcl, FLUSH_VCD_CACHE, flush);
-
-   cmd_buffer_render_pass_emit_per_tile_rcl(cmd_buffer, layer);
-
-   uint32_t supertile_w_in_pixels =
-      tiling->tile_width * tiling->supertile_width;
-   uint32_t supertile_h_in_pixels =
-      tiling->tile_height * tiling->supertile_height;
-   const uint32_t min_x_supertile =
-      state->render_area.offset.x / supertile_w_in_pixels;
-   const uint32_t min_y_supertile =
-      state->render_area.offset.y / supertile_h_in_pixels;
-
-   uint32_t max_render_x = state->render_area.offset.x;
-   if (state->render_area.extent.width > 0)
-      max_render_x += state->render_area.extent.width - 1;
-   uint32_t max_render_y = state->render_area.offset.y;
-   if (state->render_area.extent.height > 0)
-      max_render_y += state->render_area.extent.height - 1;
-   const uint32_t max_x_supertile = max_render_x / supertile_w_in_pixels;
-   const uint32_t max_y_supertile = max_render_y / supertile_h_in_pixels;
-
-   for (int y = min_y_supertile; y <= max_y_supertile; y++) {
-      for (int x = min_x_supertile; x <= max_x_supertile; x++) {
-         cl_emit(rcl, SUPERTILE_COORDINATES, coords) {
-            coords.column_number_in_supertiles = x;
-            coords.row_number_in_supertiles = y;
-         }
-      }
-   }
-}
-
-static void
-set_rcl_early_z_config(struct v3dv_job *job,
-                       bool *early_z_disable,
-                       uint32_t *early_z_test_and_update_direction)
-{
-   switch (job->first_ez_state) {
-   case VC5_EZ_UNDECIDED:
-   case VC5_EZ_LT_LE:
-      *early_z_disable = false;
-      *early_z_test_and_update_direction = EARLY_Z_DIRECTION_LT_LE;
-      break;
-   case VC5_EZ_GT_GE:
-      *early_z_disable = false;
-      *early_z_test_and_update_direction = EARLY_Z_DIRECTION_GT_GE;
-      break;
-   case VC5_EZ_DISABLED:
-      *early_z_disable = true;
-      break;
-   }
-}
-
-static void
-cmd_buffer_emit_render_pass_rcl(struct v3dv_cmd_buffer *cmd_buffer)
-{
-   struct v3dv_job *job = cmd_buffer->state.job;
-   assert(job);
-
-   const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
-   const struct v3dv_framebuffer *framebuffer = state->framebuffer;
-
-   /* We can't emit the RCL until we have a framebuffer, which we may not have
-    * if we are recording a secondary command buffer. In that case, we will
-    * have to wait until vkCmdExecuteCommands is called from a primary command
-    * buffer.
-    */
-   if (!framebuffer) {
-      assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
-      return;
-   }
-
-   const struct v3dv_frame_tiling *tiling = &job->frame_tiling;
-
-   const uint32_t fb_layers = framebuffer->layers;
-   v3dv_cl_ensure_space_with_branch(&job->rcl, 200 +
-                                    MAX2(fb_layers, 1) * 256 *
-                                    cl_packet_length(SUPERTILE_COORDINATES));
-   v3dv_return_if_oom(cmd_buffer, NULL);
-
-   assert(state->subpass_idx < state->pass->subpass_count);
-   const struct v3dv_subpass *subpass =
-      &state->pass->subpasses[state->subpass_idx];
-
-   struct v3dv_cl *rcl = &job->rcl;
-
-   /* Comon config must be the first TILE_RENDERING_MODE_CFG and
-    * Z_STENCIL_CLEAR_VALUES must be last. The ones in between are optional
-    * updates to the previous HW state.
-    */
-   const uint32_t ds_attachment_idx = subpass->ds_attachment.attachment;
-
-   cl_emit(rcl, TILE_RENDERING_MODE_CFG_COMMON, config) {
-      config.image_width_pixels = framebuffer->width;
-      config.image_height_pixels = framebuffer->height;
-      config.number_of_render_targets = MAX2(subpass->color_count, 1);
-      config.multisample_mode_4x = false; /* FIXME */
-      config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
-
-      if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
-         const struct v3dv_image_view *iview =
-            framebuffer->attachments[ds_attachment_idx];
-         config.internal_depth_type = iview->internal_type;
-      }
-
-      set_rcl_early_z_config(job,
-                             &config.early_z_disable,
-                             &config.early_z_test_and_update_direction);
-   }
-
-   for (uint32_t i = 0; i < subpass->color_count; i++) {
-      uint32_t attachment_idx = subpass->color_attachments[i].attachment;
-      if (attachment_idx == VK_ATTACHMENT_UNUSED)
-         continue;
-
-      struct v3dv_image_view *iview =
-         state->framebuffer->attachments[attachment_idx];
-
-      const struct v3dv_image *image = iview->image;
-      const struct v3d_resource_slice *slice = &image->slices[iview->base_level];
-
-      /* FIXME */
-      assert(image->samples == VK_SAMPLE_COUNT_1_BIT);
-
-      const uint32_t *clear_color =
-         &state->attachments[attachment_idx].clear_value.color[0];
-
-      uint32_t clear_pad = 0;
-      if (slice->tiling == VC5_TILING_UIF_NO_XOR ||
-          slice->tiling == VC5_TILING_UIF_XOR) {
-         int uif_block_height = v3d_utile_height(image->cpp) * 2;
-
-         uint32_t implicit_padded_height =
-            align(framebuffer->height, uif_block_height) / uif_block_height;
-
-         if (slice->padded_height_of_output_image_in_uif_blocks -
-             implicit_padded_height >= 15) {
-            clear_pad = slice->padded_height_of_output_image_in_uif_blocks;
-         }
-      }
-
-      cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) {
-         clear.clear_color_low_32_bits = clear_color[0];
-         clear.clear_color_next_24_bits = clear_color[1] & 0xffffff;
-         clear.render_target_number = i;
-      };
-
-      if (iview->internal_bpp >= V3D_INTERNAL_BPP_64) {
-         cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART2, clear) {
-            clear.clear_color_mid_low_32_bits =
-              ((clear_color[1] >> 24) | (clear_color[2] << 8));
-            clear.clear_color_mid_high_24_bits =
-              ((clear_color[2] >> 24) | ((clear_color[3] & 0xffff) << 8));
-            clear.render_target_number = i;
-         };
-      }
-
-      if (iview->internal_bpp >= V3D_INTERNAL_BPP_128 || clear_pad) {
-         cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART3, clear) {
-            clear.uif_padded_height_in_uif_blocks = clear_pad;
-            clear.clear_color_high_16_bits = clear_color[3] >> 16;
-            clear.render_target_number = i;
-         };
-      }
-   }
-
-   cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
-      v3dv_render_pass_setup_render_target(cmd_buffer, 0,
-                                           &rt.render_target_0_internal_bpp,
-                                           &rt.render_target_0_internal_type,
-                                           &rt.render_target_0_clamp);
-      v3dv_render_pass_setup_render_target(cmd_buffer, 1,
-                                           &rt.render_target_1_internal_bpp,
-                                           &rt.render_target_1_internal_type,
-                                           &rt.render_target_1_clamp);
-      v3dv_render_pass_setup_render_target(cmd_buffer, 2,
-                                           &rt.render_target_2_internal_bpp,
-                                           &rt.render_target_2_internal_type,
-                                           &rt.render_target_2_clamp);
-      v3dv_render_pass_setup_render_target(cmd_buffer, 3,
-                                           &rt.render_target_3_internal_bpp,
-                                           &rt.render_target_3_internal_type,
-                                           &rt.render_target_3_clamp);
-   }
-
-   /* Ends rendering mode config. */
-   if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
-      cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
-         clear.z_clear_value =
-            state->attachments[ds_attachment_idx].clear_value.z;
-         clear.stencil_clear_value =
-            state->attachments[ds_attachment_idx].clear_value.s;
-      };
-   } else {
-      cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
-         clear.z_clear_value = 1.0f;
-         clear.stencil_clear_value = 0;
-      };
-   }
-
-   /* Always set initial block size before the first branch, which needs
-    * to match the value from binning mode config.
-    */
-   cl_emit(rcl, TILE_LIST_INITIAL_BLOCK_SIZE, init) {
-      init.use_auto_chained_tile_lists = true;
-      init.size_of_first_block_in_chained_tile_lists =
-         TILE_ALLOCATION_BLOCK_SIZE_64B;
-   }
-
-   for (int layer = 0; layer < MAX2(1, fb_layers); layer++)
-      cmd_buffer_emit_render_pass_layer_rcl(cmd_buffer, layer);
-
-   cl_emit(rcl, END_OF_RENDERING, end);
-}
-
-static void
-cmd_buffer_emit_subpass_clears(struct v3dv_cmd_buffer *cmd_buffer)
-{
-   assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
-
-   assert(cmd_buffer->state.pass);
-   assert(cmd_buffer->state.subpass_idx < cmd_buffer->state.pass->subpass_count);
-   const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
-   const struct v3dv_render_pass *pass = state->pass;
-   const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
-
-   uint32_t att_count = 0;
-   VkClearAttachment atts[V3D_MAX_DRAW_BUFFERS + 1]; /* 4 color + D/S */
-   for (uint32_t i = 0; i < subpass->color_count; i++) {
-      const uint32_t att_idx = subpass->color_attachments[i].attachment;
-      if (att_idx == VK_ATTACHMENT_UNUSED)
-         continue;
-
-      struct v3dv_render_pass_attachment *att = &pass->attachments[att_idx];
-      if (att->desc.loadOp != VK_ATTACHMENT_LOAD_OP_CLEAR)
-         continue;
-
-      if (state->subpass_idx != att->first_subpass)
-         continue;
-
-      atts[att_count].aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
-      atts[att_count].colorAttachment = i;
-      atts[att_count].clearValue = state->attachments[att_idx].vk_clear_value;
-      att_count++;
-   }
-
-   const uint32_t ds_att_idx = subpass->ds_attachment.attachment;
-   if (ds_att_idx != VK_ATTACHMENT_UNUSED) {
-      struct v3dv_render_pass_attachment *att = &pass->attachments[ds_att_idx];
-      if (state->subpass_idx == att->first_subpass) {
-         VkImageAspectFlags aspects = vk_format_aspects(att->desc.format);
-         if (att->desc.loadOp != VK_ATTACHMENT_LOAD_OP_CLEAR)
-            aspects &= ~VK_IMAGE_ASPECT_DEPTH_BIT;
-         if (att->desc.stencilLoadOp != VK_ATTACHMENT_LOAD_OP_CLEAR)
-            aspects &= ~VK_IMAGE_ASPECT_STENCIL_BIT;
-
-         if (aspects) {
-            atts[att_count].aspectMask = aspects;
-            atts[att_count].colorAttachment = 0; /* Ignored */
-            atts[att_count].clearValue =
-               state->attachments[ds_att_idx].vk_clear_value;
-            att_count++;
-         }
-      }
-   }
-
-   if (att_count == 0)
-      return;
-
-   perf_debug("Render area doesn't match render pass granularity, falling back "
-              "to vkCmdClearAttachments for VK_ATTACHMENT_LOAD_OP_CLEAR.\n");
-
-   /* From the Vulkan 1.0 spec:
-    *
-    *    "VK_ATTACHMENT_LOAD_OP_CLEAR specifies that the contents within the
-    *     render area will be cleared to a uniform value, which is specified
-    *     when a render pass instance is begun."
-    *
-    * So the clear is only constrained by the render area and not by pipeline
-    * state such as scissor or viewport, these are the semantics of
-    * vkCmdClearAttachments as well.
-    */
-   VkCommandBuffer _cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
-   VkClearRect rect = {
-      .rect = state->render_area,
-      .baseArrayLayer = 0,
-      .layerCount = 1,
-   };
-   v3dv_CmdClearAttachments(_cmd_buffer, att_count, atts, 1, &rect);
-}
-
-static struct v3dv_job *
-cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer *cmd_buffer,
-                              uint32_t subpass_idx,
-                              enum v3dv_job_type type)
-{
-   assert(type == V3DV_JOB_TYPE_GPU_CL ||
-          type == V3DV_JOB_TYPE_GPU_CL_SECONDARY);
-
-   struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
-   assert(subpass_idx < state->pass->subpass_count);
-
-   /* Starting a new job can trigger a finish of the current one, so don't
-    * change the command buffer state for the new job until we are done creating
-    * the new job.
-    */
-   struct v3dv_job *job =
-      v3dv_cmd_buffer_start_job(cmd_buffer, subpass_idx, type);
-   if (!job)
-      return NULL;
-
-   state->subpass_idx = subpass_idx;
-
-   /* If we are starting a new job we need to setup binning. We only do this
-    * for V3DV_JOB_TYPE_GPU_CL jobs because V3DV_JOB_TYPE_GPU_CL_SECONDARY
-    * jobs are not submitted to the GPU directly, and are instead meant to be
-    * branched to from other V3DV_JOB_TYPE_GPU_CL jobs.
-    */
-   if (type == V3DV_JOB_TYPE_GPU_CL &&
-       job->first_subpass == state->subpass_idx) {
-      const struct v3dv_subpass *subpass =
-         &state->pass->subpasses[state->subpass_idx];
-
-      const struct v3dv_framebuffer *framebuffer = state->framebuffer;
-
-      const uint8_t internal_bpp =
-         v3dv_framebuffer_compute_internal_bpp(framebuffer, subpass);
-
-      v3dv_job_start_frame(job,
-                           framebuffer->width,
-                           framebuffer->height,
-                           framebuffer->layers,
-                           subpass->color_count,
-                           internal_bpp);
-
-      /* FIXME: we don't suppport resolve attachments yet */
-      assert(subpass->resolve_attachments == NULL);
+      v3dv_job_start_frame(job,
+                           framebuffer->width,
+                           framebuffer->height,
+                           layers,
+                           true,
+                           subpass->color_count,
+                           internal_bpp,
+                           msaa);
    }
 
    return job;
@@ -2152,7 +1527,8 @@ v3dv_cmd_buffer_subpass_start(struct v3dv_cmd_buffer *cmd_buffer,
    cmd_buffer_update_tile_alignment(cmd_buffer);
 
    /* If we can't use TLB clears then we need to emit draw clears for any
-    * LOAD_OP_CLEAR attachments in this subpass now.
+    * LOAD_OP_CLEAR attachments in this subpass now. We might also need to emit
+    * Depth/Stencil clears if we hit GFXH-1461.
     *
     * Secondary command buffers don't start subpasses (and may not even have
     * framebuffer state), so we only care about this in primaries. The only
@@ -2161,10 +1537,8 @@ v3dv_cmd_buffer_subpass_start(struct v3dv_cmd_buffer *cmd_buffer,
     * attachment load clears, but we don't have any instances of that right
     * now.
     */
-   if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
-       !cmd_buffer->state.tile_aligned_render_area) {
+   if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY)
       cmd_buffer_emit_subpass_clears(cmd_buffer);
-   }
 
    return job;
 }
@@ -2207,8 +1581,9 @@ v3dv_cmd_buffer_subpass_finish(struct v3dv_cmd_buffer *cmd_buffer)
       job->is_subpass_finish = true;
 }
 
-void
-v3dv_CmdEndRenderPass(VkCommandBuffer commandBuffer)
+VKAPI_ATTR void VKAPI_CALL
+v3dv_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
+                       const VkSubpassEndInfo *pSubpassEndInfo)
 {
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
 
@@ -2218,13 +1593,15 @@ v3dv_CmdEndRenderPass(VkCommandBuffer commandBuffer)
    v3dv_cmd_buffer_subpass_finish(cmd_buffer);
    v3dv_cmd_buffer_finish_job(cmd_buffer);
 
+   cmd_buffer_subpass_handle_pending_resolves(cmd_buffer);
+
    /* We are no longer inside a render pass */
    state->framebuffer = NULL;
    state->pass = NULL;
    state->subpass_idx = -1;
 }
 
-VkResult
+VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_EndCommandBuffer(VkCommandBuffer commandBuffer)
 {
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
@@ -2251,44 +1628,6 @@ v3dv_EndCommandBuffer(VkCommandBuffer commandBuffer)
 }
 
 static void
-emit_occlusion_query(struct v3dv_cmd_buffer *cmd_buffer);
-
-static void
-ensure_array_state(struct v3dv_cmd_buffer *cmd_buffer,
-                   uint32_t slot_size,
-                   uint32_t used_count,
-                   uint32_t *alloc_count,
-                   void **ptr);
-
-static void
-cmd_buffer_copy_secondary_end_query_state(struct v3dv_cmd_buffer *primary,
-                                          struct v3dv_cmd_buffer *secondary)
-{
-   struct v3dv_cmd_buffer_state *p_state = &primary->state;
-   struct v3dv_cmd_buffer_state *s_state = &secondary->state;
-
-   const uint32_t total_state_count =
-      p_state->query.end.used_count + s_state->query.end.used_count;
-   ensure_array_state(primary,
-                      sizeof(struct v3dv_end_query_cpu_job_info),
-                      total_state_count,
-                      &p_state->query.end.alloc_count,
-                      (void **) &p_state->query.end.states);
-   v3dv_return_if_oom(primary, NULL);
-
-   for (uint32_t i = 0; i < s_state->query.end.used_count; i++) {
-      const struct v3dv_end_query_cpu_job_info *s_qstate =
-         &secondary->state.query.end.states[i];
-
-      struct v3dv_end_query_cpu_job_info *p_qstate =
-         &p_state->query.end.states[p_state->query.end.used_count++];
-
-      p_qstate->pool = s_qstate->pool;
-      p_qstate->query = s_qstate->query;
-   }
-}
-
-static void
 clone_bo_list(struct v3dv_cmd_buffer *cmd_buffer,
               struct list_head *dst,
               struct list_head *src)
@@ -2298,7 +1637,7 @@ clone_bo_list(struct v3dv_cmd_buffer *cmd_buffer,
    list_inithead(dst);
    list_for_each_entry(struct v3dv_bo, bo, src, list_link) {
       struct v3dv_bo *clone_bo =
-         vk_alloc(&cmd_buffer->device->alloc, sizeof(struct v3dv_bo), 8,
+         vk_alloc(&cmd_buffer->device->vk.alloc, sizeof(struct v3dv_bo), 8,
                   VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
       if (!clone_bo) {
          v3dv_flag_oom(cmd_buffer, NULL);
@@ -2316,16 +1655,16 @@ clone_bo_list(struct v3dv_cmd_buffer *cmd_buffer,
  * for jobs recorded in secondary command buffers when we want to execute
  * them in primaries.
  */
-static void
-job_clone_in_cmd_buffer(struct v3dv_job *job,
-                        struct v3dv_cmd_buffer *cmd_buffer)
+struct v3dv_job *
+v3dv_job_clone_in_cmd_buffer(struct v3dv_job *job,
+                             struct v3dv_cmd_buffer *cmd_buffer)
 {
-   struct v3dv_job *clone_job = vk_alloc(&job->device->alloc,
+   struct v3dv_job *clone_job = vk_alloc(&job->device->vk.alloc,
                                          sizeof(struct v3dv_job), 8,
                                          VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
    if (!clone_job) {
       v3dv_flag_oom(cmd_buffer, NULL);
-      return;
+      return NULL;
    }
 
    /* Cloned jobs don't duplicate resources! */
@@ -2343,104 +1682,8 @@ job_clone_in_cmd_buffer(struct v3dv_job *job,
       clone_bo_list(cmd_buffer, &clone_job->indirect.bo_list,
                     &job->indirect.bo_list);
    }
-}
 
-static void
-cmd_buffer_execute_inside_pass(struct v3dv_cmd_buffer *primary,
-                               uint32_t cmd_buffer_count,
-                               const VkCommandBuffer *cmd_buffers)
-{
-   assert(primary->state.job);
-
-   if (primary->state.dirty & V3DV_CMD_DIRTY_OCCLUSION_QUERY)
-      emit_occlusion_query(primary);
-
-   for (uint32_t i = 0; i < cmd_buffer_count; i++) {
-      V3DV_FROM_HANDLE(v3dv_cmd_buffer, secondary, cmd_buffers[i]);
-
-      assert(secondary->usage_flags &
-             VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT);
-
-      list_for_each_entry(struct v3dv_job, secondary_job,
-                          &secondary->jobs, list_link) {
-         if (secondary_job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY) {
-            /* If the job is a CL, then we branch to it from the primary BCL.
-             * In this case the secondary's BCL is finished with a
-             * RETURN_FROM_SUB_LIST command to return back to the primary BCL
-             * once we are done executing it.
-             */
-            assert(v3dv_cl_offset(&secondary_job->rcl) == 0);
-            assert(secondary_job->bcl.bo);
-
-            /* Sanity check that secondary BCL ends with RETURN_FROM_SUB_LIST */
-            STATIC_ASSERT(cl_packet_length(RETURN_FROM_SUB_LIST) == 1);
-            assert(v3dv_cl_offset(&secondary_job->bcl) >= 1);
-            assert(*(((uint8_t *)secondary_job->bcl.next) - 1) ==
-                   V3D42_RETURN_FROM_SUB_LIST_opcode);
-
-            /* If we had to split the primary job while executing the secondary
-             * we will have to create a new one so we can emit the branch
-             * instruction.
-             *
-             * FIXME: in this case, maybe just copy the secondary BCL without
-             * the RETURN_FROM_SUB_LIST into the primary job to skip the
-             * branch?
-             */
-            struct v3dv_job *primary_job = primary->state.job;
-            if (!primary_job) {
-               primary_job =
-                  v3dv_cmd_buffer_subpass_resume(primary,
-                                                 primary->state.subpass_idx);
-            }
-
-            /* Make sure our primary job has all required BO references */
-            set_foreach(secondary_job->bos, entry) {
-               struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
-               v3dv_job_add_bo(primary_job, bo);
-            }
-
-            /* Emit the branch instruction */
-            v3dv_cl_ensure_space_with_branch(&primary_job->bcl,
-                                             cl_packet_length(BRANCH_TO_SUB_LIST));
-            v3dv_return_if_oom(primary, NULL);
-
-            cl_emit(&primary_job->bcl, BRANCH_TO_SUB_LIST, branch) {
-               branch.address = v3dv_cl_address(secondary_job->bcl.bo, 0);
-            }
-
-            /* If this secondary has barriers, we need to flag them in the
-             * primary job.
-             *
-             * FIXME: This might be moving the sync point too early though,
-             * maybe we would need to split the primary in this case to ensure
-             * that barriers execute right before the secondary.
-             */
-            primary_job->serialize |= secondary_job->serialize;
-            primary_job->needs_bcl_sync |= secondary_job->needs_bcl_sync;
-         } else if (secondary_job->type == V3DV_JOB_TYPE_CPU_CLEAR_ATTACHMENTS) {
-            const struct v3dv_clear_attachments_cpu_job_info *info =
-               &secondary_job->cpu.clear_attachments;
-            v3dv_CmdClearAttachments(v3dv_cmd_buffer_to_handle(primary),
-                                     info->attachment_count,
-                                     info->attachments,
-                                     info->rect_count,
-                                     info->rects);
-         } else {
-            /* This is a regular job (CPU or GPU), so just finish the current
-             * primary job (if any) and then add the secondary job to the
-             * primary's job list right after it.
-             */
-            v3dv_cmd_buffer_finish_job(primary);
-            job_clone_in_cmd_buffer(secondary_job, primary);
-         }
-      }
-
-      /* If the secondary has recorded any vkCmdEndQuery commands, we need to
-       * copy this state to the primary so it is processed properly when the
-       * current primary job is finished.
-       */
-      cmd_buffer_copy_secondary_end_query_state(primary, secondary);
-   }
+   return clone_job;
 }
 
 static void
@@ -2448,6 +1691,8 @@ cmd_buffer_execute_outside_pass(struct v3dv_cmd_buffer *primary,
                                 uint32_t cmd_buffer_count,
                                 const VkCommandBuffer *cmd_buffers)
 {
+   bool pending_barrier = false;
+   bool pending_bcl_barrier = false;
    for (uint32_t i = 0; i < cmd_buffer_count; i++) {
       V3DV_FROM_HANDLE(v3dv_cmd_buffer, secondary, cmd_buffers[i]);
 
@@ -2470,14 +1715,36 @@ cmd_buffer_execute_outside_pass(struct v3dv_cmd_buffer *primary,
       list_for_each_entry(struct v3dv_job, secondary_job,
                           &secondary->jobs, list_link) {
          /* These can only happen inside a render pass */
-         assert(secondary_job->type != V3DV_JOB_TYPE_CPU_CLEAR_ATTACHMENTS);
          assert(secondary_job->type != V3DV_JOB_TYPE_GPU_CL_SECONDARY);
-         job_clone_in_cmd_buffer(secondary_job, primary);
+         struct v3dv_job *job = v3dv_job_clone_in_cmd_buffer(secondary_job, primary);
+         if (!job)
+            return;
+
+         if (pending_barrier) {
+            job->serialize = true;
+            if (pending_bcl_barrier)
+               job->needs_bcl_sync = true;
+            pending_barrier = false;
+            pending_bcl_barrier = false;
+         }
       }
+
+      /* If this secondary had any pending barrier state we will need that
+       * barrier state consumed with whatever comes after it (first job in
+       * the next secondary or the primary, if this was the last secondary).
+       */
+      assert(secondary->state.has_barrier || !secondary->state.has_bcl_barrier);
+      pending_barrier = secondary->state.has_barrier;
+      pending_bcl_barrier = secondary->state.has_bcl_barrier;
+   }
+
+   if (pending_barrier) {
+      primary->state.has_barrier = true;
+      primary->state.has_bcl_barrier |= pending_bcl_barrier;
    }
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdExecuteCommands(VkCommandBuffer commandBuffer,
                         uint32_t commandBufferCount,
                         const VkCommandBuffer *pCommandBuffers)
@@ -2485,8 +1752,8 @@ v3dv_CmdExecuteCommands(VkCommandBuffer commandBuffer,
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, primary, commandBuffer);
 
    if (primary->state.pass != NULL) {
-      cmd_buffer_execute_inside_pass(primary,
-                                     commandBufferCount, pCommandBuffers);
+      v3dv_X(primary->device, cmd_buffer_execute_inside_pass)
+         (primary, commandBufferCount, pCommandBuffers);
    } else {
       cmd_buffer_execute_outside_pass(primary,
                                       commandBufferCount, pCommandBuffers);
@@ -2578,265 +1845,15 @@ cmd_buffer_bind_pipeline_static_state(struct v3dv_cmd_buffer *cmd_buffer,
       }
    }
 
-   cmd_buffer->state.dynamic.mask = dynamic_mask;
-   cmd_buffer->state.dirty |= dirty;
-}
-
-static void
-job_update_ez_state(struct v3dv_job *job,
-                    struct v3dv_pipeline *pipeline,
-                    struct v3dv_cmd_buffer_state *state)
-{
-   /* If we don't have a depth attachment at all, disable */
-   if (!state->pass) {
-      job->ez_state = VC5_EZ_DISABLED;
-      return;
-   }
-
-   assert(state->subpass_idx < state->pass->subpass_count);
-   struct v3dv_subpass *subpass = &state->pass->subpasses[state->subpass_idx];
-   if (subpass->ds_attachment.attachment == VK_ATTACHMENT_UNUSED) {
-      job->ez_state = VC5_EZ_DISABLED;
-      return;
-   }
-
-   /* Otherwise, look at the curently bound pipeline state */
-   switch (pipeline->ez_state) {
-   case VC5_EZ_UNDECIDED:
-      /* If the pipeline didn't pick a direction but didn't disable, then go
-       * along with the current EZ state. This allows EZ optimization for Z
-       * func == EQUAL or NEVER.
-       */
-      break;
-
-   case VC5_EZ_LT_LE:
-   case VC5_EZ_GT_GE:
-      /* If the pipeline picked a direction, then it needs to match the current
-       * direction if we've decided on one.
-       */
-      if (job->ez_state == VC5_EZ_UNDECIDED)
-         job->ez_state = pipeline->ez_state;
-      else if (job->ez_state != pipeline->ez_state)
-         job->ez_state = VC5_EZ_DISABLED;
-      break;
-
-   case VC5_EZ_DISABLED:
-      /* If the pipeline disables EZ because of a bad Z func or stencil
-       * operation, then we can't do any more EZ in this frame.
-       */
-      job->ez_state = VC5_EZ_DISABLED;
-      break;
-   }
-
-   /* If the FS writes Z, then it may update against the chosen EZ direction */
-   if (pipeline->fs->current_variant->prog_data.fs->writes_z)
-      job->ez_state = VC5_EZ_DISABLED;
-
-   if (job->first_ez_state == VC5_EZ_UNDECIDED &&
-       job->ez_state != VC5_EZ_DISABLED) {
-      job->first_ez_state = job->ez_state;
-   }
-}
-
-/* Note that the following poopulate methods doesn't do a detailed fill-up of
- * the v3d_fs_key. Here we just fill-up cmd_buffer specific info. All info
- * coming from the pipeline create info was alredy filled up when the pipeline
- * was created
- */
-static void
-cmd_buffer_populate_v3d_key(struct v3d_key *key,
-                            struct v3dv_cmd_buffer *cmd_buffer,
-                            VkPipelineBindPoint pipeline_binding)
-{
-   if (cmd_buffer->state.pipeline->combined_index_map != NULL) {
-      struct v3dv_descriptor_map *texture_map = &cmd_buffer->state.pipeline->texture_map;
-      struct v3dv_descriptor_map *sampler_map = &cmd_buffer->state.pipeline->sampler_map;
-      struct v3dv_descriptor_state *descriptor_state =
-         &cmd_buffer->state.descriptor_state[pipeline_binding];
-
-      hash_table_foreach(cmd_buffer->state.pipeline->combined_index_map, entry) {
-         uint32_t combined_idx = (uint32_t)(uintptr_t) (entry->data);
-         uint32_t combined_idx_key =
-            cmd_buffer->state.pipeline->combined_index_to_key_map[combined_idx];
-         uint32_t texture_idx;
-         uint32_t sampler_idx;
-
-         v3dv_pipeline_combined_index_key_unpack(combined_idx_key,
-                                                 &texture_idx, &sampler_idx);
-
-         struct v3dv_image_view *image_view =
-            v3dv_descriptor_map_get_image_view(descriptor_state,
-                                               texture_map,
-                                               cmd_buffer->state.pipeline->layout,
-                                               texture_idx);
-
-         assert(image_view);
-
-         const struct v3dv_sampler *sampler = NULL;
-         if (sampler_idx != V3DV_NO_SAMPLER_IDX) {
-            sampler =
-               v3dv_descriptor_map_get_sampler(descriptor_state,
-                                               sampler_map,
-                                               cmd_buffer->state.pipeline->layout,
-                                               sampler_idx);
-            assert(sampler);
-         }
-
-         key->tex[combined_idx].return_size =
-            v3dv_get_tex_return_size(image_view->format,
-                                     sampler ? sampler->compare_enable : false);
-
-         if (key->tex[combined_idx].return_size == 16) {
-            key->tex[combined_idx].return_channels = 2;
-         } else {
-            key->tex[combined_idx].return_channels = 4;
-         }
-
-         /* Note: In general, we don't need to do anything for the swizzle, as
-          * that is handled with the swizzle info at the Texture State, and the
-          * default values for key->tex[].swizzle were already filled up at
-          * the pipeline creation time.
-          *
-          * The only exeption in which we want to apply a texture swizzle
-          * lowering in the shader is to force alpha to 1 when using clamp
-          * to border with transparent black in combination with specific
-          * formats.
-          */
-         if (sampler && sampler->clamp_to_transparent_black_border) {
-            switch (image_view->vk_format) {
-            case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
-            case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
-               key->tex[combined_idx].swizzle[3] = PIPE_SWIZZLE_1;
-               break;
-            default:
-               break;
-            }
-         }
+   if (!(dynamic_mask & V3DV_DYNAMIC_COLOR_WRITE_ENABLE)) {
+      if (dest->color_write_enable != src->color_write_enable) {
+         dest->color_write_enable = src->color_write_enable;
+         dirty |= V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE;
       }
    }
-}
-
-static void
-update_fs_variant(struct v3dv_cmd_buffer *cmd_buffer)
-{
-   struct v3dv_shader_variant *variant;
-   struct v3dv_pipeline_stage *p_stage = cmd_buffer->state.pipeline->fs;
-   struct v3d_fs_key local_key;
-
-   /* We start with a copy of the original pipeline key */
-   memcpy(&local_key, &p_stage->key.fs, sizeof(struct v3d_fs_key));
-
-   cmd_buffer_populate_v3d_key(&local_key.base, cmd_buffer,
-                               VK_PIPELINE_BIND_POINT_GRAPHICS);
-
-   VkResult vk_result;
-   variant = v3dv_get_shader_variant(p_stage, &local_key.base,
-                                     sizeof(struct v3d_fs_key),
-                                     &cmd_buffer->device->alloc,
-                                     &vk_result);
-   /* At this point we are not creating a vulkan object to return to the
-    * API user, so we can't really return back a OOM error
-    */
-   assert(variant);
-   assert(vk_result == VK_SUCCESS);
-
-   p_stage->current_variant = variant;
-}
-
-static void
-update_vs_variant(struct v3dv_cmd_buffer *cmd_buffer)
-{
-   struct v3dv_shader_variant *variant;
-   struct v3dv_pipeline_stage *p_stage;
-   struct v3d_vs_key local_key;
-   VkResult vk_result;
-
-   /* We start with a copy of the original pipeline key */
-   p_stage = cmd_buffer->state.pipeline->vs;
-   memcpy(&local_key, &p_stage->key.vs, sizeof(struct v3d_vs_key));
-
-   cmd_buffer_populate_v3d_key(&local_key.base, cmd_buffer,
-                               VK_PIPELINE_BIND_POINT_GRAPHICS);
-
-   variant = v3dv_get_shader_variant(p_stage, &local_key.base,
-                                     sizeof(struct v3d_vs_key),
-                                     &cmd_buffer->device->alloc,
-                                     &vk_result);
-   /* At this point we are not creating a vulkan object to return to the
-    * API user, so we can't really return back a OOM error
-    */
-   assert(variant);
-   assert(vk_result == VK_SUCCESS);
-
-   p_stage->current_variant = variant;
-
-   /* Now the vs_bin */
-   p_stage = cmd_buffer->state.pipeline->vs_bin;
-   memcpy(&local_key, &p_stage->key.vs, sizeof(struct v3d_vs_key));
-
-   cmd_buffer_populate_v3d_key(&local_key.base, cmd_buffer,
-                               VK_PIPELINE_BIND_POINT_GRAPHICS);
-   variant = v3dv_get_shader_variant(p_stage, &local_key.base,
-                                     sizeof(struct v3d_vs_key),
-                                     &cmd_buffer->device->alloc,
-                                     &vk_result);
-
-   /* At this point we are not creating a vulkan object to return to the
-    * API user, so we can't really return back a OOM error
-    */
-   assert(variant);
-   assert(vk_result == VK_SUCCESS);
-
-   p_stage->current_variant = variant;
-}
-
-static void
-update_cs_variant(struct v3dv_cmd_buffer *cmd_buffer)
-{
-   struct v3dv_shader_variant *variant;
-   struct v3dv_pipeline_stage *p_stage = cmd_buffer->state.pipeline->cs;
-   struct v3d_key local_key;
-
-   /* We start with a copy of the original pipeline key */
-   memcpy(&local_key, &p_stage->key.base, sizeof(struct v3d_key));
-
-   cmd_buffer_populate_v3d_key(&local_key, cmd_buffer,
-                               VK_PIPELINE_BIND_POINT_COMPUTE);
-
-   VkResult result;
-   variant = v3dv_get_shader_variant(p_stage, &local_key,
-                                     sizeof(struct v3d_key),
-                                     &cmd_buffer->device->alloc,
-                                     &result);
-   /* At this point we are not creating a vulkan object to return to the
-    * API user, so we can't really return back a OOM error
-    */
-   assert(variant);
-   assert(result == VK_SUCCESS);
-
-   p_stage->current_variant = variant;
-}
 
-/*
- * Some updates on the cmd buffer requires also updates on the shader being
- * compiled at the pipeline. The poster boy here are textures, as the compiler
- * needs to do certain things depending on the texture format. So here we
- * re-create the v3d_keys and update the variant. Note that internally the
- * pipeline has a variant cache (hash table) to avoid unneeded compilations
- *
- */
-static void
-update_pipeline_variants(struct v3dv_cmd_buffer *cmd_buffer)
-{
-   assert(cmd_buffer->state.pipeline);
-
-   if (v3dv_pipeline_get_binding_point(cmd_buffer->state.pipeline) ==
-       VK_PIPELINE_BIND_POINT_GRAPHICS) {
-      update_fs_variant(cmd_buffer);
-      update_vs_variant(cmd_buffer);
-   } else {
-      update_cs_variant(cmd_buffer);
-   }
+   cmd_buffer->state.dynamic.mask = dynamic_mask;
+   cmd_buffer->state.dirty |= dirty;
 }
 
 static void
@@ -2844,30 +1861,10 @@ bind_graphics_pipeline(struct v3dv_cmd_buffer *cmd_buffer,
                        struct v3dv_pipeline *pipeline)
 {
    assert(pipeline && !(pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT));
-   if (cmd_buffer->state.pipeline == pipeline)
+   if (cmd_buffer->state.gfx.pipeline == pipeline)
       return;
 
-
-   /* Enable always flush if we are blending to sRGB render targets. This
-    * fixes test failures in:
-    * dEQP-VK.pipeline.blend.format.r8g8b8a8_srgb.*
-    *
-    * FIXME: not sure why we need this. The tile buffer is always linear, with
-    * conversion from/to sRGB happening on tile load/store operations. This
-    * means that when we enable flushing the only difference is that we convert
-    * to sRGB on the store after each draw call and we convert from sRGB on the
-    * load before each draw call, but the blend happens in linear format in the
-    * tile buffer anyway, which is the same scenario as if we didn't flush.
-    */
-   assert(pipeline->subpass);
-   if (pipeline->subpass->has_srgb_rt && pipeline->blend.enables) {
-      assert(cmd_buffer->state.job);
-      cmd_buffer->state.job->always_flush = true;
-      perf_debug("flushing draw calls for subpass %d because bound pipeline "
-                 "uses sRGB blending\n", cmd_buffer->state.subpass_idx);
-   }
-
-   cmd_buffer->state.pipeline = pipeline;
+   cmd_buffer->state.gfx.pipeline = pipeline;
 
    cmd_buffer_bind_pipeline_static_state(cmd_buffer, &pipeline->dynamic_state);
 
@@ -2880,14 +1877,14 @@ bind_compute_pipeline(struct v3dv_cmd_buffer *cmd_buffer,
 {
    assert(pipeline && pipeline->active_stages == VK_SHADER_STAGE_COMPUTE_BIT);
 
-   if (cmd_buffer->state.pipeline == pipeline)
+   if (cmd_buffer->state.compute.pipeline == pipeline)
       return;
 
-   cmd_buffer->state.pipeline = pipeline;
-   cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_PIPELINE;
+   cmd_buffer->state.compute.pipeline = pipeline;
+   cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_COMPUTE_PIPELINE;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdBindPipeline(VkCommandBuffer commandBuffer,
                      VkPipelineBindPoint pipelineBindPoint,
                      VkPipeline _pipeline)
@@ -2943,7 +1940,7 @@ v3dv_viewport_compute_xform(const VkViewport *viewport,
       scale[2] = min_abs_scale * (scale[2] < 0 ? -1.0f : 1.0f);
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdSetViewport(VkCommandBuffer commandBuffer,
                     uint32_t firstViewport,
                     uint32_t viewportCount,
@@ -2976,7 +1973,7 @@ v3dv_CmdSetViewport(VkCommandBuffer commandBuffer,
    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VIEWPORT;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdSetScissor(VkCommandBuffer commandBuffer,
                    uint32_t firstScissor,
                    uint32_t scissorCount,
@@ -3080,550 +2077,118 @@ emit_scissor(struct v3dv_cmd_buffer *cmd_buffer)
    cmd_buffer->state.clip_window.extent.width = maxx - minx;
    cmd_buffer->state.clip_window.extent.height = maxy - miny;
 
-   emit_clip_window(cmd_buffer->state.job, &cmd_buffer->state.clip_window);
+   v3dv_X(cmd_buffer->device, job_emit_clip_window)
+      (cmd_buffer->state.job, &cmd_buffer->state.clip_window);
 
    cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_SCISSOR;
 }
 
 static void
-emit_viewport(struct v3dv_cmd_buffer *cmd_buffer)
+update_gfx_uniform_state(struct v3dv_cmd_buffer *cmd_buffer,
+                         uint32_t dirty_uniform_state)
 {
-   struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
-   /* FIXME: right now we only support one viewport. viewporst[0] would work
-    * now, would need to change if we allow multiple viewports
+   /* We need to update uniform streams if any piece of state that is passed
+    * to the shader as a uniform may have changed.
+    *
+    * If only descriptor sets are dirty then we can safely ignore updates
+    * for shader stages that don't access descriptors.
     */
-   float *vptranslate = dynamic->viewport.translate[0];
-   float *vpscale = dynamic->viewport.scale[0];
-
-   struct v3dv_job *job = cmd_buffer->state.job;
-   assert(job);
-
-   const uint32_t required_cl_size =
-      cl_packet_length(CLIPPER_XY_SCALING) +
-      cl_packet_length(CLIPPER_Z_SCALE_AND_OFFSET) +
-      cl_packet_length(CLIPPER_Z_MIN_MAX_CLIPPING_PLANES) +
-      cl_packet_length(VIEWPORT_OFFSET);
-   v3dv_cl_ensure_space_with_branch(&job->bcl, required_cl_size);
-   v3dv_return_if_oom(cmd_buffer, NULL);
-
-   cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
-      clip.viewport_half_width_in_1_256th_of_pixel = vpscale[0] * 256.0f;
-      clip.viewport_half_height_in_1_256th_of_pixel = vpscale[1] * 256.0f;
-   }
-
-   cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) {
-      clip.viewport_z_offset_zc_to_zs = vptranslate[2];
-      clip.viewport_z_scale_zc_to_zs = vpscale[2];
-   }
-   cl_emit(&job->bcl, CLIPPER_Z_MIN_MAX_CLIPPING_PLANES, clip) {
-      /* Vulkan's Z NDC is [0..1], unlile OpenGL which is [-1, 1] */
-      float z1 = vptranslate[2];
-      float z2 = vptranslate[2] + vpscale[2];
-      clip.minimum_zw = MIN2(z1, z2);
-      clip.maximum_zw = MAX2(z1, z2);
-   }
-
-   cl_emit(&job->bcl, VIEWPORT_OFFSET, vp) {
-      vp.viewport_centre_x_coordinate = vptranslate[0];
-      vp.viewport_centre_y_coordinate = vptranslate[1];
-   }
-
-   cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_VIEWPORT;
-}
-
-static void
-emit_stencil(struct v3dv_cmd_buffer *cmd_buffer)
-{
-   struct v3dv_job *job = cmd_buffer->state.job;
-   assert(job);
-
-   struct v3dv_pipeline *pipeline = cmd_buffer->state.pipeline;
-   struct v3dv_dynamic_state *dynamic_state = &cmd_buffer->state.dynamic;
-
-   const uint32_t dynamic_stencil_states = V3DV_DYNAMIC_STENCIL_COMPARE_MASK |
-                                           V3DV_DYNAMIC_STENCIL_WRITE_MASK |
-                                           V3DV_DYNAMIC_STENCIL_REFERENCE;
-
-   v3dv_cl_ensure_space_with_branch(&job->bcl,
-                                    2 * cl_packet_length(STENCIL_CFG));
-   v3dv_return_if_oom(cmd_buffer, NULL);
-
-   bool emitted_stencil = false;
-   for (uint32_t i = 0; i < 2; i++) {
-      if (pipeline->emit_stencil_cfg[i]) {
-         if (dynamic_state->mask & dynamic_stencil_states) {
-            cl_emit_with_prepacked(&job->bcl, STENCIL_CFG,
-                                   pipeline->stencil_cfg[i], config) {
-               if (dynamic_state->mask & V3DV_DYNAMIC_STENCIL_COMPARE_MASK) {
-                  config.stencil_test_mask =
-                     i == 0 ? dynamic_state->stencil_compare_mask.front :
-                              dynamic_state->stencil_compare_mask.back;
-               }
-               if (dynamic_state->mask & V3DV_DYNAMIC_STENCIL_WRITE_MASK) {
-                  config.stencil_write_mask =
-                     i == 0 ? dynamic_state->stencil_write_mask.front :
-                              dynamic_state->stencil_write_mask.back;
-               }
-               if (dynamic_state->mask & V3DV_DYNAMIC_STENCIL_REFERENCE) {
-                  config.stencil_ref_value =
-                     i == 0 ? dynamic_state->stencil_reference.front :
-                              dynamic_state->stencil_reference.back;
-               }
-            }
-         } else {
-            cl_emit_prepacked(&job->bcl, &pipeline->stencil_cfg[i]);
-         }
-
-         emitted_stencil = true;
-      }
-   }
-
-   if (emitted_stencil) {
-      const uint32_t dynamic_stencil_dirty_flags =
-               V3DV_CMD_DIRTY_STENCIL_COMPARE_MASK |
-               V3DV_CMD_DIRTY_STENCIL_WRITE_MASK |
-               V3DV_CMD_DIRTY_STENCIL_REFERENCE;
-      cmd_buffer->state.dirty &= ~dynamic_stencil_dirty_flags;
-   }
-}
-
-static void
-emit_depth_bias(struct v3dv_cmd_buffer *cmd_buffer)
-{
-   struct v3dv_pipeline *pipeline = cmd_buffer->state.pipeline;
-   assert(pipeline);
-
-   if (!pipeline->depth_bias.enabled)
-      return;
-
-   struct v3dv_job *job = cmd_buffer->state.job;
-   assert(job);
 
-   v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(DEPTH_OFFSET));
-   v3dv_return_if_oom(cmd_buffer, NULL);
-
-   struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
-   cl_emit(&job->bcl, DEPTH_OFFSET, bias) {
-      bias.depth_offset_factor = dynamic->depth_bias.slope_factor;
-      bias.depth_offset_units = dynamic->depth_bias.constant_factor;
-      if (pipeline->depth_bias.is_z16)
-         bias.depth_offset_units *= 256.0f;
-   }
-
-   cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_DEPTH_BIAS;
-}
-
-static void
-emit_line_width(struct v3dv_cmd_buffer *cmd_buffer)
-{
-   struct v3dv_job *job = cmd_buffer->state.job;
-   assert(job);
-
-   v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(LINE_WIDTH));
-   v3dv_return_if_oom(cmd_buffer, NULL);
-
-   cl_emit(&job->bcl, LINE_WIDTH, line) {
-      line.line_width = cmd_buffer->state.dynamic.line_width;
-   }
-
-   cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_LINE_WIDTH;
-}
-
-static void
-emit_blend(struct v3dv_cmd_buffer *cmd_buffer)
-{
-   struct v3dv_job *job = cmd_buffer->state.job;
-   assert(job);
-
-   struct v3dv_pipeline *pipeline = cmd_buffer->state.pipeline;
-   assert(pipeline);
-
-   const uint32_t blend_packets_size =
-      cl_packet_length(BLEND_ENABLES) +
-      cl_packet_length(BLEND_CONSTANT_COLOR) +
-      cl_packet_length(BLEND_CFG) * V3D_MAX_DRAW_BUFFERS +
-      cl_packet_length(COLOR_WRITE_MASKS);
-
-   v3dv_cl_ensure_space_with_branch(&job->bcl, blend_packets_size);
-   v3dv_return_if_oom(cmd_buffer, NULL);
-
-   if (cmd_buffer->state.dirty & V3DV_CMD_DIRTY_PIPELINE) {
-      if (pipeline->blend.enables) {
-         cl_emit(&job->bcl, BLEND_ENABLES, enables) {
-            enables.mask = pipeline->blend.enables;
-         }
-      }
-
-      for (uint32_t i = 0; i < V3D_MAX_DRAW_BUFFERS; i++) {
-         if (pipeline->blend.enables & (1 << i))
-            cl_emit_prepacked(&job->bcl, &pipeline->blend.cfg[i]);
-      }
-
-      cl_emit(&job->bcl, COLOR_WRITE_MASKS, mask) {
-         mask.mask = pipeline->blend.color_write_masks;
-      }
-   }
-
-   if (pipeline->blend.needs_color_constants &&
-       cmd_buffer->state.dirty & V3DV_CMD_DIRTY_BLEND_CONSTANTS) {
-      struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
-      cl_emit(&job->bcl, BLEND_CONSTANT_COLOR, color) {
-         color.red_f16 = _mesa_float_to_half(dynamic->blend_constants[0]);
-         color.green_f16 = _mesa_float_to_half(dynamic->blend_constants[1]);
-         color.blue_f16 = _mesa_float_to_half(dynamic->blend_constants[2]);
-         color.alpha_f16 = _mesa_float_to_half(dynamic->blend_constants[3]);
-      }
-      cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_BLEND_CONSTANTS;
-   }
-}
-
-static void
-emit_flat_shade_flags(struct v3dv_job *job,
-                      int varying_offset,
-                      uint32_t varyings,
-                      enum V3DX(Varying_Flags_Action) lower,
-                      enum V3DX(Varying_Flags_Action) higher)
-{
-   v3dv_cl_ensure_space_with_branch(&job->bcl,
-                                    cl_packet_length(FLAT_SHADE_FLAGS));
-   v3dv_return_if_oom(NULL, job);
-
-   cl_emit(&job->bcl, FLAT_SHADE_FLAGS, flags) {
-      flags.varying_offset_v0 = varying_offset;
-      flags.flat_shade_flags_for_varyings_v024 = varyings;
-      flags.action_for_flat_shade_flags_of_lower_numbered_varyings = lower;
-      flags.action_for_flat_shade_flags_of_higher_numbered_varyings = higher;
-   }
-}
-
-static void
-emit_noperspective_flags(struct v3dv_job *job,
-                         int varying_offset,
-                         uint32_t varyings,
-                         enum V3DX(Varying_Flags_Action) lower,
-                         enum V3DX(Varying_Flags_Action) higher)
-{
-   v3dv_cl_ensure_space_with_branch(&job->bcl,
-                                    cl_packet_length(NON_PERSPECTIVE_FLAGS));
-   v3dv_return_if_oom(NULL, job);
-
-   cl_emit(&job->bcl, NON_PERSPECTIVE_FLAGS, flags) {
-      flags.varying_offset_v0 = varying_offset;
-      flags.non_perspective_flags_for_varyings_v024 = varyings;
-      flags.action_for_non_perspective_flags_of_lower_numbered_varyings = lower;
-      flags.action_for_non_perspective_flags_of_higher_numbered_varyings = higher;
-   }
-}
-
-static void
-emit_centroid_flags(struct v3dv_job *job,
-                    int varying_offset,
-                    uint32_t varyings,
-                    enum V3DX(Varying_Flags_Action) lower,
-                    enum V3DX(Varying_Flags_Action) higher)
-{
-   v3dv_cl_ensure_space_with_branch(&job->bcl,
-                                    cl_packet_length(CENTROID_FLAGS));
-   v3dv_return_if_oom(NULL, job);
-
-   cl_emit(&job->bcl, CENTROID_FLAGS, flags) {
-      flags.varying_offset_v0 = varying_offset;
-      flags.centroid_flags_for_varyings_v024 = varyings;
-      flags.action_for_centroid_flags_of_lower_numbered_varyings = lower;
-      flags.action_for_centroid_flags_of_higher_numbered_varyings = higher;
-   }
-}
-
-static bool
-emit_varying_flags(struct v3dv_job *job,
-                   uint32_t num_flags,
-                   const uint32_t *flags,
-                   void (*flag_emit_callback)(struct v3dv_job *job,
-                                              int varying_offset,
-                                              uint32_t flags,
-                                              enum V3DX(Varying_Flags_Action) lower,
-                                              enum V3DX(Varying_Flags_Action) higher))
-{
-   bool emitted_any = false;
-   for (int i = 0; i < num_flags; i++) {
-      if (!flags[i])
-         continue;
-
-      if (emitted_any) {
-        flag_emit_callback(job, i, flags[i],
-                           V3D_VARYING_FLAGS_ACTION_UNCHANGED,
-                           V3D_VARYING_FLAGS_ACTION_UNCHANGED);
-      } else if (i == 0) {
-        flag_emit_callback(job, i, flags[i],
-                           V3D_VARYING_FLAGS_ACTION_UNCHANGED,
-                           V3D_VARYING_FLAGS_ACTION_ZEROED);
-      } else {
-        flag_emit_callback(job, i, flags[i],
-                           V3D_VARYING_FLAGS_ACTION_ZEROED,
-                           V3D_VARYING_FLAGS_ACTION_ZEROED);
-      }
-
-      emitted_any = true;
-   }
-
-   return emitted_any;
-}
-
-static void
-emit_varyings_state(struct v3dv_cmd_buffer *cmd_buffer)
-{
-   struct v3dv_job *job = cmd_buffer->state.job;
-   struct v3dv_pipeline *pipeline = cmd_buffer->state.pipeline;
-
-   struct v3d_fs_prog_data *prog_data_fs =
-      pipeline->fs->current_variant->prog_data.fs;
-
-   const uint32_t num_flags =
-      ARRAY_SIZE(prog_data_fs->flat_shade_flags);
-   const uint32_t *flat_shade_flags = prog_data_fs->flat_shade_flags;
-   const uint32_t *noperspective_flags =  prog_data_fs->noperspective_flags;
-   const uint32_t *centroid_flags = prog_data_fs->centroid_flags;
-
-   if (!emit_varying_flags(job, num_flags, flat_shade_flags,
-                           emit_flat_shade_flags)) {
-      v3dv_cl_ensure_space_with_branch(
-         &job->bcl, cl_packet_length(ZERO_ALL_FLAT_SHADE_FLAGS));
-      v3dv_return_if_oom(cmd_buffer, NULL);
-
-      cl_emit(&job->bcl, ZERO_ALL_FLAT_SHADE_FLAGS, flags);
-   }
-
-   if (!emit_varying_flags(job, num_flags, noperspective_flags,
-                           emit_noperspective_flags)) {
-      v3dv_cl_ensure_space_with_branch(
-         &job->bcl, cl_packet_length(ZERO_ALL_NON_PERSPECTIVE_FLAGS));
-      v3dv_return_if_oom(cmd_buffer, NULL);
-
-      cl_emit(&job->bcl, ZERO_ALL_NON_PERSPECTIVE_FLAGS, flags);
-   }
-
-   if (!emit_varying_flags(job, num_flags, centroid_flags,
-                           emit_centroid_flags)) {
-      v3dv_cl_ensure_space_with_branch(
-         &job->bcl, cl_packet_length(ZERO_ALL_CENTROID_FLAGS));
-      v3dv_return_if_oom(cmd_buffer, NULL);
-
-      cl_emit(&job->bcl, ZERO_ALL_CENTROID_FLAGS, flags);
-   }
-}
-
-static void
-emit_configuration_bits(struct v3dv_cmd_buffer *cmd_buffer)
-{
-   struct v3dv_job *job = cmd_buffer->state.job;
-   assert(job);
-
-   struct v3dv_pipeline *pipeline = cmd_buffer->state.pipeline;
+   struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
    assert(pipeline);
 
-   job_update_ez_state(job, pipeline, &cmd_buffer->state);
+   const bool has_new_pipeline = dirty_uniform_state & V3DV_CMD_DIRTY_PIPELINE;
+   const bool has_new_viewport = dirty_uniform_state & V3DV_CMD_DIRTY_VIEWPORT;
+   const bool has_new_push_constants = dirty_uniform_state & V3DV_CMD_DIRTY_PUSH_CONSTANTS;
+   const bool has_new_descriptors = dirty_uniform_state & V3DV_CMD_DIRTY_DESCRIPTOR_SETS;
+   const bool has_new_view_index = dirty_uniform_state & V3DV_CMD_DIRTY_VIEW_INDEX;
 
-   v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(CFG_BITS));
-   v3dv_return_if_oom(cmd_buffer, NULL);
-
-   cl_emit_with_prepacked(&job->bcl, CFG_BITS, pipeline->cfg_bits, config) {
-      config.early_z_updates_enable = job->ez_state != VC5_EZ_DISABLED;
-      config.early_z_enable = config.early_z_updates_enable;
-   }
-}
-
-static void
-emit_gl_shader_state(struct v3dv_cmd_buffer *cmd_buffer)
-{
-   struct v3dv_job *job = cmd_buffer->state.job;
-   assert(job);
-
-   struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
-   struct v3dv_pipeline *pipeline = state->pipeline;
-   assert(pipeline);
-
-   /* Upload the uniforms to the indirect CL first */
-   struct v3dv_cl_reloc fs_uniforms =
-      v3dv_write_uniforms(cmd_buffer, pipeline->fs);
-
-   struct v3dv_cl_reloc vs_uniforms =
-      v3dv_write_uniforms(cmd_buffer, pipeline->vs);
-
-   struct v3dv_cl_reloc vs_bin_uniforms =
-      v3dv_write_uniforms(cmd_buffer, pipeline->vs_bin);
-
-   /* Update the cache dirty flag based on the shader progs data */
-   job->tmu_dirty_rcl |= pipeline->vs_bin->current_variant->prog_data.vs->base.tmu_dirty_rcl;
-   job->tmu_dirty_rcl |= pipeline->vs->current_variant->prog_data.vs->base.tmu_dirty_rcl;
-   job->tmu_dirty_rcl |= pipeline->fs->current_variant->prog_data.fs->base.tmu_dirty_rcl;
+   /* VK_SHADER_STAGE_FRAGMENT_BIT */
+   const bool has_new_descriptors_fs =
+      has_new_descriptors &&
+      (cmd_buffer->state.dirty_descriptor_stages & VK_SHADER_STAGE_FRAGMENT_BIT);
 
-   /* See GFXH-930 workaround below */
-   uint32_t num_elements_to_emit = MAX2(pipeline->va_count, 1);
+   const bool has_new_push_constants_fs =
+      has_new_push_constants &&
+      (cmd_buffer->state.dirty_push_constants_stages & VK_SHADER_STAGE_FRAGMENT_BIT);
 
-   uint32_t shader_rec_offset =
-      v3dv_cl_ensure_space(&job->indirect,
-                           cl_packet_length(GL_SHADER_STATE_RECORD) +
-                           num_elements_to_emit *
-                           cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD),
-                           32);
-   v3dv_return_if_oom(cmd_buffer, NULL);
+   const bool needs_fs_update = has_new_pipeline ||
+                                has_new_view_index ||
+                                has_new_push_constants_fs ||
+                                has_new_descriptors_fs ||
+                                has_new_view_index;
 
-   cl_emit_with_prepacked(&job->indirect, GL_SHADER_STATE_RECORD,
-                          pipeline->shader_state_record, shader) {
+   if (needs_fs_update) {
+      struct v3dv_shader_variant *fs_variant =
+         pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT];
 
-      /* FIXME: we are setting this values here and during the
-       * prepacking. This is because both cl_emit_with_prepacked and v3dv_pack
-       * asserts for minimum values of these. It would be good to get
-       * v3dv_pack to assert on the final value if possible
-       */
-      shader.min_coord_shader_input_segments_required_in_play =
-         pipeline->vpm_cfg_bin.As;
-      shader.min_vertex_shader_input_segments_required_in_play =
-         pipeline->vpm_cfg.As;
-
-      shader.coordinate_shader_code_address =
-         v3dv_cl_address(pipeline->vs_bin->current_variant->assembly_bo, 0);
-      shader.vertex_shader_code_address =
-         v3dv_cl_address(pipeline->vs->current_variant->assembly_bo, 0);
-      shader.fragment_shader_code_address =
-         v3dv_cl_address(pipeline->fs->current_variant->assembly_bo, 0);
-
-      shader.coordinate_shader_uniforms_address = vs_bin_uniforms;
-      shader.vertex_shader_uniforms_address = vs_uniforms;
-      shader.fragment_shader_uniforms_address = fs_uniforms;
-
-      shader.address_of_default_attribute_values =
-         v3dv_cl_address(pipeline->default_attribute_values, 0);
+      cmd_buffer->state.uniforms.fs =
+         v3dv_write_uniforms(cmd_buffer, pipeline, fs_variant);
    }
 
-   /* Upload vertex element attributes (SHADER_STATE_ATTRIBUTE_RECORD) */
-   struct v3d_vs_prog_data *prog_data_vs =
-      pipeline->vs->current_variant->prog_data.vs;
-
-   struct v3d_vs_prog_data *prog_data_vs_bin =
-      pipeline->vs_bin->current_variant->prog_data.vs;
-
-   bool cs_loaded_any = false;
-   const bool cs_uses_builtins = prog_data_vs_bin->uses_iid ||
-                                 prog_data_vs_bin->uses_biid ||
-                                 prog_data_vs_bin->uses_vid;
-   const uint32_t packet_length =
-      cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD);
-
-   uint32_t emitted_va_count = 0;
-   for (uint32_t i = 0; emitted_va_count < pipeline->va_count; i++) {
-      assert(i < MAX_VERTEX_ATTRIBS);
-
-      if (pipeline->va[i].vk_format == VK_FORMAT_UNDEFINED)
-         continue;
-
-      const uint32_t binding = pipeline->va[i].binding;
+   /* VK_SHADER_STAGE_GEOMETRY_BIT */
+   if (pipeline->has_gs) {
+      const bool has_new_descriptors_gs =
+         has_new_descriptors &&
+         (cmd_buffer->state.dirty_descriptor_stages &
+          VK_SHADER_STAGE_GEOMETRY_BIT);
 
-      /* We store each vertex attribute in the array using its driver location
-       * as index.
-       */
-      const uint32_t location = i;
-
-      struct v3dv_vertex_binding *c_vb = &cmd_buffer->state.vertex_bindings[binding];
-
-      cl_emit_with_prepacked(&job->indirect, GL_SHADER_STATE_ATTRIBUTE_RECORD,
-                             &pipeline->vertex_attrs[i * packet_length], attr) {
-
-         assert(c_vb->buffer->mem->bo);
-         attr.address = v3dv_cl_address(c_vb->buffer->mem->bo,
-                                        c_vb->buffer->mem_offset +
-                                        pipeline->va[i].offset +
-                                        c_vb->offset);
-
-         attr.number_of_values_read_by_coordinate_shader =
-            prog_data_vs_bin->vattr_sizes[location];
-         attr.number_of_values_read_by_vertex_shader =
-            prog_data_vs->vattr_sizes[location];
-
-         /* GFXH-930: At least one attribute must be enabled and read by CS
-          * and VS.  If we have attributes being consumed by the VS but not
-          * the CS, then set up a dummy load of the last attribute into the
-          * CS's VPM inputs.  (Since CS is just dead-code-elimination compared
-          * to VS, we can't have CS loading but not VS).
-          *
-          * GFXH-1602: first attribute must be active if using builtins.
-          */
-         if (prog_data_vs_bin->vattr_sizes[location])
-            cs_loaded_any = true;
-
-         if (i == 0 && cs_uses_builtins && !cs_loaded_any) {
-            attr.number_of_values_read_by_coordinate_shader = 1;
-            cs_loaded_any = true;
-         } else if (i == pipeline->va_count - 1 && !cs_loaded_any) {
-            attr.number_of_values_read_by_coordinate_shader = 1;
-            cs_loaded_any = true;
-         }
+      const bool has_new_push_constants_gs =
+         has_new_push_constants &&
+         (cmd_buffer->state.dirty_push_constants_stages &
+          VK_SHADER_STAGE_GEOMETRY_BIT);
 
-         attr.maximum_index = 0xffffff;
-      }
+      const bool needs_gs_update = has_new_viewport ||
+                                   has_new_view_index ||
+                                   has_new_pipeline ||
+                                   has_new_push_constants_gs ||
+                                   has_new_descriptors_gs;
 
-      emitted_va_count++;
-   }
+      if (needs_gs_update) {
+         struct v3dv_shader_variant *gs_variant =
+            pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY];
 
-   if (pipeline->va_count == 0) {
-      /* GFXH-930: At least one attribute must be enabled and read
-       * by CS and VS.  If we have no attributes being consumed by
-       * the shader, set up a dummy to be loaded into the VPM.
-       */
-      cl_emit(&job->indirect, GL_SHADER_STATE_ATTRIBUTE_RECORD, attr) {
-         /* Valid address of data whose value will be unused. */
-         attr.address = v3dv_cl_address(job->indirect.bo, 0);
+          struct v3dv_shader_variant *gs_bin_variant =
+            pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN];
 
-         attr.type = ATTRIBUTE_FLOAT;
-         attr.stride = 0;
-         attr.vec_size = 1;
+         cmd_buffer->state.uniforms.gs =
+            v3dv_write_uniforms(cmd_buffer, pipeline, gs_variant);
 
-         attr.number_of_values_read_by_coordinate_shader = 1;
-         attr.number_of_values_read_by_vertex_shader = 1;
+         cmd_buffer->state.uniforms.gs_bin =
+            v3dv_write_uniforms(cmd_buffer, pipeline, gs_bin_variant);
       }
    }
 
-   if (cmd_buffer->state.dirty & V3DV_CMD_DIRTY_PIPELINE) {
-      v3dv_cl_ensure_space_with_branch(&job->bcl,
-                                       sizeof(pipeline->vcm_cache_size));
-      v3dv_return_if_oom(cmd_buffer, NULL);
-
-      cl_emit_prepacked(&job->bcl, &pipeline->vcm_cache_size);
-   }
+   /* VK_SHADER_STAGE_VERTEX_BIT */
+   const bool has_new_descriptors_vs =
+      has_new_descriptors &&
+      (cmd_buffer->state.dirty_descriptor_stages & VK_SHADER_STAGE_VERTEX_BIT);
 
-   v3dv_cl_ensure_space_with_branch(&job->bcl,
-                                    cl_packet_length(GL_SHADER_STATE));
-   v3dv_return_if_oom(cmd_buffer, NULL);
+   const bool has_new_push_constants_vs =
+      has_new_push_constants &&
+      (cmd_buffer->state.dirty_push_constants_stages & VK_SHADER_STAGE_VERTEX_BIT);
 
-   cl_emit(&job->bcl, GL_SHADER_STATE, state) {
-      state.address = v3dv_cl_address(job->indirect.bo,
-                                      shader_rec_offset);
-      state.number_of_attribute_arrays = num_elements_to_emit;
-   }
+   const bool needs_vs_update = has_new_viewport ||
+                                has_new_view_index ||
+                                has_new_pipeline ||
+                                has_new_push_constants_vs ||
+                                has_new_descriptors_vs;
 
-   cmd_buffer->state.dirty &= ~(V3DV_CMD_DIRTY_VERTEX_BUFFER |
-                                V3DV_CMD_DIRTY_DESCRIPTOR_SETS |
-                                V3DV_CMD_DIRTY_PUSH_CONSTANTS);
-}
+   if (needs_vs_update) {
+      struct v3dv_shader_variant *vs_variant =
+         pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX];
 
-static void
-emit_occlusion_query(struct v3dv_cmd_buffer *cmd_buffer)
-{
-   struct v3dv_job *job = cmd_buffer->state.job;
-   assert(job);
+       struct v3dv_shader_variant *vs_bin_variant =
+         pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN];
 
-   v3dv_cl_ensure_space_with_branch(&job->bcl,
-                                    cl_packet_length(OCCLUSION_QUERY_COUNTER));
-   v3dv_return_if_oom(cmd_buffer, NULL);
+      cmd_buffer->state.uniforms.vs =
+         v3dv_write_uniforms(cmd_buffer, pipeline, vs_variant);
 
-   cl_emit(&job->bcl, OCCLUSION_QUERY_COUNTER, counter) {
-      if (cmd_buffer->state.query.active_query) {
-         counter.address =
-            v3dv_cl_address(cmd_buffer->state.query.active_query, 0);
-      }
+      cmd_buffer->state.uniforms.vs_bin =
+         v3dv_write_uniforms(cmd_buffer, pipeline, vs_bin_variant);
    }
 
-   cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_OCCLUSION_QUERY;
+   cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_VIEW_INDEX;
 }
 
 /* This stores command buffer state that we might be about to stomp for
@@ -3646,9 +2211,9 @@ v3dv_cmd_buffer_meta_state_push(struct v3dv_cmd_buffer *cmd_buffer,
          attachment_state_item_size * state->attachment_alloc_count;
       if (state->meta.attachment_alloc_count < state->attachment_alloc_count) {
          if (state->meta.attachment_alloc_count > 0)
-            vk_free(&cmd_buffer->device->alloc, state->meta.attachments);
+            vk_free(&cmd_buffer->device->vk.alloc, state->meta.attachments);
 
-         state->meta.attachments = vk_zalloc(&cmd_buffer->device->alloc,
+         state->meta.attachments = vk_zalloc(&cmd_buffer->device->vk.alloc,
                                              attachment_state_total_size, 8,
                                              VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
          if (!state->meta.attachments) {
@@ -3665,20 +2230,19 @@ v3dv_cmd_buffer_meta_state_push(struct v3dv_cmd_buffer *cmd_buffer,
       memcpy(&state->meta.render_area, &state->render_area, sizeof(VkRect2D));
    }
 
-   state->meta.pipeline = v3dv_pipeline_to_handle(state->pipeline);
-   if (state->meta.pipeline) {
-      memcpy(&state->meta.dynamic, &state->dynamic, sizeof(state->dynamic));
-   }
-
-   /* We expect that meta operations are graphics-only and won't alter
-    * compute state.
+   /* We expect that meta operations are graphics-only, so we only take into
+    * account the graphics pipeline, and the graphics state
     */
+   state->meta.gfx.pipeline = state->gfx.pipeline;
+   memcpy(&state->meta.dynamic, &state->dynamic, sizeof(state->dynamic));
+
    struct v3dv_descriptor_state *gfx_descriptor_state =
-      &state->descriptor_state[VK_PIPELINE_BIND_POINT_GRAPHICS];
+      &cmd_buffer->state.gfx.descriptor_state;
+
    if (push_descriptor_state) {
       if (gfx_descriptor_state->valid != 0) {
-         memcpy(&state->meta.descriptor_state, gfx_descriptor_state,
-                sizeof(state->descriptor_state));
+         memcpy(&state->meta.gfx.descriptor_state, gfx_descriptor_state,
+                sizeof(state->gfx.descriptor_state));
       }
       state->meta.has_descriptor_state = true;
    } else {
@@ -3728,135 +2292,46 @@ v3dv_cmd_buffer_meta_state_pop(struct v3dv_cmd_buffer *cmd_buffer,
       state->subpass_idx = -1;
    }
 
-   if (state->meta.pipeline != VK_NULL_HANDLE) {
-      struct v3dv_pipeline *pipeline =
-            v3dv_pipeline_from_handle(state->meta.pipeline);
+   if (state->meta.gfx.pipeline != NULL) {
+      struct v3dv_pipeline *pipeline = state->meta.gfx.pipeline;
       VkPipelineBindPoint pipeline_binding =
          v3dv_pipeline_get_binding_point(pipeline);
       v3dv_CmdBindPipeline(v3dv_cmd_buffer_to_handle(cmd_buffer),
                            pipeline_binding,
-                           state->meta.pipeline);
-      if (pipeline_binding == VK_PIPELINE_BIND_POINT_GRAPHICS) {
-         memcpy(&state->dynamic, &state->meta.dynamic, sizeof(state->dynamic));
-         state->dirty |= dirty_dynamic_state;
-      }
+                           v3dv_pipeline_to_handle(state->meta.gfx.pipeline));
    } else {
-      state->pipeline = VK_NULL_HANDLE;
+      state->gfx.pipeline = NULL;
+   }
+
+   if (dirty_dynamic_state) {
+      memcpy(&state->dynamic, &state->meta.dynamic, sizeof(state->dynamic));
+      state->dirty |= dirty_dynamic_state;
    }
 
    if (state->meta.has_descriptor_state) {
-      if (state->meta.descriptor_state.valid != 0) {
-         memcpy(&state->descriptor_state[VK_PIPELINE_BIND_POINT_GRAPHICS],
-                &state->meta.descriptor_state,
-                sizeof(state->descriptor_state));
+      if (state->meta.gfx.descriptor_state.valid != 0) {
+         memcpy(&state->gfx.descriptor_state, &state->meta.gfx.descriptor_state,
+                sizeof(state->gfx.descriptor_state));
       } else {
-         state->descriptor_state[VK_PIPELINE_BIND_POINT_GRAPHICS].valid = 0;
+         state->gfx.descriptor_state.valid = 0;
       }
    }
 
    memcpy(cmd_buffer->push_constants_data, state->meta.push_constants,
           sizeof(state->meta.push_constants));
 
-   state->meta.pipeline = VK_NULL_HANDLE;
+   state->meta.gfx.pipeline = NULL;
    state->meta.framebuffer = VK_NULL_HANDLE;
    state->meta.pass = VK_NULL_HANDLE;
    state->meta.subpass_idx = -1;
    state->meta.has_descriptor_state = false;
 }
 
-/* FIXME: C&P from v3dx_draw. Refactor to common place? */
-static uint32_t
-v3d_hw_prim_type(enum pipe_prim_type prim_type)
-{
-   switch (prim_type) {
-   case PIPE_PRIM_POINTS:
-   case PIPE_PRIM_LINES:
-   case PIPE_PRIM_LINE_LOOP:
-   case PIPE_PRIM_LINE_STRIP:
-   case PIPE_PRIM_TRIANGLES:
-   case PIPE_PRIM_TRIANGLE_STRIP:
-   case PIPE_PRIM_TRIANGLE_FAN:
-      return prim_type;
-
-   case PIPE_PRIM_LINES_ADJACENCY:
-   case PIPE_PRIM_LINE_STRIP_ADJACENCY:
-   case PIPE_PRIM_TRIANGLES_ADJACENCY:
-   case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY:
-      return 8 + (prim_type - PIPE_PRIM_LINES_ADJACENCY);
-
-   default:
-      unreachable("Unsupported primitive type");
-   }
-}
-
-struct v3dv_draw_info {
-   uint32_t vertex_count;
-   uint32_t instance_count;
-   uint32_t first_vertex;
-   uint32_t first_instance;
-};
-
-static void
-cmd_buffer_emit_draw(struct v3dv_cmd_buffer *cmd_buffer,
-                     struct v3dv_draw_info *info)
-{
-   struct v3dv_job *job = cmd_buffer->state.job;
-   assert(job);
-
-   struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
-   struct v3dv_pipeline *pipeline = state->pipeline;
-
-   assert(pipeline);
-
-   uint32_t hw_prim_type = v3d_hw_prim_type(pipeline->vs->topology);
-
-   if (info->first_instance > 0) {
-      v3dv_cl_ensure_space_with_branch(
-         &job->bcl, cl_packet_length(BASE_VERTEX_BASE_INSTANCE));
-      v3dv_return_if_oom(cmd_buffer, NULL);
-
-      cl_emit(&job->bcl, BASE_VERTEX_BASE_INSTANCE, base) {
-         base.base_instance = info->first_instance;
-         base.base_vertex = 0;
-      }
-   }
-
-   if (info->instance_count > 1) {
-      v3dv_cl_ensure_space_with_branch(
-         &job->bcl, cl_packet_length(VERTEX_ARRAY_INSTANCED_PRIMS));
-      v3dv_return_if_oom(cmd_buffer, NULL);
-
-      cl_emit(&job->bcl, VERTEX_ARRAY_INSTANCED_PRIMS, prim) {
-         prim.mode = hw_prim_type;
-         prim.index_of_first_vertex = info->first_vertex;
-         prim.number_of_instances = info->instance_count;
-         prim.instance_length = info->vertex_count;
-      }
-   } else {
-      v3dv_cl_ensure_space_with_branch(
-         &job->bcl, cl_packet_length(VERTEX_ARRAY_PRIMS));
-      v3dv_return_if_oom(cmd_buffer, NULL);
-      cl_emit(&job->bcl, VERTEX_ARRAY_PRIMS, prim) {
-         prim.mode = hw_prim_type;
-         prim.length = info->vertex_count;
-         prim.index_of_first_vertex = info->first_vertex;
-      }
-   }
-}
-
 static struct v3dv_job *
 cmd_buffer_pre_draw_split_job(struct v3dv_cmd_buffer *cmd_buffer)
 {
-   /* If we emitted a pipeline barrier right before this draw we won't have
-    * an active job. In that case, create a new job continuing the current
-    * subpass.
-    */
    struct v3dv_job *job = cmd_buffer->state.job;
-   if (!job) {
-      job = v3dv_cmd_buffer_subpass_resume(cmd_buffer,
-                                           cmd_buffer->state.subpass_idx);
-      return job;
-   }
+   assert(job);
 
    /* If the job has been flagged with 'always_flush' and it has already
     * recorded any draw calls then we need to start a new job for it.
@@ -3883,11 +2358,102 @@ cmd_buffer_pre_draw_split_job(struct v3dv_cmd_buffer *cmd_buffer)
    return job;
 }
 
+/**
+ * The Vulkan spec states:
+ *
+ *   "It is legal for a subpass to use no color or depth/stencil
+ *    attachments (...)  This kind of subpass can use shader side effects such
+ *    as image stores and atomics to produce an output. In this case, the
+ *    subpass continues to use the width, height, and layers of the framebuffer
+ *    to define the dimensions of the rendering area, and the
+ *    rasterizationSamples from each pipeline’s
+ *    VkPipelineMultisampleStateCreateInfo to define the number of samples used
+ *    in rasterization."
+ *
+ * We need to enable MSAA in the TILE_BINNING_MODE_CFG packet, which we
+ * emit when we start a new frame at the begining of a subpass. At that point,
+ * if the framebuffer doesn't have any attachments we won't enable MSAA and
+ * the job won't be valid in the scenario described by the spec.
+ *
+ * This function is intended to be called before a draw call and will test if
+ * we are in that scenario, in which case, it will restart the current job
+ * with MSAA enabled.
+ */
 static void
-cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer)
+cmd_buffer_restart_job_for_msaa_if_needed(struct v3dv_cmd_buffer *cmd_buffer)
+{
+   assert(cmd_buffer->state.job);
+
+   /* We don't support variableMultisampleRate so we know that all pipelines
+    * bound in the same subpass must have matching number of samples, so we
+    * can do this check only on the first draw call.
+    */
+   if (cmd_buffer->state.job->draw_count > 0)
+      return;
+
+   /* We only need to restart the frame if the pipeline requires MSAA but
+    * our frame tiling didn't enable it.
+    */
+   if (!cmd_buffer->state.gfx.pipeline->msaa ||
+       cmd_buffer->state.job->frame_tiling.msaa) {
+      return;
+   }
+
+   /* FIXME: Secondary command buffers don't start frames. Instead, they are
+    * recorded into primary jobs that start them. For secondaries, we should
+    * still handle this scenario, but we should do that when we record them
+    * into primaries by testing if any of the secondaries has multisampled
+    * draw calls in them, and then using that info to decide if we need to
+    * restart the primary job into which they are being recorded.
+    */
+   if (cmd_buffer->level != VK_COMMAND_BUFFER_LEVEL_PRIMARY)
+      return;
+
+   /* Drop the current job and restart it with MSAA enabled */
+   struct v3dv_job *old_job = cmd_buffer->state.job;
+   cmd_buffer->state.job = NULL;
+
+   struct v3dv_job *job = vk_zalloc(&cmd_buffer->device->vk.alloc,
+                                    sizeof(struct v3dv_job), 8,
+                                    VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
+   if (!job) {
+      v3dv_flag_oom(cmd_buffer, NULL);
+      return;
+   }
+
+   v3dv_job_init(job, V3DV_JOB_TYPE_GPU_CL, cmd_buffer->device, cmd_buffer,
+                 cmd_buffer->state.subpass_idx);
+   cmd_buffer->state.job = job;
+
+   v3dv_job_start_frame(job,
+                        old_job->frame_tiling.width,
+                        old_job->frame_tiling.height,
+                        old_job->frame_tiling.layers,
+                        true,
+                        old_job->frame_tiling.render_target_count,
+                        old_job->frame_tiling.internal_bpp,
+                        true /* msaa */);
+
+   v3dv_job_destroy(old_job);
+}
+
+void
+v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer)
 {
-   assert(cmd_buffer->state.pipeline);
-   assert(!(cmd_buffer->state.pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT));
+   assert(cmd_buffer->state.gfx.pipeline);
+   assert(!(cmd_buffer->state.gfx.pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT));
+
+   /* If we emitted a pipeline barrier right before this draw we won't have
+    * an active job. In that case, create a new job continuing the current
+    * subpass.
+    */
+   if (!cmd_buffer->state.job) {
+      v3dv_cmd_buffer_subpass_resume(cmd_buffer,
+                                     cmd_buffer->state.subpass_idx);
+   }
+
+   /* Restart single sample job for MSAA pipeline if needed */
+   cmd_buffer_restart_job_for_msaa_if_needed(cmd_buffer);
 
    /* If the job is configured to flush on every draw call we need to create
     * a new job now.
@@ -3895,13 +2461,6 @@ cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer)
    struct v3dv_job *job = cmd_buffer_pre_draw_split_job(cmd_buffer);
    job->draw_count++;
 
-   /* We may need to compile shader variants based on bound textures */
-   uint32_t *dirty = &cmd_buffer->state.dirty;
-   if (*dirty & (V3DV_CMD_DIRTY_PIPELINE |
-                 V3DV_CMD_DIRTY_DESCRIPTOR_SETS)) {
-      update_pipeline_variants(cmd_buffer);
-   }
-
    /* GL shader state binds shaders, uniform and vertex attribute state. The
     * compiler injects uniforms to handle some descriptor types (such as
     * textures), so we need to regen that when descriptor state changes.
@@ -3909,17 +2468,26 @@ cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer)
     * We also need to emit new shader state if we have a dirty viewport since
     * that will require that we new uniform state for QUNIFORM_VIEWPORT_*.
     */
-   if (*dirty & (V3DV_CMD_DIRTY_PIPELINE |
-                 V3DV_CMD_DIRTY_VERTEX_BUFFER |
-                 V3DV_CMD_DIRTY_DESCRIPTOR_SETS |
-                 V3DV_CMD_DIRTY_PUSH_CONSTANTS |
-                 V3DV_CMD_DIRTY_VIEWPORT)) {
-      emit_gl_shader_state(cmd_buffer);
-   }
+   uint32_t *dirty = &cmd_buffer->state.dirty;
+
+   const uint32_t dirty_uniform_state =
+      *dirty & (V3DV_CMD_DIRTY_PIPELINE |
+                V3DV_CMD_DIRTY_PUSH_CONSTANTS |
+                V3DV_CMD_DIRTY_DESCRIPTOR_SETS |
+                V3DV_CMD_DIRTY_VIEWPORT |
+                V3DV_CMD_DIRTY_VIEW_INDEX);
+
+   if (dirty_uniform_state)
+      update_gfx_uniform_state(cmd_buffer, dirty_uniform_state);
+
+   struct v3dv_device *device = cmd_buffer->device;
+
+   if (dirty_uniform_state || (*dirty & V3DV_CMD_DIRTY_VERTEX_BUFFER))
+      v3dv_X(device, cmd_buffer_emit_gl_shader_state)(cmd_buffer);
 
    if (*dirty & (V3DV_CMD_DIRTY_PIPELINE)) {
-      emit_configuration_bits(cmd_buffer);
-      emit_varyings_state(cmd_buffer);
+      v3dv_X(device, cmd_buffer_emit_configuration_bits)(cmd_buffer);
+      v3dv_X(device, cmd_buffer_emit_varyings_state)(cmd_buffer);
    }
 
    if (*dirty & (V3DV_CMD_DIRTY_VIEWPORT | V3DV_CMD_DIRTY_SCISSOR)) {
@@ -3927,46 +2495,78 @@ cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer)
    }
 
    if (*dirty & V3DV_CMD_DIRTY_VIEWPORT) {
-      emit_viewport(cmd_buffer);
+      v3dv_X(device, cmd_buffer_emit_viewport)(cmd_buffer);
    }
 
+   if (*dirty & V3DV_CMD_DIRTY_INDEX_BUFFER)
+      v3dv_X(device, cmd_buffer_emit_index_buffer)(cmd_buffer);
+
    const uint32_t dynamic_stencil_dirty_flags =
       V3DV_CMD_DIRTY_STENCIL_COMPARE_MASK |
       V3DV_CMD_DIRTY_STENCIL_WRITE_MASK |
       V3DV_CMD_DIRTY_STENCIL_REFERENCE;
    if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | dynamic_stencil_dirty_flags))
-      emit_stencil(cmd_buffer);
+      v3dv_X(device, cmd_buffer_emit_stencil)(cmd_buffer);
 
    if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_DEPTH_BIAS))
-      emit_depth_bias(cmd_buffer);
+      v3dv_X(device, cmd_buffer_emit_depth_bias)(cmd_buffer);
 
    if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_BLEND_CONSTANTS))
-      emit_blend(cmd_buffer);
+      v3dv_X(device, cmd_buffer_emit_blend)(cmd_buffer);
 
    if (*dirty & V3DV_CMD_DIRTY_OCCLUSION_QUERY)
-      emit_occlusion_query(cmd_buffer);
+      v3dv_X(device, cmd_buffer_emit_occlusion_query)(cmd_buffer);
 
    if (*dirty & V3DV_CMD_DIRTY_LINE_WIDTH)
-      emit_line_width(cmd_buffer);
+      v3dv_X(device, cmd_buffer_emit_line_width)(cmd_buffer);
+
+   if (*dirty & V3DV_CMD_DIRTY_PIPELINE)
+      v3dv_X(device, cmd_buffer_emit_sample_state)(cmd_buffer);
+
+   if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE))
+      v3dv_X(device, cmd_buffer_emit_color_write_mask)(cmd_buffer);
 
    cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_PIPELINE;
 }
 
+static inline void
+cmd_buffer_set_view_index(struct v3dv_cmd_buffer *cmd_buffer,
+                          uint32_t view_index)
+{
+   cmd_buffer->state.view_index = view_index;
+   cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VIEW_INDEX;
+}
+
 static void
 cmd_buffer_draw(struct v3dv_cmd_buffer *cmd_buffer,
                 struct v3dv_draw_info *info)
 {
-   cmd_buffer_emit_pre_draw(cmd_buffer);
-   cmd_buffer_emit_draw(cmd_buffer, info);
+
+   struct v3dv_render_pass *pass = cmd_buffer->state.pass;
+   if (likely(!pass->multiview_enabled)) {
+      v3dv_cmd_buffer_emit_pre_draw(cmd_buffer);
+      v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw)(cmd_buffer, info);
+      return;
+   }
+
+   uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask;
+   while (view_mask) {
+      cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask));
+      v3dv_cmd_buffer_emit_pre_draw(cmd_buffer);
+      v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw)(cmd_buffer, info);
+   }
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdDraw(VkCommandBuffer commandBuffer,
              uint32_t vertexCount,
              uint32_t instanceCount,
              uint32_t firstVertex,
              uint32_t firstInstance)
 {
+   if (vertexCount == 0 || instanceCount == 0)
+      return;
+
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
    struct v3dv_draw_info info = {};
    info.vertex_count = vertexCount;
@@ -3977,7 +2577,7 @@ v3dv_CmdDraw(VkCommandBuffer commandBuffer,
    cmd_buffer_draw(cmd_buffer, &info);
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdDrawIndexed(VkCommandBuffer commandBuffer,
                     uint32_t indexCount,
                     uint32_t instanceCount,
@@ -3985,131 +2585,93 @@ v3dv_CmdDrawIndexed(VkCommandBuffer commandBuffer,
                     int32_t vertexOffset,
                     uint32_t firstInstance)
 {
-   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
-
-   cmd_buffer_emit_pre_draw(cmd_buffer);
-
-   struct v3dv_job *job = cmd_buffer->state.job;
-   assert(job);
-
-   const struct v3dv_pipeline *pipeline = cmd_buffer->state.pipeline;
-   uint32_t hw_prim_type = v3d_hw_prim_type(pipeline->vs->topology);
-   uint8_t index_type = ffs(cmd_buffer->state.index_buffer.index_size) - 1;
-   uint32_t index_offset = firstIndex * cmd_buffer->state.index_buffer.index_size;
+   if (indexCount == 0 || instanceCount == 0)
+      return;
 
-   if (vertexOffset != 0 || firstInstance != 0) {
-      v3dv_cl_ensure_space_with_branch(
-         &job->bcl, cl_packet_length(BASE_VERTEX_BASE_INSTANCE));
-      v3dv_return_if_oom(cmd_buffer, NULL);
+   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
 
-      cl_emit(&job->bcl, BASE_VERTEX_BASE_INSTANCE, base) {
-         base.base_instance = firstInstance;
-         base.base_vertex = vertexOffset;
-      }
+   struct v3dv_render_pass *pass = cmd_buffer->state.pass;
+   if (likely(!pass->multiview_enabled)) {
+      v3dv_cmd_buffer_emit_pre_draw(cmd_buffer);
+      v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indexed)
+         (cmd_buffer, indexCount, instanceCount,
+          firstIndex, vertexOffset, firstInstance);
+      return;
    }
 
-   if (instanceCount == 1) {
-      v3dv_cl_ensure_space_with_branch(
-         &job->bcl, cl_packet_length(INDEXED_PRIM_LIST));
-      v3dv_return_if_oom(cmd_buffer, NULL);
-
-      cl_emit(&job->bcl, INDEXED_PRIM_LIST, prim) {
-         prim.index_type = index_type;
-         prim.length = indexCount;
-         prim.index_offset = index_offset;
-         prim.mode = hw_prim_type;
-         prim.enable_primitive_restarts = pipeline->primitive_restart;
-      }
-   } else if (instanceCount > 1) {
-      v3dv_cl_ensure_space_with_branch(
-         &job->bcl, cl_packet_length(INDEXED_INSTANCED_PRIM_LIST));
-      v3dv_return_if_oom(cmd_buffer, NULL);
-
-      cl_emit(&job->bcl, INDEXED_INSTANCED_PRIM_LIST, prim) {
-         prim.index_type = index_type;
-         prim.index_offset = index_offset;
-         prim.mode = hw_prim_type;
-         prim.enable_primitive_restarts = pipeline->primitive_restart;
-         prim.number_of_instances = instanceCount;
-         prim.instance_length = indexCount;
-      }
+   uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask;
+   while (view_mask) {
+      cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask));
+      v3dv_cmd_buffer_emit_pre_draw(cmd_buffer);
+      v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indexed)
+         (cmd_buffer, indexCount, instanceCount,
+          firstIndex, vertexOffset, firstInstance);
    }
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdDrawIndirect(VkCommandBuffer commandBuffer,
                      VkBuffer _buffer,
                      VkDeviceSize offset,
                      uint32_t drawCount,
                      uint32_t stride)
 {
-   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
-   V3DV_FROM_HANDLE(v3dv_buffer, buffer, _buffer);
-
    /* drawCount is the number of draws to execute, and can be zero. */
    if (drawCount == 0)
       return;
 
-   cmd_buffer_emit_pre_draw(cmd_buffer);
-
-   struct v3dv_job *job = cmd_buffer->state.job;
-   assert(job);
-
-   const struct v3dv_pipeline *pipeline = cmd_buffer->state.pipeline;
-   uint32_t hw_prim_type = v3d_hw_prim_type(pipeline->vs->topology);
+   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+   V3DV_FROM_HANDLE(v3dv_buffer, buffer, _buffer);
 
-   v3dv_cl_ensure_space_with_branch(
-      &job->bcl, cl_packet_length(INDIRECT_VERTEX_ARRAY_INSTANCED_PRIMS));
-   v3dv_return_if_oom(cmd_buffer, NULL);
+   struct v3dv_render_pass *pass = cmd_buffer->state.pass;
+   if (likely(!pass->multiview_enabled)) {
+      v3dv_cmd_buffer_emit_pre_draw(cmd_buffer);
+      v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indirect)
+         (cmd_buffer, buffer, offset, drawCount, stride);
+      return;
+   }
 
-   cl_emit(&job->bcl, INDIRECT_VERTEX_ARRAY_INSTANCED_PRIMS, prim) {
-      prim.mode = hw_prim_type;
-      prim.number_of_draw_indirect_array_records = drawCount;
-      prim.stride_in_multiples_of_4_bytes = stride >> 2;
-      prim.address = v3dv_cl_address(buffer->mem->bo,
-                                     buffer->mem_offset + offset);
+   uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask;
+   while (view_mask) {
+      cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask));
+      v3dv_cmd_buffer_emit_pre_draw(cmd_buffer);
+      v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indirect)
+         (cmd_buffer, buffer, offset, drawCount, stride);
    }
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
                             VkBuffer _buffer,
                             VkDeviceSize offset,
                             uint32_t drawCount,
                             uint32_t stride)
 {
-   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
-   V3DV_FROM_HANDLE(v3dv_buffer, buffer, _buffer);
-
    /* drawCount is the number of draws to execute, and can be zero. */
    if (drawCount == 0)
       return;
 
-   cmd_buffer_emit_pre_draw(cmd_buffer);
-
-   struct v3dv_job *job = cmd_buffer->state.job;
-   assert(job);
-
-   const struct v3dv_pipeline *pipeline = cmd_buffer->state.pipeline;
-   uint32_t hw_prim_type = v3d_hw_prim_type(pipeline->vs->topology);
-   uint8_t index_type = ffs(cmd_buffer->state.index_buffer.index_size) - 1;
+   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+   V3DV_FROM_HANDLE(v3dv_buffer, buffer, _buffer);
 
-   v3dv_cl_ensure_space_with_branch(
-      &job->bcl, cl_packet_length(INDIRECT_INDEXED_INSTANCED_PRIM_LIST));
-   v3dv_return_if_oom(cmd_buffer, NULL);
+   struct v3dv_render_pass *pass = cmd_buffer->state.pass;
+   if (likely(!pass->multiview_enabled)) {
+      v3dv_cmd_buffer_emit_pre_draw(cmd_buffer);
+      v3dv_X(cmd_buffer->device, cmd_buffer_emit_indexed_indirect)
+         (cmd_buffer, buffer, offset, drawCount, stride);
+      return;
+   }
 
-   cl_emit(&job->bcl, INDIRECT_INDEXED_INSTANCED_PRIM_LIST, prim) {
-      prim.index_type = index_type;
-      prim.mode = hw_prim_type;
-      prim.enable_primitive_restarts = pipeline->primitive_restart;
-      prim.number_of_draw_indirect_indexed_records = drawCount;
-      prim.stride_in_multiples_of_4_bytes = stride >> 2;
-      prim.address = v3dv_cl_address(buffer->mem->bo,
-                                     buffer->mem_offset + offset);
+   uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask;
+   while (view_mask) {
+      cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask));
+      v3dv_cmd_buffer_emit_pre_draw(cmd_buffer);
+      v3dv_X(cmd_buffer->device, cmd_buffer_emit_indexed_indirect)
+         (cmd_buffer, buffer, offset, drawCount, stride);
    }
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdPipelineBarrier(VkCommandBuffer commandBuffer,
                         VkPipelineStageFlags srcStageMask,
                         VkPipelineStageFlags dstStageMask,
@@ -4145,7 +2707,7 @@ v3dv_CmdPipelineBarrier(VkCommandBuffer commandBuffer,
    }
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,
                           uint32_t firstBinding,
                           uint32_t bindingCount,
@@ -4180,6 +2742,9 @@ static uint32_t
 get_index_size(VkIndexType index_type)
 {
    switch (index_type) {
+   case VK_INDEX_TYPE_UINT8_EXT:
+      return 1;
+      break;
    case VK_INDEX_TYPE_UINT16:
       return 2;
       break;
@@ -4191,50 +2756,28 @@ get_index_size(VkIndexType index_type)
    }
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,
                         VkBuffer buffer,
                         VkDeviceSize offset,
                         VkIndexType indexType)
 {
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
-   V3DV_FROM_HANDLE(v3dv_buffer, ibuffer, buffer);
-
-   struct v3dv_job *job = cmd_buffer->state.job;
-   assert(job);
-
-   v3dv_cl_ensure_space_with_branch(
-      &job->bcl, cl_packet_length(INDEX_BUFFER_SETUP));
-   v3dv_return_if_oom(cmd_buffer, NULL);
 
    const uint32_t index_size = get_index_size(indexType);
-
-   /* If we have started a new job we always need to emit index buffer state.
-    * We know we are in that scenario because that is the only case where we
-    * set the dirty bit.
-    */
-   if (!(cmd_buffer->state.dirty & V3DV_CMD_DIRTY_INDEX_BUFFER)) {
-      if (buffer == cmd_buffer->state.index_buffer.buffer &&
-          offset == cmd_buffer->state.index_buffer.offset &&
-          index_size == cmd_buffer->state.index_buffer.index_size) {
-         return;
-      }
-   }
-
-   cl_emit(&job->bcl, INDEX_BUFFER_SETUP, ib) {
-      ib.address = v3dv_cl_address(ibuffer->mem->bo,
-                                   ibuffer->mem_offset + offset);
-      ib.size = ibuffer->mem->bo->size;
+   if (buffer == cmd_buffer->state.index_buffer.buffer &&
+       offset == cmd_buffer->state.index_buffer.offset &&
+       index_size == cmd_buffer->state.index_buffer.index_size) {
+      return;
    }
 
    cmd_buffer->state.index_buffer.buffer = buffer;
    cmd_buffer->state.index_buffer.offset = offset;
    cmd_buffer->state.index_buffer.index_size = index_size;
-
-   cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_INDEX_BUFFER;
+   cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_INDEX_BUFFER;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer,
                               VkStencilFaceFlags faceMask,
                               uint32_t compareMask)
@@ -4249,7 +2792,7 @@ v3dv_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer,
    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_STENCIL_COMPARE_MASK;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer,
                             VkStencilFaceFlags faceMask,
                             uint32_t writeMask)
@@ -4264,7 +2807,7 @@ v3dv_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer,
    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_STENCIL_WRITE_MASK;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdSetStencilReference(VkCommandBuffer commandBuffer,
                             VkStencilFaceFlags faceMask,
                             uint32_t reference)
@@ -4279,7 +2822,7 @@ v3dv_CmdSetStencilReference(VkCommandBuffer commandBuffer,
    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_STENCIL_REFERENCE;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdSetDepthBias(VkCommandBuffer commandBuffer,
                      float depthBiasConstantFactor,
                      float depthBiasClamp,
@@ -4288,11 +2831,12 @@ v3dv_CmdSetDepthBias(VkCommandBuffer commandBuffer,
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
 
    cmd_buffer->state.dynamic.depth_bias.constant_factor = depthBiasConstantFactor;
+   cmd_buffer->state.dynamic.depth_bias.depth_bias_clamp = depthBiasClamp;
    cmd_buffer->state.dynamic.depth_bias.slope_factor = depthBiasSlopeFactor;
    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_DEPTH_BIAS;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdSetDepthBounds(VkCommandBuffer commandBuffer,
                        float minDepthBounds,
                        float maxDepthBounds)
@@ -4302,7 +2846,7 @@ v3dv_CmdSetDepthBounds(VkCommandBuffer commandBuffer,
     */
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdSetLineWidth(VkCommandBuffer commandBuffer,
                      float lineWidth)
 {
@@ -4312,7 +2856,7 @@ v3dv_CmdSetLineWidth(VkCommandBuffer commandBuffer,
    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_LINE_WIDTH;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
                            VkPipelineBindPoint pipelineBindPoint,
                            VkPipelineLayout _layout,
@@ -4330,20 +2874,20 @@ v3dv_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
    assert(firstSet + descriptorSetCount <= MAX_SETS);
 
    struct v3dv_descriptor_state *descriptor_state =
-      &cmd_buffer->state.descriptor_state[pipelineBindPoint];
+      pipelineBindPoint == VK_PIPELINE_BIND_POINT_COMPUTE ?
+      &cmd_buffer->state.compute.descriptor_state :
+      &cmd_buffer->state.gfx.descriptor_state;
 
+   VkShaderStageFlags dirty_stages = 0;
    bool descriptor_state_changed = false;
    for (uint32_t i = 0; i < descriptorSetCount; i++) {
       V3DV_FROM_HANDLE(v3dv_descriptor_set, set, pDescriptorSets[i]);
       uint32_t index = firstSet + i;
 
+      descriptor_state->valid |= (1u << index);
       if (descriptor_state->descriptor_sets[index] != set) {
          descriptor_state->descriptor_sets[index] = set;
-         descriptor_state_changed = true;
-      }
-
-      if (!(descriptor_state->valid & (1u << index))) {
-         descriptor_state->valid |= (1u << index);
+         dirty_stages |= set->layout->shader_stages;
          descriptor_state_changed = true;
       }
 
@@ -4352,20 +2896,24 @@ v3dv_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
 
          if (descriptor_state->dynamic_offsets[idx] != pDynamicOffsets[dyn_index]) {
             descriptor_state->dynamic_offsets[idx] = pDynamicOffsets[dyn_index];
+            dirty_stages |= set->layout->shader_stages;
             descriptor_state_changed = true;
          }
       }
    }
 
    if (descriptor_state_changed) {
-      if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS)
+      if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) {
          cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_DESCRIPTOR_SETS;
-      else
+         cmd_buffer->state.dirty_descriptor_stages |= dirty_stages & VK_SHADER_STAGE_ALL_GRAPHICS;
+      } else {
          cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS;
+         cmd_buffer->state.dirty_descriptor_stages |= VK_SHADER_STAGE_COMPUTE_BIT;
+      }
    }
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdPushConstants(VkCommandBuffer commandBuffer,
                       VkPipelineLayout layout,
                       VkShaderStageFlags stageFlags,
@@ -4375,15 +2923,16 @@ v3dv_CmdPushConstants(VkCommandBuffer commandBuffer,
 {
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
 
-   if (!memcmp(cmd_buffer->push_constants_data + offset, pValues, size))
+   if (!memcmp((uint8_t *) cmd_buffer->push_constants_data + offset, pValues, size))
       return;
 
-   memcpy((void*) cmd_buffer->push_constants_data + offset, pValues, size);
+   memcpy((uint8_t *) cmd_buffer->push_constants_data + offset, pValues, size);
 
    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_PUSH_CONSTANTS;
+   cmd_buffer->state.dirty_push_constants_stages |= stageFlags;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdSetBlendConstants(VkCommandBuffer commandBuffer,
                           const float blendConstants[4])
 {
@@ -4401,6 +2950,26 @@ v3dv_CmdSetBlendConstants(VkCommandBuffer commandBuffer,
    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_BLEND_CONSTANTS;
 }
 
+VKAPI_ATTR void VKAPI_CALL
+v3dv_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer,
+                               uint32_t attachmentCount,
+                               const VkBool32 *pColorWriteEnables)
+{
+   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+   uint32_t color_write_enable = 0;
+
+   for (uint32_t i = 0; i < attachmentCount; i++)
+      color_write_enable |= pColorWriteEnables[i] ? (0xfu << (i * 4)) : 0;
+
+   if (state->dynamic.color_write_enable == color_write_enable)
+      return;
+
+   state->dynamic.color_write_enable = color_write_enable;
+
+   state->dirty |= V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE;
+}
+
 void
 v3dv_cmd_buffer_reset_queries(struct v3dv_cmd_buffer *cmd_buffer,
                               struct v3dv_query_pool *pool,
@@ -4429,12 +2998,12 @@ v3dv_cmd_buffer_reset_queries(struct v3dv_cmd_buffer *cmd_buffer,
    list_addtail(&job->list_link, &cmd_buffer->jobs);
 }
 
-static void
-ensure_array_state(struct v3dv_cmd_buffer *cmd_buffer,
-                   uint32_t slot_size,
-                   uint32_t used_count,
-                   uint32_t *alloc_count,
-                   void **ptr)
+void
+v3dv_cmd_buffer_ensure_array_state(struct v3dv_cmd_buffer *cmd_buffer,
+                                   uint32_t slot_size,
+                                   uint32_t used_count,
+                                   uint32_t *alloc_count,
+                                   void **ptr)
 {
    if (used_count >= *alloc_count) {
       const uint32_t prev_slot_count = *alloc_count;
@@ -4442,7 +3011,7 @@ ensure_array_state(struct v3dv_cmd_buffer *cmd_buffer,
 
       const uint32_t new_slot_count = MAX2(*alloc_count * 2, 4);
       const uint32_t bytes = new_slot_count * slot_size;
-      *ptr = vk_alloc(&cmd_buffer->device->alloc, bytes, 8,
+      *ptr = vk_alloc(&cmd_buffer->device->vk.alloc, bytes, 8,
                       VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
       if (*ptr == NULL) {
          fprintf(stderr, "Error: failed to allocate CPU buffer for query.\n");
@@ -4463,10 +3032,11 @@ v3dv_cmd_buffer_begin_query(struct v3dv_cmd_buffer *cmd_buffer,
                             VkQueryControlFlags flags)
 {
    /* FIXME: we only support one active query for now */
-   assert(cmd_buffer->state.query.active_query == NULL);
+   assert(cmd_buffer->state.query.active_query.bo == NULL);
    assert(query < pool->query_count);
 
-   cmd_buffer->state.query.active_query = pool->queries[query].bo;
+   cmd_buffer->state.query.active_query.bo = pool->queries[query].bo;
+   cmd_buffer->state.query.active_query.offset = pool->queries[query].offset;
    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY;
 }
 
@@ -4476,7 +3046,7 @@ v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer,
                           uint32_t query)
 {
    assert(query < pool->query_count);
-   assert(cmd_buffer->state.query.active_query != NULL);
+   assert(cmd_buffer->state.query.active_query.bo != NULL);
 
    if  (cmd_buffer->state.pass) {
       /* Queue the EndQuery in the command buffer state, we will create a CPU
@@ -4484,11 +3054,11 @@ v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer,
        * render pass job in which they have been recorded.
        */
       struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
-      ensure_array_state(cmd_buffer,
-                         sizeof(struct v3dv_end_query_cpu_job_info),
-                         state->query.end.used_count,
-                         &state->query.end.alloc_count,
-                         (void **) &state->query.end.states);
+      v3dv_cmd_buffer_ensure_array_state(cmd_buffer,
+                                         sizeof(struct v3dv_end_query_cpu_job_info),
+                                         state->query.end.used_count,
+                                         &state->query.end.alloc_count,
+                                         (void **) &state->query.end.states);
       v3dv_return_if_oom(cmd_buffer, NULL);
 
       struct v3dv_end_query_cpu_job_info *info =
@@ -4496,6 +3066,27 @@ v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer,
 
       info->pool = pool;
       info->query = query;
+
+      /* From the Vulkan spec:
+       *
+       *   "If queries are used while executing a render pass instance that has
+       *    multiview enabled, the query uses N consecutive query indices in
+       *    the query pool (starting at query) where N is the number of bits set
+       *    in the view mask in the subpass the query is used in. How the
+       *    numerical results of the query are distributed among the queries is
+       *    implementation-dependent."
+       *
+       * In our case, only the first query is used but this means we still need
+       * to flag the other queries as available so we don't emit errors when
+       * the applications attempt to retrive values from them.
+       */
+      struct v3dv_render_pass *pass = cmd_buffer->state.pass;
+      if (!pass->multiview_enabled) {
+         info->count = 1;
+      } else {
+         struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
+         info->count = util_bitcount(subpass->view_mask);
+      }
    } else {
       /* Otherwise, schedule the CPU job immediately */
       struct v3dv_job *job =
@@ -4506,10 +3097,14 @@ v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer,
 
       job->cpu.query_end.pool = pool;
       job->cpu.query_end.query = query;
+
+      /* Multiview queries cannot cross subpass boundaries */
+      job->cpu.query_end.count = 1;
+
       list_addtail(&job->list_link, &cmd_buffer->jobs);
    }
 
-   cmd_buffer->state.query.active_query = NULL;
+   cmd_buffer->state.query.active_query.bo = NULL;
    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY;
 }
 
@@ -4554,7 +3149,7 @@ v3dv_cmd_buffer_add_tfu_job(struct v3dv_cmd_buffer *cmd_buffer,
                             struct drm_v3d_submit_tfu *tfu)
 {
    struct v3dv_device *device = cmd_buffer->device;
-   struct v3dv_job *job = vk_zalloc(&device->alloc,
+   struct v3dv_job *job = vk_zalloc(&device->vk.alloc,
                                     sizeof(struct v3dv_job), 8,
                                     VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
    if (!job) {
@@ -4567,7 +3162,7 @@ v3dv_cmd_buffer_add_tfu_job(struct v3dv_cmd_buffer *cmd_buffer,
    list_addtail(&job->list_link, &cmd_buffer->jobs);
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdSetEvent(VkCommandBuffer commandBuffer,
                  VkEvent _event,
                  VkPipelineStageFlags stageMask)
@@ -4593,7 +3188,7 @@ v3dv_CmdSetEvent(VkCommandBuffer commandBuffer,
    list_addtail(&job->list_link, &cmd_buffer->jobs);
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdResetEvent(VkCommandBuffer commandBuffer,
                    VkEvent _event,
                    VkPipelineStageFlags stageMask)
@@ -4619,7 +3214,7 @@ v3dv_CmdResetEvent(VkCommandBuffer commandBuffer,
    list_addtail(&job->list_link, &cmd_buffer->jobs);
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdWaitEvents(VkCommandBuffer commandBuffer,
                    uint32_t eventCount,
                    const VkEvent *pEvents,
@@ -4645,7 +3240,7 @@ v3dv_CmdWaitEvents(VkCommandBuffer commandBuffer,
    const uint32_t event_list_size = sizeof(struct v3dv_event *) * eventCount;
 
    job->cpu.event_wait.events =
-      vk_alloc(&cmd_buffer->device->alloc, event_list_size, 8,
+      vk_alloc(&cmd_buffer->device->vk.alloc, event_list_size, 8,
                VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
    if (!job->cpu.event_wait.events) {
       v3dv_flag_oom(cmd_buffer, NULL);
@@ -4672,30 +3267,58 @@ v3dv_CmdWaitEvents(VkCommandBuffer commandBuffer,
    list_addtail(&job->list_link, &cmd_buffer->jobs);
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdWriteTimestamp(VkCommandBuffer commandBuffer,
                        VkPipelineStageFlagBits pipelineStage,
                        VkQueryPool queryPool,
                        uint32_t query)
 {
-   unreachable("Timestamp queries are not supported.");
+   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+   V3DV_FROM_HANDLE(v3dv_query_pool, query_pool, queryPool);
+
+   /* If this is called inside a render pass we need to finish the current
+    * job here...
+    */
+   struct v3dv_render_pass *pass = cmd_buffer->state.pass;
+   if (pass)
+      v3dv_cmd_buffer_finish_job(cmd_buffer);
+
+   struct v3dv_job *job =
+      v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
+                                     V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY,
+                                     cmd_buffer, -1);
+   v3dv_return_if_oom(cmd_buffer, NULL);
+
+   job->cpu.query_timestamp.pool = query_pool;
+   job->cpu.query_timestamp.query = query;
+
+   if (!pass || !pass->multiview_enabled) {
+      job->cpu.query_timestamp.count = 1;
+   } else {
+      struct v3dv_subpass *subpass =
+         &pass->subpasses[cmd_buffer->state.subpass_idx];
+      job->cpu.query_timestamp.count = util_bitcount(subpass->view_mask);
+   }
+
+   list_addtail(&job->list_link, &cmd_buffer->jobs);
+   cmd_buffer->state.job = NULL;
+
+   /* ...and resume the subpass after the timestamp */
+   if (cmd_buffer->state.pass)
+      v3dv_cmd_buffer_subpass_resume(cmd_buffer, cmd_buffer->state.subpass_idx);
 }
 
 static void
 cmd_buffer_emit_pre_dispatch(struct v3dv_cmd_buffer *cmd_buffer)
 {
-   assert(cmd_buffer->state.pipeline);
-   assert(cmd_buffer->state.pipeline->active_stages == VK_SHADER_STAGE_COMPUTE_BIT);
-
-   /* We may need to compile shader variants based on bound textures */
-   uint32_t *dirty = &cmd_buffer->state.dirty;
-   if (*dirty & (V3DV_CMD_DIRTY_PIPELINE |
-                 V3DV_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS)) {
-      update_pipeline_variants(cmd_buffer);
-   }
+   assert(cmd_buffer->state.compute.pipeline);
+   assert(cmd_buffer->state.compute.pipeline->active_stages ==
+          VK_SHADER_STAGE_COMPUTE_BIT);
 
-   *dirty &= ~(V3DV_CMD_DIRTY_PIPELINE |
-               V3DV_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS);
+   cmd_buffer->state.dirty &= ~(V3DV_CMD_DIRTY_COMPUTE_PIPELINE |
+                                V3DV_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS);
+   cmd_buffer->state.dirty_descriptor_stages &= ~VK_SHADER_STAGE_COMPUTE_BIT;
+   cmd_buffer->state.dirty_push_constants_stages &= ~VK_SHADER_STAGE_COMPUTE_BIT;
 }
 
 #define V3D_CSD_CFG012_WG_COUNT_SHIFT 16
@@ -4743,8 +3366,7 @@ v3dv_cmd_buffer_rewrite_indirect_csd_job(
       /* Make sure the GPU is not currently accessing the indirect CL for this
        * job, since we are about to overwrite some of the uniform data.
        */
-      const uint64_t infinite = 0xffffffffffffffffull;
-      v3dv_bo_wait(job->device, job->indirect.bo, infinite);
+      v3dv_bo_wait(job->device, job->indirect.bo, PIPE_TIMEOUT_INFINITE);
 
       for (uint32_t i = 0; i < 3; i++) {
          if (info->wg_uniform_offsets[i]) {
@@ -4761,16 +3383,21 @@ v3dv_cmd_buffer_rewrite_indirect_csd_job(
 
 static struct v3dv_job *
 cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
+                          uint32_t base_offset_x,
+                          uint32_t base_offset_y,
+                          uint32_t base_offset_z,
                           uint32_t group_count_x,
                           uint32_t group_count_y,
                           uint32_t group_count_z,
                           uint32_t **wg_uniform_offsets_out,
                           uint32_t *wg_size_out)
 {
-   struct v3dv_pipeline *pipeline = cmd_buffer->state.pipeline;
-   assert(pipeline && pipeline->cs && pipeline->cs->nir);
+   struct v3dv_pipeline *pipeline = cmd_buffer->state.compute.pipeline;
+   assert(pipeline && pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE]);
+   struct v3dv_shader_variant *cs_variant =
+      pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE];
 
-   struct v3dv_job *job = vk_zalloc(&cmd_buffer->device->alloc,
+   struct v3dv_job *job = vk_zalloc(&cmd_buffer->device->vk.alloc,
                                     sizeof(struct v3dv_job), 8,
                                     VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
    if (!job) {
@@ -4787,42 +3414,59 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
    job->csd.wg_count[1] = group_count_y;
    job->csd.wg_count[2] = group_count_z;
 
+   job->csd.wg_base[0] = base_offset_x;
+   job->csd.wg_base[1] = base_offset_y;
+   job->csd.wg_base[2] = base_offset_z;
+
    submit->cfg[0] |= group_count_x << V3D_CSD_CFG012_WG_COUNT_SHIFT;
    submit->cfg[1] |= group_count_y << V3D_CSD_CFG012_WG_COUNT_SHIFT;
    submit->cfg[2] |= group_count_z << V3D_CSD_CFG012_WG_COUNT_SHIFT;
 
-   const struct nir_shader *cs =  pipeline->cs->nir;
-
-   const uint32_t wgs_per_sg = 1; /* FIXME */
-   const uint32_t wg_size = cs->info.cs.local_size[0] *
-                            cs->info.cs.local_size[1] *
-                            cs->info.cs.local_size[2];
-   submit->cfg[3] |= wgs_per_sg << V3D_CSD_CFG3_WGS_PER_SG_SHIFT;
-   submit->cfg[3] |= ((DIV_ROUND_UP(wgs_per_sg * wg_size, 16) - 1) <<
-                     V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT);
+   const struct v3d_compute_prog_data *cpd =
+      cs_variant->prog_data.cs;
+
+   const uint32_t num_wgs = group_count_x * group_count_y * group_count_z;
+   const uint32_t wg_size = cpd->local_size[0] *
+                            cpd->local_size[1] *
+                            cpd->local_size[2];
+
+   uint32_t wgs_per_sg =
+      v3d_csd_choose_workgroups_per_supergroup(
+         &cmd_buffer->device->devinfo,
+         cs_variant->prog_data.cs->has_subgroups,
+         cs_variant->prog_data.cs->base.has_control_barrier,
+         cs_variant->prog_data.cs->base.threads,
+         num_wgs, wg_size);
+
+   uint32_t batches_per_sg = DIV_ROUND_UP(wgs_per_sg * wg_size, 16);
+   uint32_t whole_sgs = num_wgs / wgs_per_sg;
+   uint32_t rem_wgs = num_wgs - whole_sgs * wgs_per_sg;
+   uint32_t num_batches = batches_per_sg * whole_sgs +
+                          DIV_ROUND_UP(rem_wgs * wg_size, 16);
+
+   submit->cfg[3] |= (wgs_per_sg & 0xf) << V3D_CSD_CFG3_WGS_PER_SG_SHIFT;
+   submit->cfg[3] |= (batches_per_sg - 1) << V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT;
    submit->cfg[3] |= (wg_size & 0xff) << V3D_CSD_CFG3_WG_SIZE_SHIFT;
    if (wg_size_out)
       *wg_size_out = wg_size;
 
-   uint32_t batches_per_wg = DIV_ROUND_UP(wg_size, 16);
-   submit->cfg[4] = batches_per_wg *
-                    (group_count_x * group_count_y * group_count_z) - 1;
+   submit->cfg[4] = num_batches - 1;
    assert(submit->cfg[4] != ~0);
 
-   assert(pipeline->cs->current_variant &&
-          pipeline->cs->current_variant->assembly_bo);
-   const struct v3dv_shader_variant *variant = pipeline->cs->current_variant;
-   submit->cfg[5] = variant->assembly_bo->offset;
+   assert(pipeline->shared_data->assembly_bo);
+   struct v3dv_bo *cs_assembly_bo = pipeline->shared_data->assembly_bo;
+
+   submit->cfg[5] = cs_assembly_bo->offset + cs_variant->assembly_offset;
    submit->cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS;
-   if (variant->prog_data.base->single_seg)
+   if (cs_variant->prog_data.base->single_seg)
       submit->cfg[5] |= V3D_CSD_CFG5_SINGLE_SEG;
-   if (variant->prog_data.base->threads == 4)
+   if (cs_variant->prog_data.base->threads == 4)
       submit->cfg[5] |= V3D_CSD_CFG5_THREADING;
 
-   if (variant->prog_data.cs->shared_size > 0) {
+   if (cs_variant->prog_data.cs->shared_size > 0) {
       job->csd.shared_memory =
          v3dv_bo_alloc(cmd_buffer->device,
-                       variant->prog_data.cs->shared_size * wgs_per_sg,
+                       cs_variant->prog_data.cs->shared_size * wgs_per_sg,
                        "shared_vars", true);
       if (!job->csd.shared_memory) {
          v3dv_flag_oom(cmd_buffer, NULL);
@@ -4830,10 +3474,10 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
       }
    }
 
-   v3dv_job_add_bo(job, variant->assembly_bo);
-
+   v3dv_job_add_bo_unchecked(job, cs_assembly_bo);
    struct v3dv_cl_reloc uniforms =
-      v3dv_write_uniforms_wg_offsets(cmd_buffer, pipeline->cs,
+      v3dv_write_uniforms_wg_offsets(cmd_buffer, pipeline,
+                                     cs_variant,
                                      wg_uniform_offsets_out);
    submit->cfg[6] = uniforms.bo->offset + uniforms.offset;
 
@@ -4844,6 +3488,9 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
 
 static void
 cmd_buffer_dispatch(struct v3dv_cmd_buffer *cmd_buffer,
+                    uint32_t base_offset_x,
+                    uint32_t base_offset_y,
+                    uint32_t base_offset_z,
                     uint32_t group_count_x,
                     uint32_t group_count_y,
                     uint32_t group_count_z)
@@ -4853,6 +3500,9 @@ cmd_buffer_dispatch(struct v3dv_cmd_buffer *cmd_buffer,
 
    struct v3dv_job *job =
       cmd_buffer_create_csd_job(cmd_buffer,
+                                base_offset_x,
+                                base_offset_y,
+                                base_offset_z,
                                 group_count_x,
                                 group_count_y,
                                 group_count_z,
@@ -4862,7 +3512,7 @@ cmd_buffer_dispatch(struct v3dv_cmd_buffer *cmd_buffer,
    cmd_buffer->state.job = NULL;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdDispatch(VkCommandBuffer commandBuffer,
                  uint32_t groupCountX,
                  uint32_t groupCountY,
@@ -4871,9 +3521,28 @@ v3dv_CmdDispatch(VkCommandBuffer commandBuffer,
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
 
    cmd_buffer_emit_pre_dispatch(cmd_buffer);
-   cmd_buffer_dispatch(cmd_buffer, groupCountX, groupCountY, groupCountZ);
+   cmd_buffer_dispatch(cmd_buffer, 0, 0, 0,
+                       groupCountX, groupCountY, groupCountZ);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+v3dv_CmdDispatchBase(VkCommandBuffer commandBuffer,
+                     uint32_t baseGroupX,
+                     uint32_t baseGroupY,
+                     uint32_t baseGroupZ,
+                     uint32_t groupCountX,
+                     uint32_t groupCountY,
+                     uint32_t groupCountZ)
+{
+   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   cmd_buffer_emit_pre_dispatch(cmd_buffer);
+   cmd_buffer_dispatch(cmd_buffer,
+                       baseGroupX, baseGroupY, baseGroupZ,
+                       groupCountX, groupCountY, groupCountZ);
 }
 
+
 static void
 cmd_buffer_dispatch_indirect(struct v3dv_cmd_buffer *cmd_buffer,
                              struct v3dv_buffer *buffer,
@@ -4898,6 +3567,7 @@ cmd_buffer_dispatch_indirect(struct v3dv_cmd_buffer *cmd_buffer,
     */
    struct v3dv_job *csd_job =
       cmd_buffer_create_csd_job(cmd_buffer,
+                                0, 0, 0,
                                 1, 1, 1,
                                 &job->cpu.csd_indirect.wg_uniform_offsets[0],
                                 &job->cpu.csd_indirect.wg_size);
@@ -4920,7 +3590,7 @@ cmd_buffer_dispatch_indirect(struct v3dv_cmd_buffer *cmd_buffer,
    cmd_buffer->state.job = NULL;
 }
 
-void
+VKAPI_ATTR void VKAPI_CALL
 v3dv_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
                          VkBuffer _buffer,
                          VkDeviceSize offset)
@@ -4934,14 +3604,9 @@ v3dv_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
    cmd_buffer_dispatch_indirect(cmd_buffer, buffer, offset);
 }
 
-void
-v3dv_CmdResolveImage(VkCommandBuffer commandBuffer,
-                     VkImage srcImage,
-                     VkImageLayout srcImageLayout,
-                     VkImage dstImage,
-                     VkImageLayout dstImageLayout,
-                     uint32_t regionCount,
-                     const VkImageResolve *pRegions)
+VKAPI_ATTR void VKAPI_CALL
+v3dv_CmdSetDeviceMask(VkCommandBuffer commandBuffer, uint32_t deviceMask)
 {
-   unreachable("vkCmdResolveImage not implemented");
+   /* Nothing to do here since we only support a single device */
+   assert(deviceMask == 0x1);
 }