v3dv: add the concept of a job
authorIago Toral Quiroga <itoral@igalia.com>
Wed, 8 Jan 2020 10:14:35 +0000 (11:14 +0100)
committerMarge Bot <eric+marge@anholt.net>
Tue, 13 Oct 2020 21:21:26 +0000 (21:21 +0000)
As we make progress towards more complex submissions we will need to split
our command buffers into smaller executable units (jobs) that we can
submit indepdently to the kernel. This will be required to implement
pipeline barriers, split subpasses that have depedencies on previous
subpasses, split render passes that use more than 4 render targets, etc.

For now we keep things simple and we only keep one job as current
recording target in the command buffer, and we generate a new one
with every subpass or with any commands we see outside of a render pass
(only vkCopyImageToBuffer for now). In the future we probably want to
optimize this by merging subpasses into the same job when possible,
etc.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6766>

src/broadcom/vulkan/v3dv_cl.c
src/broadcom/vulkan/v3dv_cl.h
src/broadcom/vulkan/v3dv_cmd_buffer.c
src/broadcom/vulkan/v3dv_meta_copy.c
src/broadcom/vulkan/v3dv_private.h
src/broadcom/vulkan/v3dv_queue.c
src/broadcom/vulkan/v3dv_uniforms.c

index d3494c5..e20e673 100644 (file)
 #include "broadcom/cle/v3dx_pack.h"
 
 void
-v3dv_cl_init(struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_cl *cl)
+v3dv_cl_init(struct v3dv_job *job, struct v3dv_cl *cl)
 {
    cl->base = NULL;
    cl->next = cl->base;
    cl->bo = NULL;
    cl->size = 0;
-   cl->cmd_buffer = cmd_buffer;
+   cl->job = job;
 }
 
 void
 v3dv_cl_begin(struct v3dv_cl *cl)
 {
-   assert(!cl->cmd_buffer ||
-          cl->cmd_buffer->status == V3DV_CMD_BUFFER_STATUS_INITIALIZED);
    assert(v3dv_cl_offset(cl) == 0);
 }
 
@@ -48,15 +46,15 @@ v3dv_cl_reset(struct v3dv_cl *cl)
    /* FIXME: consider keeping the BO when the command buffer is reset with
     * flag VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT.
     */
-   v3dv_cl_init(cl->cmd_buffer, cl);
+   v3dv_cl_init(cl->job, cl);
 }
 
 void
 v3dv_cl_destroy(struct v3dv_cl *cl)
 {
    if (cl->bo) {
-      assert(cl->cmd_buffer);
-      v3dv_bo_free(cl->cmd_buffer->device, cl->bo);
+      assert(cl->job);
+      v3dv_bo_free(cl->job->cmd_buffer->device, cl->bo);
    }
 
    /* Leave the CL in a reset state to catch use after destroy instances */
@@ -73,15 +71,15 @@ v3dv_cl_ensure_space(struct v3dv_cl *cl, uint32_t space, uint32_t alignment)
       return offset;
    }
 
-   struct v3dv_bo *bo = v3dv_bo_alloc(cl->cmd_buffer->device, space);
+   struct v3dv_bo *bo = v3dv_bo_alloc(cl->job->cmd_buffer->device, space);
    if (!bo) {
       fprintf(stderr, "failed to allocate memory for command list");
       abort();
    }
 
-   v3dv_cmd_buffer_add_bo(cl->cmd_buffer, bo);
+   v3dv_job_add_bo(cl->job, bo);
 
-   bool ok = v3dv_bo_map(cl->cmd_buffer->device, bo, bo->size);
+   bool ok = v3dv_bo_map(cl->job->cmd_buffer->device, bo, bo->size);
    if (!ok) {
       fprintf(stderr, "failed to map command list buffer");
       abort();
@@ -102,7 +100,7 @@ v3dv_cl_ensure_space_with_branch(struct v3dv_cl *cl, uint32_t space)
    if (v3dv_cl_offset(cl) + space + cl_packet_length(BRANCH) <= cl->size)
       return;
 
-   struct v3dv_bo *bo = v3dv_bo_alloc(cl->cmd_buffer->device, space);
+   struct v3dv_bo *bo = v3dv_bo_alloc(cl->job->cmd_buffer->device, space);
    if (!bo) {
       fprintf(stderr, "failed to allocate memory for command list");
       abort();
@@ -115,9 +113,9 @@ v3dv_cl_ensure_space_with_branch(struct v3dv_cl *cl, uint32_t space)
       }
    }
 
-   v3dv_cmd_buffer_add_bo(cl->cmd_buffer, bo);
+   v3dv_job_add_bo(cl->job, bo);
 
-   bool ok = v3dv_bo_map(cl->cmd_buffer->device, bo, bo->size);
+   bool ok = v3dv_bo_map(cl->job->cmd_buffer->device, bo, bo->size);
    if (!ok) {
       fprintf(stderr, "failed to map command list buffer");
       abort();
index f58b2d5..c95110f 100644 (file)
 #include "broadcom/cle/v3d_packet_helpers.h"
 
 struct v3dv_bo;
-struct v3dv_cmd_buffer;
+struct v3dv_job;
 struct v3dv_cl;
 
-void v3dv_cmd_buffer_add_bo(struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_bo *bo);
+void v3dv_job_add_bo(struct v3dv_job *job, struct v3dv_bo *bo);
 
 /**
  * Undefined structure, used for typechecking that you're passing the pointers
@@ -46,7 +46,7 @@ struct v3dv_cl_reloc {
 
 struct v3dv_cl {
    void *base;
-   struct v3dv_cmd_buffer *cmd_buffer;
+   struct v3dv_job *job;
    struct v3dv_cl_out *next;
    struct v3dv_bo *bo;
    uint32_t size;
@@ -82,7 +82,7 @@ v3dv_cl_get_address(struct v3dv_cl *cl)
    return (struct v3dv_cl_reloc){ .bo = cl->bo, .offset = v3dv_cl_offset(cl) };
 }
 
-void v3dv_cl_init(struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_cl *cl);
+void v3dv_cl_init(struct v3dv_job *job, struct v3dv_cl *cl);
 void v3dv_cl_begin(struct v3dv_cl *cl);
 void v3dv_cl_reset(struct v3dv_cl *cl);
 void v3dv_cl_destroy(struct v3dv_cl *cl);
@@ -167,7 +167,7 @@ static inline void
 cl_pack_emit_reloc(struct v3dv_cl *cl, const struct v3dv_cl_reloc *reloc)
 {
         if (reloc->bo)
-                v3dv_cmd_buffer_add_bo(cl->cmd_buffer, reloc->bo);
+                v3dv_job_add_bo(cl->job, reloc->bo);
 }
 
 #endif /* V3DV_CL_H */
index ca302db..caf2d2a 100644 (file)
@@ -35,16 +35,16 @@ const struct v3dv_dynamic_state default_dynamic_state = {
 };
 
 void
-v3dv_cmd_buffer_add_bo(struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_bo *bo)
+v3dv_job_add_bo(struct v3dv_job *job, struct v3dv_bo *bo)
 {
    if (!bo)
       return;
 
-   if (_mesa_set_search(cmd_buffer->bos, bo))
+   if (_mesa_set_search(job->bos, bo))
       return;
 
-   _mesa_set_add(cmd_buffer->bos, bo);
-   cmd_buffer->bo_count++;
+   _mesa_set_add(job->bos, bo);
+   job->bo_count++;
 }
 
 VkResult
@@ -94,13 +94,7 @@ cmd_buffer_create(struct v3dv_device *device,
    cmd_buffer->level = level;
    cmd_buffer->usage_flags = 0;
 
-   cmd_buffer->bos =
-      _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
-   cmd_buffer->bo_count = 0;
-
-   v3dv_cl_init(cmd_buffer, &cmd_buffer->bcl);
-   v3dv_cl_init(cmd_buffer, &cmd_buffer->rcl);
-   v3dv_cl_init(cmd_buffer, &cmd_buffer->indirect);
+   list_inithead(&cmd_buffer->submit_jobs);
 
    cmd_buffer->status = V3DV_CMD_BUFFER_STATUS_NEW;
 
@@ -113,48 +107,114 @@ cmd_buffer_create(struct v3dv_device *device,
 }
 
 static void
-cmd_buffer_destroy(struct v3dv_cmd_buffer *cmd_buffer)
+job_destroy(struct v3dv_job *job)
 {
-   list_del(&cmd_buffer->pool_link);
+   assert(job);
+
+   list_del(&job->list_link);
 
-   v3dv_cl_destroy(&cmd_buffer->bcl);
-   v3dv_cl_destroy(&cmd_buffer->rcl);
-   v3dv_cl_destroy(&cmd_buffer->indirect);
+   v3dv_cl_destroy(&job->bcl);
+   v3dv_cl_destroy(&job->rcl);
+   v3dv_cl_destroy(&job->indirect);
 
    /* Since we don't ref BOs, when we add them to the command buffer, don't
     * unref them here either.
     */
 #if 0
-   set_foreach(cmd_buffer->bos, entry) {
+   set_foreach(job->bos, entry) {
       struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
       v3dv_bo_free(cmd_buffer->device, bo);
    }
 #endif
-   _mesa_set_destroy(cmd_buffer->bos, NULL);
+   _mesa_set_destroy(job->bos, NULL);
+
+   v3dv_bo_free(job->cmd_buffer->device, job->tile_alloc);
+   v3dv_bo_free(job->cmd_buffer->device, job->tile_state);
+}
+
+static void
+cmd_buffer_destroy(struct v3dv_cmd_buffer *cmd_buffer)
+{
+   list_del(&cmd_buffer->pool_link);
 
-   v3dv_bo_free(cmd_buffer->device, cmd_buffer->tile_alloc);
-   v3dv_bo_free(cmd_buffer->device, cmd_buffer->tile_state);
+   list_for_each_entry_safe(struct v3dv_job, job,
+                            &cmd_buffer->submit_jobs, list_link) {
+      job_destroy(job);
+   }
+
+   if (cmd_buffer->state.job)
+      job_destroy(cmd_buffer->state.job);
 
    vk_free(&cmd_buffer->pool->alloc, cmd_buffer);
 }
 
+static void
+emit_binning_flush(struct v3dv_job *job)
+{
+   assert(job);
+   v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(FLUSH));
+   cl_emit(&job->bcl, FLUSH, flush);
+}
+
+void
+v3dv_cmd_buffer_finish_job(struct v3dv_cmd_buffer *cmd_buffer)
+{
+   struct v3dv_job *job = cmd_buffer->state.job;
+   assert(job);
+   assert(v3dv_cl_offset(&job->bcl) != 0);
+
+   list_addtail(&job->list_link, &cmd_buffer->submit_jobs);
+   cmd_buffer->state.job = NULL;
+}
+
+struct v3dv_job *
+v3dv_cmd_buffer_start_job(struct v3dv_cmd_buffer *cmd_buffer)
+{
+   /* Ensure we are not starting a new job without finishing a previous one */
+   if (cmd_buffer->state.job != NULL) {
+      emit_binning_flush(cmd_buffer->state.job);
+      v3dv_cmd_buffer_finish_job(cmd_buffer);
+   }
+
+   assert(cmd_buffer->state.job == NULL);
+   struct v3dv_job *job = vk_zalloc(&cmd_buffer->device->alloc,
+                                    sizeof(struct v3dv_job), 8,
+                                    VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   assert(job);
+
+   job->cmd_buffer = cmd_buffer;
+
+   job->bos =
+      _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
+   job->bo_count = 0;
+
+   v3dv_cl_init(job, &job->bcl);
+   v3dv_cl_begin(&job->bcl);
+
+   v3dv_cl_init(job, &job->rcl);
+   v3dv_cl_begin(&job->rcl);
+
+   v3dv_cl_init(job, &job->indirect);
+   v3dv_cl_begin(&job->indirect);
+
+   cmd_buffer->state.job = job;
+   return job;
+}
+
 static VkResult
 cmd_buffer_reset(struct v3dv_cmd_buffer *cmd_buffer)
 {
    if (cmd_buffer->status != V3DV_CMD_BUFFER_STATUS_INITIALIZED) {
-      cmd_buffer->usage_flags = 0;
+      /* FIXME */
+      assert(cmd_buffer->status == V3DV_CMD_BUFFER_STATUS_NEW);
 
-      _mesa_set_clear(cmd_buffer->bos, NULL);
-      cmd_buffer->bo_count = 0;
-
-      v3dv_cl_reset(&cmd_buffer->bcl);
-      v3dv_cl_reset(&cmd_buffer->rcl);
-      v3dv_cl_reset(&cmd_buffer->indirect);
+      cmd_buffer->usage_flags = 0;
 
       struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
       state->pass = NULL;
       state->framebuffer = NULL;
       state->subpass_idx = 0;
+      state->job = NULL;
 
       cmd_buffer->status = V3DV_CMD_BUFFER_STATUS_INITIALIZED;
    }
@@ -248,19 +308,16 @@ v3dv_BeginCommandBuffer(VkCommandBuffer commandBuffer,
 
    cmd_buffer->usage_flags = pBeginInfo->flags;
 
-   v3dv_cl_begin(&cmd_buffer->bcl);
-   v3dv_cl_begin(&cmd_buffer->rcl);
-   v3dv_cl_begin(&cmd_buffer->indirect);
-
    cmd_buffer->status = V3DV_CMD_BUFFER_STATUS_RECORDING;
 
    return VK_SUCCESS;
 }
 
 static void
-emit_clip_window(struct v3dv_cmd_buffer *cmd_buffer, VkRect2D *rect)
+emit_clip_window(struct v3dv_job *job, const VkRect2D *rect)
 {
-   cl_emit(&cmd_buffer->bcl, CLIP_WINDOW, clip) {
+   assert(job);
+   cl_emit(&job->bcl, CLIP_WINDOW, clip) {
       clip.clip_window_left_pixel_coordinate = rect->offset.x;
       clip.clip_window_bottom_pixel_coordinate = rect->offset.y;
       clip.clip_window_width_in_pixels = rect->extent.width;
@@ -349,90 +406,12 @@ v3dv_CmdBeginRenderPass(VkCommandBuffer commandBuffer,
                                      pRenderPassBegin->clearValueCount,
                                      pRenderPassBegin->pClearValues);
 
-   v3dv_cl_ensure_space_with_branch(&cmd_buffer->bcl, 256);
-
-   /* The PTB will request the tile alloc initial size per tile at start
-    * of tile binning.
-    */
-   const uint32_t fb_layers = 1; /* FIXME */
-   uint32_t tile_alloc_size = 64 * MAX2(fb_layers, 1) *
-                              framebuffer->draw_tiles_x *
-                              framebuffer->draw_tiles_y;
-
-   /* The PTB allocates in aligned 4k chunks after the initial setup. */
-   tile_alloc_size = align(tile_alloc_size, 4096);
-
-   /* Include the first two chunk allocations that the PTB does so that
-    * we definitely clear the OOM condition before triggering one (the HW
-    * won't trigger OOM during the first allocations).
-    */
-   tile_alloc_size += 8192;
-
-   /* For performance, allocate some extra initial memory after the PTB's
-    * minimal allocations, so that we hopefully don't have to block the
-    * GPU on the kernel handling an OOM signal.
-    */
-   tile_alloc_size += 512 * 1024;
-
-   cmd_buffer->tile_alloc = v3dv_bo_alloc(cmd_buffer->device, tile_alloc_size);
-   v3dv_cmd_buffer_add_bo(cmd_buffer, cmd_buffer->tile_alloc);
-
-   const uint32_t tsda_per_tile_size = 256;
-   const uint32_t tile_state_size = MAX2(fb_layers, 1) *
-                                    framebuffer->draw_tiles_x *
-                                    framebuffer->draw_tiles_y *
-                                    tsda_per_tile_size;
-   cmd_buffer->tile_state = v3dv_bo_alloc(cmd_buffer->device, tile_state_size);
-   v3dv_cmd_buffer_add_bo(cmd_buffer, cmd_buffer->tile_state);
-
-   /* This must go before the binning mode configuration. It is
-    * required for layered framebuffers to work.
-    */
-   if (fb_layers > 0) {
-      cl_emit(&cmd_buffer->bcl, NUMBER_OF_LAYERS, config) {
-         config.number_of_layers = fb_layers;
-      }
-   }
-
-   cl_emit(&cmd_buffer->bcl, TILE_BINNING_MODE_CFG, config) {
-      config.width_in_pixels = framebuffer->width;
-      config.height_in_pixels = framebuffer->height;
-      config.number_of_render_targets = MAX2(framebuffer->attachment_count, 1);
-      config.multisample_mode_4x = false; /* FIXME */
-      config.maximum_bpp_of_all_render_targets = framebuffer->internal_bpp;
-   }
-
-   /* There's definitely nothing in the VCD cache we want. */
-   cl_emit(&cmd_buffer->bcl, FLUSH_VCD_CACHE, bin);
-
-   /* Disable any leftover OQ state from another job. */
-   cl_emit(&cmd_buffer->bcl, OCCLUSION_QUERY_COUNTER, counter);
-
-   /* "Binning mode lists must have a Start Tile Binning item (6) after
-    *  any prefix state data before the binning list proper starts."
-    */
-   cl_emit(&cmd_buffer->bcl, START_TILE_BINNING, bin);
-
    /* FIXME: probably need to align the render area to tile boundaries since
     *        the tile clears will render full tiles anyway.
     *        See vkGetRenderAreaGranularity().
     */
    state->render_area = pRenderPassBegin->renderArea;
 
-   /* If we don't have a scissor or viewport defined let's just use the render
-    * area as clip_window, as that would be required for a clear in any
-    * case. If we have that, it would be emitted as part of the pipeline
-    * dynamic state flush
-    *
-    * FIXME: this is mostly just needed for clear. radv has dedicated paths
-    * for them, so we could get that idea. In any case, need to revisit if
-    * this is the place to emit the clip window.
-    */
-   if (cmd_buffer->state.dynamic.scissor.count == 0 &&
-       cmd_buffer->state.dynamic.viewport.count == 0) {
-      emit_clip_window(cmd_buffer, &state->render_area);
-   }
-
    /* Setup for first subpass */
    state->subpass_idx = 0;
 }
@@ -627,10 +606,13 @@ emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
 static void
 emit_generic_per_tile_list(struct v3dv_cmd_buffer *cmd_buffer, uint32_t layer)
 {
+   struct v3dv_job *job = cmd_buffer->state.job;
+   assert(job);
+
    /* Emit the generic list in our indirect state -- the rcl will just
     * have pointers into it.
     */
-   struct v3dv_cl *cl = &cmd_buffer->indirect;
+   struct v3dv_cl *cl = &job->indirect;
    v3dv_cl_ensure_space(cl, 200, 1);
    struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
 
@@ -653,7 +635,7 @@ emit_generic_per_tile_list(struct v3dv_cmd_buffer *cmd_buffer, uint32_t layer)
 
    cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
 
-   cl_emit(&cmd_buffer->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
+   cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
       branch.start = tile_list_start;
       branch.end = v3dv_cl_get_address(cl);
    }
@@ -665,7 +647,8 @@ emit_render_layer(struct v3dv_cmd_buffer *cmd_buffer, uint32_t layer)
    const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
    const struct v3dv_framebuffer *framebuffer = state->framebuffer;
 
-   struct v3dv_cl *rcl = &cmd_buffer->rcl;
+   struct v3dv_job *job = cmd_buffer->state.job;
+   struct v3dv_cl *rcl = &job->rcl;
 
    /* If doing multicore binning, we would need to initialize each
     * core's tile list here.
@@ -673,7 +656,7 @@ emit_render_layer(struct v3dv_cmd_buffer *cmd_buffer, uint32_t layer)
    const uint32_t tile_alloc_offset =
       64 * layer * framebuffer->draw_tiles_x * framebuffer->draw_tiles_y;
    cl_emit(rcl, MULTICORE_RENDERING_TILE_LIST_SET_BASE, list) {
-      list.address = v3dv_cl_address(cmd_buffer->tile_alloc, tile_alloc_offset);
+      list.address = v3dv_cl_address(job->tile_alloc, tile_alloc_offset);
    }
 
    cl_emit(rcl, MULTICORE_RENDERING_SUPERTILE_CFG, config) {
@@ -758,10 +741,13 @@ emit_render_layer(struct v3dv_cmd_buffer *cmd_buffer, uint32_t layer)
 static void
 emit_rcl(struct v3dv_cmd_buffer *cmd_buffer)
 {
+   struct v3dv_job *job = cmd_buffer->state.job;
+   assert(job);
+
    /* FIXME */
    const uint32_t fb_layers = 1;
 
-   v3dv_cl_ensure_space_with_branch(&cmd_buffer->rcl, 200 +
+   v3dv_cl_ensure_space_with_branch(&job->rcl, 200 +
                                     MAX2(fb_layers, 1) * 256 *
                                     cl_packet_length(SUPERTILE_COORDINATES));
 
@@ -772,7 +758,7 @@ emit_rcl(struct v3dv_cmd_buffer *cmd_buffer)
    const struct v3dv_subpass *subpass =
       &state->pass->subpasses[state->subpass_idx];
 
-   struct v3dv_cl *rcl = &cmd_buffer->rcl;
+   struct v3dv_cl *rcl = &job->rcl;
 
    /* Comon config must be the first TILE_RENDERING_MODE_CFG and
     * Z_STENCIL_CLEAR_VALUES must be last. The ones in between are optional
@@ -892,7 +878,7 @@ subpass_start(struct v3dv_cmd_buffer *cmd_buffer)
    for (uint32_t i = 0; i < subpass->color_count; i++) {
       uint32_t rp_attachment_idx = subpass->color_attachments[i].attachment;
       const struct v3dv_render_pass_attachment *attachment =
-         &cmd_buffer->state.pass->attachments[rp_attachment_idx];
+         &state->pass->attachments[rp_attachment_idx];
 
       /* FIXME: if a previous subpass has alredy computed the hw clear color
        *        for this attachment we could skip this. We can just flag this
@@ -904,7 +890,7 @@ subpass_start(struct v3dv_cmd_buffer *cmd_buffer)
 
       const uint32_t sp_attachment_idx = i;
       const struct v3dv_image_view *iview =
-         cmd_buffer->state.framebuffer->attachments[sp_attachment_idx];
+         state->framebuffer->attachments[sp_attachment_idx];
 
       assert((iview->aspects &
               (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) == 0);
@@ -917,20 +903,113 @@ subpass_start(struct v3dv_cmd_buffer *cmd_buffer)
                                                      clear_color);
       }
    }
+
+   /* FIXME: for now, each subpass goes into a separate job. In the future we
+    * might be able to merge subpasses that render to the same render targets
+    * so long as they don't render to more than 4 color attachments and there
+    * aren't other subpass dependencies preveting this.
+    */
+   struct v3dv_job *job = v3dv_cmd_buffer_start_job(cmd_buffer);
+
+   const struct v3dv_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
+
+   /* Setup binning for this subpass.
+    *
+    * FIXME: For now we do this at the start each subpass but if we implement
+    * subpass merges in the future we would only want to emit this once per job.
+    */
+   v3dv_cl_ensure_space_with_branch(&job->bcl, 256);
+
+   /* The PTB will request the tile alloc initial size per tile at start
+    * of tile binning.
+    */
+   const uint32_t fb_layers = 1; /* FIXME */
+   uint32_t tile_alloc_size = 64 * MAX2(fb_layers, 1) *
+                              framebuffer->draw_tiles_x *
+                              framebuffer->draw_tiles_y;
+
+   /* The PTB allocates in aligned 4k chunks after the initial setup. */
+   tile_alloc_size = align(tile_alloc_size, 4096);
+
+   /* Include the first two chunk allocations that the PTB does so that
+    * we definitely clear the OOM condition before triggering one (the HW
+    * won't trigger OOM during the first allocations).
+    */
+   tile_alloc_size += 8192;
+
+   /* For performance, allocate some extra initial memory after the PTB's
+    * minimal allocations, so that we hopefully don't have to block the
+    * GPU on the kernel handling an OOM signal.
+    */
+   tile_alloc_size += 512 * 1024;
+
+   job->tile_alloc = v3dv_bo_alloc(cmd_buffer->device, tile_alloc_size);
+   v3dv_job_add_bo(job, job->tile_alloc);
+
+   const uint32_t tsda_per_tile_size = 256;
+   const uint32_t tile_state_size = MAX2(fb_layers, 1) *
+                                    framebuffer->draw_tiles_x *
+                                    framebuffer->draw_tiles_y *
+                                    tsda_per_tile_size;
+   job->tile_state = v3dv_bo_alloc(cmd_buffer->device, tile_state_size);
+   v3dv_job_add_bo(job, job->tile_state);
+
+   /* This must go before the binning mode configuration. It is
+    * required for layered framebuffers to work.
+    */
+   if (fb_layers > 0) {
+      cl_emit(&job->bcl, NUMBER_OF_LAYERS, config) {
+         config.number_of_layers = fb_layers;
+      }
+   }
+
+   cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) {
+      config.width_in_pixels = framebuffer->width;
+      config.height_in_pixels = framebuffer->height;
+      config.number_of_render_targets = MAX2(framebuffer->attachment_count, 1);
+      config.multisample_mode_4x = false; /* FIXME */
+      config.maximum_bpp_of_all_render_targets = framebuffer->internal_bpp;
+   }
+
+   /* There's definitely nothing in the VCD cache we want. */
+   cl_emit(&job->bcl, FLUSH_VCD_CACHE, bin);
+
+   /* Disable any leftover OQ state from another job. */
+   cl_emit(&job->bcl, OCCLUSION_QUERY_COUNTER, counter);
+
+   /* "Binning mode lists must have a Start Tile Binning item (6) after
+    *  any prefix state data before the binning list proper starts."
+    */
+   cl_emit(&job->bcl, START_TILE_BINNING, bin);
+
+   /* If we don't have a scissor or viewport defined let's just use the render
+    * area as clip_window, as that would be required for a clear in any
+    * case. If we have that, it would be emitted as part of the pipeline
+    * dynamic state flush
+    *
+    * FIXME: this is mostly just needed for clear. radv has dedicated paths
+    * for them, so we could get that idea. In any case, need to revisit if
+    * this is the place to emit the clip window.
+    */
+   if (cmd_buffer->state.dynamic.scissor.count == 0 &&
+       cmd_buffer->state.dynamic.viewport.count == 0) {
+      emit_clip_window(job, &state->render_area);
+   }
 }
 
 static void
 subpass_finish(struct v3dv_cmd_buffer *cmd_buffer)
 {
-   v3dv_cl_ensure_space_with_branch(&cmd_buffer->bcl, cl_packet_length(FLUSH));
+   struct v3dv_job *job = cmd_buffer->state.job;
+   assert(job);
 
-   /* We need to emit a flush between binning jobs, so do this before we start
-    * recording the next subpass.
+   /* This finishes the a binning job.
     *
     * FIXME: if the next subpass draws to the same RTs, we could skip this
     * and the binning setup for the next subpass.
     */
-   cl_emit(&cmd_buffer->bcl, FLUSH, flush);
+   emit_binning_flush(job);
+   v3dv_cmd_buffer_finish_job(cmd_buffer);
 }
 
 static void
@@ -961,11 +1040,18 @@ v3dv_EndCommandBuffer(VkCommandBuffer commandBuffer)
 {
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
 
-   if (v3dv_cl_offset(&cmd_buffer->bcl) == 0)
-      return VK_SUCCESS; /* FIXME? */
-
    cmd_buffer->status = V3DV_CMD_BUFFER_STATUS_EXECUTABLE;
 
+   struct v3dv_job *job = cmd_buffer->state.job;
+   if (!job)
+      return VK_SUCCESS;
+
+   /* We get here if we recorded commands after the last render pass in the
+    * command buffer. Make sure we finish this last job. */
+   assert(v3dv_cl_offset(&job->bcl) != 0);
+   emit_binning_flush(job);
+   v3dv_cmd_buffer_finish_job(cmd_buffer);
+
    return VK_SUCCESS;
 }
 
@@ -1028,11 +1114,11 @@ v3dv_CmdBindPipeline(VkCommandBuffer commandBuffer,
 
       /* FIXME: is here the best moment to do that? or when drawing? */
       if (pipeline->vs->assembly_bo)
-         v3dv_cmd_buffer_add_bo(cmd_buffer, pipeline->vs->assembly_bo);
+         v3dv_job_add_bo(cmd_buffer->state.job, pipeline->vs->assembly_bo);
       if (pipeline->vs_bin->assembly_bo)
-         v3dv_cmd_buffer_add_bo(cmd_buffer, pipeline->vs_bin->assembly_bo);
+         v3dv_job_add_bo(cmd_buffer->state.job, pipeline->vs_bin->assembly_bo);
       if (pipeline->fs->assembly_bo)
-         v3dv_cmd_buffer_add_bo(cmd_buffer, pipeline->fs->assembly_bo);
+         v3dv_job_add_bo(cmd_buffer->state.job, pipeline->fs->assembly_bo);
 
       cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_PIPELINE;
       break;
@@ -1181,7 +1267,7 @@ emit_scissor(struct v3dv_cmd_buffer *cmd_buffer)
    clip_window.extent.width = maxx - minx;
    clip_window.extent.height = maxy - miny;
 
-   emit_clip_window(cmd_buffer, &clip_window);
+   emit_clip_window(cmd_buffer->state.job, &clip_window);
 }
 
 static void
@@ -1194,23 +1280,26 @@ emit_viewport(struct v3dv_cmd_buffer *cmd_buffer)
    float *vptranslate = dynamic->viewport.translate[0];
    float *vpscale = dynamic->viewport.scale[0];
 
-   cl_emit(&cmd_buffer->bcl, CLIPPER_XY_SCALING, clip) {
+   struct v3dv_job *job = cmd_buffer->state.job;
+   assert(job);
+
+   cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
       clip.viewport_half_width_in_1_256th_of_pixel = vpscale[0] * 256.0f;
       clip.viewport_half_height_in_1_256th_of_pixel = vpscale[1] * 256.0f;
    }
 
-   cl_emit(&cmd_buffer->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) {
+   cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) {
       clip.viewport_z_offset_zc_to_zs = vptranslate[2];
       clip.viewport_z_scale_zc_to_zs = vpscale[2];
    }
-   cl_emit(&cmd_buffer->bcl, CLIPPER_Z_MIN_MAX_CLIPPING_PLANES, clip) {
+   cl_emit(&job->bcl, CLIPPER_Z_MIN_MAX_CLIPPING_PLANES, clip) {
       float z1 = (vptranslate[2] - vpscale[2]);
       float z2 = (vptranslate[2] + vpscale[2]);
       clip.minimum_zw = MIN2(z1, z2);
       clip.maximum_zw = MAX2(z1, z2);
    }
 
-   cl_emit(&cmd_buffer->bcl, VIEWPORT_OFFSET, vp) {
+   cl_emit(&job->bcl, VIEWPORT_OFFSET, vp) {
       vp.viewport_centre_x_coordinate = vptranslate[0];
       vp.viewport_centre_y_coordinate = vptranslate[1];
    }
@@ -1233,9 +1322,11 @@ struct vpm_config {
 static void
 cmd_buffer_emit_graphics_pipeline(struct v3dv_cmd_buffer *cmd_buffer)
 {
+   struct v3dv_job *job = cmd_buffer->state.job;
+   assert(job);
+
    struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
    struct v3dv_pipeline *pipeline = state->pipeline;
-
    assert(pipeline);
 
    /* Upload the uniforms to the indirect CL first */
@@ -1249,9 +1340,9 @@ cmd_buffer_emit_graphics_pipeline(struct v3dv_cmd_buffer *cmd_buffer)
       v3dv_write_uniforms(cmd_buffer, pipeline->vs_bin);
 
    /* Update the cache dirty flag based on the shader progs data */
-   state->tmu_dirty_rcl |= pipeline->vs_bin->prog_data.vs->base.tmu_dirty_rcl;
-   state->tmu_dirty_rcl |= pipeline->vs->prog_data.vs->base.tmu_dirty_rcl;
-   state->tmu_dirty_rcl |= pipeline->fs->prog_data.fs->base.tmu_dirty_rcl;
+   job->tmu_dirty_rcl |= pipeline->vs_bin->prog_data.vs->base.tmu_dirty_rcl;
+   job->tmu_dirty_rcl |= pipeline->vs->prog_data.vs->base.tmu_dirty_rcl;
+   job->tmu_dirty_rcl |= pipeline->fs->prog_data.fs->base.tmu_dirty_rcl;
 
    /* FIXME: fake vtx->num_elements, that is the vertex state that includes
     * data from the buffers used on the vertex. Such info is still not
@@ -1267,7 +1358,7 @@ cmd_buffer_emit_graphics_pipeline(struct v3dv_cmd_buffer *cmd_buffer)
    uint32_t num_elements_to_emit = MAX2(vtx_num_elements, 1);
 
    uint32_t shader_rec_offset =
-      v3dv_cl_ensure_space(&cmd_buffer->indirect,
+      v3dv_cl_ensure_space(&job->indirect,
                            cl_packet_length(GL_SHADER_STATE_RECORD) +
                            num_elements_to_emit *
                            cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD),
@@ -1286,7 +1377,7 @@ cmd_buffer_emit_graphics_pipeline(struct v3dv_cmd_buffer *cmd_buffer)
    vpm_cfg.Ve = 0;
    vpm_cfg.Vc = pipeline->vs->prog_data.vs->vcm_cache_size;
 
-   cl_emit(&cmd_buffer->indirect, GL_SHADER_STATE_RECORD, shader) {
+   cl_emit(&job->indirect, GL_SHADER_STATE_RECORD, shader) {
       shader.enable_clipping = true;
 
       shader.point_size_in_shaded_vertex_data =
@@ -1400,9 +1491,9 @@ cmd_buffer_emit_graphics_pipeline(struct v3dv_cmd_buffer *cmd_buffer)
        * by CS and VS.  If we have no attributes being consumed by
        * the shader, set up a dummy to be loaded into the VPM.
        */
-      cl_emit(&cmd_buffer->indirect, GL_SHADER_STATE_ATTRIBUTE_RECORD, attr) {
+      cl_emit(&job->indirect, GL_SHADER_STATE_ATTRIBUTE_RECORD, attr) {
          /* Valid address of data whose value will be unused. */
-         attr.address = v3dv_cl_address(cmd_buffer->indirect.bo, 0);
+         attr.address = v3dv_cl_address(job->indirect.bo, 0);
 
          attr.type = ATTRIBUTE_FLOAT;
          attr.stride = 0;
@@ -1413,13 +1504,13 @@ cmd_buffer_emit_graphics_pipeline(struct v3dv_cmd_buffer *cmd_buffer)
       }
    }
 
-   cl_emit(&cmd_buffer->bcl, VCM_CACHE_SIZE, vcm) {
+   cl_emit(&job->bcl, VCM_CACHE_SIZE, vcm) {
       vcm.number_of_16_vertex_batches_for_binning = vpm_cfg_bin.Vc;
       vcm.number_of_16_vertex_batches_for_rendering = vpm_cfg.Vc;
    }
 
-   cl_emit(&cmd_buffer->bcl, GL_SHADER_STATE, state) {
-      state.address = v3dv_cl_address(cmd_buffer->indirect.bo,
+   cl_emit(&job->bcl, GL_SHADER_STATE, state) {
+      state.address = v3dv_cl_address(job->indirect.bo,
                                       shader_rec_offset);
       state.number_of_attribute_arrays = num_elements_to_emit;
    }
@@ -1462,6 +1553,9 @@ static void
 cmd_buffer_emit_draw_packets(struct v3dv_cmd_buffer *cmd_buffer,
                              struct v3dv_draw_info *info)
 {
+   struct v3dv_job *job = cmd_buffer->state.job;
+   assert(job);
+
    struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
    struct v3dv_pipeline *pipeline = state->pipeline;
 
@@ -1473,7 +1567,7 @@ cmd_buffer_emit_draw_packets(struct v3dv_cmd_buffer *cmd_buffer,
    /* FIXME: using VERTEX_ARRAY_PRIMS always as it fits our test caselist
     * right now. Need to be choosen based on the current case.
     */
-   cl_emit(&cmd_buffer->bcl, VERTEX_ARRAY_PRIMS, prim) {
+   cl_emit(&job->bcl, VERTEX_ARRAY_PRIMS, prim) {
       prim.mode = hw_prim_type | prim_tf_enable;
       prim.length = info->vertex_count;
       prim.index_of_first_vertex = info->first_vertex;
index a7728a2..fc55be8 100644 (file)
@@ -27,8 +27,7 @@
 #include "vk_format_info.h"
 
 static void
-emit_image_loads(struct v3dv_cmd_buffer *cmd_buffer,
-                 struct v3dv_cl *cl,
+emit_image_loads(struct v3dv_cl *cl,
                  struct v3dv_image *image,
                  uint32_t layer,
                  uint32_t mip_level)
@@ -67,8 +66,7 @@ emit_image_loads(struct v3dv_cmd_buffer *cmd_buffer,
 }
 
 static void
-emit_buffer_stores(struct v3dv_cmd_buffer *cmd_buffer,
-                   struct v3dv_cl *cl,
+emit_buffer_stores(struct v3dv_cl *cl,
                    struct v3dv_buffer *buffer,
                    struct v3dv_image *image,
                    uint32_t buffer_offset,
@@ -92,13 +90,13 @@ emit_buffer_stores(struct v3dv_cmd_buffer *cmd_buffer,
 }
 
 static void
-emit_copy_layer_to_buffer_per_tile_list(struct v3dv_cmd_buffer *cmd_buffer,
+emit_copy_layer_to_buffer_per_tile_list(struct v3dv_job *job,
                                         struct v3dv_buffer *buffer,
                                         struct v3dv_image *image,
                                         uint32_t layer,
                                         const VkBufferImageCopy *region)
 {
-   struct v3dv_cl *cl = &cmd_buffer->indirect;
+   struct v3dv_cl *cl = &job->indirect;
    v3dv_cl_ensure_space(cl, 200, 1);
    struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
 
@@ -108,8 +106,7 @@ emit_copy_layer_to_buffer_per_tile_list(struct v3dv_cmd_buffer *cmd_buffer,
    assert(layer < imgrsc->layerCount);
 
    /* Load image to TLB */
-   emit_image_loads(cmd_buffer, cl, image,
-                    imgrsc->baseArrayLayer + layer, imgrsc->mipLevel);
+   emit_image_loads(cl, image, imgrsc->baseArrayLayer + layer, imgrsc->mipLevel);
 
    cl_emit(cl, PRIM_LIST_FORMAT, fmt) {
       fmt.primitive_type = LIST_TRIANGLES;
@@ -130,21 +127,20 @@ emit_copy_layer_to_buffer_per_tile_list(struct v3dv_cmd_buffer *cmd_buffer,
    uint32_t buffer_stride = width * image->cpp;
    uint32_t buffer_offset =
       region->bufferOffset + height * buffer_stride * layer;
-   emit_buffer_stores(cmd_buffer, cl, buffer, image,
-                      buffer_offset, buffer_stride);
+   emit_buffer_stores(cl, buffer, image, buffer_offset, buffer_stride);
 
    cl_emit(cl, END_OF_TILE_MARKER, end);
 
    cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
 
-   cl_emit(&cmd_buffer->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
+   cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
       branch.start = tile_list_start;
       branch.end = v3dv_cl_get_address(cl);
    }
 }
 
 static void
-emit_copy_layer_to_buffer(struct v3dv_cmd_buffer *cmd_buffer,
+emit_copy_layer_to_buffer(struct v3dv_job *job,
                           uint32_t min_x_supertile,
                           uint32_t min_y_supertile,
                           uint32_t max_x_supertile,
@@ -155,12 +151,12 @@ emit_copy_layer_to_buffer(struct v3dv_cmd_buffer *cmd_buffer,
                           uint32_t layer,
                           const VkBufferImageCopy *region)
 {
-   struct v3dv_cl *rcl = &cmd_buffer->rcl;
+   struct v3dv_cl *rcl = &job->rcl;
 
    const uint32_t tile_alloc_offset =
       64 * layer * framebuffer->draw_tiles_x * framebuffer->draw_tiles_y;
    cl_emit(rcl, MULTICORE_RENDERING_TILE_LIST_SET_BASE, list) {
-      list.address = v3dv_cl_address(cmd_buffer->tile_alloc, tile_alloc_offset);
+      list.address = v3dv_cl_address(job->tile_alloc, tile_alloc_offset);
    }
 
    cl_emit(rcl, MULTICORE_RENDERING_SUPERTILE_CFG, config) {
@@ -189,8 +185,7 @@ emit_copy_layer_to_buffer(struct v3dv_cmd_buffer *cmd_buffer,
 
    cl_emit(rcl, FLUSH_VCD_CACHE, flush);
 
-   emit_copy_layer_to_buffer_per_tile_list(cmd_buffer, buffer, image,
-                                           layer, region);
+   emit_copy_layer_to_buffer_per_tile_list(job, buffer, image, layer, region);
 
    for (int y = min_y_supertile; y <= max_y_supertile; y++) {
       for (int x = min_x_supertile; x <= max_x_supertile; x++) {
@@ -203,7 +198,7 @@ emit_copy_layer_to_buffer(struct v3dv_cmd_buffer *cmd_buffer,
 }
 
 static void
-emit_copy_image_to_buffer_rcl(struct v3dv_cmd_buffer *cmd_buffer,
+emit_copy_image_to_buffer_rcl(struct v3dv_job *job,
                               struct v3dv_buffer *buffer,
                               struct v3dv_image *image,
                               struct v3dv_framebuffer *framebuffer,
@@ -212,7 +207,7 @@ emit_copy_image_to_buffer_rcl(struct v3dv_cmd_buffer *cmd_buffer,
 {
    const VkImageSubresourceLayers *imgrsc = &region->imageSubresource;
 
-   struct v3dv_cl *rcl = &cmd_buffer->rcl;
+   struct v3dv_cl *rcl = &job->rcl;
    v3dv_cl_ensure_space_with_branch(rcl, 200 +
                                     imgrsc->layerCount * 256 *
                                     cl_packet_length(SUPERTILE_COORDINATES));
@@ -263,7 +258,7 @@ emit_copy_image_to_buffer_rcl(struct v3dv_cmd_buffer *cmd_buffer,
    const uint32_t max_y_supertile = max_render_y / supertile_h_in_pixels;
 
    for (int layer = 0; layer < imgrsc->layerCount; layer++) {
-      emit_copy_layer_to_buffer(cmd_buffer,
+      emit_copy_layer_to_buffer(job,
                                 min_x_supertile, min_y_supertile,
                                 max_x_supertile, max_y_supertile,
                                 buffer, image, framebuffer,
@@ -275,17 +270,17 @@ emit_copy_image_to_buffer_rcl(struct v3dv_cmd_buffer *cmd_buffer,
 }
 
 static void
-emit_copy_image_to_buffer_bcl(struct v3dv_cmd_buffer *cmd_buffer,
+emit_copy_image_to_buffer_bcl(struct v3dv_job *job,
                               struct v3dv_framebuffer *framebuffer,
                               const VkBufferImageCopy *region)
 {
-   v3dv_cl_ensure_space_with_branch(&cmd_buffer->bcl, 256);
+   v3dv_cl_ensure_space_with_branch(&job->bcl, 256);
 
-   cl_emit(&cmd_buffer->bcl, NUMBER_OF_LAYERS, config) {
+   cl_emit(&job->bcl, NUMBER_OF_LAYERS, config) {
       config.number_of_layers = framebuffer->layers;
    }
 
-   cl_emit(&cmd_buffer->bcl, TILE_BINNING_MODE_CFG, config) {
+   cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) {
       config.width_in_pixels = framebuffer->width;
       config.height_in_pixels = framebuffer->height;
       config.number_of_render_targets = 1;
@@ -293,20 +288,20 @@ emit_copy_image_to_buffer_bcl(struct v3dv_cmd_buffer *cmd_buffer,
       config.maximum_bpp_of_all_render_targets = framebuffer->internal_bpp;
    }
 
-   cl_emit(&cmd_buffer->bcl, FLUSH_VCD_CACHE, bin);
+   cl_emit(&job->bcl, FLUSH_VCD_CACHE, bin);
 
-   cl_emit(&cmd_buffer->bcl, OCCLUSION_QUERY_COUNTER, counter);
+   cl_emit(&job->bcl, OCCLUSION_QUERY_COUNTER, counter);
 
-   cl_emit(&cmd_buffer->bcl, START_TILE_BINNING, bin);
+   cl_emit(&job->bcl, START_TILE_BINNING, bin);
 
-   cl_emit(&cmd_buffer->bcl, CLIP_WINDOW, clip) {
+   cl_emit(&job->bcl, CLIP_WINDOW, clip) {
       clip.clip_window_left_pixel_coordinate = region->imageOffset.x;
       clip.clip_window_bottom_pixel_coordinate = region->imageOffset.y;
       clip.clip_window_width_in_pixels = region->imageExtent.width;
       clip.clip_window_height_in_pixels = region->imageExtent.height;
    }
 
-   cl_emit(&cmd_buffer->bcl, FLUSH, flush);
+   cl_emit(&job->bcl, FLUSH, flush);
 }
 
 /* Sets framebuffer dimensions and computes tile size parameters based on the
@@ -365,35 +360,30 @@ copy_image_to_buffer_tlb(struct v3dv_cmd_buffer *cmd_buffer,
       struct v3dv_framebuffer framebuffer;
       setup_framebuffer_params(&framebuffer, image, num_layers, internal_bpp);
 
-      /* FIXME: here we assume that we have a valid tile alloc/state setup,
-       *        which is usually the case for copy after render scenarios. The
-       *        code below simply checks and asserts this requirement,
-       *        however, a proper implementation should allocate new tile
-       *        alloc/state if we don't have one (for example if we haven't
-       *        recorded a render pass yet) or the one we have isn't large
-       *        enough. We still need to figure out how we want to handle
-       *        varying tile alloc/state requirements in a command buffer.
-       */
+      struct v3dv_job *job = v3dv_cmd_buffer_start_job(cmd_buffer);
+
       uint32_t tile_alloc_size = 64 * num_layers *
                                  framebuffer.draw_tiles_x *
                                  framebuffer.draw_tiles_y;
       tile_alloc_size = align(tile_alloc_size, 4096);
       tile_alloc_size += 8192;
       tile_alloc_size += 512 * 1024;
-      assert(cmd_buffer->tile_alloc &&
-             cmd_buffer->tile_alloc->size >= tile_alloc_size);
+      job->tile_alloc = v3dv_bo_alloc(cmd_buffer->device, tile_alloc_size);
+      v3dv_job_add_bo(job, job->tile_alloc);
 
       const uint32_t tsda_per_tile_size = 256;
       const uint32_t tile_state_size = num_layers *
                                        framebuffer.draw_tiles_x *
                                        framebuffer.draw_tiles_y *
                                        tsda_per_tile_size;
-      assert(cmd_buffer->tile_state &&
-             cmd_buffer->tile_state->size >= tile_state_size);
+      job->tile_state = v3dv_bo_alloc(cmd_buffer->device, tile_state_size);
+      v3dv_job_add_bo(job, job->tile_state);
 
-      emit_copy_image_to_buffer_bcl(cmd_buffer, &framebuffer, region);
-      emit_copy_image_to_buffer_rcl(cmd_buffer, buffer, image,
+      emit_copy_image_to_buffer_bcl(job, &framebuffer, region);
+      emit_copy_image_to_buffer_rcl(job, buffer, image,
                                     &framebuffer, internal_type, region);
+
+      v3dv_cmd_buffer_finish_job(cmd_buffer);
 }
 
 void
index 3ef1d14..d9c698f 100644 (file)
@@ -439,6 +439,30 @@ struct v3dv_dynamic_state {
 
 extern const struct v3dv_dynamic_state default_dynamic_state;
 
+struct v3dv_job {
+   struct list_head list_link;
+
+   struct v3dv_cmd_buffer *cmd_buffer;
+
+   struct v3dv_cl bcl;
+   struct v3dv_cl rcl;
+   struct v3dv_cl indirect;
+
+   /* Set of all BOs referenced by the job. This will be used for making
+    * the list of BOs that the kernel will need to have paged in to
+    * execute our job.
+    */
+   struct set *bos;
+   uint32_t bo_count;
+
+   struct v3dv_bo *tile_alloc;
+   struct v3dv_bo *tile_state;
+
+   bool tmu_dirty_rcl;
+};
+
+void v3dv_job_add_bo(struct v3dv_job *job, struct v3dv_bo *bo);
+
 struct v3dv_cmd_buffer_state {
    const struct v3dv_render_pass *pass;
    const struct v3dv_framebuffer *framebuffer;
@@ -456,8 +480,8 @@ struct v3dv_cmd_buffer_state {
    struct v3dv_dynamic_state dynamic;
    uint32_t dirty;
 
-   /* FIXME: here? */
-   bool tmu_dirty_rcl;
+   /* Current job being recorded */
+   struct v3dv_job *job;
 };
 
 struct v3dv_cmd_buffer {
@@ -471,26 +495,16 @@ struct v3dv_cmd_buffer {
    VkCommandBufferUsageFlags usage_flags;
    VkCommandBufferLevel level;
 
-   struct v3dv_cl bcl;
-   struct v3dv_cl rcl;
-   struct v3dv_cl indirect;
-
    enum v3dv_cmd_buffer_status status;
 
    struct v3dv_cmd_buffer_state state;
 
-   /* Set of all BOs referenced by the job. This will be used for making
-    * the list of BOs that the kernel will need to have paged in to
-    * execute our job.
-    */
-   struct set *bos;
-   uint32_t bo_count;
-
-   struct v3dv_bo *tile_alloc;
-   struct v3dv_bo *tile_state;
+   /* List of jobs to submit to the kernel */
+   struct list_head submit_jobs;
 };
 
-void v3dv_cmd_buffer_add_bo(struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_bo *bo);
+struct v3dv_job *v3dv_cmd_buffer_start_job(struct v3dv_cmd_buffer *cmd_buffer);
+void v3dv_cmd_buffer_finish_job(struct v3dv_cmd_buffer *cmd_buffer);
 
 struct v3dv_shader_module {
    unsigned char sha1[20];
index 678bfb4..186c9f0 100644 (file)
 #include <errno.h>
 
 static void
-v3dv_clif_dump(struct v3dv_queue *queue,
-               struct v3dv_cmd_buffer *cmd_buffer,
+v3dv_clif_dump(struct v3dv_device *device,
+               struct v3dv_job *job,
                struct drm_v3d_submit_cl *submit)
 {
    if (!(V3D_DEBUG & (V3D_DEBUG_CL | V3D_DEBUG_CLIF)))
       return;
 
-   struct clif_dump *clif = clif_dump_init(&queue->device->devinfo,
+   struct clif_dump *clif = clif_dump_init(&device->devinfo,
                                            stderr,
                                            V3D_DEBUG & V3D_DEBUG_CL);
 
-   set_foreach(cmd_buffer->bos, entry) {
+   set_foreach(job->bos, entry) {
       struct v3dv_bo *bo = (void *)entry->key;
       char *name = ralloc_asprintf(NULL, "%s_0x%x",
                                    "" /* bo->name */ , bo->offset);
 
-      v3dv_bo_map(queue->device, bo, bo->size);
+      v3dv_bo_map(device, bo, bo->size);
       clif_dump_add_bo(clif, name, bo->offset, bo->size, bo->map);
 
       ralloc_free(name);
@@ -57,17 +57,9 @@ v3dv_clif_dump(struct v3dv_queue *queue,
 }
 
 static VkResult
-queue_submit(struct v3dv_queue *queue,
-             const VkSubmitInfo *pSubmit,
-             VkFence fence)
+job_submit(struct v3dv_job *job)
 {
-   /* FIXME */
-   assert(fence == 0);
-   assert(pSubmit->waitSemaphoreCount == 0);
-   assert(pSubmit->signalSemaphoreCount == 0);
-   assert(pSubmit->commandBufferCount == 1);
-
-   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, pSubmit->pCommandBuffers[0]);
+   assert(job);
 
    struct drm_v3d_submit_cl submit;
 
@@ -79,36 +71,37 @@ queue_submit(struct v3dv_queue *queue,
    /* Update the sync object for the last rendering by our context. */
    submit.out_sync = 0; /* FIXME */
 
-   submit.bcl_start = cmd_buffer->bcl.bo->offset;
-   submit.bcl_end = cmd_buffer->bcl.bo->offset + v3dv_cl_offset(&cmd_buffer->bcl);
-   submit.rcl_start = cmd_buffer->rcl.bo->offset;
-   submit.rcl_end = cmd_buffer->rcl.bo->offset + v3dv_cl_offset(&cmd_buffer->rcl);
+   submit.bcl_start = job->bcl.bo->offset;
+   submit.bcl_end = job->bcl.bo->offset + v3dv_cl_offset(&job->bcl);
+   submit.rcl_start = job->rcl.bo->offset;
+   submit.rcl_end = job->rcl.bo->offset + v3dv_cl_offset(&job->rcl);
 
    submit.flags = 0;
    /* FIXME: we already know that we support cache flush, as we only support
     * hw that supports that, but would be better to just DRM-ask it
     */
-   if (cmd_buffer->state.tmu_dirty_rcl)
+   if (job->tmu_dirty_rcl)
       submit.flags |= DRM_V3D_SUBMIT_CL_FLUSH_CACHE;
 
-   submit.qma = cmd_buffer->tile_alloc->offset;
-   submit.qms = cmd_buffer->tile_alloc->size;
-   submit.qts = cmd_buffer->tile_state->offset;
+   submit.qma = job->tile_alloc->offset;
+   submit.qms = job->tile_alloc->size;
+   submit.qts = job->tile_state->offset;
 
-   submit.bo_handle_count = cmd_buffer->bo_count;
+   submit.bo_handle_count = job->bo_count;
    uint32_t *bo_handles =
       (uint32_t *) malloc(sizeof(uint32_t) * MAX2(4, submit.bo_handle_count * 2));
    uint32_t bo_idx = 0;
-   set_foreach(cmd_buffer->bos, entry) {
+   set_foreach(job->bos, entry) {
       struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
       bo_handles[bo_idx++] = bo->handle;
    }
    assert(bo_idx == submit.bo_handle_count);
    submit.bo_handles = (uintptr_t)(void *)bo_handles;
 
-   v3dv_clif_dump(queue, cmd_buffer, &submit);
+   struct v3dv_device *device = job->cmd_buffer->device;
+   v3dv_clif_dump(device, job, &submit);
 
-   int ret = v3dv_ioctl(queue->device->fd, DRM_IOCTL_V3D_SUBMIT_CL, &submit);
+   int ret = v3dv_ioctl(device->fd, DRM_IOCTL_V3D_SUBMIT_CL, &submit);
    static bool warned = false;
    if (ret && !warned) {
       fprintf(stderr, "Draw call returned %s. Expect corruption.\n",
@@ -124,6 +117,29 @@ queue_submit(struct v3dv_queue *queue,
    return VK_SUCCESS;
 }
 
+static VkResult
+queue_submit(struct v3dv_queue *queue,
+             const VkSubmitInfo *pSubmit,
+             VkFence fence)
+{
+   /* FIXME */
+   assert(fence == 0);
+   assert(pSubmit->waitSemaphoreCount == 0);
+   assert(pSubmit->signalSemaphoreCount == 0);
+   assert(pSubmit->commandBufferCount == 1);
+
+   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, pSubmit->pCommandBuffers[0]);
+
+   list_for_each_entry_safe(struct v3dv_job, job,
+                            &cmd_buffer->submit_jobs, list_link) {
+      VkResult result = job_submit(job);
+      if (result != VK_SUCCESS)
+         return result;
+   }
+
+   return VK_SUCCESS;
+}
+
 VkResult
 v3dv_QueueSubmit(VkQueue _queue,
                  uint32_t submitCount,
index 109dbe4..0652753 100644 (file)
@@ -34,6 +34,9 @@ v3dv_write_uniforms(struct v3dv_cmd_buffer *cmd_buffer,
    struct v3d_uniform_list *uinfo = &p_stage->prog_data.base->uniforms;
    struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
 
+   struct v3dv_job *job = cmd_buffer->state.job;
+   assert(job);
+
    /* The hardware always pre-fetches the next uniform (also when there
     * aren't any), so we always allocate space for an extra slot. This
     * fixes MMU exceptions reported since Linux kernel 5.4 when the
@@ -42,13 +45,11 @@ v3dv_write_uniforms(struct v3dv_cmd_buffer *cmd_buffer,
     * the last uniform it will read beyond the end of the page and trigger
     * the MMU exception.
     */
-   v3dv_cl_ensure_space(&cmd_buffer->indirect, (uinfo->count + 1) * 4, 4);
+   v3dv_cl_ensure_space(&job->indirect, (uinfo->count + 1) * 4, 4);
 
-   struct v3dv_cl_reloc uniform_stream =
-      v3dv_cl_get_address(&cmd_buffer->indirect);
+   struct v3dv_cl_reloc uniform_stream = v3dv_cl_get_address(&job->indirect);
 
-   struct v3dv_cl_out *uniforms =
-      cl_start(&cmd_buffer->indirect);
+   struct v3dv_cl_out *uniforms = cl_start(&job->indirect);
 
    for (int i = 0; i < uinfo->count; i++) {
       uint32_t data = uinfo->data[i];
@@ -79,7 +80,7 @@ v3dv_write_uniforms(struct v3dv_cmd_buffer *cmd_buffer,
       }
    }
 
-   cl_end(&cmd_buffer->indirect, uniforms);
+   cl_end(&job->indirect, uniforms);
 
    return uniform_stream;
 }