radv: Try to do a better job of dealing with L2 coherent images.
authorBas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Tue, 5 Oct 2021 01:10:20 +0000 (03:10 +0200)
committerMarge Bot <eric+marge@anholt.net>
Wed, 13 Oct 2021 14:07:05 +0000 (14:07 +0000)
Only try to invalidate L2 if we actually hit one of the incoherent images.

Note we may actually insert some extra flushes at the end of a command
buffer so that we may asume the caches are clean the start of the next
command buffer. However, on average I think that case is uncommon
enough that being able to make assumptions at the start of a cmdbuffer
is beneficial. Especially since MSAA is somewhat rare in more recent
games.

Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13239>

src/amd/vulkan/radv_cmd_buffer.c
src/amd/vulkan/radv_private.h
src/amd/vulkan/si_cmd_buffer.c

index 57faae3..3fdd9bd 100644 (file)
@@ -3777,9 +3777,9 @@ radv_stage_flush(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags src_st
  * images. However, given the existence of memory barriers which do not specify
  * the image/buffer it often devolves to just VRAM/GTT anyway.
  *
- * In practice we can cheat a bit, since the INV_* operations include writebacks.
- * If we know that all the destinations that need the WB do an INV, then we can
- * skip the WB.
+ * To help reducing the invalidations for GPUs that have L2 coherency between the
+ * RB and the shader caches, we always invalidate L2 on the src side, as we can
+ * use our knowledge of past usage to optimize flushes away.
  */
 
 enum radv_cmd_flush_bits
@@ -3811,6 +3811,10 @@ radv_src_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags src_flag
                flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
             }
          }
+
+         /* This is valid even for the rb_noncoherent_dirty case, because with how we account for
+          * dirtyness, if it isn't dirty it doesn't contain the data at all and hence doesn't need
+          * invalidating. */
          if (!image_is_coherent)
             flush_bits |= RADV_CMD_FLAG_WB_L2;
          break;
@@ -3878,6 +3882,11 @@ radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags dst_flag
          has_DB_meta = false;
    }
 
+   /* All the L2 invalidations below are not the CB/DB. So if there are no incoherent images
+    * in the L2 cache in CB/DB mode then they are already usable from all the other L2 clients. */
+   image_is_coherent |= cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9 &&
+                        !cmd_buffer->state.rb_noncoherent_dirty;
+
    u_foreach_bit(b, dst_flags)
    {
       switch ((VkAccessFlagBits)(1 << b)) {
@@ -4741,6 +4750,16 @@ radv_EndCommandBuffer(VkCommandBuffer commandBuffer)
        */
       cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits;
 
+      /* Flush noncoherent images on GFX9+ so we can assume they're clean on the start of a
+       * command buffer.
+       */
+      if (cmd_buffer->state.rb_noncoherent_dirty &&
+          cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9)
+         cmd_buffer->state.flush_bits |= radv_src_access_flush(
+            cmd_buffer,
+            VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT,
+            NULL);
+
       /* Since NGG streamout uses GDS, we need to make GDS idle when
        * we leave the IB, otherwise another process might overwrite
        * it while our shaders are busy.
@@ -5735,10 +5754,37 @@ radv_cmd_buffer_begin_subpass(struct radv_cmd_buffer *cmd_buffer, uint32_t subpa
    assert(cmd_buffer->cs->cdw <= cdw_max);
 }
 
+static void
+radv_mark_noncoherent_rb(struct radv_cmd_buffer *cmd_buffer)
+{
+   const struct radv_subpass *subpass = cmd_buffer->state.subpass;
+
+   /* Have to be conservative in cmdbuffers with inherited attachments. */
+   if (!cmd_buffer->state.attachments) {
+      cmd_buffer->state.rb_noncoherent_dirty = true;
+      return;
+   }
+
+   for (uint32_t i = 0; i < subpass->color_count; ++i) {
+      const uint32_t a = subpass->color_attachments[i].attachment;
+      if (a == VK_ATTACHMENT_UNUSED)
+         continue;
+      if (!cmd_buffer->state.attachments[a].iview->image->l2_coherent) {
+         cmd_buffer->state.rb_noncoherent_dirty = true;
+         return;
+      }
+   }
+   if (subpass->depth_stencil_attachment &&
+       !cmd_buffer->state.attachments[subpass->depth_stencil_attachment->attachment]
+           .iview->image->l2_coherent)
+      cmd_buffer->state.rb_noncoherent_dirty = true;
+}
+
 void
 radv_cmd_buffer_restore_subpass(struct radv_cmd_buffer *cmd_buffer,
                                 const struct radv_subpass *subpass)
 {
+   radv_mark_noncoherent_rb(cmd_buffer);
    radv_cmd_buffer_set_subpass(cmd_buffer, subpass);
 }
 
@@ -5810,6 +5856,8 @@ radv_CmdNextSubpass2(VkCommandBuffer commandBuffer, const VkSubpassBeginInfo *pS
 {
    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
 
+   radv_mark_noncoherent_rb(cmd_buffer);
+
    uint32_t prev_subpass = radv_get_subpass_id(cmd_buffer);
    radv_cmd_buffer_end_subpass(cmd_buffer);
    radv_cmd_buffer_begin_subpass(cmd_buffer, prev_subpass + 1);
@@ -7192,6 +7240,8 @@ radv_CmdEndRenderPass2(VkCommandBuffer commandBuffer, const VkSubpassEndInfo *pS
 {
    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
 
+   radv_mark_noncoherent_rb(cmd_buffer);
+
    radv_emit_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier);
 
    radv_cmd_buffer_end_subpass(cmd_buffer);
@@ -7574,6 +7624,9 @@ radv_barrier(struct radv_cmd_buffer *cmd_buffer, uint32_t memoryBarrierCount,
    enum radv_cmd_flush_bits src_flush_bits = 0;
    enum radv_cmd_flush_bits dst_flush_bits = 0;
 
+   if (cmd_buffer->state.subpass)
+      radv_mark_noncoherent_rb(cmd_buffer);
+
    radv_describe_barrier_start(cmd_buffer, info->reason);
 
    for (unsigned i = 0; i < info->eventCount; ++i) {
index 206f302..c8b8ce3 100644 (file)
@@ -1396,6 +1396,9 @@ struct radv_cmd_state {
    /* Whether CP DMA is busy/idle. */
    bool dma_is_busy;
 
+   /* Whether any images that are not L2 coherent are dirty from the CB. */
+   bool rb_noncoherent_dirty;
+
    /* Conditional rendering info. */
    uint8_t predication_op; /* 32-bit or 64-bit predicate value */
    int predication_type;   /* -1: disabled, 0: normal, 1: inverted */
index a672d6d..8826682 100644 (file)
@@ -1357,6 +1357,9 @@ si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer)
    if (unlikely(cmd_buffer->device->trace_bo))
       radv_cmd_buffer_trace_emit(cmd_buffer);
 
+   if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_INV_L2)
+      cmd_buffer->state.rb_noncoherent_dirty = false;
+
    /* Clear the caches that have been flushed to avoid syncing too much
     * when there is some pending active queries.
     */