radv: Try to do a better job of dealing with L2 coherent images.

author Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>

Tue, 5 Oct 2021 01:10:20 +0000 (03:10 +0200)

committer Marge Bot <eric+marge@anholt.net>

Wed, 13 Oct 2021 14:07:05 +0000 (14:07 +0000)
author Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Tue, 5 Oct 2021 01:10:20 +0000 (03:10 +0200)
committer Marge Bot <eric+marge@anholt.net>
Wed, 13 Oct 2021 14:07:05 +0000 (14:07 +0000)
diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c

index 57faae3..3fdd9bd 100644 (file)
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -3777,9 +3777,9 @@ radv_stage_flush(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags src_st
   * images. However, given the existence of memory barriers which do not specify
   * the image/buffer it often devolves to just VRAM/GTT anyway.
   *
- * In practice we can cheat a bit, since the INV_* operations include writebacks.
- * If we know that all the destinations that need the WB do an INV, then we can
- * skip the WB.
+ * To help reducing the invalidations for GPUs that have L2 coherency between the
+ * RB and the shader caches, we always invalidate L2 on the src side, as we can
+ * use our knowledge of past usage to optimize flushes away.
   */
  
  enum radv_cmd_flush_bits
@@ -3811,6 +3811,10 @@ radv_src_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags src_flag
                 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
              }
           }
+
+         /* This is valid even for the rb_noncoherent_dirty case, because with how we account for
+          * dirtyness, if it isn't dirty it doesn't contain the data at all and hence doesn't need
+          * invalidating. */
           if (!image_is_coherent)
              flush_bits |= RADV_CMD_FLAG_WB_L2;
           break;
@@ -3878,6 +3882,11 @@ radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags dst_flag
           has_DB_meta = false;
     }
  
+   /* All the L2 invalidations below are not the CB/DB. So if there are no incoherent images
+    * in the L2 cache in CB/DB mode then they are already usable from all the other L2 clients. */
+   image_is_coherent |= cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9 &&
+                        !cmd_buffer->state.rb_noncoherent_dirty;
+
     u_foreach_bit(b, dst_flags)
     {
        switch ((VkAccessFlagBits)(1 << b)) {
@@ -4741,6 +4750,16 @@ radv_EndCommandBuffer(VkCommandBuffer commandBuffer)
         */
        cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits;
  
+      /* Flush noncoherent images on GFX9+ so we can assume they're clean on the start of a
+       * command buffer.
+       */
+      if (cmd_buffer->state.rb_noncoherent_dirty &&
+          cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9)
+         cmd_buffer->state.flush_bits |= radv_src_access_flush(
+            cmd_buffer,
+            VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT,
+            NULL);
+
        /* Since NGG streamout uses GDS, we need to make GDS idle when
         * we leave the IB, otherwise another process might overwrite
         * it while our shaders are busy.
@@ -5735,10 +5754,37 @@ radv_cmd_buffer_begin_subpass(struct radv_cmd_buffer *cmd_buffer, uint32_t subpa
     assert(cmd_buffer->cs->cdw <= cdw_max);
  }
  
+static void
+radv_mark_noncoherent_rb(struct radv_cmd_buffer *cmd_buffer)
+{
+   const struct radv_subpass *subpass = cmd_buffer->state.subpass;
+
+   /* Have to be conservative in cmdbuffers with inherited attachments. */
+   if (!cmd_buffer->state.attachments) {
+      cmd_buffer->state.rb_noncoherent_dirty = true;
+      return;
+   }
+
+   for (uint32_t i = 0; i < subpass->color_count; ++i) {
+      const uint32_t a = subpass->color_attachments[i].attachment;
+      if (a == VK_ATTACHMENT_UNUSED)
+         continue;
+      if (!cmd_buffer->state.attachments[a].iview->image->l2_coherent) {
+         cmd_buffer->state.rb_noncoherent_dirty = true;
+         return;
+      }
+   }
+   if (subpass->depth_stencil_attachment &&
+       !cmd_buffer->state.attachments[subpass->depth_stencil_attachment->attachment]
+           .iview->image->l2_coherent)
+      cmd_buffer->state.rb_noncoherent_dirty = true;
+}
+
  void
  radv_cmd_buffer_restore_subpass(struct radv_cmd_buffer *cmd_buffer,
                                  const struct radv_subpass *subpass)
  {
+   radv_mark_noncoherent_rb(cmd_buffer);
     radv_cmd_buffer_set_subpass(cmd_buffer, subpass);
  }
  
@@ -5810,6 +5856,8 @@ radv_CmdNextSubpass2(VkCommandBuffer commandBuffer, const VkSubpassBeginInfo *pS
  {
     RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
  
+   radv_mark_noncoherent_rb(cmd_buffer);
+
     uint32_t prev_subpass = radv_get_subpass_id(cmd_buffer);
     radv_cmd_buffer_end_subpass(cmd_buffer);
     radv_cmd_buffer_begin_subpass(cmd_buffer, prev_subpass + 1);
@@ -7192,6 +7240,8 @@ radv_CmdEndRenderPass2(VkCommandBuffer commandBuffer, const VkSubpassEndInfo *pS
  {
     RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
  
+   radv_mark_noncoherent_rb(cmd_buffer);
+
     radv_emit_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier);
  
     radv_cmd_buffer_end_subpass(cmd_buffer);
@@ -7574,6 +7624,9 @@ radv_barrier(struct radv_cmd_buffer *cmd_buffer, uint32_t memoryBarrierCount,
     enum radv_cmd_flush_bits src_flush_bits = 0;
     enum radv_cmd_flush_bits dst_flush_bits = 0;
  
+   if (cmd_buffer->state.subpass)
+      radv_mark_noncoherent_rb(cmd_buffer);
+
     radv_describe_barrier_start(cmd_buffer, info->reason);
  
     for (unsigned i = 0; i < info->eventCount; ++i) {
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h

index 206f302..c8b8ce3 100644 (file)
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -1396,6 +1396,9 @@ struct radv_cmd_state {
     /* Whether CP DMA is busy/idle. */
     bool dma_is_busy;
  
+   /* Whether any images that are not L2 coherent are dirty from the CB. */
+   bool rb_noncoherent_dirty;
+
     /* Conditional rendering info. */
     uint8_t predication_op; /* 32-bit or 64-bit predicate value */
     int predication_type;   /* -1: disabled, 0: normal, 1: inverted */
diff --git a/src/amd/vulkan/si_cmd_buffer.c b/src/amd/vulkan/si_cmd_buffer.c

index a672d6d..8826682 100644 (file)
--- a/src/amd/vulkan/si_cmd_buffer.c
+++ b/src/amd/vulkan/si_cmd_buffer.c
@@ -1357,6 +1357,9 @@ si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer)
     if (unlikely(cmd_buffer->device->trace_bo))
        radv_cmd_buffer_trace_emit(cmd_buffer);
  
+   if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_INV_L2)
+      cmd_buffer->state.rb_noncoherent_dirty = false;
+
     /* Clear the caches that have been flushed to avoid syncing too much
      * when there is some pending active queries.
      */
author	Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
	Tue, 5 Oct 2021 01:10:20 +0000 (03:10 +0200)
committer	Marge Bot <eric+marge@anholt.net>
	Wed, 13 Oct 2021 14:07:05 +0000 (14:07 +0000)
src/amd/vulkan/radv_cmd_buffer.c		patch \| blob \| history
src/amd/vulkan/radv_private.h		patch \| blob \| history
src/amd/vulkan/si_cmd_buffer.c		patch \| blob \| history