anv: reset query pools using blorp
authorFelix DeGrood <felix.j.degrood@intel.com>
Wed, 22 Mar 2023 22:08:09 +0000 (22:08 +0000)
committerMarge Bot <emma+marge@anholt.net>
Fri, 7 Apr 2023 15:51:20 +0000 (15:51 +0000)
Previously we used PC to set query data to 0 during
CmdResetQueryPool. This was slow when clearing large query pools.
Switching to blorp to clear pools is faster for large query pools.

Red Dead Redemption 2: +1.5% speedup

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22178>

src/intel/vulkan/anv_device.c
src/intel/vulkan/anv_private.h
src/intel/vulkan/genX_cmd_buffer.c
src/intel/vulkan/genX_query.c
src/util/driconf.h

index 3abea8a..eae2797 100644 (file)
@@ -76,6 +76,7 @@ static const driOptionDescription anv_dri_options[] = {
       DRI_CONF_ANV_FP64_WORKAROUND_ENABLED(false)
       DRI_CONF_ANV_GENERATED_INDIRECT_THRESHOLD(4)
       DRI_CONF_NO_16BIT(false)
+      DRI_CONF_ANV_QUERY_CLEAR_WITH_BLORP_THRESHOLD(6)
    DRI_CONF_SECTION_END
 
    DRI_CONF_SECTION_DEBUG
@@ -1113,6 +1114,8 @@ anv_init_dri_options(struct anv_instance *instance)
             driQueryOptionb(&instance->dri_options, "fp64_workaround_enabled");
     instance->generated_indirect_threshold =
             driQueryOptioni(&instance->dri_options, "generated_indirect_threshold");
+    instance->query_clear_with_blorp_threshold =
+       driQueryOptioni(&instance->dri_options, "query_clear_with_blorp_threshold");
 }
 
 VkResult anv_CreateInstance(
index 9ac4def..c9ce3ea 100644 (file)
@@ -1029,6 +1029,7 @@ struct anv_instance {
     bool                                        fp64_workaround_enabled;
     float                                       lower_depth_range_rate;
     unsigned                                    generated_indirect_threshold;
+    unsigned                                    query_clear_with_blorp_threshold;
 
     /* HW workarounds */
     bool                                        no_16bit;
@@ -2085,6 +2086,11 @@ enum anv_pipe_bits {
     * implement a workaround for Gfx9.
     */
    ANV_PIPE_POST_SYNC_BIT                    = (1 << 25),
+
+   /* This bit does not exist directly in PIPE_CONTROL. It means that render
+    * target operations related to clearing of queries are ongoing.
+    */
+   ANV_PIPE_QUERY_CLEARS_BIT                 = (1 << 26),
 };
 
 #define ANV_PIPE_FLUSH_BITS ( \
@@ -2127,6 +2133,20 @@ enum anv_pipe_bits {
 #define ANV_PIPE_GPGPU_BITS ( \
    (GFX_VERx10 >= 125 ? ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT : 0))
 
+/* Things we need to flush before accessing query data using the command
+ * streamer.
+ *
+ * Prior to DG2 experiments show that the command streamer is not coherent
+ * with the tile cache so we need to flush it to make any data visible to CS.
+ *
+ * Otherwise we want to flush the RT cache which is where blorp writes, either
+ * for clearing the query buffer or for clearing the destination buffer in
+ * vkCopyQueryPoolResults().
+ */
+#define ANV_PIPE_QUERY_FLUSH_BITS ( \
+   (GFX_VERx10 < 125 ? ANV_PIPE_TILE_CACHE_FLUSH_BIT : 0) | \
+   ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT)
+
 enum intel_ds_stall_flag
 anv_pipe_flush_bit_to_ds_stall_flag(enum anv_pipe_bits bits);
 
index 684fd42..cb92cf3 100644 (file)
@@ -1752,7 +1752,16 @@ genX(emit_apply_pipe_flushes)(struct anv_batch *batch,
        * saying that render target writes are ongoing.
        */
       if (bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT)
-         bits &= ~(ANV_PIPE_RENDER_TARGET_BUFFER_WRITES);
+         bits &= ~ANV_PIPE_RENDER_TARGET_BUFFER_WRITES;
+
+      /* If the conditions for flushing the query clears are met, we can
+       * toggle the bit off.
+       */
+      if ((bits & ANV_PIPE_QUERY_FLUSH_BITS) == ANV_PIPE_QUERY_FLUSH_BITS &&
+          (bits & (ANV_PIPE_END_OF_PIPE_SYNC_BIT |
+                   ANV_PIPE_CS_STALL_BIT))) {
+         bits &= ~ANV_PIPE_QUERY_CLEARS_BIT;
+      }
 
       bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
                 ANV_PIPE_END_OF_PIPE_SYNC_BIT);
@@ -3803,6 +3812,16 @@ genX(EndCommandBuffer)(
       return VK_SUCCESS;
    }
 
+   /* Flush query clears using blorp so that secondary query writes do not
+    * race with the clear.
+    */
+   if (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_QUERY_CLEARS_BIT) {
+      anv_add_pending_pipe_bits(cmd_buffer,
+                                ANV_PIPE_QUERY_FLUSH_BITS |
+                                ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT,
+                                "query clear flush prior command buffer end");
+   }
+
    genX(cmd_buffer_flush_generated_draws)(cmd_buffer);
 
    /* Turn on object level preemption if it is disabled to have it in known
@@ -3878,6 +3897,16 @@ genX(CmdExecuteCommands)(
     */
    genX(apply_task_urb_workaround)(primary);
 
+   /* Flush query clears using blorp so that secondary query writes do not
+    * race with the clear.
+    */
+   if (primary->state.pending_pipe_bits & ANV_PIPE_QUERY_CLEARS_BIT) {
+      anv_add_pending_pipe_bits(primary,
+                                ANV_PIPE_QUERY_FLUSH_BITS |
+                                ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT,
+                                "query clear flush prior to secondary buffer");
+   }
+
    /* The secondary command buffer doesn't know which textures etc. have been
     * flushed prior to their execution.  Apply those flushes now.
     */
@@ -6535,6 +6564,21 @@ genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
       cmd_buffer->state.compute.pipeline_dirty = true;
 #endif
 
+
+#if GFX_VERx10 < 125
+   /* We apparently cannot flush the tile cache (color/depth) from the GPGPU
+    * pipeline. That means query clears will not be visible to query
+    * copy/write. So we need to flush it before going to GPGPU mode.
+    */
+   if (cmd_buffer->state.current_pipeline == _3D &&
+       (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_QUERY_CLEARS_BIT)) {
+      anv_add_pending_pipe_bits(cmd_buffer,
+                                ANV_PIPE_QUERY_FLUSH_BITS |
+                                ANV_PIPE_END_OF_PIPE_SYNC_BIT,
+                                "query clear flush prior to GPGPU");
+   }
+#endif
+
 #if GFX_VER >= 12
    /* From Tigerlake PRM, Volume 2a, PIPELINE_SELECT:
     *
index afb1542..0f913c5 100644 (file)
@@ -181,7 +181,6 @@ VkResult genX(CreateQueryPool)(
    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
       uint64s_per_slot = 1 + 2 /* availability + size (PostbuildInfoSerializationDesc) */;
       break;
-      break;
 
 #endif
    case VK_QUERY_TYPE_RESULT_STATUS_ONLY_KHR:
@@ -783,6 +782,19 @@ void genX(CmdResetQueryPool)(
 {
    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
+   struct anv_physical_device *pdevice = cmd_buffer->device->physical;
+
+   if (queryCount >= pdevice->instance->query_clear_with_blorp_threshold) {
+      anv_cmd_buffer_fill_area(cmd_buffer,
+                               anv_query_address(pool, firstQuery),
+                               queryCount * pool->stride,
+                               0);
+
+      anv_add_pending_pipe_bits(cmd_buffer,
+                                ANV_PIPE_QUERY_CLEARS_BIT,
+                                "vkCmdResetQueryPool of timestamps");
+      return;
+   }
 
    switch (pool->type) {
    case VK_QUERY_TYPE_OCCLUSION:
@@ -963,6 +975,22 @@ emit_perf_intel_query(struct anv_cmd_buffer *cmd_buffer,
    }
 }
 
+static void
+emit_query_clear_flush(struct anv_cmd_buffer *cmd_buffer,
+                       struct anv_query_pool *pool,
+                       const char *reason)
+{
+   if ((cmd_buffer->state.pending_pipe_bits & ANV_PIPE_QUERY_CLEARS_BIT) == 0)
+      return;
+
+   anv_add_pending_pipe_bits(cmd_buffer,
+                             ANV_PIPE_QUERY_FLUSH_BITS |
+                             ANV_PIPE_END_OF_PIPE_SYNC_BIT,
+                             reason);
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+}
+
+
 void genX(CmdBeginQuery)(
     VkCommandBuffer                             commandBuffer,
     VkQueryPool                                 queryPool,
@@ -983,6 +1011,8 @@ void genX(CmdBeginQueryIndexedEXT)(
    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
    struct anv_address query_addr = anv_query_address(pool, query);
 
+   emit_query_clear_flush(cmd_buffer, pool, "CmdBeginQuery* flush query clears");
+
    struct mi_builder b;
    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
 
@@ -1352,6 +1382,9 @@ void genX(CmdWriteTimestamp2)(
 
    assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
 
+   emit_query_clear_flush(cmd_buffer, pool,
+                          "CmdWriteTimestamp flush query clears");
+
    struct mi_builder b;
    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
 
@@ -1472,33 +1505,42 @@ void genX(CmdCopyQueryPoolResults)(
     * to ensure proper ordering of the commands from the 3d pipe and the
     * command streamer.
     */
-   if (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_RENDER_TARGET_BUFFER_WRITES) {
+   const bool need_flushes =
+      (cmd_buffer->state.pending_pipe_bits &
+       (ANV_PIPE_RENDER_TARGET_BUFFER_WRITES |
+        ANV_PIPE_QUERY_CLEARS_BIT));
+
+   if (need_flushes) {
       anv_add_pending_pipe_bits(cmd_buffer,
-                                ANV_PIPE_TILE_CACHE_FLUSH_BIT |
-                                ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT,
+                                ANV_PIPE_QUERY_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT,
                                 "CopyQueryPoolResults");
    }
 
-   if ((cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS) ||
-       /* Occlusion & timestamp queries are written using a PIPE_CONTROL and
-        * because we're about to copy values from MI commands, we need to
-        * stall the command streamer to make sure the PIPE_CONTROL values have
-        * landed, otherwise we could see inconsistent values & availability.
-        *
-        *  From the vulkan spec:
-        *
-        *     "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
-        *     previous uses of vkCmdResetQueryPool in the same queue, without
-        *     any additional synchronization."
-        */
-       pool->type == VK_QUERY_TYPE_OCCLUSION ||
-       pool->type == VK_QUERY_TYPE_TIMESTAMP) {
+   bool need_cs_stall =
+      (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS) ||
+      /* Occlusion & timestamp queries are written using a PIPE_CONTROL and
+       * because we're about to copy values from MI commands, we need to stall
+       * the command streamer to make sure the PIPE_CONTROL values have
+       * landed, otherwise we could see inconsistent values & availability.
+       *
+       *  From the vulkan spec:
+       *
+       *     "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
+       *     previous uses of vkCmdResetQueryPool in the same queue, without
+       *     any additional synchronization."
+       */
+      pool->type == VK_QUERY_TYPE_OCCLUSION ||
+      pool->type == VK_QUERY_TYPE_TIMESTAMP;
+
+   if (need_cs_stall) {
       anv_add_pending_pipe_bits(cmd_buffer,
                                 ANV_PIPE_CS_STALL_BIT,
-                                "CopyQueryPoolResults");
-      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+                                "CopyQueryPoolResults stall");
    }
 
+   if (need_cs_stall || need_flushes)
+      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
    struct anv_address dest_addr = anv_address_add(buffer->address, destOffset);
    for (uint32_t i = 0; i < queryCount; i++) {
       struct anv_address query_addr = anv_query_address(pool, firstQuery + i);
index 32be1d7..bf92b5d 100644 (file)
    DRI_CONF_OPT_I(generated_indirect_threshold, def, 0, INT32_MAX, \
                   "Indirect threshold count above which we start generating commands")
 
+#define DRI_CONF_ANV_QUERY_CLEAR_WITH_BLORP_THRESHOLD(def) \
+   DRI_CONF_OPT_I(query_clear_with_blorp_threshold, def, 0, INT32_MAX, \
+                  "Indirect threshold count above which we start generating commands")
+
 /**
  * \brief DZN specific configuration options
  */