radv: simplify the NGG vs legacy pipelinestat query path
authorSamuel Pitoiset <samuel.pitoiset@gmail.com>
Wed, 19 Jul 2023 07:12:01 +0000 (09:12 +0200)
committerSamuel Pitoiset <samuel.pitoiset@gmail.com>
Thu, 27 Jul 2023 07:13:11 +0000 (09:13 +0200)
NGG is enabled by default on RDNA1-2 but the driver might fallback to
legacy GS for some reasons, like XFB. On these generations, the number
of generated primitives by GS needs to be emulated from the NGG shader
because the hw doesn't increment the related pipelinestat counter.

In order to support NGG and legacy GS with that query (remember that
we can't know pipelines when starting/ending queries), we used to
reserve 2x 64-bit counters to store the GDS results, and the results
were accumulated.

Now that legacy GS also uses GDS counters, we can simplify this path
and overwrite the pipelinestat counter directly instead of having two
separate counters.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24231>

src/amd/vulkan/radv_query.c

index f9d8e4e..d2d0819 100644 (file)
 static const unsigned pipeline_statistics_indices[] = {7, 6, 3, 4, 5, 2, 1, 0, 8, 9, 10};
 
 static unsigned
+radv_get_pipelinestat_query_offset(VkQueryPipelineStatisticFlagBits query)
+{
+   uint32_t idx = ffs(query) - 1;
+   return pipeline_statistics_indices[idx] * 8;
+}
+
+static unsigned
 radv_get_pipelinestat_query_size(struct radv_device *device)
 {
    unsigned num_results = device->physical_device->rad_info.gfx_level >= GFX11 ? 14 : 11;
@@ -276,25 +283,14 @@ build_pipeline_statistics_query_shader(struct radv_device *device)
    nir_ssa_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 4);
    nir_ssa_def *stats_mask = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 8), .range = 12);
    nir_ssa_def *avail_offset = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 12), .range = 16);
-   nir_ssa_def *uses_gds = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 16), .range = 20);
 
    nir_ssa_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
    nir_ssa_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
 
    nir_ssa_def *global_id = get_global_ids(&b, 1);
 
-   nir_variable *input_stride = nir_local_variable_create(b.impl, glsl_int_type(), "input_stride");
-   nir_push_if(&b, nir_ine_imm(&b, uses_gds, 0));
-   {
-      nir_store_var(&b, input_stride, nir_imm_int(&b, pipelinestat_block_size * 2 + 8 * 2), 0x1);
-   }
-   nir_push_else(&b, NULL);
-   {
-      nir_store_var(&b, input_stride, nir_imm_int(&b, pipelinestat_block_size * 2), 0x1);
-   }
-   nir_pop_if(&b, NULL);
-
-   nir_ssa_def *input_base = nir_imul(&b, nir_load_var(&b, input_stride), global_id);
+   nir_ssa_def *input_stride = nir_imm_int(&b, pipelinestat_block_size * 2);
+   nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id);
    nir_ssa_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 8);
    nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id);
 
@@ -324,23 +320,6 @@ build_pipeline_statistics_query_shader(struct radv_device *device)
 
       nir_store_var(&b, result, nir_isub(&b, end, start), 0x1);
 
-      nir_push_if(&b,
-                  nir_iand(&b, nir_i2b(&b, uses_gds),
-                           nir_imm_bool(&b, 1u << i == VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT)));
-      {
-         /* Compute the GDS result if needed. */
-         nir_ssa_def *gds_start_offset = nir_iadd(&b, input_base, nir_imm_int(&b, pipelinestat_block_size * 2));
-         nir_ssa_def *gds_start = nir_load_ssbo(&b, 1, 64, src_buf, gds_start_offset);
-
-         nir_ssa_def *gds_end_offset = nir_iadd(&b, input_base, nir_imm_int(&b, pipelinestat_block_size * 2 + 8));
-         nir_ssa_def *gds_end = nir_load_ssbo(&b, 1, 64, src_buf, gds_end_offset);
-
-         nir_ssa_def *ngg_gds_result = nir_isub(&b, gds_end, gds_start);
-
-         nir_store_var(&b, result, nir_iadd(&b, nir_load_var(&b, result), ngg_gds_result), 0x1);
-      }
-      nir_pop_if(&b, NULL);
-
       /* Store result */
       nir_push_if(&b, result_is_64bit);
 
@@ -1096,12 +1075,6 @@ radv_create_query_pool(struct radv_device *device, const VkQueryPoolCreateInfo *
       break;
    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
       pool->stride = radv_get_pipelinestat_query_size(device) * 2;
-      if (pool->uses_gds) {
-         /* When the query pool needs GDS (for counting the number of primitives generated by a
-          * geometry shader with NGG), allocate 2x64-bit values for begin/end.
-          */
-         pool->stride += 8 * 2;
-      }
       break;
    case VK_QUERY_TYPE_TIMESTAMP:
    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
@@ -1268,7 +1241,6 @@ radv_GetQueryPoolResults(VkDevice _device, VkQueryPool queryPool, uint32_t first
       case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
          unsigned pipelinestat_block_size = radv_get_pipelinestat_query_size(device);
          const uint32_t *avail_ptr = (const uint32_t *)(pool->ptr + pool->availability_offset + 4 * query);
-         uint64_t ngg_gds_result = 0;
 
          do {
             available = p_atomic_read(avail_ptr);
@@ -1277,14 +1249,6 @@ radv_GetQueryPoolResults(VkDevice _device, VkQueryPool queryPool, uint32_t first
          if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
             result = VK_NOT_READY;
 
-         if (pool->uses_gds) {
-            /* Compute the result that was copied from GDS. */
-            const uint64_t *gds_start = (uint64_t *)(src + pipelinestat_block_size * 2);
-            const uint64_t *gds_stop = (uint64_t *)(src + pipelinestat_block_size * 2 + 8);
-
-            ngg_gds_result = gds_stop[0] - gds_start[0];
-         }
-
          const uint64_t *start = (uint64_t *)src;
          const uint64_t *stop = (uint64_t *)(src + pipelinestat_block_size);
          if (flags & VK_QUERY_RESULT_64_BIT) {
@@ -1294,10 +1258,6 @@ radv_GetQueryPoolResults(VkDevice _device, VkQueryPool queryPool, uint32_t first
                if (pool->pipeline_stats_mask & (1u << i)) {
                   if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) {
                      *dst = stop[pipeline_statistics_indices[i]] - start[pipeline_statistics_indices[i]];
-
-                     if (pool->uses_gds && (1u << i) == VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT) {
-                        *dst += ngg_gds_result;
-                     }
                   }
                   dst++;
                }
@@ -1310,10 +1270,6 @@ radv_GetQueryPoolResults(VkDevice _device, VkQueryPool queryPool, uint32_t first
                if (pool->pipeline_stats_mask & (1u << i)) {
                   if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) {
                      *dst = stop[pipeline_statistics_indices[i]] - start[pipeline_statistics_indices[i]];
-
-                     if (pool->uses_gds && (1u << i) == VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT) {
-                        *dst += ngg_gds_result;
-                     }
                   }
                   dst++;
                }
@@ -1538,7 +1494,7 @@ radv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, VkQueryPool queryPoo
       radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.pipeline_statistics_query_pipeline, pool->bo,
                         dst_buffer->bo, firstQuery * pool->stride, dst_buffer->offset + dstOffset, pool->stride, stride,
                         dst_size, queryCount, flags, pool->pipeline_stats_mask,
-                        pool->availability_offset + 4 * firstQuery, pool->uses_gds);
+                        pool->availability_offset + 4 * firstQuery, false);
       break;
    case VK_QUERY_TYPE_TIMESTAMP:
    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
@@ -1762,8 +1718,6 @@ emit_begin_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *poo
       radeon_emit(cs, va >> 32);
       break;
    case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
-      unsigned pipelinestat_block_size = radv_get_pipelinestat_query_size(cmd_buffer->device);
-
       radeon_check_space(cmd_buffer->device->ws, cs, 4);
 
       ++cmd_buffer->state.active_pipeline_queries;
@@ -1778,7 +1732,10 @@ emit_begin_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *poo
       radeon_emit(cs, va >> 32);
 
       if (pool->uses_gds) {
-         va += pipelinestat_block_size * 2;
+         uint32_t gs_prim_offset =
+            radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT);
+
+         va += gs_prim_offset;
 
          /* pipeline statistics counter for all streams */
          gfx10_copy_gds_query(cmd_buffer, RADV_SHADER_QUERY_PIPELINE_STAT_OFFSET, va);
@@ -1919,7 +1876,10 @@ emit_end_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool,
                                  EOP_DATA_SEL_VALUE_32BIT, avail_va, 1, cmd_buffer->gfx9_eop_bug_va);
 
       if (pool->uses_gds) {
-         va += pipelinestat_block_size + 8;
+         uint32_t gs_prim_offset =
+            radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT);
+
+         va += gs_prim_offset;
 
          /* pipeline statistics counter for all streams */
          gfx10_copy_gds_query(cmd_buffer, RADV_SHADER_QUERY_PIPELINE_STAT_OFFSET, va);