radv: fix a synchronization issue with primitives generated query on RDNA1-2
authorSamuel Pitoiset <samuel.pitoiset@gmail.com>
Tue, 3 Oct 2023 15:34:06 +0000 (17:34 +0200)
committerEric Engestrom <eric@engestrom.ch>
Mon, 30 Oct 2023 15:49:44 +0000 (15:49 +0000)
Only RDNA1-2 are affected because RADV needs to handle the legacy vs
NGG path for this query, and the NGG results are stored with 2 extra
64-bit values.

Fixes flakes with
dEQP-VK.transform_feedback.primitives_generated_query.* since VKCTS
1.3.7.0.

Cc: mesa-stable
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25862>
(cherry picked from commit 7d96fe853e1b007293054645caf54ade77a77c90)

.pick_status.json
src/amd/ci/radv-navi21-aco-flakes.txt
src/amd/vulkan/radv_query.c

index edf59ed..12f4eed 100644 (file)
         "description": "radv: fix a synchronization issue with primitives generated query on RDNA1-2",
         "nominated": true,
         "nomination_type": 0,
-        "resolution": 0,
+        "resolution": 1,
         "main_sha": null,
         "because_sha": null,
         "notes": null
index 663dfca..b6e62fe 100644 (file)
@@ -19,6 +19,3 @@ dEQP-VK.ray_tracing_pipeline.pipeline_library.configurations.multithreaded_compi
 dEQP-VK.ray_tracing_pipeline.pipeline_library.configurations.multithreaded_compilation.*_check_capture_replay_handles
 dEQP-VK.ray_tracing_pipeline.pipeline_library.configurations.singlethreaded_compilation.*_check_all_handles
 dEQP-VK.ray_tracing_pipeline.pipeline_library.configurations.singlethreaded_compilation.*_check_capture_replay_handles
-
-# New CTS flakes in 1.3.7.0
-dEQP-VK.transform_feedback.primitives_generated_query.get.*
index 6596e93..7e43604 100644 (file)
@@ -584,7 +584,7 @@ build_timestamp_query_shader(struct radv_device *device)
 }
 
 #define RADV_PGQ_STRIDE     32
-#define RADV_PGQ_STRIDE_GDS (RADV_PGQ_STRIDE + 4 * 2)
+#define RADV_PGQ_STRIDE_GDS (RADV_PGQ_STRIDE + 8 * 2)
 
 static nir_shader *
 build_pg_query_shader(struct radv_device *device)
@@ -663,11 +663,21 @@ build_pg_query_shader(struct radv_device *device)
    nir_def *avails[2];
    avails[0] = nir_channel(&b, load1, 1);
    avails[1] = nir_channel(&b, load2, 1);
-   nir_def *result_is_available =
-      nir_i2b(&b, nir_iand(&b, nir_iand(&b, avails[0], avails[1]), nir_imm_int(&b, 0x80000000)));
+   nir_store_var(&b, available, nir_i2b(&b, nir_iand_imm(&b, nir_iand(&b, avails[0], avails[1]), 0x80000000)), 0x1);
+
+   nir_push_if(&b, uses_gds);
+   {
+      nir_def *gds_avail_start = nir_load_ssbo(&b, 1, 32, src_buf, nir_iadd_imm(&b, input_base, 36), .align_mul = 4);
+      nir_def *gds_avail_end = nir_load_ssbo(&b, 1, 32, src_buf, nir_iadd_imm(&b, input_base, 44), .align_mul = 4);
+      nir_def *gds_result_available =
+         nir_i2b(&b, nir_iand_imm(&b, nir_iand(&b, gds_avail_start, gds_avail_end), 0x80000000));
+
+      nir_store_var(&b, available, nir_iand(&b, nir_load_var(&b, available), gds_result_available), 0x1);
+   }
+   nir_pop_if(&b, NULL);
 
    /* Only compute result if available. */
-   nir_push_if(&b, result_is_available);
+   nir_push_if(&b, nir_load_var(&b, available));
 
    /* Pack values. */
    nir_def *packed64[2];
@@ -684,7 +694,7 @@ build_pg_query_shader(struct radv_device *device)
       nir_def *gds_start =
          nir_load_ssbo(&b, 1, 32, src_buf, nir_iadd(&b, input_base, nir_imm_int(&b, 32)), .align_mul = 4);
       nir_def *gds_end =
-         nir_load_ssbo(&b, 1, 32, src_buf, nir_iadd(&b, input_base, nir_imm_int(&b, 36)), .align_mul = 4);
+         nir_load_ssbo(&b, 1, 32, src_buf, nir_iadd(&b, input_base, nir_imm_int(&b, 40)), .align_mul = 4);
 
       nir_def *ngg_gds_result = nir_isub(&b, gds_end, gds_start);
 
@@ -692,8 +702,6 @@ build_pg_query_shader(struct radv_device *device)
    }
    nir_pop_if(&b, NULL);
 
-   nir_store_var(&b, available, nir_imm_true(&b), 0x1);
-
    nir_pop_if(&b, NULL);
 
    /* Determine if result is 64 or 32 bit. */
@@ -1087,7 +1095,7 @@ radv_create_query_pool(struct radv_device *device, const VkQueryPoolCreateInfo *
    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
       if (pool->uses_gds && device->physical_device->rad_info.gfx_level < GFX11) {
          /* When the hardware can use both the legacy and the NGG paths in the same begin/end pair,
-          * allocate 2x32-bit values for the GDS counters.
+          * allocate 2x64-bit values for the GDS counters.
           */
          pool->stride = RADV_PGQ_STRIDE_GDS;
       } else {
@@ -1318,6 +1326,7 @@ radv_GetQueryPoolResults(VkDevice _device, VkQueryPool queryPool, uint32_t first
          break;
       }
       case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
+         const bool uses_gds_query = pool->uses_gds && device->physical_device->rad_info.gfx_level < GFX11;
          p_atomic_uint64_t const *src64 = (p_atomic_uint64_t const *)src;
          uint64_t primitive_storage_needed;
 
@@ -1333,6 +1342,10 @@ radv_GetQueryPoolResults(VkDevice _device, VkQueryPool queryPool, uint32_t first
                 !(p_atomic_read(src64 + 2) & 0x8000000000000000UL)) {
                available = 0;
             }
+            if (uses_gds_query && (!(p_atomic_read(src64 + 4) & 0x8000000000000000UL) ||
+                                   !(p_atomic_read(src64 + 5) & 0x8000000000000000UL))) {
+               available = 0;
+            }
          } while (!available && (flags & VK_QUERY_RESULT_WAIT_BIT));
 
          if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
@@ -1340,11 +1353,9 @@ radv_GetQueryPoolResults(VkDevice _device, VkQueryPool queryPool, uint32_t first
 
          primitive_storage_needed = src64[2] - src64[0];
 
-         if (pool->uses_gds && device->physical_device->rad_info.gfx_level < GFX11) {
-            uint32_t const *src32 = (uint32_t const *)src;
-
+         if (uses_gds_query) {
             /* Accumulate the result that was copied from GDS in case NGG shader has been used. */
-            primitive_storage_needed += src32[9] - src32[8];
+            primitive_storage_needed += src64[5] - src64[4];
          }
 
          if (flags & VK_QUERY_RESULT_64_BIT) {
@@ -1539,15 +1550,22 @@ radv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, VkQueryPool queryPoo
       break;
    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
       if (flags & VK_QUERY_RESULT_WAIT_BIT) {
+         const bool uses_gds_query = pool->uses_gds && cmd_buffer->device->physical_device->rad_info.gfx_level < GFX11;
+
          for (unsigned i = 0; i < queryCount; i++) {
             unsigned query = firstQuery + i;
             uint64_t src_va = va + query * pool->stride;
 
-            radeon_check_space(cmd_buffer->device->ws, cs, 7 * 2);
+            radeon_check_space(cmd_buffer->device->ws, cs, 7 * 4);
 
             /* Wait on the upper word of the PrimitiveStorageNeeded result. */
             radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 4, 0x80000000, 0xffffffff);
             radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 20, 0x80000000, 0xffffffff);
+
+            if (uses_gds_query) {
+               radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 36, 0x80000000, 0xffffffff);
+               radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 44, 0x80000000, 0xffffffff);
+            }
          }
       }
 
@@ -1829,6 +1847,7 @@ emit_begin_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *poo
          if (pool->uses_gds) {
             /* generated prim counter */
             gfx10_copy_gds_query(cmd_buffer, RADV_SHADER_QUERY_PRIM_GEN_OFFSET(index), va + 32);
+            radv_cs_write_data_imm(cs, V_370_ME, va + 36, 0x80000000);
 
             /* Record that the command buffer needs GDS. */
             cmd_buffer->gds_needed = true;
@@ -1983,7 +2002,8 @@ emit_end_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool,
 
          if (pool->uses_gds) {
             /* generated prim counter */
-            gfx10_copy_gds_query(cmd_buffer, RADV_SHADER_QUERY_PRIM_GEN_OFFSET(index), va + 36);
+            gfx10_copy_gds_query(cmd_buffer, RADV_SHADER_QUERY_PRIM_GEN_OFFSET(index), va + 40);
+            radv_cs_write_data_imm(cs, V_370_ME, va + 44, 0x80000000);
 
             cmd_buffer->state.active_prims_gen_gds_queries--;