389e5938bf131fda8113cc4d60f55ddee58e6856
[platform/upstream/mesa.git] / src / amd / vulkan / radv_query.c
1 /*
2  * Copyrigh 2016 Red Hat Inc.
3  * Based on anv:
4  * Copyright © 2015 Intel Corporation
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice (including the next
14  * paragraph) shall be included in all copies or substantial portions of the
15  * Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23  * IN THE SOFTWARE.
24  */
25
26 #include <assert.h>
27 #include <fcntl.h>
28 #include <stdbool.h>
29 #include <string.h>
30
31 #include "nir/nir_builder.h"
32 #include "util/u_atomic.h"
33 #include "vulkan/vulkan_core.h"
34 #include "radv_acceleration_structure.h"
35 #include "radv_cs.h"
36 #include "radv_meta.h"
37 #include "radv_private.h"
38 #include "sid.h"
39
40 #define TIMESTAMP_NOT_READY UINT64_MAX
41
42 /* TODO: Add support for mesh/task queries on GFX11 */
43 static const unsigned pipeline_statistics_indices[] = {7, 6, 3, 4, 5, 2, 1, 0, 8, 9, 10};
44
45 static unsigned
46 radv_get_pipelinestat_query_size(struct radv_device *device)
47 {
48    unsigned num_results = device->physical_device->rad_info.gfx_level >= GFX11 ? 14 : 11;
49    return num_results * 8;
50 }
51
52 static void
53 radv_store_availability(nir_builder *b, nir_ssa_def *flags, nir_ssa_def *dst_buf,
54                         nir_ssa_def *offset, nir_ssa_def *value32)
55 {
56    nir_push_if(b, nir_test_mask(b, flags, VK_QUERY_RESULT_WITH_AVAILABILITY_BIT));
57
58    nir_push_if(b, nir_test_mask(b, flags, VK_QUERY_RESULT_64_BIT));
59
60    nir_store_ssbo(b, nir_vec2(b, value32, nir_imm_int(b, 0)), dst_buf, offset, .align_mul = 8);
61
62    nir_push_else(b, NULL);
63
64    nir_store_ssbo(b, value32, dst_buf, offset);
65
66    nir_pop_if(b, NULL);
67
68    nir_pop_if(b, NULL);
69 }
70
71 static nir_shader *
72 build_occlusion_query_shader(struct radv_device *device)
73 {
74    /* the shader this builds is roughly
75     *
76     * push constants {
77     *   uint32_t flags;
78     *   uint32_t dst_stride;
79     * };
80     *
81     * uint32_t src_stride = 16 * db_count;
82     *
83     * location(binding = 0) buffer dst_buf;
84     * location(binding = 1) buffer src_buf;
85     *
86     * void main() {
87     *   uint64_t result = 0;
88     *   uint64_t src_offset = src_stride * global_id.x;
89     *   uint64_t dst_offset = dst_stride * global_id.x;
90     *   bool available = true;
91     *   for (int i = 0; i < db_count; ++i) {
92     *           if (enabled_rb_mask & (1 << i)) {
93     *                   uint64_t start = src_buf[src_offset + 16 * i];
94     *                   uint64_t end = src_buf[src_offset + 16 * i + 8];
95     *                   if ((start & (1ull << 63)) && (end & (1ull << 63)))
96     *                           result += end - start;
97     *                   else
98     *                           available = false;
99     *           }
100     *   }
101     *   uint32_t elem_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
102     *   if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
103     *           if (flags & VK_QUERY_RESULT_64_BIT)
104     *                   dst_buf[dst_offset] = result;
105     *           else
106     *                   dst_buf[dst_offset] = (uint32_t)result.
107     *   }
108     *   if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
109     *           dst_buf[dst_offset + elem_size] = available;
110     *   }
111     * }
112     */
113    nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "occlusion_query");
114    b.shader->info.workgroup_size[0] = 64;
115
116    nir_variable *result = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result");
117    nir_variable *outer_counter =
118       nir_local_variable_create(b.impl, glsl_int_type(), "outer_counter");
119    nir_variable *start = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "start");
120    nir_variable *end = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "end");
121    nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
122    unsigned enabled_rb_mask = device->physical_device->rad_info.enabled_rb_mask;
123    unsigned db_count = device->physical_device->rad_info.max_render_backends;
124
125    nir_ssa_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 4);
126
127    nir_ssa_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
128    nir_ssa_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
129
130    nir_ssa_def *global_id = get_global_ids(&b, 1);
131
132    nir_ssa_def *input_stride = nir_imm_int(&b, db_count * 16);
133    nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id);
134    nir_ssa_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 8);
135    nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id);
136
137    nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1);
138    nir_store_var(&b, outer_counter, nir_imm_int(&b, 0), 0x1);
139    nir_store_var(&b, available, nir_imm_true(&b), 0x1);
140
141    nir_push_loop(&b);
142
143    nir_ssa_def *current_outer_count = nir_load_var(&b, outer_counter);
144    radv_break_on_count(&b, outer_counter, nir_imm_int(&b, db_count));
145
146    nir_ssa_def *enabled_cond =
147       nir_iand_imm(&b, nir_ishl(&b, nir_imm_int(&b, 1), current_outer_count), enabled_rb_mask);
148
149    nir_push_if(&b, nir_i2b(&b, enabled_cond));
150
151    nir_ssa_def *load_offset = nir_imul_imm(&b, current_outer_count, 16);
152    load_offset = nir_iadd(&b, input_base, load_offset);
153
154    nir_ssa_def *load = nir_load_ssbo(&b, 2, 64, src_buf, load_offset, .align_mul = 16);
155
156    nir_store_var(&b, start, nir_channel(&b, load, 0), 0x1);
157    nir_store_var(&b, end, nir_channel(&b, load, 1), 0x1);
158
159    nir_ssa_def *start_done = nir_ilt(&b, nir_load_var(&b, start), nir_imm_int64(&b, 0));
160    nir_ssa_def *end_done = nir_ilt(&b, nir_load_var(&b, end), nir_imm_int64(&b, 0));
161
162    nir_push_if(&b, nir_iand(&b, start_done, end_done));
163
164    nir_store_var(&b, result,
165                  nir_iadd(&b, nir_load_var(&b, result),
166                           nir_isub(&b, nir_load_var(&b, end), nir_load_var(&b, start))),
167                  0x1);
168
169    nir_push_else(&b, NULL);
170
171    nir_store_var(&b, available, nir_imm_false(&b), 0x1);
172
173    nir_pop_if(&b, NULL);
174    nir_pop_if(&b, NULL);
175    nir_pop_loop(&b, NULL);
176
177    /* Store the result if complete or if partial results have been requested. */
178
179    nir_ssa_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
180    nir_ssa_def *result_size =
181       nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
182    nir_push_if(&b, nir_ior(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT),
183                            nir_load_var(&b, available)));
184
185    nir_push_if(&b, result_is_64bit);
186
187    nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, output_base, .align_mul = 8);
188
189    nir_push_else(&b, NULL);
190
191    nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, output_base,
192                   .align_mul = 8);
193
194    nir_pop_if(&b, NULL);
195    nir_pop_if(&b, NULL);
196
197    radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, result_size, output_base),
198                            nir_b2i32(&b, nir_load_var(&b, available)));
199
200    return b.shader;
201 }
202
203 static nir_shader *
204 build_pipeline_statistics_query_shader(struct radv_device *device)
205 {
206    unsigned pipelinestat_block_size = +radv_get_pipelinestat_query_size(device);
207
208    /* the shader this builds is roughly
209     *
210     * push constants {
211     *   uint32_t flags;
212     *   uint32_t dst_stride;
213     *   uint32_t stats_mask;
214     *   uint32_t avail_offset;
215     * };
216     *
217     * uint32_t src_stride = pipelinestat_block_size * 2;
218     *
219     * location(binding = 0) buffer dst_buf;
220     * location(binding = 1) buffer src_buf;
221     *
222     * void main() {
223     *   uint64_t src_offset = src_stride * global_id.x;
224     *   uint64_t dst_base = dst_stride * global_id.x;
225     *   uint64_t dst_offset = dst_base;
226     *   uint32_t elem_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
227     *   uint32_t elem_count = stats_mask >> 16;
228     *   uint32_t available32 = src_buf[avail_offset + 4 * global_id.x];
229     *   if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
230     *           dst_buf[dst_offset + elem_count * elem_size] = available32;
231     *   }
232     *   if ((bool)available32) {
233     *           // repeat 11 times:
234     *           if (stats_mask & (1 << 0)) {
235     *                   uint64_t start = src_buf[src_offset + 8 * indices[0]];
236     *                   uint64_t end = src_buf[src_offset + 8 * indices[0] +
237     * pipelinestat_block_size]; uint64_t result = end - start; if (flags & VK_QUERY_RESULT_64_BIT)
238     *                           dst_buf[dst_offset] = result;
239     *                   else
240     *                           dst_buf[dst_offset] = (uint32_t)result.
241     *                   dst_offset += elem_size;
242     *           }
243     *   } else if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
244     *              // Set everything to 0 as we don't know what is valid.
245     *           for (int i = 0; i < elem_count; ++i)
246     *                   dst_buf[dst_base + elem_size * i] = 0;
247     *   }
248     * }
249     */
250    nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "pipeline_statistics_query");
251    b.shader->info.workgroup_size[0] = 64;
252
253    nir_variable *output_offset =
254       nir_local_variable_create(b.impl, glsl_int_type(), "output_offset");
255    nir_variable *result =
256       nir_local_variable_create(b.impl, glsl_int64_t_type(), "result");
257
258    nir_ssa_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 4);
259    nir_ssa_def *stats_mask = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 8), .range = 12);
260    nir_ssa_def *avail_offset = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 12), .range = 16);
261    nir_ssa_def *uses_gds = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 16), .range = 20);
262
263    nir_ssa_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
264    nir_ssa_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
265
266    nir_ssa_def *global_id = get_global_ids(&b, 1);
267
268    nir_variable *input_stride = nir_local_variable_create(b.impl, glsl_int_type(), "input_stride");
269    nir_push_if(&b, nir_ine(&b, uses_gds, nir_imm_int(&b, 0)));
270    {
271       nir_store_var(&b, input_stride, nir_imm_int(&b, pipelinestat_block_size * 2 + 8 * 2), 0x1);
272    }
273    nir_push_else(&b, NULL);
274    {
275       nir_store_var(&b, input_stride, nir_imm_int(&b, pipelinestat_block_size * 2), 0x1);
276    }
277    nir_pop_if(&b, NULL);
278
279    nir_ssa_def *input_base = nir_imul(&b, nir_load_var(&b, input_stride), global_id);
280    nir_ssa_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 8);
281    nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id);
282
283    avail_offset = nir_iadd(&b, avail_offset, nir_imul_imm(&b, global_id, 4));
284
285    nir_ssa_def *available32 = nir_load_ssbo(&b, 1, 32, src_buf, avail_offset);
286
287    nir_ssa_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
288    nir_ssa_def *elem_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
289    nir_ssa_def *elem_count = nir_ushr_imm(&b, stats_mask, 16);
290
291    radv_store_availability(&b, flags, dst_buf,
292                            nir_iadd(&b, output_base, nir_imul(&b, elem_count, elem_size)),
293                            available32);
294
295    nir_push_if(&b, nir_i2b(&b, available32));
296
297    nir_store_var(&b, output_offset, output_base, 0x1);
298    for (int i = 0; i < ARRAY_SIZE(pipeline_statistics_indices); ++i) {
299       nir_push_if(&b, nir_test_mask(&b, stats_mask, BITFIELD64_BIT(i)));
300
301       nir_ssa_def *start_offset = nir_iadd_imm(&b, input_base, pipeline_statistics_indices[i] * 8);
302       nir_ssa_def *start = nir_load_ssbo(&b, 1, 64, src_buf, start_offset);
303
304       nir_ssa_def *end_offset =
305          nir_iadd_imm(&b, input_base, pipeline_statistics_indices[i] * 8 + pipelinestat_block_size);
306       nir_ssa_def *end = nir_load_ssbo(&b, 1, 64, src_buf, end_offset);
307
308       nir_store_var(&b, result, nir_isub(&b, end, start), 0x1);
309
310       nir_push_if(&b, nir_iand(&b, nir_i2b(&b, uses_gds),
311                                nir_ieq(&b, nir_imm_int(&b, 1u << i),
312                                        nir_imm_int(&b, VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT))));
313       {
314          /* Compute the GDS result if needed. */
315          nir_ssa_def *gds_start_offset =
316             nir_iadd(&b, input_base, nir_imm_int(&b, pipelinestat_block_size * 2));
317          nir_ssa_def *gds_start = nir_load_ssbo(&b, 1, 64, src_buf, gds_start_offset);
318
319          nir_ssa_def *gds_end_offset =
320             nir_iadd(&b, input_base, nir_imm_int(&b, pipelinestat_block_size * 2 + 8));
321          nir_ssa_def *gds_end = nir_load_ssbo(&b, 1, 64, src_buf, gds_end_offset);
322
323          nir_ssa_def *ngg_gds_result = nir_isub(&b, gds_end, gds_start);
324
325          nir_store_var(&b, result, nir_iadd(&b, nir_load_var(&b, result), ngg_gds_result), 0x1);
326       }
327       nir_pop_if(&b, NULL);
328
329       /* Store result */
330       nir_push_if(&b, result_is_64bit);
331
332       nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, nir_load_var(&b, output_offset));
333
334       nir_push_else(&b, NULL);
335
336       nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, nir_load_var(&b, output_offset));
337
338       nir_pop_if(&b, NULL);
339
340       nir_store_var(&b, output_offset, nir_iadd(&b, nir_load_var(&b, output_offset), elem_size),
341                     0x1);
342
343       nir_pop_if(&b, NULL);
344    }
345
346    nir_push_else(&b, NULL); /* nir_i2b(&b, available32) */
347
348    nir_push_if(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT));
349
350    /* Stores zeros in all outputs. */
351
352    nir_variable *counter = nir_local_variable_create(b.impl, glsl_int_type(), "counter");
353    nir_store_var(&b, counter, nir_imm_int(&b, 0), 0x1);
354
355    nir_loop *loop = nir_push_loop(&b);
356
357    nir_ssa_def *current_counter = nir_load_var(&b, counter);
358    radv_break_on_count(&b, counter, elem_count);
359
360    nir_ssa_def *output_elem = nir_iadd(&b, output_base, nir_imul(&b, elem_size, current_counter));
361    nir_push_if(&b, result_is_64bit);
362
363    nir_store_ssbo(&b, nir_imm_int64(&b, 0), dst_buf, output_elem);
364
365    nir_push_else(&b, NULL);
366
367    nir_store_ssbo(&b, nir_imm_int(&b, 0), dst_buf, output_elem);
368
369    nir_pop_if(&b, NULL);
370
371    nir_pop_loop(&b, loop);
372    nir_pop_if(&b, NULL); /* VK_QUERY_RESULT_PARTIAL_BIT */
373    nir_pop_if(&b, NULL); /* nir_i2b(&b, available32) */
374    return b.shader;
375 }
376
377 static nir_shader *
378 build_tfb_query_shader(struct radv_device *device)
379 {
380    /* the shader this builds is roughly
381     *
382     * uint32_t src_stride = 32;
383     *
384     * location(binding = 0) buffer dst_buf;
385     * location(binding = 1) buffer src_buf;
386     *
387     * void main() {
388     *   uint64_t result[2] = {};
389     *   bool available = false;
390     *   uint64_t src_offset = src_stride * global_id.x;
391     *   uint64_t dst_offset = dst_stride * global_id.x;
392     *   uint64_t *src_data = src_buf[src_offset];
393     *   uint32_t avail = (src_data[0] >> 32) &
394     *                    (src_data[1] >> 32) &
395     *                    (src_data[2] >> 32) &
396     *                    (src_data[3] >> 32);
397     *   if (avail & 0x80000000) {
398     *           result[0] = src_data[3] - src_data[1];
399     *           result[1] = src_data[2] - src_data[0];
400     *           available = true;
401     *   }
402     *   uint32_t result_size = flags & VK_QUERY_RESULT_64_BIT ? 16 : 8;
403     *   if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
404     *           if (flags & VK_QUERY_RESULT_64_BIT) {
405     *                   dst_buf[dst_offset] = result;
406     *           } else {
407     *                   dst_buf[dst_offset] = (uint32_t)result;
408     *           }
409     *   }
410     *   if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
411     *           dst_buf[dst_offset + result_size] = available;
412     *   }
413     * }
414     */
415    nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "tfb_query");
416    b.shader->info.workgroup_size[0] = 64;
417
418    /* Create and initialize local variables. */
419    nir_variable *result =
420       nir_local_variable_create(b.impl, glsl_vector_type(GLSL_TYPE_UINT64, 2), "result");
421    nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
422
423    nir_store_var(&b, result, nir_vec2(&b, nir_imm_int64(&b, 0), nir_imm_int64(&b, 0)), 0x3);
424    nir_store_var(&b, available, nir_imm_false(&b), 0x1);
425
426    nir_ssa_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 4);
427
428    /* Load resources. */
429    nir_ssa_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
430    nir_ssa_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
431
432    /* Compute global ID. */
433    nir_ssa_def *global_id = get_global_ids(&b, 1);
434
435    /* Compute src/dst strides. */
436    nir_ssa_def *input_stride = nir_imm_int(&b, 32);
437    nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id);
438    nir_ssa_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 8);
439    nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id);
440
441    /* Load data from the query pool. */
442    nir_ssa_def *load1 = nir_load_ssbo(&b, 4, 32, src_buf, input_base, .align_mul = 32);
443    nir_ssa_def *load2 =
444       nir_load_ssbo(&b, 4, 32, src_buf, nir_iadd_imm(&b, input_base, 16), .align_mul = 16);
445
446    /* Check if result is available. */
447    nir_ssa_def *avails[2];
448    avails[0] = nir_iand(&b, nir_channel(&b, load1, 1), nir_channel(&b, load1, 3));
449    avails[1] = nir_iand(&b, nir_channel(&b, load2, 1), nir_channel(&b, load2, 3));
450    nir_ssa_def *result_is_available =
451       nir_test_mask(&b, nir_iand(&b, avails[0], avails[1]), 0x80000000);
452
453    /* Only compute result if available. */
454    nir_push_if(&b, result_is_available);
455
456    /* Pack values. */
457    nir_ssa_def *packed64[4];
458    packed64[0] =
459       nir_pack_64_2x32(&b, nir_vec2(&b, nir_channel(&b, load1, 0), nir_channel(&b, load1, 1)));
460    packed64[1] =
461       nir_pack_64_2x32(&b, nir_vec2(&b, nir_channel(&b, load1, 2), nir_channel(&b, load1, 3)));
462    packed64[2] =
463       nir_pack_64_2x32(&b, nir_vec2(&b, nir_channel(&b, load2, 0), nir_channel(&b, load2, 1)));
464    packed64[3] =
465       nir_pack_64_2x32(&b, nir_vec2(&b, nir_channel(&b, load2, 2), nir_channel(&b, load2, 3)));
466
467    /* Compute result. */
468    nir_ssa_def *num_primitive_written = nir_isub(&b, packed64[3], packed64[1]);
469    nir_ssa_def *primitive_storage_needed = nir_isub(&b, packed64[2], packed64[0]);
470
471    nir_store_var(&b, result, nir_vec2(&b, num_primitive_written, primitive_storage_needed), 0x3);
472    nir_store_var(&b, available, nir_imm_true(&b), 0x1);
473
474    nir_pop_if(&b, NULL);
475
476    /* Determine if result is 64 or 32 bit. */
477    nir_ssa_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
478    nir_ssa_def *result_size =
479       nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 16), nir_imm_int(&b, 8));
480
481    /* Store the result if complete or partial results have been requested. */
482    nir_push_if(&b, nir_ior(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT),
483                            nir_load_var(&b, available)));
484
485    /* Store result. */
486    nir_push_if(&b, result_is_64bit);
487
488    nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, output_base);
489
490    nir_push_else(&b, NULL);
491
492    nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, output_base);
493
494    nir_pop_if(&b, NULL);
495    nir_pop_if(&b, NULL);
496
497    radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, result_size, output_base),
498                            nir_b2i32(&b, nir_load_var(&b, available)));
499
500    return b.shader;
501 }
502
503 static nir_shader *
504 build_timestamp_query_shader(struct radv_device *device)
505 {
506    /* the shader this builds is roughly
507     *
508     * uint32_t src_stride = 8;
509     *
510     * location(binding = 0) buffer dst_buf;
511     * location(binding = 1) buffer src_buf;
512     *
513     * void main() {
514     *   uint64_t result = 0;
515     *   bool available = false;
516     *   uint64_t src_offset = src_stride * global_id.x;
517     *   uint64_t dst_offset = dst_stride * global_id.x;
518     *   uint64_t timestamp = src_buf[src_offset];
519     *   if (timestamp != TIMESTAMP_NOT_READY) {
520     *           result = timestamp;
521     *           available = true;
522     *   }
523     *   uint32_t result_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
524     *   if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
525     *           if (flags & VK_QUERY_RESULT_64_BIT) {
526     *                   dst_buf[dst_offset] = result;
527     *           } else {
528     *                   dst_buf[dst_offset] = (uint32_t)result;
529     *           }
530     *   }
531     *   if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
532     *           dst_buf[dst_offset + result_size] = available;
533     *   }
534     * }
535     */
536    nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "timestamp_query");
537    b.shader->info.workgroup_size[0] = 64;
538
539    /* Create and initialize local variables. */
540    nir_variable *result = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result");
541    nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
542
543    nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1);
544    nir_store_var(&b, available, nir_imm_false(&b), 0x1);
545
546    nir_ssa_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 4);
547
548    /* Load resources. */
549    nir_ssa_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
550    nir_ssa_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
551
552    /* Compute global ID. */
553    nir_ssa_def *global_id = get_global_ids(&b, 1);
554
555    /* Compute src/dst strides. */
556    nir_ssa_def *input_stride = nir_imm_int(&b, 8);
557    nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id);
558    nir_ssa_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 8);
559    nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id);
560
561    /* Load data from the query pool. */
562    nir_ssa_def *load = nir_load_ssbo(&b, 2, 32, src_buf, input_base, .align_mul = 8);
563
564    /* Pack the timestamp. */
565    nir_ssa_def *timestamp;
566    timestamp =
567       nir_pack_64_2x32(&b, nir_vec2(&b, nir_channel(&b, load, 0), nir_channel(&b, load, 1)));
568
569    /* Check if result is available. */
570    nir_ssa_def *result_is_available = nir_i2b(&b, nir_ine_imm(&b, timestamp, TIMESTAMP_NOT_READY));
571
572    /* Only store result if available. */
573    nir_push_if(&b, result_is_available);
574
575    nir_store_var(&b, result, timestamp, 0x1);
576    nir_store_var(&b, available, nir_imm_true(&b), 0x1);
577
578    nir_pop_if(&b, NULL);
579
580    /* Determine if result is 64 or 32 bit. */
581    nir_ssa_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
582    nir_ssa_def *result_size =
583       nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
584
585    /* Store the result if complete or partial results have been requested. */
586    nir_push_if(&b, nir_ior(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT),
587                            nir_load_var(&b, available)));
588
589    /* Store result. */
590    nir_push_if(&b, result_is_64bit);
591
592    nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, output_base);
593
594    nir_push_else(&b, NULL);
595
596    nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, output_base);
597
598    nir_pop_if(&b, NULL);
599
600    nir_pop_if(&b, NULL);
601
602    radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, result_size, output_base),
603                            nir_b2i32(&b, nir_load_var(&b, available)));
604
605    return b.shader;
606 }
607
608 static nir_shader *
609 build_pg_query_shader(struct radv_device *device)
610 {
611    /* the shader this builds is roughly
612     *
613     * uint32_t src_stride = 32;
614     *
615     * location(binding = 0) buffer dst_buf;
616     * location(binding = 1) buffer src_buf;
617     *
618     * void main() {
619     *   uint64_t result = {};
620     *   bool available = false;
621     *   uint64_t src_offset = src_stride * global_id.x;
622     *   uint64_t dst_offset = dst_stride * global_id.x;
623     *   uint64_t *src_data = src_buf[src_offset];
624     *   uint32_t avail = (src_data[0] >> 32) &
625     *                    (src_data[2] >> 32);
626     *   if (avail & 0x80000000) {
627     *           result = src_data[2] - src_data[0];
628     *           if (use_gds) {
629     *                   uint32_t ngg_gds_result = 0;
630     *                   ngg_gds_result += src_data[9] - src_data[8];
631     *                   result += (uint64_t)ngg_gds_result;
632     *           }
633     *           available = true;
634     *   }
635     *   uint32_t result_size = flags & VK_QUERY_RESULT_64_BIT ? 16 : 8;
636     *   if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
637     *           if (flags & VK_QUERY_RESULT_64_BIT) {
638     *                   dst_buf[dst_offset] = result;
639     *           } else {
640     *                   dst_buf[dst_offset] = (uint32_t)result;
641     *           }
642     *   }
643     *   if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
644     *           dst_buf[dst_offset + result_size] = available;
645     *   }
646     * }
647     */
648    nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "pg_query");
649    b.shader->info.workgroup_size[0] = 64;
650
651    /* Create and initialize local variables. */
652    nir_variable *result =
653       nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result");
654    nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
655
656    nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1);
657    nir_store_var(&b, available, nir_imm_false(&b), 0x1);
658
659    nir_ssa_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 16);
660
661    /* Load resources. */
662    nir_ssa_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
663    nir_ssa_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
664
665    /* Compute global ID. */
666    nir_ssa_def *global_id = get_global_ids(&b, 1);
667
668    /* Compute src/dst strides. */
669    nir_ssa_def *input_stride = nir_imm_int(&b, 32);
670    nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id);
671    nir_ssa_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 16);
672    nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id);
673
674    /* Load data from the query pool. */
675    nir_ssa_def *load1 = nir_load_ssbo(&b, 2, 32, src_buf, input_base, .align_mul = 32);
676    nir_ssa_def *load2 = nir_load_ssbo(
677       &b, 2, 32, src_buf, nir_iadd(&b, input_base, nir_imm_int(&b, 16)), .align_mul = 16);
678
679    /* Check if result is available. */
680    nir_ssa_def *avails[2];
681    avails[0] = nir_channel(&b, load1, 1);
682    avails[1] = nir_channel(&b, load2, 1);
683    nir_ssa_def *result_is_available =
684       nir_i2b(&b, nir_iand(&b, nir_iand(&b, avails[0], avails[1]), nir_imm_int(&b, 0x80000000)));
685
686    /* Only compute result if available. */
687    nir_push_if(&b, result_is_available);
688
689    /* Pack values. */
690    nir_ssa_def *packed64[2];
691    packed64[0] =
692       nir_pack_64_2x32(&b, nir_vec2(&b, nir_channel(&b, load1, 0), nir_channel(&b, load1, 1)));
693    packed64[1] =
694       nir_pack_64_2x32(&b, nir_vec2(&b, nir_channel(&b, load2, 0), nir_channel(&b, load2, 1)));
695
696    /* Compute result. */
697    nir_ssa_def *primitive_storage_needed = nir_isub(&b, packed64[1], packed64[0]);
698
699    nir_store_var(&b, result, primitive_storage_needed, 0x1);
700
701    nir_ssa_def *uses_gds = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 16), .range = 20);
702    nir_push_if(&b, nir_i2b(&b, uses_gds));
703    {
704       nir_ssa_def *gds_start =
705          nir_load_ssbo(&b, 1, 32, src_buf, nir_iadd(&b, input_base, nir_imm_int(&b, 32)), .align_mul = 4);
706       nir_ssa_def *gds_end =
707          nir_load_ssbo(&b, 1, 32, src_buf, nir_iadd(&b, input_base, nir_imm_int(&b, 36)), .align_mul = 4);
708
709       nir_ssa_def *ngg_gds_result = nir_isub(&b, gds_end, gds_start);
710
711       nir_store_var(&b, result, nir_iadd(&b, nir_load_var(&b, result), nir_u2u64(&b, ngg_gds_result)), 0x1);
712    }
713    nir_pop_if(&b, NULL);
714
715    nir_store_var(&b, available, nir_imm_true(&b), 0x1);
716
717    nir_pop_if(&b, NULL);
718
719    /* Determine if result is 64 or 32 bit. */
720    nir_ssa_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
721    nir_ssa_def *result_size =
722       nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 16), nir_imm_int(&b, 8));
723
724    /* Store the result if complete or partial results have been requested. */
725    nir_push_if(&b, nir_ior(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT),
726                            nir_load_var(&b, available)));
727
728    /* Store result. */
729    nir_push_if(&b, result_is_64bit);
730
731    nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, output_base);
732
733    nir_push_else(&b, NULL);
734
735    nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, output_base);
736
737    nir_pop_if(&b, NULL);
738    nir_pop_if(&b, NULL);
739
740    radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, result_size, output_base),
741                            nir_b2i32(&b, nir_load_var(&b, available)));
742
743    return b.shader;
744 }
745
746 static VkResult
747 radv_device_init_meta_query_state_internal(struct radv_device *device)
748 {
749    VkResult result;
750    nir_shader *occlusion_cs = NULL;
751    nir_shader *pipeline_statistics_cs = NULL;
752    nir_shader *tfb_cs = NULL;
753    nir_shader *timestamp_cs = NULL;
754    nir_shader *pg_cs = NULL;
755
756    mtx_lock(&device->meta_state.mtx);
757    if (device->meta_state.query.pipeline_statistics_query_pipeline) {
758       mtx_unlock(&device->meta_state.mtx);
759       return VK_SUCCESS;
760    }
761    occlusion_cs = build_occlusion_query_shader(device);
762    pipeline_statistics_cs = build_pipeline_statistics_query_shader(device);
763    tfb_cs = build_tfb_query_shader(device);
764    timestamp_cs = build_timestamp_query_shader(device);
765    pg_cs = build_pg_query_shader(device);
766
767    VkDescriptorSetLayoutCreateInfo occlusion_ds_create_info = {
768       .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
769       .flags = VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR,
770       .bindingCount = 2,
771       .pBindings = (VkDescriptorSetLayoutBinding[]){
772          {.binding = 0,
773           .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
774           .descriptorCount = 1,
775           .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
776           .pImmutableSamplers = NULL},
777          {.binding = 1,
778           .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
779           .descriptorCount = 1,
780           .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
781           .pImmutableSamplers = NULL},
782       }};
783
784    result = radv_CreateDescriptorSetLayout(radv_device_to_handle(device), &occlusion_ds_create_info,
785                                            &device->meta_state.alloc,
786                                            &device->meta_state.query.ds_layout);
787    if (result != VK_SUCCESS)
788       goto fail;
789
790    VkPipelineLayoutCreateInfo occlusion_pl_create_info = {
791       .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
792       .setLayoutCount = 1,
793       .pSetLayouts = &device->meta_state.query.ds_layout,
794       .pushConstantRangeCount = 1,
795       .pPushConstantRanges = &(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0, 20},
796    };
797
798    result =
799       radv_CreatePipelineLayout(radv_device_to_handle(device), &occlusion_pl_create_info,
800                                 &device->meta_state.alloc, &device->meta_state.query.p_layout);
801    if (result != VK_SUCCESS)
802       goto fail;
803
804    VkPipelineShaderStageCreateInfo occlusion_pipeline_shader_stage = {
805       .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
806       .stage = VK_SHADER_STAGE_COMPUTE_BIT,
807       .module = vk_shader_module_handle_from_nir(occlusion_cs),
808       .pName = "main",
809       .pSpecializationInfo = NULL,
810    };
811
812    VkComputePipelineCreateInfo occlusion_vk_pipeline_info = {
813       .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
814       .stage = occlusion_pipeline_shader_stage,
815       .flags = 0,
816       .layout = device->meta_state.query.p_layout,
817    };
818
819    result = radv_CreateComputePipelines(
820       radv_device_to_handle(device), device->meta_state.cache, 1,
821       &occlusion_vk_pipeline_info, NULL, &device->meta_state.query.occlusion_query_pipeline);
822    if (result != VK_SUCCESS)
823       goto fail;
824
825    VkPipelineShaderStageCreateInfo pipeline_statistics_pipeline_shader_stage = {
826       .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
827       .stage = VK_SHADER_STAGE_COMPUTE_BIT,
828       .module = vk_shader_module_handle_from_nir(pipeline_statistics_cs),
829       .pName = "main",
830       .pSpecializationInfo = NULL,
831    };
832
833    VkComputePipelineCreateInfo pipeline_statistics_vk_pipeline_info = {
834       .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
835       .stage = pipeline_statistics_pipeline_shader_stage,
836       .flags = 0,
837       .layout = device->meta_state.query.p_layout,
838    };
839
840    result = radv_CreateComputePipelines(
841       radv_device_to_handle(device), device->meta_state.cache, 1,
842       &pipeline_statistics_vk_pipeline_info, NULL,
843       &device->meta_state.query.pipeline_statistics_query_pipeline);
844    if (result != VK_SUCCESS)
845       goto fail;
846
847    VkPipelineShaderStageCreateInfo tfb_pipeline_shader_stage = {
848       .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
849       .stage = VK_SHADER_STAGE_COMPUTE_BIT,
850       .module = vk_shader_module_handle_from_nir(tfb_cs),
851       .pName = "main",
852       .pSpecializationInfo = NULL,
853    };
854
855    VkComputePipelineCreateInfo tfb_pipeline_info = {
856       .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
857       .stage = tfb_pipeline_shader_stage,
858       .flags = 0,
859       .layout = device->meta_state.query.p_layout,
860    };
861
862    result = radv_CreateComputePipelines(
863       radv_device_to_handle(device), device->meta_state.cache, 1,
864       &tfb_pipeline_info, NULL, &device->meta_state.query.tfb_query_pipeline);
865    if (result != VK_SUCCESS)
866       goto fail;
867
868    VkPipelineShaderStageCreateInfo timestamp_pipeline_shader_stage = {
869       .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
870       .stage = VK_SHADER_STAGE_COMPUTE_BIT,
871       .module = vk_shader_module_handle_from_nir(timestamp_cs),
872       .pName = "main",
873       .pSpecializationInfo = NULL,
874    };
875
876    VkComputePipelineCreateInfo timestamp_pipeline_info = {
877       .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
878       .stage = timestamp_pipeline_shader_stage,
879       .flags = 0,
880       .layout = device->meta_state.query.p_layout,
881    };
882
883    result = radv_CreateComputePipelines(
884       radv_device_to_handle(device), device->meta_state.cache, 1,
885       &timestamp_pipeline_info, NULL, &device->meta_state.query.timestamp_query_pipeline);
886    if (result != VK_SUCCESS)
887       goto fail;
888
889    VkPipelineShaderStageCreateInfo pg_pipeline_shader_stage = {
890       .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
891       .stage = VK_SHADER_STAGE_COMPUTE_BIT,
892       .module = vk_shader_module_handle_from_nir(pg_cs),
893       .pName = "main",
894       .pSpecializationInfo = NULL,
895    };
896
897    VkComputePipelineCreateInfo pg_pipeline_info = {
898       .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
899       .stage = pg_pipeline_shader_stage,
900       .flags = 0,
901       .layout = device->meta_state.query.p_layout,
902    };
903
904    result = radv_CreateComputePipelines(
905       radv_device_to_handle(device), device->meta_state.cache, 1,
906       &pg_pipeline_info, NULL, &device->meta_state.query.pg_query_pipeline);
907
908 fail:
909    ralloc_free(occlusion_cs);
910    ralloc_free(pipeline_statistics_cs);
911    ralloc_free(tfb_cs);
912    ralloc_free(pg_cs);
913    ralloc_free(timestamp_cs);
914    mtx_unlock(&device->meta_state.mtx);
915    return result;
916 }
917
918 VkResult
919 radv_device_init_meta_query_state(struct radv_device *device, bool on_demand)
920 {
921    if (on_demand)
922       return VK_SUCCESS;
923
924    return radv_device_init_meta_query_state_internal(device);
925 }
926
927 void
928 radv_device_finish_meta_query_state(struct radv_device *device)
929 {
930    if (device->meta_state.query.tfb_query_pipeline)
931       radv_DestroyPipeline(radv_device_to_handle(device),
932                            device->meta_state.query.tfb_query_pipeline, &device->meta_state.alloc);
933
934    if (device->meta_state.query.pipeline_statistics_query_pipeline)
935       radv_DestroyPipeline(radv_device_to_handle(device),
936                            device->meta_state.query.pipeline_statistics_query_pipeline,
937                            &device->meta_state.alloc);
938
939    if (device->meta_state.query.occlusion_query_pipeline)
940       radv_DestroyPipeline(radv_device_to_handle(device),
941                            device->meta_state.query.occlusion_query_pipeline,
942                            &device->meta_state.alloc);
943
944    if (device->meta_state.query.timestamp_query_pipeline)
945       radv_DestroyPipeline(radv_device_to_handle(device),
946                            device->meta_state.query.timestamp_query_pipeline,
947                            &device->meta_state.alloc);
948
949    if (device->meta_state.query.pg_query_pipeline)
950       radv_DestroyPipeline(radv_device_to_handle(device),
951                            device->meta_state.query.pg_query_pipeline, &device->meta_state.alloc);
952
953    if (device->meta_state.query.p_layout)
954       radv_DestroyPipelineLayout(radv_device_to_handle(device), device->meta_state.query.p_layout,
955                                  &device->meta_state.alloc);
956
957    if (device->meta_state.query.ds_layout)
958       device->vk.dispatch_table.DestroyDescriptorSetLayout(radv_device_to_handle(device),
959                                                            device->meta_state.query.ds_layout,
960                                                            &device->meta_state.alloc);
961 }
962
963 static void
964 radv_query_shader(struct radv_cmd_buffer *cmd_buffer, VkPipeline *pipeline,
965                   struct radeon_winsys_bo *src_bo, struct radeon_winsys_bo *dst_bo,
966                   uint64_t src_offset, uint64_t dst_offset, uint32_t src_stride,
967                   uint32_t dst_stride, size_t dst_size, uint32_t count, uint32_t flags,
968                   uint32_t pipeline_stats_mask, uint32_t avail_offset, bool uses_gds)
969 {
970    struct radv_device *device = cmd_buffer->device;
971    struct radv_meta_saved_state saved_state;
972    struct radv_buffer src_buffer, dst_buffer;
973
974    if (!*pipeline) {
975       VkResult ret = radv_device_init_meta_query_state_internal(device);
976       if (ret != VK_SUCCESS) {
977          vk_command_buffer_set_error(&cmd_buffer->vk, ret);
978          return;
979       }
980    }
981
982    /* VK_EXT_conditional_rendering says that copy commands should not be
983     * affected by conditional rendering.
984     */
985    radv_meta_save(&saved_state, cmd_buffer,
986                   RADV_META_SAVE_COMPUTE_PIPELINE | RADV_META_SAVE_CONSTANTS |
987                      RADV_META_SAVE_DESCRIPTORS | RADV_META_SUSPEND_PREDICATING);
988
989    uint64_t src_buffer_size = MAX2(src_stride * count, avail_offset + 4 * count - src_offset);
990    uint64_t dst_buffer_size = dst_stride * (count - 1) + dst_size;
991
992    radv_buffer_init(&src_buffer, device, src_bo, src_buffer_size, src_offset);
993    radv_buffer_init(&dst_buffer, device, dst_bo, dst_buffer_size, dst_offset);
994
995    radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE,
996                         *pipeline);
997
998    radv_meta_push_descriptor_set(
999       cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, device->meta_state.query.p_layout, 0, /* set */
1000       2, /* descriptorWriteCount */
1001       (VkWriteDescriptorSet[]){
1002          {.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
1003           .dstBinding = 0,
1004           .dstArrayElement = 0,
1005           .descriptorCount = 1,
1006           .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
1007           .pBufferInfo = &(VkDescriptorBufferInfo){.buffer = radv_buffer_to_handle(&dst_buffer),
1008                                                    .offset = 0,
1009                                                    .range = VK_WHOLE_SIZE}},
1010          {.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
1011           .dstBinding = 1,
1012           .dstArrayElement = 0,
1013           .descriptorCount = 1,
1014           .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
1015           .pBufferInfo = &(VkDescriptorBufferInfo){.buffer = radv_buffer_to_handle(&src_buffer),
1016                                                    .offset = 0,
1017                                                    .range = VK_WHOLE_SIZE}}});
1018
1019    /* Encode the number of elements for easy access by the shader. */
1020    pipeline_stats_mask &= 0x7ff;
1021    pipeline_stats_mask |= util_bitcount(pipeline_stats_mask) << 16;
1022
1023    avail_offset -= src_offset;
1024
1025    struct {
1026       uint32_t flags;
1027       uint32_t dst_stride;
1028       uint32_t pipeline_stats_mask;
1029       uint32_t avail_offset;
1030       uint32_t uses_gds;
1031    } push_constants = {flags, dst_stride, pipeline_stats_mask, avail_offset, uses_gds};
1032
1033    radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer), device->meta_state.query.p_layout,
1034                          VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_constants), &push_constants);
1035
1036    cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_INV_L2 | RADV_CMD_FLAG_INV_VCACHE;
1037
1038    if (flags & VK_QUERY_RESULT_WAIT_BIT)
1039       cmd_buffer->state.flush_bits |= RADV_CMD_FLUSH_AND_INV_FRAMEBUFFER;
1040
1041    radv_unaligned_dispatch(cmd_buffer, count, 1, 1);
1042
1043    /* Ensure that the query copy dispatch is complete before a potential vkCmdResetPool because
1044     * there is an implicit execution dependency from each such query command to all query commands
1045     * previously submitted to the same queue.
1046     */
1047    cmd_buffer->active_query_flush_bits |=
1048       RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2 | RADV_CMD_FLAG_INV_VCACHE;
1049
1050    radv_buffer_finish(&src_buffer);
1051    radv_buffer_finish(&dst_buffer);
1052
1053    radv_meta_restore(&saved_state, cmd_buffer);
1054 }
1055
1056 static void
1057 radv_destroy_query_pool(struct radv_device *device, const VkAllocationCallbacks *pAllocator,
1058                         struct radv_query_pool *pool)
1059 {
1060    if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR)
1061       radv_pc_deinit_query_pool((struct radv_pc_query_pool *)pool);
1062
1063    if (pool->bo)
1064       device->ws->buffer_destroy(device->ws, pool->bo);
1065    vk_object_base_finish(&pool->base);
1066    vk_free2(&device->vk.alloc, pAllocator, pool);
1067 }
1068
1069 VKAPI_ATTR VkResult VKAPI_CALL
1070 radv_CreateQueryPool(VkDevice _device, const VkQueryPoolCreateInfo *pCreateInfo,
1071                      const VkAllocationCallbacks *pAllocator, VkQueryPool *pQueryPool)
1072 {
1073    RADV_FROM_HANDLE(radv_device, device, _device);
1074    VkResult result;
1075    size_t pool_struct_size = pCreateInfo->queryType == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR
1076                                 ? sizeof(struct radv_pc_query_pool)
1077                                 : sizeof(struct radv_query_pool);
1078
1079    struct radv_query_pool *pool = vk_alloc2(&device->vk.alloc, pAllocator, pool_struct_size, 8,
1080                                             VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
1081
1082    if (!pool)
1083       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1084
1085    vk_object_base_init(&device->vk, &pool->base, VK_OBJECT_TYPE_QUERY_POOL);
1086
1087    pool->type = pCreateInfo->queryType;
1088    pool->pipeline_stats_mask = pCreateInfo->pipelineStatistics;
1089
1090    /* The number of primitives generated by geometry shader invocations is only counted by the
1091     * hardware if GS uses the legacy path. When NGG GS is used, the hardware can't know the number
1092     * of generated primitives and we have to increment it from the shader using a plain GDS atomic.
1093     */
1094    pool->uses_gds = device->physical_device->use_ngg &&
1095                     ((pool->pipeline_stats_mask & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT) ||
1096                      pCreateInfo->queryType == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT);
1097
1098    switch (pCreateInfo->queryType) {
1099    case VK_QUERY_TYPE_OCCLUSION:
1100       pool->stride = 16 * device->physical_device->rad_info.max_render_backends;
1101       break;
1102    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1103       pool->stride = radv_get_pipelinestat_query_size(device) * 2;
1104       if (pool->uses_gds) {
1105          /* When the query pool needs GDS (for counting the number of primitives generated by a
1106           * geometry shader with NGG), allocate 2x64-bit values for begin/end.
1107           */
1108          pool->stride += 8 * 2;
1109       }
1110       break;
1111    case VK_QUERY_TYPE_TIMESTAMP:
1112    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
1113    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
1114    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
1115    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
1116       pool->stride = 8;
1117       break;
1118    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1119       pool->stride = 32;
1120       break;
1121    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1122       pool->stride = 32;
1123       if (pool->uses_gds && device->physical_device->rad_info.gfx_level < GFX11) {
1124          /* When the hardware can use both the legacy and the NGG paths in the same begin/end pair,
1125           * allocate 2x32-bit values for the GDS counters.
1126           */
1127          pool->stride += 4 * 2;
1128       }
1129       break;
1130    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
1131       result = radv_pc_init_query_pool(device->physical_device, pCreateInfo,
1132                                        (struct radv_pc_query_pool *)pool);
1133
1134       if (result != VK_SUCCESS) {
1135          radv_destroy_query_pool(device, pAllocator, pool);
1136          return vk_error(device, result);
1137       }
1138       break;
1139    }
1140    default:
1141       unreachable("creating unhandled query type");
1142    }
1143
1144    pool->availability_offset = pool->stride * pCreateInfo->queryCount;
1145    pool->size = pool->availability_offset;
1146    if (pCreateInfo->queryType == VK_QUERY_TYPE_PIPELINE_STATISTICS)
1147       pool->size += 4 * pCreateInfo->queryCount;
1148
1149    result = device->ws->buffer_create(device->ws, pool->size, 64, RADEON_DOMAIN_GTT,
1150                                       RADEON_FLAG_NO_INTERPROCESS_SHARING,
1151                                       RADV_BO_PRIORITY_QUERY_POOL, 0, &pool->bo);
1152    if (result != VK_SUCCESS) {
1153       radv_destroy_query_pool(device, pAllocator, pool);
1154       return vk_error(device, result);
1155    }
1156
1157    pool->ptr = device->ws->buffer_map(pool->bo);
1158    if (!pool->ptr) {
1159       radv_destroy_query_pool(device, pAllocator, pool);
1160       return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
1161    }
1162
1163    *pQueryPool = radv_query_pool_to_handle(pool);
1164    return VK_SUCCESS;
1165 }
1166
1167 VKAPI_ATTR void VKAPI_CALL
1168 radv_DestroyQueryPool(VkDevice _device, VkQueryPool _pool, const VkAllocationCallbacks *pAllocator)
1169 {
1170    RADV_FROM_HANDLE(radv_device, device, _device);
1171    RADV_FROM_HANDLE(radv_query_pool, pool, _pool);
1172
1173    if (!pool)
1174       return;
1175
1176    radv_destroy_query_pool(device, pAllocator, pool);
1177 }
1178
1179 VKAPI_ATTR VkResult VKAPI_CALL
1180 radv_GetQueryPoolResults(VkDevice _device, VkQueryPool queryPool, uint32_t firstQuery,
1181                          uint32_t queryCount, size_t dataSize, void *pData, VkDeviceSize stride,
1182                          VkQueryResultFlags flags)
1183 {
1184    RADV_FROM_HANDLE(radv_device, device, _device);
1185    RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1186    char *data = pData;
1187    VkResult result = VK_SUCCESS;
1188
1189    if (vk_device_is_lost(&device->vk))
1190       return VK_ERROR_DEVICE_LOST;
1191
1192    for (unsigned query_idx = 0; query_idx < queryCount; ++query_idx, data += stride) {
1193       char *dest = data;
1194       unsigned query = firstQuery + query_idx;
1195       char *src = pool->ptr + query * pool->stride;
1196       uint32_t available;
1197
1198       switch (pool->type) {
1199       case VK_QUERY_TYPE_TIMESTAMP:
1200       case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
1201       case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
1202       case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
1203       case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR: {
1204          uint64_t const *src64 = (uint64_t const *)src;
1205          uint64_t value;
1206
1207          do {
1208             value = p_atomic_read(src64);
1209          } while (value == TIMESTAMP_NOT_READY && (flags & VK_QUERY_RESULT_WAIT_BIT));
1210
1211          available = value != TIMESTAMP_NOT_READY;
1212
1213          if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1214             result = VK_NOT_READY;
1215
1216          if (flags & VK_QUERY_RESULT_64_BIT) {
1217             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1218                *(uint64_t *)dest = value;
1219             dest += 8;
1220          } else {
1221             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1222                *(uint32_t *)dest = (uint32_t)value;
1223             dest += 4;
1224          }
1225          break;
1226       }
1227       case VK_QUERY_TYPE_OCCLUSION: {
1228          uint64_t const *src64 = (uint64_t const *)src;
1229          uint32_t db_count = device->physical_device->rad_info.max_render_backends;
1230          uint32_t enabled_rb_mask = device->physical_device->rad_info.enabled_rb_mask;
1231          uint64_t sample_count = 0;
1232          available = 1;
1233
1234          for (int i = 0; i < db_count; ++i) {
1235             uint64_t start, end;
1236
1237             if (!(enabled_rb_mask & (1 << i)))
1238                continue;
1239
1240             do {
1241                start = p_atomic_read(src64 + 2 * i);
1242                end = p_atomic_read(src64 + 2 * i + 1);
1243             } while ((!(start & (1ull << 63)) || !(end & (1ull << 63))) &&
1244                      (flags & VK_QUERY_RESULT_WAIT_BIT));
1245
1246             if (!(start & (1ull << 63)) || !(end & (1ull << 63)))
1247                available = 0;
1248             else {
1249                sample_count += end - start;
1250             }
1251          }
1252
1253          if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1254             result = VK_NOT_READY;
1255
1256          if (flags & VK_QUERY_RESULT_64_BIT) {
1257             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1258                *(uint64_t *)dest = sample_count;
1259             dest += 8;
1260          } else {
1261             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1262                *(uint32_t *)dest = sample_count;
1263             dest += 4;
1264          }
1265          break;
1266       }
1267       case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
1268          unsigned pipelinestat_block_size = radv_get_pipelinestat_query_size(device);
1269          const uint32_t *avail_ptr =
1270             (const uint32_t *)(pool->ptr + pool->availability_offset + 4 * query);
1271          uint64_t ngg_gds_result = 0;
1272
1273          do {
1274             available = p_atomic_read(avail_ptr);
1275          } while (!available && (flags & VK_QUERY_RESULT_WAIT_BIT));
1276
1277          if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1278             result = VK_NOT_READY;
1279
1280          if (pool->uses_gds) {
1281             /* Compute the result that was copied from GDS. */
1282             const uint64_t *gds_start = (uint64_t *)(src + pipelinestat_block_size * 2);
1283             const uint64_t *gds_stop = (uint64_t *)(src + pipelinestat_block_size * 2 + 8);
1284
1285             ngg_gds_result = gds_stop[0] - gds_start[0];
1286          }
1287
1288          const uint64_t *start = (uint64_t *)src;
1289          const uint64_t *stop = (uint64_t *)(src + pipelinestat_block_size);
1290          if (flags & VK_QUERY_RESULT_64_BIT) {
1291             uint64_t *dst = (uint64_t *)dest;
1292             dest += util_bitcount(pool->pipeline_stats_mask) * 8;
1293             for (int i = 0; i < ARRAY_SIZE(pipeline_statistics_indices); ++i) {
1294                if (pool->pipeline_stats_mask & (1u << i)) {
1295                   if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) {
1296                      *dst = stop[pipeline_statistics_indices[i]] -
1297                             start[pipeline_statistics_indices[i]];
1298
1299                      if (pool->uses_gds &&
1300                          (1u << i) == VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT) {
1301                         *dst += ngg_gds_result;
1302                      }
1303                   }
1304                   dst++;
1305                }
1306             }
1307
1308          } else {
1309             uint32_t *dst = (uint32_t *)dest;
1310             dest += util_bitcount(pool->pipeline_stats_mask) * 4;
1311             for (int i = 0; i < ARRAY_SIZE(pipeline_statistics_indices); ++i) {
1312                if (pool->pipeline_stats_mask & (1u << i)) {
1313                   if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) {
1314                      *dst = stop[pipeline_statistics_indices[i]] -
1315                             start[pipeline_statistics_indices[i]];
1316
1317                      if (pool->uses_gds &&
1318                          (1u << i) == VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT) {
1319                         *dst += ngg_gds_result;
1320                      }
1321                   }
1322                   dst++;
1323                }
1324             }
1325          }
1326          break;
1327       }
1328       case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
1329          uint64_t const *src64 = (uint64_t const *)src;
1330          uint64_t num_primitives_written;
1331          uint64_t primitive_storage_needed;
1332
1333          /* SAMPLE_STREAMOUTSTATS stores this structure:
1334           * {
1335           *     u64 NumPrimitivesWritten;
1336           *     u64 PrimitiveStorageNeeded;
1337           * }
1338           */
1339          available = 1;
1340          for (int j = 0; j < 4; j++) {
1341             if (!(p_atomic_read(src64 + j) & 0x8000000000000000UL))
1342                available = 0;
1343          }
1344
1345          if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1346             result = VK_NOT_READY;
1347
1348          num_primitives_written = src64[3] - src64[1];
1349          primitive_storage_needed = src64[2] - src64[0];
1350
1351          if (flags & VK_QUERY_RESULT_64_BIT) {
1352             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1353                *(uint64_t *)dest = num_primitives_written;
1354             dest += 8;
1355             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1356                *(uint64_t *)dest = primitive_storage_needed;
1357             dest += 8;
1358          } else {
1359             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1360                *(uint32_t *)dest = num_primitives_written;
1361             dest += 4;
1362             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1363                *(uint32_t *)dest = primitive_storage_needed;
1364             dest += 4;
1365          }
1366          break;
1367       }
1368       case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
1369          uint64_t const *src64 = (uint64_t const *)src;
1370          uint64_t primitive_storage_needed;
1371
1372          /* SAMPLE_STREAMOUTSTATS stores this structure:
1373           * {
1374           *     u64 NumPrimitivesWritten;
1375           *     u64 PrimitiveStorageNeeded;
1376           * }
1377           */
1378          available = 1;
1379          if (!(p_atomic_read(src64 + 0) & 0x8000000000000000UL) ||
1380              !(p_atomic_read(src64 + 2) & 0x8000000000000000UL)) {
1381             available = 0;
1382          }
1383
1384          if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1385             result = VK_NOT_READY;
1386
1387          primitive_storage_needed = src64[2] - src64[0];
1388
1389          if (pool->uses_gds && device->physical_device->rad_info.gfx_level < GFX11) {
1390             uint32_t const *src32 = (uint32_t const *)src;
1391
1392             /* Accumulate the result that was copied from GDS in case NGG shader has been used. */
1393             primitive_storage_needed += src32[9] - src32[8];
1394          }
1395
1396          if (flags & VK_QUERY_RESULT_64_BIT) {
1397             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1398                *(uint64_t *)dest = primitive_storage_needed;
1399             dest += 8;
1400          } else {
1401             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1402                *(uint32_t *)dest = primitive_storage_needed;
1403             dest += 4;
1404          }
1405          break;
1406       }
1407       case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
1408          struct radv_pc_query_pool *pc_pool = (struct radv_pc_query_pool *)pool;
1409          const uint64_t *src64 = (const uint64_t *)src;
1410          bool avail;
1411          do {
1412             avail = true;
1413             for (unsigned i = 0; i < pc_pool->num_passes; ++i)
1414                if (!p_atomic_read(src64 + pool->stride / 8 - i - 1))
1415                   avail = false;
1416          } while (!avail && (flags & VK_QUERY_RESULT_WAIT_BIT));
1417
1418          available = avail;
1419
1420          radv_pc_get_results(pc_pool, src64, dest);
1421          dest += pc_pool->num_counters * sizeof(union VkPerformanceCounterResultKHR);
1422          break;
1423       }
1424       default:
1425          unreachable("trying to get results of unhandled query type");
1426       }
1427
1428       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
1429          if (flags & VK_QUERY_RESULT_64_BIT) {
1430             *(uint64_t *)dest = available;
1431          } else {
1432             *(uint32_t *)dest = available;
1433          }
1434       }
1435    }
1436
1437    return result;
1438 }
1439
1440 static void
1441 emit_query_flush(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool)
1442 {
1443    if (cmd_buffer->pending_reset_query) {
1444       if (pool->size >= RADV_BUFFER_OPS_CS_THRESHOLD) {
1445          /* Only need to flush caches if the query pool size is
1446           * large enough to be resetted using the compute shader
1447           * path. Small pools don't need any cache flushes
1448           * because we use a CP dma clear.
1449           */
1450          si_emit_cache_flush(cmd_buffer);
1451       }
1452    }
1453 }
1454
1455 static size_t
1456 radv_query_result_size(const struct radv_query_pool *pool, VkQueryResultFlags flags)
1457 {
1458    unsigned values = (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) ? 1 : 0;
1459    switch (pool->type) {
1460    case VK_QUERY_TYPE_TIMESTAMP:
1461    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
1462    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
1463    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
1464    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
1465    case VK_QUERY_TYPE_OCCLUSION:
1466       values += 1;
1467       break;
1468    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1469       values += util_bitcount(pool->pipeline_stats_mask);
1470       break;
1471    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1472       values += 2;
1473       break;
1474    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1475       values += 1;
1476       break;
1477    default:
1478       unreachable("trying to get size of unhandled query type");
1479    }
1480    return values * ((flags & VK_QUERY_RESULT_64_BIT) ? 8 : 4);
1481 }
1482
1483 VKAPI_ATTR void VKAPI_CALL
1484 radv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, VkQueryPool queryPool,
1485                              uint32_t firstQuery, uint32_t queryCount, VkBuffer dstBuffer,
1486                              VkDeviceSize dstOffset, VkDeviceSize stride, VkQueryResultFlags flags)
1487 {
1488    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1489    RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1490    RADV_FROM_HANDLE(radv_buffer, dst_buffer, dstBuffer);
1491    struct radeon_cmdbuf *cs = cmd_buffer->cs;
1492    uint64_t va = radv_buffer_get_va(pool->bo);
1493    uint64_t dest_va = radv_buffer_get_va(dst_buffer->bo);
1494    size_t dst_size = radv_query_result_size(pool, flags);
1495    dest_va += dst_buffer->offset + dstOffset;
1496
1497    if (!queryCount)
1498       return;
1499
1500    radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pool->bo);
1501    radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, dst_buffer->bo);
1502
1503    /* Workaround engines that forget to properly specify WAIT_BIT because some driver implicitly
1504     * synchronizes before query copy.
1505     */
1506    if (cmd_buffer->device->instance->flush_before_query_copy)
1507       cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits;
1508
1509    /* From the Vulkan spec 1.1.108:
1510     *
1511     * "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
1512     *  previous uses of vkCmdResetQueryPool in the same queue, without any
1513     *  additional synchronization."
1514     *
1515     * So, we have to flush the caches if the compute shader path was used.
1516     */
1517    emit_query_flush(cmd_buffer, pool);
1518
1519    switch (pool->type) {
1520    case VK_QUERY_TYPE_OCCLUSION:
1521       if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1522          unsigned enabled_rb_mask = cmd_buffer->device->physical_device->rad_info.enabled_rb_mask;
1523          uint32_t rb_avail_offset = 16 * util_last_bit(enabled_rb_mask) - 4;
1524          for (unsigned i = 0; i < queryCount; ++i, dest_va += stride) {
1525             unsigned query = firstQuery + i;
1526             uint64_t src_va = va + query * pool->stride + rb_avail_offset;
1527
1528             radeon_check_space(cmd_buffer->device->ws, cs, 7);
1529
1530             /* Waits on the upper word of the last DB entry */
1531             radv_cp_wait_mem(cs, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va, 0x80000000, 0xffffffff);
1532          }
1533       }
1534       radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.occlusion_query_pipeline,
1535                         pool->bo, dst_buffer->bo, firstQuery * pool->stride,
1536                         dst_buffer->offset + dstOffset, pool->stride, stride, dst_size, queryCount,
1537                         flags, 0, 0, false);
1538       break;
1539    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1540       if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1541          for (unsigned i = 0; i < queryCount; ++i, dest_va += stride) {
1542             unsigned query = firstQuery + i;
1543
1544             radeon_check_space(cmd_buffer->device->ws, cs, 7);
1545
1546             uint64_t avail_va = va + pool->availability_offset + 4 * query;
1547
1548             /* This waits on the ME. All copies below are done on the ME */
1549             radv_cp_wait_mem(cs, WAIT_REG_MEM_EQUAL, avail_va, 1, 0xffffffff);
1550          }
1551       }
1552       radv_query_shader(
1553          cmd_buffer, &cmd_buffer->device->meta_state.query.pipeline_statistics_query_pipeline,
1554          pool->bo, dst_buffer->bo, firstQuery * pool->stride, dst_buffer->offset + dstOffset,
1555          pool->stride, stride, dst_size, queryCount, flags, pool->pipeline_stats_mask,
1556          pool->availability_offset + 4 * firstQuery, pool->uses_gds);
1557       break;
1558    case VK_QUERY_TYPE_TIMESTAMP:
1559    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
1560    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
1561    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
1562    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
1563       if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1564          for (unsigned i = 0; i < queryCount; ++i, dest_va += stride) {
1565             unsigned query = firstQuery + i;
1566             uint64_t local_src_va = va + query * pool->stride;
1567
1568             radeon_check_space(cmd_buffer->device->ws, cs, 7);
1569
1570             /* Wait on the high 32 bits of the timestamp in
1571              * case the low part is 0xffffffff.
1572              */
1573             radv_cp_wait_mem(cs, WAIT_REG_MEM_NOT_EQUAL, local_src_va + 4,
1574                              TIMESTAMP_NOT_READY >> 32, 0xffffffff);
1575          }
1576       }
1577
1578       radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.timestamp_query_pipeline,
1579                         pool->bo, dst_buffer->bo, firstQuery * pool->stride,
1580                         dst_buffer->offset + dstOffset, pool->stride, stride, dst_size, queryCount,
1581                         flags, 0, 0, false);
1582       break;
1583    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1584       if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1585          for (unsigned i = 0; i < queryCount; i++) {
1586             unsigned query = firstQuery + i;
1587             uint64_t src_va = va + query * pool->stride;
1588
1589             radeon_check_space(cmd_buffer->device->ws, cs, 7 * 4);
1590
1591             /* Wait on the upper word of all results. */
1592             for (unsigned j = 0; j < 4; j++, src_va += 8) {
1593                radv_cp_wait_mem(cs, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 4, 0x80000000,
1594                                 0xffffffff);
1595             }
1596          }
1597       }
1598
1599       radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.tfb_query_pipeline,
1600                         pool->bo, dst_buffer->bo, firstQuery * pool->stride,
1601                         dst_buffer->offset + dstOffset, pool->stride, stride, dst_size, queryCount,
1602                         flags, 0, 0, false);
1603       break;
1604    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1605       if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1606          for (unsigned i = 0; i < queryCount; i++) {
1607             unsigned query = firstQuery + i;
1608             uint64_t src_va = va + query * pool->stride;
1609
1610             radeon_check_space(cmd_buffer->device->ws, cs, 7 * 2);
1611
1612             /* Wait on the upper word of the PrimitiveStorageNeeded result. */
1613             radv_cp_wait_mem(cs, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 4, 0x80000000, 0xffffffff);
1614             radv_cp_wait_mem(cs, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 20, 0x80000000, 0xffffffff);
1615          }
1616       }
1617
1618       radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.pg_query_pipeline,
1619                         pool->bo, dst_buffer->bo, firstQuery * pool->stride,
1620                         dst_buffer->offset + dstOffset, pool->stride, stride, dst_size, queryCount,
1621                         flags, 0, 0, pool->uses_gds && cmd_buffer->device->physical_device->rad_info.gfx_level < GFX11);
1622       break;
1623    default:
1624       unreachable("trying to get results of unhandled query type");
1625    }
1626 }
1627
1628 static uint32_t
1629 query_clear_value(VkQueryType type)
1630 {
1631    switch (type) {
1632    case VK_QUERY_TYPE_TIMESTAMP:
1633    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
1634    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
1635    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
1636    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
1637       return (uint32_t)TIMESTAMP_NOT_READY;
1638    default:
1639       return 0;
1640    }
1641 }
1642
1643 VKAPI_ATTR void VKAPI_CALL
1644 radv_CmdResetQueryPool(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t firstQuery,
1645                        uint32_t queryCount)
1646 {
1647    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1648    RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1649    uint32_t value = query_clear_value(pool->type);
1650    uint32_t flush_bits = 0;
1651
1652    /* Make sure to sync all previous work if the given command buffer has
1653     * pending active queries. Otherwise the GPU might write queries data
1654     * after the reset operation.
1655     */
1656    cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits;
1657
1658    flush_bits |= radv_fill_buffer(cmd_buffer, NULL, pool->bo,
1659                                   radv_buffer_get_va(pool->bo) + firstQuery * pool->stride,
1660                                   queryCount * pool->stride, value);
1661
1662    if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
1663       flush_bits |=
1664          radv_fill_buffer(cmd_buffer, NULL, pool->bo,
1665                           radv_buffer_get_va(pool->bo) + pool->availability_offset + firstQuery * 4,
1666                           queryCount * 4, 0);
1667    }
1668
1669    if (flush_bits) {
1670       /* Only need to flush caches for the compute shader path. */
1671       cmd_buffer->pending_reset_query = true;
1672       cmd_buffer->state.flush_bits |= flush_bits;
1673    }
1674 }
1675
1676 VKAPI_ATTR void VKAPI_CALL
1677 radv_ResetQueryPool(VkDevice _device, VkQueryPool queryPool, uint32_t firstQuery,
1678                     uint32_t queryCount)
1679 {
1680    RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1681
1682    uint32_t value = query_clear_value(pool->type);
1683    uint32_t *data = (uint32_t *)(pool->ptr + firstQuery * pool->stride);
1684    uint32_t *data_end = (uint32_t *)(pool->ptr + (firstQuery + queryCount) * pool->stride);
1685
1686    for (uint32_t *p = data; p != data_end; ++p)
1687       *p = value;
1688
1689    if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
1690       memset(pool->ptr + pool->availability_offset + firstQuery * 4, 0, queryCount * 4);
1691    }
1692 }
1693
1694 static unsigned
1695 event_type_for_stream(unsigned stream)
1696 {
1697    switch (stream) {
1698    default:
1699    case 0:
1700       return V_028A90_SAMPLE_STREAMOUTSTATS;
1701    case 1:
1702       return V_028A90_SAMPLE_STREAMOUTSTATS1;
1703    case 2:
1704       return V_028A90_SAMPLE_STREAMOUTSTATS2;
1705    case 3:
1706       return V_028A90_SAMPLE_STREAMOUTSTATS3;
1707    }
1708 }
1709
1710 static void
1711 emit_sample_streamout(struct radv_cmd_buffer *cmd_buffer, uint64_t va, uint32_t index)
1712 {
1713    struct radeon_cmdbuf *cs = cmd_buffer->cs;
1714
1715    radeon_check_space(cmd_buffer->device->ws, cs, 4);
1716
1717    assert(index < MAX_SO_STREAMS);
1718
1719    radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1720    radeon_emit(cs, EVENT_TYPE(event_type_for_stream(index)) | EVENT_INDEX(3));
1721    radeon_emit(cs, va);
1722    radeon_emit(cs, va >> 32);
1723 }
1724
1725 static void
1726 gfx10_copy_gds_query(struct radv_cmd_buffer *cmd_buffer, uint32_t gds_offset, uint64_t va)
1727 {
1728    struct radeon_cmdbuf *cs = cmd_buffer->cs;
1729
1730    /* Make sure GDS is idle before copying the value. */
1731    cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2;
1732    si_emit_cache_flush(cmd_buffer);
1733
1734    radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
1735    radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_GDS) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
1736                    COPY_DATA_WR_CONFIRM);
1737    radeon_emit(cs, gds_offset);
1738    radeon_emit(cs, 0);
1739    radeon_emit(cs, va);
1740    radeon_emit(cs, va >> 32);
1741 }
1742
1743 static void
1744 emit_begin_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool, uint64_t va,
1745                  VkQueryType query_type, VkQueryControlFlags flags, uint32_t index)
1746 {
1747    struct radeon_cmdbuf *cs = cmd_buffer->cs;
1748    switch (query_type) {
1749    case VK_QUERY_TYPE_OCCLUSION:
1750       radeon_check_space(cmd_buffer->device->ws, cs, 7);
1751
1752       ++cmd_buffer->state.active_occlusion_queries;
1753       if (cmd_buffer->state.active_occlusion_queries == 1) {
1754          if (flags & VK_QUERY_CONTROL_PRECISE_BIT) {
1755             /* This is the first occlusion query, enable
1756              * the hint if the precision bit is set.
1757              */
1758             cmd_buffer->state.perfect_occlusion_queries_enabled = true;
1759          }
1760
1761          radv_set_db_count_control(cmd_buffer, true);
1762       } else {
1763          if ((flags & VK_QUERY_CONTROL_PRECISE_BIT) &&
1764              !cmd_buffer->state.perfect_occlusion_queries_enabled) {
1765             /* This is not the first query, but this one
1766              * needs to enable precision, DB_COUNT_CONTROL
1767              * has to be updated accordingly.
1768              */
1769             cmd_buffer->state.perfect_occlusion_queries_enabled = true;
1770
1771             radv_set_db_count_control(cmd_buffer, true);
1772          }
1773       }
1774
1775       if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
1776          uint64_t rb_mask =
1777             BITFIELD64_MASK(cmd_buffer->device->physical_device->rad_info.max_render_backends);
1778
1779          radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1780          radeon_emit(cs, EVENT_TYPE(V_028A90_PIXEL_PIPE_STAT_CONTROL) | EVENT_INDEX(1));
1781          radeon_emit(cs, PIXEL_PIPE_STATE_CNTL_COUNTER_ID(0) |
1782                          PIXEL_PIPE_STATE_CNTL_STRIDE(2) |
1783                          PIXEL_PIPE_STATE_CNTL_INSTANCE_EN_LO(rb_mask));
1784          radeon_emit(cs, PIXEL_PIPE_STATE_CNTL_INSTANCE_EN_HI(rb_mask));
1785       }
1786
1787       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1788
1789       if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
1790          radeon_emit(cs, EVENT_TYPE(V_028A90_PIXEL_PIPE_STAT_DUMP) | EVENT_INDEX(1));
1791       } else {
1792          radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
1793       }
1794
1795       radeon_emit(cs, va);
1796       radeon_emit(cs, va >> 32);
1797       break;
1798    case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
1799       unsigned pipelinestat_block_size = radv_get_pipelinestat_query_size(cmd_buffer->device);
1800
1801       radeon_check_space(cmd_buffer->device->ws, cs, 4);
1802
1803       ++cmd_buffer->state.active_pipeline_queries;
1804       if (cmd_buffer->state.active_pipeline_queries == 1) {
1805          cmd_buffer->state.flush_bits &= ~RADV_CMD_FLAG_STOP_PIPELINE_STATS;
1806          cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_START_PIPELINE_STATS;
1807       }
1808
1809       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1810       radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
1811       radeon_emit(cs, va);
1812       radeon_emit(cs, va >> 32);
1813
1814       if (pool->uses_gds) {
1815          va += pipelinestat_block_size * 2;
1816
1817          /* pipeline statistics counter for all streams */
1818          gfx10_copy_gds_query(cmd_buffer, RADV_NGG_QUERY_PIPELINE_STAT_OFFSET, va);
1819
1820          /* Record that the command buffer needs GDS. */
1821          cmd_buffer->gds_needed = true;
1822
1823          cmd_buffer->state.active_pipeline_gds_queries++;
1824       }
1825       break;
1826    }
1827    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1828       if (cmd_buffer->device->physical_device->use_ngg_streamout) {
1829          /* generated prim counter */
1830          gfx10_copy_gds_query(cmd_buffer, RADV_NGG_QUERY_PRIM_GEN_OFFSET(index), va);
1831          radv_emit_write_data_imm(cs, V_370_ME, va + 4, 0x80000000);
1832
1833          /* written prim counter */
1834          gfx10_copy_gds_query(cmd_buffer, RADV_NGG_QUERY_PRIM_XFB_OFFSET(index), va + 8);
1835          radv_emit_write_data_imm(cs, V_370_ME, va + 12, 0x80000000);
1836
1837          cmd_buffer->state.active_prims_xfb_gds_queries++;
1838       } else {
1839          emit_sample_streamout(cmd_buffer, va, index);
1840       }
1841       break;
1842    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
1843       if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
1844          /* On GFX11+, primitives generated query always use GDS. */
1845          gfx10_copy_gds_query(cmd_buffer, RADV_NGG_QUERY_PRIM_GEN_OFFSET(index), va);
1846          radv_emit_write_data_imm(cs, V_370_ME, va + 4, 0x80000000);
1847
1848          /* Record that the command buffer needs GDS. */
1849          cmd_buffer->gds_needed = true;
1850
1851          cmd_buffer->state.active_prims_gen_gds_queries++;
1852       } else {
1853          if (!cmd_buffer->state.active_prims_gen_queries) {
1854             bool old_streamout_enabled = radv_is_streamout_enabled(cmd_buffer);
1855
1856             cmd_buffer->state.active_prims_gen_queries++;
1857
1858             if (old_streamout_enabled != radv_is_streamout_enabled(cmd_buffer)) {
1859                radv_emit_streamout_enable(cmd_buffer);
1860             }
1861          } else {
1862             cmd_buffer->state.active_prims_gen_queries++;
1863          }
1864
1865          emit_sample_streamout(cmd_buffer, va, index);
1866
1867          if (pool->uses_gds) {
1868             /* generated prim counter */
1869             gfx10_copy_gds_query(cmd_buffer, RADV_NGG_QUERY_PRIM_GEN_OFFSET(index), va + 32);
1870
1871             /* Record that the command buffer needs GDS. */
1872             cmd_buffer->gds_needed = true;
1873
1874             cmd_buffer->state.active_prims_gen_gds_queries++;
1875          }
1876       }
1877       break;
1878    }
1879    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
1880       radv_pc_begin_query(cmd_buffer, (struct radv_pc_query_pool *)pool, va);
1881       break;
1882    }
1883    default:
1884       unreachable("beginning unhandled query type");
1885    }
1886 }
1887
1888 static void
1889 emit_end_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool, uint64_t va,
1890                uint64_t avail_va, VkQueryType query_type, uint32_t index)
1891 {
1892    struct radeon_cmdbuf *cs = cmd_buffer->cs;
1893    switch (query_type) {
1894    case VK_QUERY_TYPE_OCCLUSION:
1895       radeon_check_space(cmd_buffer->device->ws, cs, 14);
1896
1897       cmd_buffer->state.active_occlusion_queries--;
1898       if (cmd_buffer->state.active_occlusion_queries == 0) {
1899          radv_set_db_count_control(cmd_buffer, false);
1900
1901          /* Reset the perfect occlusion queries hint now that no
1902           * queries are active.
1903           */
1904          cmd_buffer->state.perfect_occlusion_queries_enabled = false;
1905       }
1906
1907       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1908       if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
1909          radeon_emit(cs, EVENT_TYPE(V_028A90_PIXEL_PIPE_STAT_DUMP) | EVENT_INDEX(1));
1910       } else {
1911          radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
1912       }
1913       radeon_emit(cs, va + 8);
1914       radeon_emit(cs, (va + 8) >> 32);
1915
1916       break;
1917    case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
1918       unsigned pipelinestat_block_size = radv_get_pipelinestat_query_size(cmd_buffer->device);
1919
1920       radeon_check_space(cmd_buffer->device->ws, cs, 16);
1921
1922       cmd_buffer->state.active_pipeline_queries--;
1923       if (cmd_buffer->state.active_pipeline_queries == 0) {
1924          cmd_buffer->state.flush_bits &= ~RADV_CMD_FLAG_START_PIPELINE_STATS;
1925          cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_STOP_PIPELINE_STATS;
1926       }
1927       va += pipelinestat_block_size;
1928
1929       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1930       radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
1931       radeon_emit(cs, va);
1932       radeon_emit(cs, va >> 32);
1933
1934       si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level,
1935                                  radv_cmd_buffer_uses_mec(cmd_buffer), V_028A90_BOTTOM_OF_PIPE_TS,
1936                                  0, EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, avail_va, 1,
1937                                  cmd_buffer->gfx9_eop_bug_va);
1938
1939       if (pool->uses_gds) {
1940          va += pipelinestat_block_size + 8;
1941
1942          /* pipeline statistics counter for all streams */
1943          gfx10_copy_gds_query(cmd_buffer, RADV_NGG_QUERY_PIPELINE_STAT_OFFSET, va);
1944
1945          cmd_buffer->state.active_pipeline_gds_queries--;
1946       }
1947       break;
1948    }
1949    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1950       if (cmd_buffer->device->physical_device->use_ngg_streamout) {
1951          /* generated prim counter */
1952          gfx10_copy_gds_query(cmd_buffer, RADV_NGG_QUERY_PRIM_GEN_OFFSET(index), va + 16);
1953          radv_emit_write_data_imm(cs, V_370_ME, va + 20, 0x80000000);
1954
1955          /* written prim counter */
1956          gfx10_copy_gds_query(cmd_buffer, RADV_NGG_QUERY_PRIM_XFB_OFFSET(index), va + 24);
1957          radv_emit_write_data_imm(cs, V_370_ME, va + 28, 0x80000000);
1958
1959          cmd_buffer->state.active_prims_xfb_gds_queries--;
1960       } else {
1961          emit_sample_streamout(cmd_buffer, va + 16, index);
1962       }
1963       break;
1964    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
1965       if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
1966          /* On GFX11+, primitives generated query always use GDS. */
1967          gfx10_copy_gds_query(cmd_buffer, RADV_NGG_QUERY_PRIM_GEN_OFFSET(index), va + 16);
1968          radv_emit_write_data_imm(cs, V_370_ME, va + 20, 0x80000000);
1969
1970          cmd_buffer->state.active_prims_gen_gds_queries--;
1971       } else {
1972          if (cmd_buffer->state.active_prims_gen_queries == 1) {
1973             bool old_streamout_enabled = radv_is_streamout_enabled(cmd_buffer);
1974
1975             cmd_buffer->state.active_prims_gen_queries--;
1976
1977             if (old_streamout_enabled != radv_is_streamout_enabled(cmd_buffer)) {
1978                radv_emit_streamout_enable(cmd_buffer);
1979             }
1980          } else {
1981             cmd_buffer->state.active_prims_gen_queries--;
1982          }
1983
1984          emit_sample_streamout(cmd_buffer, va + 16, index);
1985
1986          if (pool->uses_gds) {
1987             /* generated prim counter */
1988             gfx10_copy_gds_query(cmd_buffer, RADV_NGG_QUERY_PRIM_GEN_OFFSET(index), va + 36);
1989
1990             cmd_buffer->state.active_prims_gen_gds_queries--;
1991          }
1992       }
1993       break;
1994    }
1995    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
1996       radv_pc_end_query(cmd_buffer, (struct radv_pc_query_pool *)pool, va);
1997       break;
1998    }
1999    default:
2000       unreachable("ending unhandled query type");
2001    }
2002
2003    cmd_buffer->active_query_flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
2004                                           RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2 |
2005                                           RADV_CMD_FLAG_INV_VCACHE;
2006    if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9) {
2007       cmd_buffer->active_query_flush_bits |=
2008          RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB;
2009    }
2010 }
2011
2012 VKAPI_ATTR void VKAPI_CALL
2013 radv_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t query,
2014                              VkQueryControlFlags flags, uint32_t index)
2015 {
2016    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
2017    RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
2018    struct radeon_cmdbuf *cs = cmd_buffer->cs;
2019    uint64_t va = radv_buffer_get_va(pool->bo);
2020
2021    radv_cs_add_buffer(cmd_buffer->device->ws, cs, pool->bo);
2022
2023    emit_query_flush(cmd_buffer, pool);
2024
2025    va += pool->stride * query;
2026
2027    emit_begin_query(cmd_buffer, pool, va, pool->type, flags, index);
2028 }
2029
2030 VKAPI_ATTR void VKAPI_CALL
2031 radv_CmdBeginQuery(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t query,
2032                    VkQueryControlFlags flags)
2033 {
2034    radv_CmdBeginQueryIndexedEXT(commandBuffer, queryPool, query, flags, 0);
2035 }
2036
2037 VKAPI_ATTR void VKAPI_CALL
2038 radv_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t query,
2039                            uint32_t index)
2040 {
2041    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
2042    RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
2043    uint64_t va = radv_buffer_get_va(pool->bo);
2044    uint64_t avail_va = va + pool->availability_offset + 4 * query;
2045    va += pool->stride * query;
2046
2047    /* Do not need to add the pool BO to the list because the query must
2048     * currently be active, which means the BO is already in the list.
2049     */
2050    emit_end_query(cmd_buffer, pool, va, avail_va, pool->type, index);
2051
2052    /*
2053     * For multiview we have to emit a query for each bit in the mask,
2054     * however the first query we emit will get the totals for all the
2055     * operations, so we don't want to get a real value in the other
2056     * queries. This emits a fake begin/end sequence so the waiting
2057     * code gets a completed query value and doesn't hang, but the
2058     * query returns 0.
2059     */
2060    if (cmd_buffer->state.render.view_mask) {
2061       for (unsigned i = 1; i < util_bitcount(cmd_buffer->state.render.view_mask); i++) {
2062          va += pool->stride;
2063          avail_va += 4;
2064          emit_begin_query(cmd_buffer, pool, va, pool->type, 0, 0);
2065          emit_end_query(cmd_buffer, pool, va, avail_va, pool->type, 0);
2066       }
2067    }
2068 }
2069
2070 VKAPI_ATTR void VKAPI_CALL
2071 radv_CmdEndQuery(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t query)
2072 {
2073    radv_CmdEndQueryIndexedEXT(commandBuffer, queryPool, query, 0);
2074 }
2075
2076 VKAPI_ATTR void VKAPI_CALL
2077 radv_CmdWriteTimestamp2(VkCommandBuffer commandBuffer, VkPipelineStageFlags2 stage,
2078                         VkQueryPool queryPool, uint32_t query)
2079 {
2080    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
2081    RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
2082    bool mec = radv_cmd_buffer_uses_mec(cmd_buffer);
2083    struct radeon_cmdbuf *cs = cmd_buffer->cs;
2084    uint64_t va = radv_buffer_get_va(pool->bo);
2085    uint64_t query_va = va + pool->stride * query;
2086
2087    radv_cs_add_buffer(cmd_buffer->device->ws, cs, pool->bo);
2088
2089    emit_query_flush(cmd_buffer, pool);
2090
2091    int num_queries = 1;
2092    if (cmd_buffer->state.render.view_mask)
2093       num_queries = util_bitcount(cmd_buffer->state.render.view_mask);
2094
2095    ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 28 * num_queries);
2096
2097    for (unsigned i = 0; i < num_queries; i++) {
2098       if (stage == VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT) {
2099          radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
2100          radeon_emit(cs, COPY_DATA_COUNT_SEL | COPY_DATA_WR_CONFIRM |
2101                             COPY_DATA_SRC_SEL(COPY_DATA_TIMESTAMP) | COPY_DATA_DST_SEL(V_370_MEM));
2102          radeon_emit(cs, 0);
2103          radeon_emit(cs, 0);
2104          radeon_emit(cs, query_va);
2105          radeon_emit(cs, query_va >> 32);
2106       } else {
2107          si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level,
2108                                     mec, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM,
2109                                     EOP_DATA_SEL_TIMESTAMP, query_va, 0,
2110                                     cmd_buffer->gfx9_eop_bug_va);
2111       }
2112       query_va += pool->stride;
2113    }
2114
2115    cmd_buffer->active_query_flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
2116                                           RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2 |
2117                                           RADV_CMD_FLAG_INV_VCACHE;
2118    if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9) {
2119       cmd_buffer->active_query_flush_bits |=
2120          RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB;
2121    }
2122
2123    assert(cmd_buffer->cs->cdw <= cdw_max);
2124 }
2125
2126 VKAPI_ATTR void VKAPI_CALL
2127 radv_CmdWriteAccelerationStructuresPropertiesKHR(
2128    VkCommandBuffer commandBuffer, uint32_t accelerationStructureCount,
2129    const VkAccelerationStructureKHR *pAccelerationStructures, VkQueryType queryType,
2130    VkQueryPool queryPool, uint32_t firstQuery)
2131 {
2132    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
2133    RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
2134    struct radeon_cmdbuf *cs = cmd_buffer->cs;
2135    uint64_t pool_va = radv_buffer_get_va(pool->bo);
2136    uint64_t query_va = pool_va + pool->stride * firstQuery;
2137
2138    radv_cs_add_buffer(cmd_buffer->device->ws, cs, pool->bo);
2139
2140    emit_query_flush(cmd_buffer, pool);
2141
2142    ASSERTED unsigned cdw_max =
2143       radeon_check_space(cmd_buffer->device->ws, cs, 6 * accelerationStructureCount);
2144
2145    for (uint32_t i = 0; i < accelerationStructureCount; ++i) {
2146       RADV_FROM_HANDLE(radv_acceleration_structure, accel_struct, pAccelerationStructures[i]);
2147       uint64_t va = accel_struct->va;
2148
2149       switch (queryType) {
2150       case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
2151          va += offsetof(struct radv_accel_struct_header, compacted_size);
2152          break;
2153       case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
2154          va += offsetof(struct radv_accel_struct_header, serialization_size);
2155          break;
2156       case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
2157          va += offsetof(struct radv_accel_struct_header, instance_count);
2158          break;
2159       case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
2160          va += offsetof(struct radv_accel_struct_header, size);
2161          break;
2162       default:
2163          unreachable("Unhandle accel struct query type.");
2164       }
2165
2166       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
2167       radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
2168                          COPY_DATA_COUNT_SEL | COPY_DATA_WR_CONFIRM);
2169       radeon_emit(cs, va);
2170       radeon_emit(cs, va >> 32);
2171       radeon_emit(cs, query_va);
2172       radeon_emit(cs, query_va >> 32);
2173
2174       query_va += pool->stride;
2175    }
2176
2177    assert(cmd_buffer->cs->cdw <= cdw_max);
2178 }