radv: move to using common record_result.
[platform/upstream/mesa.git] / src / amd / vulkan / radv_cmd_buffer.c
1 /*
2  * Copyright © 2016 Red Hat.
3  * Copyright © 2016 Bas Nieuwenhuizen
4  *
5  * based in part on anv driver which is:
6  * Copyright © 2015 Intel Corporation
7  *
8  * Permission is hereby granted, free of charge, to any person obtaining a
9  * copy of this software and associated documentation files (the "Software"),
10  * to deal in the Software without restriction, including without limitation
11  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12  * and/or sell copies of the Software, and to permit persons to whom the
13  * Software is furnished to do so, subject to the following conditions:
14  *
15  * The above copyright notice and this permission notice (including the next
16  * paragraph) shall be included in all copies or substantial portions of the
17  * Software.
18  *
19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
22  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25  * IN THE SOFTWARE.
26  */
27
28 #include "radv_cs.h"
29 #include "radv_debug.h"
30 #include "radv_meta.h"
31 #include "radv_private.h"
32 #include "radv_radeon_winsys.h"
33 #include "radv_shader.h"
34 #include "sid.h"
35 #include "vk_format.h"
36 #include "vk_util.h"
37 #include "vk_enum_defines.h"
38 #include "vk_common_entrypoints.h"
39 #include "vk_render_pass.h"
40
41 #include "ac_debug.h"
42 #include "ac_shader_args.h"
43
44 #include "util/fast_idiv_by_const.h"
45
46 enum {
47    RADV_PREFETCH_VBO_DESCRIPTORS = (1 << 0),
48    RADV_PREFETCH_VS = (1 << 1),
49    RADV_PREFETCH_TCS = (1 << 2),
50    RADV_PREFETCH_TES = (1 << 3),
51    RADV_PREFETCH_GS = (1 << 4),
52    RADV_PREFETCH_PS = (1 << 5),
53    RADV_PREFETCH_MS = (1 << 6),
54    RADV_PREFETCH_SHADERS = (RADV_PREFETCH_VS | RADV_PREFETCH_TCS | RADV_PREFETCH_TES |
55                             RADV_PREFETCH_GS | RADV_PREFETCH_PS | RADV_PREFETCH_MS)
56 };
57
58 static void radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer,
59                                          struct radv_image *image,
60                                          VkImageLayout src_layout, VkImageLayout dst_layout,
61                                          uint32_t src_family_index, uint32_t dst_family_index,
62                                          const VkImageSubresourceRange *range,
63                                          struct radv_sample_locations_state *sample_locs);
64
65 static void radv_set_rt_stack_size(struct radv_cmd_buffer *cmd_buffer, uint32_t size);
66
67 const struct radv_dynamic_state default_dynamic_state = {
68    .viewport =
69       {
70          .count = 0,
71       },
72    .scissor =
73       {
74          .count = 0,
75       },
76    .line_width = 1.0f,
77    .depth_bias =
78       {
79          .bias = 0.0f,
80          .clamp = 0.0f,
81          .slope = 0.0f,
82       },
83    .blend_constants = {0.0f, 0.0f, 0.0f, 0.0f},
84    .depth_bounds =
85       {
86          .min = 0.0f,
87          .max = 1.0f,
88       },
89    .stencil_compare_mask =
90       {
91          .front = ~0u,
92          .back = ~0u,
93       },
94    .stencil_write_mask =
95       {
96          .front = ~0u,
97          .back = ~0u,
98       },
99    .stencil_reference =
100       {
101          .front = 0u,
102          .back = 0u,
103       },
104    .line_stipple =
105       {
106          .factor = 0u,
107          .pattern = 0u,
108       },
109    .cull_mode = 0u,
110    .front_face = 0u,
111    .primitive_topology = 0u,
112    .fragment_shading_rate =
113       {
114          .size = {1u, 1u},
115          .combiner_ops = {VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR,
116                           VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR},
117       },
118    .depth_bias_enable = 0u,
119    .primitive_restart_enable = 0u,
120    .rasterizer_discard_enable = 0u,
121    .logic_op = 0u,
122    .color_write_enable = 0u,
123 };
124
125 static void
126 radv_bind_dynamic_state(struct radv_cmd_buffer *cmd_buffer, const struct radv_dynamic_state *src)
127 {
128    struct radv_dynamic_state *dest = &cmd_buffer->state.dynamic;
129    uint64_t copy_mask = src->mask;
130    uint64_t dest_mask = 0;
131
132    dest->discard_rectangle.count = src->discard_rectangle.count;
133    dest->sample_location.count = src->sample_location.count;
134
135    if (copy_mask & RADV_DYNAMIC_VIEWPORT) {
136       if (dest->viewport.count != src->viewport.count) {
137          dest->viewport.count = src->viewport.count;
138          dest_mask |= RADV_DYNAMIC_VIEWPORT;
139       }
140
141       if (memcmp(&dest->viewport.viewports, &src->viewport.viewports,
142                  src->viewport.count * sizeof(VkViewport))) {
143          typed_memcpy(dest->viewport.viewports, src->viewport.viewports, src->viewport.count);
144          typed_memcpy(dest->viewport.xform, src->viewport.xform, src->viewport.count);
145          dest_mask |= RADV_DYNAMIC_VIEWPORT;
146       }
147    }
148
149    if (copy_mask & RADV_DYNAMIC_SCISSOR) {
150       if (dest->scissor.count != src->scissor.count) {
151          dest->scissor.count = src->scissor.count;
152          dest_mask |= RADV_DYNAMIC_SCISSOR;
153       }
154
155       if (memcmp(&dest->scissor.scissors, &src->scissor.scissors,
156                  src->scissor.count * sizeof(VkRect2D))) {
157          typed_memcpy(dest->scissor.scissors, src->scissor.scissors, src->scissor.count);
158          dest_mask |= RADV_DYNAMIC_SCISSOR;
159       }
160    }
161
162    if (copy_mask & RADV_DYNAMIC_BLEND_CONSTANTS) {
163       if (memcmp(&dest->blend_constants, &src->blend_constants, sizeof(src->blend_constants))) {
164          typed_memcpy(dest->blend_constants, src->blend_constants, 4);
165          dest_mask |= RADV_DYNAMIC_BLEND_CONSTANTS;
166       }
167    }
168
169    if (copy_mask & RADV_DYNAMIC_DISCARD_RECTANGLE) {
170       if (memcmp(&dest->discard_rectangle.rectangles, &src->discard_rectangle.rectangles,
171                  src->discard_rectangle.count * sizeof(VkRect2D))) {
172          typed_memcpy(dest->discard_rectangle.rectangles, src->discard_rectangle.rectangles,
173                       src->discard_rectangle.count);
174          dest_mask |= RADV_DYNAMIC_DISCARD_RECTANGLE;
175       }
176    }
177
178    if (copy_mask & RADV_DYNAMIC_SAMPLE_LOCATIONS) {
179       if (dest->sample_location.per_pixel != src->sample_location.per_pixel ||
180           dest->sample_location.grid_size.width != src->sample_location.grid_size.width ||
181           dest->sample_location.grid_size.height != src->sample_location.grid_size.height ||
182           memcmp(&dest->sample_location.locations, &src->sample_location.locations,
183                  src->sample_location.count * sizeof(VkSampleLocationEXT))) {
184          dest->sample_location.per_pixel = src->sample_location.per_pixel;
185          dest->sample_location.grid_size = src->sample_location.grid_size;
186          typed_memcpy(dest->sample_location.locations, src->sample_location.locations,
187                       src->sample_location.count);
188          dest_mask |= RADV_DYNAMIC_SAMPLE_LOCATIONS;
189       }
190    }
191
192 #define RADV_CMP_COPY(field, flag)                                \
193    if (copy_mask & flag) {                                        \
194       if (dest->field != src->field) {                            \
195          dest->field = src->field;                                \
196          dest_mask |= flag;                                       \
197       }                                                           \
198    }
199
200    RADV_CMP_COPY(line_width, RADV_DYNAMIC_LINE_WIDTH);
201
202    RADV_CMP_COPY(depth_bias.bias, RADV_DYNAMIC_DEPTH_BIAS);
203    RADV_CMP_COPY(depth_bias.clamp, RADV_DYNAMIC_DEPTH_BIAS);
204    RADV_CMP_COPY(depth_bias.slope, RADV_DYNAMIC_DEPTH_BIAS);
205
206    RADV_CMP_COPY(depth_bounds.min, RADV_DYNAMIC_DEPTH_BOUNDS);
207    RADV_CMP_COPY(depth_bounds.max, RADV_DYNAMIC_DEPTH_BOUNDS);
208
209    RADV_CMP_COPY(stencil_compare_mask.front, RADV_DYNAMIC_STENCIL_COMPARE_MASK);
210    RADV_CMP_COPY(stencil_compare_mask.back, RADV_DYNAMIC_STENCIL_COMPARE_MASK);
211
212    RADV_CMP_COPY(stencil_write_mask.front, RADV_DYNAMIC_STENCIL_WRITE_MASK);
213    RADV_CMP_COPY(stencil_write_mask.back, RADV_DYNAMIC_STENCIL_WRITE_MASK);
214
215    RADV_CMP_COPY(stencil_reference.front, RADV_DYNAMIC_STENCIL_REFERENCE);
216    RADV_CMP_COPY(stencil_reference.back, RADV_DYNAMIC_STENCIL_REFERENCE);
217
218    RADV_CMP_COPY(line_stipple.factor, RADV_DYNAMIC_LINE_STIPPLE);
219    RADV_CMP_COPY(line_stipple.pattern, RADV_DYNAMIC_LINE_STIPPLE);
220
221    RADV_CMP_COPY(cull_mode, RADV_DYNAMIC_CULL_MODE);
222    RADV_CMP_COPY(front_face, RADV_DYNAMIC_FRONT_FACE);
223    RADV_CMP_COPY(primitive_topology, RADV_DYNAMIC_PRIMITIVE_TOPOLOGY);
224    RADV_CMP_COPY(depth_test_enable, RADV_DYNAMIC_DEPTH_TEST_ENABLE);
225    RADV_CMP_COPY(depth_write_enable, RADV_DYNAMIC_DEPTH_WRITE_ENABLE);
226    RADV_CMP_COPY(depth_compare_op, RADV_DYNAMIC_DEPTH_COMPARE_OP);
227    RADV_CMP_COPY(depth_bounds_test_enable, RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE);
228    RADV_CMP_COPY(stencil_test_enable, RADV_DYNAMIC_STENCIL_TEST_ENABLE);
229
230    RADV_CMP_COPY(stencil_op.front.fail_op, RADV_DYNAMIC_STENCIL_OP);
231    RADV_CMP_COPY(stencil_op.front.pass_op, RADV_DYNAMIC_STENCIL_OP);
232    RADV_CMP_COPY(stencil_op.front.depth_fail_op, RADV_DYNAMIC_STENCIL_OP);
233    RADV_CMP_COPY(stencil_op.front.compare_op, RADV_DYNAMIC_STENCIL_OP);
234    RADV_CMP_COPY(stencil_op.back.fail_op, RADV_DYNAMIC_STENCIL_OP);
235    RADV_CMP_COPY(stencil_op.back.pass_op, RADV_DYNAMIC_STENCIL_OP);
236    RADV_CMP_COPY(stencil_op.back.depth_fail_op, RADV_DYNAMIC_STENCIL_OP);
237    RADV_CMP_COPY(stencil_op.back.compare_op, RADV_DYNAMIC_STENCIL_OP);
238
239    RADV_CMP_COPY(fragment_shading_rate.size.width, RADV_DYNAMIC_FRAGMENT_SHADING_RATE);
240    RADV_CMP_COPY(fragment_shading_rate.size.height, RADV_DYNAMIC_FRAGMENT_SHADING_RATE);
241    RADV_CMP_COPY(fragment_shading_rate.combiner_ops[0], RADV_DYNAMIC_FRAGMENT_SHADING_RATE);
242    RADV_CMP_COPY(fragment_shading_rate.combiner_ops[1], RADV_DYNAMIC_FRAGMENT_SHADING_RATE);
243
244    RADV_CMP_COPY(depth_bias_enable, RADV_DYNAMIC_DEPTH_BIAS_ENABLE);
245
246    RADV_CMP_COPY(primitive_restart_enable, RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE);
247
248    RADV_CMP_COPY(rasterizer_discard_enable, RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE);
249
250    RADV_CMP_COPY(logic_op, RADV_DYNAMIC_LOGIC_OP);
251
252    RADV_CMP_COPY(color_write_enable, RADV_DYNAMIC_COLOR_WRITE_ENABLE);
253
254 #undef RADV_CMP_COPY
255
256    cmd_buffer->state.dirty |= dest_mask;
257 }
258
259 bool
260 radv_cmd_buffer_uses_mec(struct radv_cmd_buffer *cmd_buffer)
261 {
262    return cmd_buffer->qf == RADV_QUEUE_COMPUTE &&
263           cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7;
264 }
265
266 enum amd_ip_type
267 radv_queue_family_to_ring(struct radv_physical_device *physical_device,
268                           enum radv_queue_family f)
269 {
270    switch (f) {
271    case RADV_QUEUE_GENERAL:
272       return AMD_IP_GFX;
273    case RADV_QUEUE_COMPUTE:
274       return AMD_IP_COMPUTE;
275    case RADV_QUEUE_TRANSFER:
276       return AMD_IP_SDMA;
277    default:
278       unreachable("Unknown queue family");
279    }
280 }
281
282 static void
283 radv_emit_write_data_packet(struct radv_cmd_buffer *cmd_buffer, unsigned engine_sel, uint64_t va,
284                             unsigned count, const uint32_t *data)
285 {
286    struct radeon_cmdbuf *cs = cmd_buffer->cs;
287
288    radeon_check_space(cmd_buffer->device->ws, cs, 4 + count);
289
290    radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0));
291    radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(engine_sel));
292    radeon_emit(cs, va);
293    radeon_emit(cs, va >> 32);
294    radeon_emit_array(cs, data, count);
295 }
296
297 static void
298 radv_emit_clear_data(struct radv_cmd_buffer *cmd_buffer, unsigned engine_sel, uint64_t va,
299                      unsigned size)
300 {
301    uint32_t *zeroes = alloca(size);
302    memset(zeroes, 0, size);
303    radv_emit_write_data_packet(cmd_buffer, engine_sel, va, size / 4, zeroes);
304 }
305
306 static void
307 radv_destroy_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
308 {
309    list_del(&cmd_buffer->pool_link);
310
311    list_for_each_entry_safe(struct radv_cmd_buffer_upload, up, &cmd_buffer->upload.list, list)
312    {
313       cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, up->upload_bo);
314       list_del(&up->list);
315       free(up);
316    }
317
318    if (cmd_buffer->upload.upload_bo)
319       cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, cmd_buffer->upload.upload_bo);
320
321    if (cmd_buffer->state.own_render_pass) {
322       radv_DestroyRenderPass(radv_device_to_handle(cmd_buffer->device),
323                              radv_render_pass_to_handle(cmd_buffer->state.pass), NULL);
324       cmd_buffer->state.own_render_pass = false;
325    }
326
327    if (cmd_buffer->cs)
328       cmd_buffer->device->ws->cs_destroy(cmd_buffer->cs);
329    if (cmd_buffer->ace_internal.cs)
330       cmd_buffer->device->ws->cs_destroy(cmd_buffer->ace_internal.cs);
331
332    for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
333       struct radv_descriptor_set_header *set = &cmd_buffer->descriptors[i].push_set.set;
334       free(set->mapped_ptr);
335       if (set->layout)
336          vk_descriptor_set_layout_unref(&cmd_buffer->device->vk, &set->layout->vk);
337       vk_object_base_finish(&set->base);
338    }
339
340    vk_object_base_finish(&cmd_buffer->meta_push_descriptors.base);
341
342    vk_command_buffer_finish(&cmd_buffer->vk);
343    vk_free(&cmd_buffer->pool->vk.alloc, cmd_buffer);
344 }
345
346 static VkResult
347 radv_create_cmd_buffer(struct radv_device *device, struct radv_cmd_pool *pool,
348                        VkCommandBufferLevel level, VkCommandBuffer *pCommandBuffer)
349 {
350    struct radv_cmd_buffer *cmd_buffer;
351    unsigned ring;
352    cmd_buffer = vk_zalloc(&pool->vk.alloc, sizeof(*cmd_buffer), 8,
353                           VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
354    if (cmd_buffer == NULL)
355       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
356
357    VkResult result =
358       vk_command_buffer_init(&cmd_buffer->vk, &pool->vk, level);
359    if (result != VK_SUCCESS) {
360       vk_free(&cmd_buffer->pool->vk.alloc, cmd_buffer);
361       return result;
362    }
363
364    cmd_buffer->device = device;
365    cmd_buffer->pool = pool;
366
367    list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
368    cmd_buffer->qf = vk_queue_to_radv(device->physical_device, pool->vk.queue_family_index);
369
370    ring = radv_queue_family_to_ring(device->physical_device, cmd_buffer->qf);
371
372    cmd_buffer->cs = device->ws->cs_create(device->ws, ring);
373    if (!cmd_buffer->cs) {
374       radv_destroy_cmd_buffer(cmd_buffer);
375       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
376    }
377
378    vk_object_base_init(&device->vk, &cmd_buffer->meta_push_descriptors.base,
379                        VK_OBJECT_TYPE_DESCRIPTOR_SET);
380
381    for (unsigned i = 0; i < MAX_BIND_POINTS; i++)
382       vk_object_base_init(&device->vk, &cmd_buffer->descriptors[i].push_set.set.base,
383                           VK_OBJECT_TYPE_DESCRIPTOR_SET);
384
385    *pCommandBuffer = radv_cmd_buffer_to_handle(cmd_buffer);
386
387    list_inithead(&cmd_buffer->upload.list);
388
389    return VK_SUCCESS;
390 }
391
392 static VkResult
393 radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
394 {
395    vk_command_buffer_reset(&cmd_buffer->vk);
396
397    cmd_buffer->device->ws->cs_reset(cmd_buffer->cs);
398    if (cmd_buffer->ace_internal.cs)
399       cmd_buffer->device->ws->cs_reset(cmd_buffer->ace_internal.cs);
400
401    list_for_each_entry_safe(struct radv_cmd_buffer_upload, up, &cmd_buffer->upload.list, list)
402    {
403       cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, up->upload_bo);
404       list_del(&up->list);
405       free(up);
406    }
407
408    if (cmd_buffer->state.own_render_pass) {
409       radv_DestroyRenderPass(radv_device_to_handle(cmd_buffer->device),
410                              radv_render_pass_to_handle(cmd_buffer->state.pass), NULL);
411       cmd_buffer->state.own_render_pass = false;
412    }
413
414    cmd_buffer->push_constant_stages = 0;
415    cmd_buffer->scratch_size_per_wave_needed = 0;
416    cmd_buffer->scratch_waves_wanted = 0;
417    cmd_buffer->compute_scratch_size_per_wave_needed = 0;
418    cmd_buffer->compute_scratch_waves_wanted = 0;
419    cmd_buffer->esgs_ring_size_needed = 0;
420    cmd_buffer->gsvs_ring_size_needed = 0;
421    cmd_buffer->tess_rings_needed = false;
422    cmd_buffer->task_rings_needed = false;
423    cmd_buffer->mesh_scratch_ring_needed = false;
424    cmd_buffer->gds_needed = false;
425    cmd_buffer->gds_oa_needed = false;
426    cmd_buffer->sample_positions_needed = false;
427    cmd_buffer->ace_internal.sem.gfx2ace_value = 0;
428    cmd_buffer->ace_internal.sem.emitted_gfx2ace_value = 0;
429    cmd_buffer->ace_internal.sem.va = 0;
430
431    if (cmd_buffer->upload.upload_bo)
432       radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->upload.upload_bo);
433    cmd_buffer->upload.offset = 0;
434
435    memset(cmd_buffer->vertex_binding_buffers, 0, sizeof(struct radv_buffer *) * cmd_buffer->used_vertex_bindings);
436    cmd_buffer->used_vertex_bindings = 0;
437
438    for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
439       cmd_buffer->descriptors[i].dirty = 0;
440       cmd_buffer->descriptors[i].valid = 0;
441       cmd_buffer->descriptors[i].push_dirty = false;
442    }
443
444    if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7) {
445       uint32_t pred_value = 0;
446       uint32_t pred_offset;
447       if (!radv_cmd_buffer_upload_data(cmd_buffer, 4, &pred_value, &pred_offset))
448          vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
449
450       cmd_buffer->mec_inv_pred_emitted = false;
451       cmd_buffer->mec_inv_pred_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + pred_offset;
452    }
453
454    if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9 &&
455        cmd_buffer->qf == RADV_QUEUE_GENERAL) {
456       unsigned num_db = cmd_buffer->device->physical_device->rad_info.max_render_backends;
457       unsigned fence_offset, eop_bug_offset;
458       void *fence_ptr;
459
460       radv_cmd_buffer_upload_alloc(cmd_buffer, 8, &fence_offset, &fence_ptr);
461       memset(fence_ptr, 0, 8);
462
463       cmd_buffer->gfx9_fence_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
464       cmd_buffer->gfx9_fence_va += fence_offset;
465
466       radv_emit_clear_data(cmd_buffer, V_370_PFP, cmd_buffer->gfx9_fence_va, 8);
467
468       if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX9) {
469          /* Allocate a buffer for the EOP bug on GFX9. */
470          radv_cmd_buffer_upload_alloc(cmd_buffer, 16 * num_db, &eop_bug_offset, &fence_ptr);
471          memset(fence_ptr, 0, 16 * num_db);
472          cmd_buffer->gfx9_eop_bug_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
473          cmd_buffer->gfx9_eop_bug_va += eop_bug_offset;
474
475          radv_emit_clear_data(cmd_buffer, V_370_PFP, cmd_buffer->gfx9_eop_bug_va, 16 * num_db);
476       }
477    }
478
479    cmd_buffer->status = RADV_CMD_BUFFER_STATUS_INITIAL;
480
481    return vk_command_buffer_get_record_result(&cmd_buffer->vk);
482 }
483
484 static bool
485 radv_cmd_buffer_resize_upload_buf(struct radv_cmd_buffer *cmd_buffer, uint64_t min_needed)
486 {
487    uint64_t new_size;
488    struct radeon_winsys_bo *bo = NULL;
489    struct radv_cmd_buffer_upload *upload;
490    struct radv_device *device = cmd_buffer->device;
491
492    new_size = MAX2(min_needed, 16 * 1024);
493    new_size = MAX2(new_size, 2 * cmd_buffer->upload.size);
494
495    VkResult result =
496       device->ws->buffer_create(device->ws, new_size, 4096, device->ws->cs_domain(device->ws),
497                                 RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING |
498                                    RADEON_FLAG_32BIT | RADEON_FLAG_GTT_WC,
499                                 RADV_BO_PRIORITY_UPLOAD_BUFFER, 0, &bo);
500
501    if (result != VK_SUCCESS) {
502       vk_command_buffer_set_error(&cmd_buffer->vk, result);
503       return false;
504    }
505
506    radv_cs_add_buffer(device->ws, cmd_buffer->cs, bo);
507    if (cmd_buffer->upload.upload_bo) {
508       upload = malloc(sizeof(*upload));
509
510       if (!upload) {
511          vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
512          device->ws->buffer_destroy(device->ws, bo);
513          return false;
514       }
515
516       memcpy(upload, &cmd_buffer->upload, sizeof(*upload));
517       list_add(&upload->list, &cmd_buffer->upload.list);
518    }
519
520    cmd_buffer->upload.upload_bo = bo;
521    cmd_buffer->upload.size = new_size;
522    cmd_buffer->upload.offset = 0;
523    cmd_buffer->upload.map = device->ws->buffer_map(cmd_buffer->upload.upload_bo);
524
525    if (!cmd_buffer->upload.map) {
526       vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_DEVICE_MEMORY);
527       return false;
528    }
529
530    return true;
531 }
532
533 bool
534 radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer *cmd_buffer, unsigned size,
535                              unsigned *out_offset, void **ptr)
536 {
537    assert(size % 4 == 0);
538
539    struct radeon_info *rad_info = &cmd_buffer->device->physical_device->rad_info;
540
541    /* Align to the scalar cache line size if it results in this allocation
542     * being placed in less of them.
543     */
544    unsigned offset = cmd_buffer->upload.offset;
545    unsigned line_size = rad_info->gfx_level >= GFX10 ? 64 : 32;
546    unsigned gap = align(offset, line_size) - offset;
547    if ((size & (line_size - 1)) > gap)
548       offset = align(offset, line_size);
549
550    if (offset + size > cmd_buffer->upload.size) {
551       if (!radv_cmd_buffer_resize_upload_buf(cmd_buffer, size))
552          return false;
553       offset = 0;
554    }
555
556    *out_offset = offset;
557    *ptr = cmd_buffer->upload.map + offset;
558
559    cmd_buffer->upload.offset = offset + size;
560    return true;
561 }
562
563 bool
564 radv_cmd_buffer_upload_data(struct radv_cmd_buffer *cmd_buffer, unsigned size, const void *data,
565                             unsigned *out_offset)
566 {
567    uint8_t *ptr;
568
569    if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, out_offset, (void **)&ptr))
570       return false;
571    assert(ptr);
572
573    memcpy(ptr, data, size);
574    return true;
575 }
576
577 void
578 radv_cmd_buffer_trace_emit(struct radv_cmd_buffer *cmd_buffer)
579 {
580    struct radv_device *device = cmd_buffer->device;
581    struct radeon_cmdbuf *cs = cmd_buffer->cs;
582    uint64_t va;
583
584    va = radv_buffer_get_va(device->trace_bo);
585    if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
586       va += 4;
587
588    ++cmd_buffer->state.trace_id;
589    radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, 1, &cmd_buffer->state.trace_id);
590
591    radeon_check_space(cmd_buffer->device->ws, cs, 2);
592
593    radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
594    radeon_emit(cs, AC_ENCODE_TRACE_POINT(cmd_buffer->state.trace_id));
595 }
596
597 static void
598 radv_ace_internal_barrier(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2 src_stage_mask,
599                           VkPipelineStageFlags2 dst_stage_mask)
600 {
601    /* Update flush bits from the main cmdbuf, except the stage flush. */
602    cmd_buffer->ace_internal.flush_bits |=
603       cmd_buffer->state.flush_bits & RADV_CMD_FLUSH_ALL_COMPUTE & ~RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
604
605    /* Add stage flush only when necessary. */
606    if (src_stage_mask &
607        (VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_NV | VK_PIPELINE_STAGE_2_TRANSFER_BIT |
608         VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT))
609       cmd_buffer->ace_internal.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
610
611    /* Block task shaders when we have to wait for CP DMA on the GFX cmdbuf. */
612    if (src_stage_mask &
613        (VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_CLEAR_BIT |
614         VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT | VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT |
615         VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT))
616       dst_stage_mask |= cmd_buffer->state.dma_is_busy ? VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_NV : 0;
617
618    /* Increment the GFX/ACE semaphore when task shaders are blocked. */
619    if (dst_stage_mask &
620        (VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT_KHR | VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT |
621         VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_NV))
622       cmd_buffer->ace_internal.sem.gfx2ace_value++;
623 }
624
625 static void
626 radv_ace_internal_cache_flush(struct radv_cmd_buffer *cmd_buffer)
627 {
628    struct radeon_cmdbuf *ace_cs = cmd_buffer->ace_internal.cs;
629    const uint32_t flush_bits = cmd_buffer->ace_internal.flush_bits;
630    enum rgp_flush_bits sqtt_flush_bits = 0;
631
632    si_cs_emit_cache_flush(ace_cs, cmd_buffer->device->physical_device->rad_info.gfx_level, NULL, 0,
633                           true, flush_bits, &sqtt_flush_bits, 0);
634
635    cmd_buffer->ace_internal.flush_bits = 0;
636 }
637
638 static uint64_t
639 radv_ace_internal_sem_create(struct radv_cmd_buffer *cmd_buffer)
640 {
641    /* DWORD 0: GFX->ACE semaphore (GFX blocks ACE, ie. ACE waits for GFX)
642     * DWORD 1: ACE->GFX semaphore
643     */
644    uint64_t sem_init = 0;
645    uint32_t va_off = 0;
646    if (!radv_cmd_buffer_upload_data(cmd_buffer, sizeof(uint64_t), &sem_init, &va_off)) {
647       vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
648       return 0;
649    }
650
651    return radv_buffer_get_va(cmd_buffer->upload.upload_bo) + va_off;
652 }
653
654 static bool
655 radv_ace_internal_sem_dirty(const struct radv_cmd_buffer *cmd_buffer)
656 {
657    return cmd_buffer->ace_internal.sem.gfx2ace_value !=
658           cmd_buffer->ace_internal.sem.emitted_gfx2ace_value;
659 }
660
661 ALWAYS_INLINE static bool
662 radv_flush_gfx2ace_semaphore(struct radv_cmd_buffer *cmd_buffer)
663 {
664    if (!radv_ace_internal_sem_dirty(cmd_buffer))
665       return false;
666
667    if (!cmd_buffer->ace_internal.sem.va) {
668       cmd_buffer->ace_internal.sem.va = radv_ace_internal_sem_create(cmd_buffer);
669       if (!cmd_buffer->ace_internal.sem.va)
670          return false;
671    }
672
673    /* GFX writes a value to the semaphore which ACE can wait for.*/
674    si_cs_emit_write_event_eop(
675       cmd_buffer->cs, cmd_buffer->device->physical_device->rad_info.gfx_level,
676       radv_cmd_buffer_uses_mec(cmd_buffer), V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM,
677       EOP_DATA_SEL_VALUE_32BIT, cmd_buffer->ace_internal.sem.va,
678       cmd_buffer->ace_internal.sem.gfx2ace_value, cmd_buffer->gfx9_eop_bug_va);
679
680    cmd_buffer->ace_internal.sem.emitted_gfx2ace_value = cmd_buffer->ace_internal.sem.gfx2ace_value;
681    return true;
682 }
683
684 ALWAYS_INLINE static void
685 radv_wait_gfx2ace_semaphore(struct radv_cmd_buffer *cmd_buffer)
686 {
687    assert(cmd_buffer->ace_internal.sem.va);
688    struct radeon_cmdbuf *ace_cs = cmd_buffer->ace_internal.cs;
689    radeon_check_space(cmd_buffer->device->ws, ace_cs, 7);
690
691    /* ACE waits for the semaphore which GFX wrote. */
692    radv_cp_wait_mem(ace_cs, WAIT_REG_MEM_GREATER_OR_EQUAL, cmd_buffer->ace_internal.sem.va,
693                     cmd_buffer->ace_internal.sem.gfx2ace_value, 0xffffffff);
694 }
695
696 static struct radeon_cmdbuf *
697 radv_ace_internal_create(struct radv_cmd_buffer *cmd_buffer)
698 {
699    assert(!cmd_buffer->ace_internal.cs);
700    struct radv_device *device = cmd_buffer->device;
701    struct radeon_cmdbuf *ace_cs = device->ws->cs_create(device->ws, AMD_IP_COMPUTE);
702
703    if (!ace_cs)
704       vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
705
706    return ace_cs;
707 }
708
709 static VkResult
710 radv_ace_internal_finalize(struct radv_cmd_buffer *cmd_buffer)
711 {
712    assert(cmd_buffer->ace_internal.cs);
713    struct radv_device *device = cmd_buffer->device;
714    struct radeon_cmdbuf *ace_cs = cmd_buffer->ace_internal.cs;
715
716    /* Emit pending cache flush. */
717    radv_ace_internal_cache_flush(cmd_buffer);
718
719    /* Clear the ACE semaphore if it exists.
720     * This is necessary in case the same cmd buffer is submitted again in the future.
721     */
722    if (cmd_buffer->ace_internal.sem.va) {
723       struct radeon_cmdbuf *main_cs = cmd_buffer->cs;
724       uint64_t gfx2ace_va = cmd_buffer->ace_internal.sem.va;
725       uint64_t ace2gfx_va = cmd_buffer->ace_internal.sem.va + 4;
726
727       /* ACE: write 1 to the ACE->GFX semaphore. */
728       si_cs_emit_write_event_eop(ace_cs, cmd_buffer->device->physical_device->rad_info.gfx_level,
729                                  true, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM,
730                                  EOP_DATA_SEL_VALUE_32BIT, ace2gfx_va, 1,
731                                  cmd_buffer->gfx9_eop_bug_va);
732
733       /* Wait for ACE to finish, otherwise we may risk writing 0 to the semaphore
734        * when ACE is still waiting for it. This may not happen in practice, but
735        * better safe than sorry.
736        */
737       radv_cp_wait_mem(main_cs, WAIT_REG_MEM_GREATER_OR_EQUAL, ace2gfx_va, 1, 0xffffffff);
738
739       /* GFX: clear GFX->ACE and ACE->GFX semaphores. */
740       radv_emit_clear_data(cmd_buffer, V_370_ME, gfx2ace_va, 8);
741    }
742
743    device->ws->cs_add_buffers(ace_cs, cmd_buffer->cs);
744    return device->ws->cs_finalize(ace_cs);
745 }
746
747 static void
748 radv_cmd_buffer_after_draw(struct radv_cmd_buffer *cmd_buffer, enum radv_cmd_flush_bits flags)
749 {
750    if (unlikely(cmd_buffer->device->thread_trace.bo)) {
751       radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
752       radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER) | EVENT_INDEX(0));
753    }
754
755    if (cmd_buffer->device->instance->debug_flags & RADV_DEBUG_SYNC_SHADERS) {
756       enum rgp_flush_bits sqtt_flush_bits = 0;
757       assert(flags & (RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH));
758
759       radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4);
760
761       /* Force wait for graphics or compute engines to be idle. */
762       si_cs_emit_cache_flush(cmd_buffer->cs,
763                              cmd_buffer->device->physical_device->rad_info.gfx_level,
764                              &cmd_buffer->gfx9_fence_idx, cmd_buffer->gfx9_fence_va,
765                              radv_cmd_buffer_uses_mec(cmd_buffer), flags, &sqtt_flush_bits,
766                              cmd_buffer->gfx9_eop_bug_va);
767
768       if (cmd_buffer->state.graphics_pipeline && (flags & RADV_CMD_FLAG_PS_PARTIAL_FLUSH) &&
769           radv_pipeline_has_stage(cmd_buffer->state.graphics_pipeline, MESA_SHADER_TASK)) {
770          /* Force wait for compute engines to be idle on the internal cmdbuf. */
771          si_cs_emit_cache_flush(cmd_buffer->ace_internal.cs,
772                                 cmd_buffer->device->physical_device->rad_info.gfx_level, NULL, 0,
773                                 true, RADV_CMD_FLAG_CS_PARTIAL_FLUSH, &sqtt_flush_bits, 0);
774       }
775    }
776
777    if (unlikely(cmd_buffer->device->trace_bo))
778       radv_cmd_buffer_trace_emit(cmd_buffer);
779 }
780
781 static void
782 radv_save_pipeline(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline)
783 {
784    struct radv_device *device = cmd_buffer->device;
785    enum amd_ip_type ring;
786    uint32_t data[2];
787    uint64_t va;
788
789    va = radv_buffer_get_va(device->trace_bo);
790
791    ring = radv_queue_family_to_ring(device->physical_device, cmd_buffer->qf);
792
793    switch (ring) {
794    case AMD_IP_GFX:
795       va += 8;
796       break;
797    case AMD_IP_COMPUTE:
798       va += 16;
799       break;
800    default:
801       assert(!"invalid IP type");
802    }
803
804    uint64_t pipeline_address = (uintptr_t)pipeline;
805    data[0] = pipeline_address;
806    data[1] = pipeline_address >> 32;
807
808    radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, 2, data);
809 }
810
811 static void
812 radv_save_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, uint64_t vb_ptr)
813 {
814    struct radv_device *device = cmd_buffer->device;
815    uint32_t data[2];
816    uint64_t va;
817
818    va = radv_buffer_get_va(device->trace_bo);
819    va += 24;
820
821    data[0] = vb_ptr;
822    data[1] = vb_ptr >> 32;
823
824    radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, 2, data);
825 }
826
827 static void
828 radv_save_vs_prolog(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader_part *prolog)
829 {
830    struct radv_device *device = cmd_buffer->device;
831    uint32_t data[2];
832    uint64_t va;
833
834    va = radv_buffer_get_va(device->trace_bo);
835    va += 32;
836
837    uint64_t prolog_address = (uintptr_t)prolog;
838    data[0] = prolog_address;
839    data[1] = prolog_address >> 32;
840
841    radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, 2, data);
842 }
843
844 void
845 radv_set_descriptor_set(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point,
846                         struct radv_descriptor_set *set, unsigned idx)
847 {
848    struct radv_descriptor_state *descriptors_state =
849       radv_get_descriptors_state(cmd_buffer, bind_point);
850
851    descriptors_state->sets[idx] = set;
852
853    descriptors_state->valid |= (1u << idx); /* active descriptors */
854    descriptors_state->dirty |= (1u << idx);
855 }
856
857 static void
858 radv_save_descriptors(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point)
859 {
860    struct radv_descriptor_state *descriptors_state =
861       radv_get_descriptors_state(cmd_buffer, bind_point);
862    struct radv_device *device = cmd_buffer->device;
863    uint32_t data[MAX_SETS * 2] = {0};
864    uint64_t va;
865    va = radv_buffer_get_va(device->trace_bo) + 40;
866
867    u_foreach_bit(i, descriptors_state->valid)
868    {
869       struct radv_descriptor_set *set = descriptors_state->sets[i];
870       data[i * 2] = (uint64_t)(uintptr_t)set;
871       data[i * 2 + 1] = (uint64_t)(uintptr_t)set >> 32;
872    }
873
874    radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, MAX_SETS * 2, data);
875 }
876
877 struct radv_userdata_info *
878 radv_lookup_user_sgpr(struct radv_pipeline *pipeline, gl_shader_stage stage, int idx)
879 {
880    struct radv_shader *shader = radv_get_shader(pipeline, stage);
881    return &shader->info.user_sgprs_locs.shader_data[idx];
882 }
883
884 static void
885 radv_emit_userdata_address(struct radv_device *device, struct radeon_cmdbuf *cs,
886                            struct radv_pipeline *pipeline, gl_shader_stage stage, int idx,
887                            uint64_t va)
888 {
889    struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, idx);
890    uint32_t base_reg = pipeline->user_data_0[stage];
891    if (loc->sgpr_idx == -1)
892       return;
893
894    assert(loc->num_sgprs == 1);
895
896    radv_emit_shader_pointer(device, cs, base_reg + loc->sgpr_idx * 4, va, false);
897 }
898
899 static void
900 radv_emit_descriptor_pointers(struct radv_device *device, struct radeon_cmdbuf *cs,
901                               struct radv_pipeline *pipeline,
902                               struct radv_descriptor_state *descriptors_state,
903                               gl_shader_stage stage)
904 {
905    uint32_t sh_base = pipeline->user_data_0[stage];
906    struct radv_userdata_locations *locs = &pipeline->shaders[stage]->info.user_sgprs_locs;
907    unsigned mask = locs->descriptor_sets_enabled;
908
909    mask &= descriptors_state->dirty & descriptors_state->valid;
910
911    while (mask) {
912       int start, count;
913
914       u_bit_scan_consecutive_range(&mask, &start, &count);
915
916       struct radv_userdata_info *loc = &locs->descriptor_sets[start];
917       unsigned sh_offset = sh_base + loc->sgpr_idx * 4;
918
919       radv_emit_shader_pointer_head(cs, sh_offset, count, true);
920       for (int i = 0; i < count; i++) {
921          struct radv_descriptor_set *set = descriptors_state->sets[start + i];
922
923          radv_emit_shader_pointer_body(device, cs, set->header.va, true);
924       }
925    }
926 }
927
928 /**
929  * Convert the user sample locations to hardware sample locations (the values
930  * that will be emitted by PA_SC_AA_SAMPLE_LOCS_PIXEL_*).
931  */
932 static void
933 radv_convert_user_sample_locs(struct radv_sample_locations_state *state, uint32_t x, uint32_t y,
934                               VkOffset2D *sample_locs)
935 {
936    uint32_t x_offset = x % state->grid_size.width;
937    uint32_t y_offset = y % state->grid_size.height;
938    uint32_t num_samples = (uint32_t)state->per_pixel;
939    VkSampleLocationEXT *user_locs;
940    uint32_t pixel_offset;
941
942    pixel_offset = (x_offset + y_offset * state->grid_size.width) * num_samples;
943
944    assert(pixel_offset <= MAX_SAMPLE_LOCATIONS);
945    user_locs = &state->locations[pixel_offset];
946
947    for (uint32_t i = 0; i < num_samples; i++) {
948       float shifted_pos_x = user_locs[i].x - 0.5;
949       float shifted_pos_y = user_locs[i].y - 0.5;
950
951       int32_t scaled_pos_x = floorf(shifted_pos_x * 16);
952       int32_t scaled_pos_y = floorf(shifted_pos_y * 16);
953
954       sample_locs[i].x = CLAMP(scaled_pos_x, -8, 7);
955       sample_locs[i].y = CLAMP(scaled_pos_y, -8, 7);
956    }
957 }
958
959 /**
960  * Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask based on hardware sample
961  * locations.
962  */
963 static void
964 radv_compute_sample_locs_pixel(uint32_t num_samples, VkOffset2D *sample_locs,
965                                uint32_t *sample_locs_pixel)
966 {
967    for (uint32_t i = 0; i < num_samples; i++) {
968       uint32_t sample_reg_idx = i / 4;
969       uint32_t sample_loc_idx = i % 4;
970       int32_t pos_x = sample_locs[i].x;
971       int32_t pos_y = sample_locs[i].y;
972
973       uint32_t shift_x = 8 * sample_loc_idx;
974       uint32_t shift_y = shift_x + 4;
975
976       sample_locs_pixel[sample_reg_idx] |= (pos_x & 0xf) << shift_x;
977       sample_locs_pixel[sample_reg_idx] |= (pos_y & 0xf) << shift_y;
978    }
979 }
980
981 /**
982  * Compute the PA_SC_CENTROID_PRIORITY_* mask based on the top left hardware
983  * sample locations.
984  */
985 static uint64_t
986 radv_compute_centroid_priority(struct radv_cmd_buffer *cmd_buffer, VkOffset2D *sample_locs,
987                                uint32_t num_samples)
988 {
989    uint32_t *centroid_priorities = alloca(num_samples * sizeof(*centroid_priorities));
990    uint32_t sample_mask = num_samples - 1;
991    uint32_t *distances = alloca(num_samples * sizeof(*distances));
992    uint64_t centroid_priority = 0;
993
994    /* Compute the distances from center for each sample. */
995    for (int i = 0; i < num_samples; i++) {
996       distances[i] = (sample_locs[i].x * sample_locs[i].x) + (sample_locs[i].y * sample_locs[i].y);
997    }
998
999    /* Compute the centroid priorities by looking at the distances array. */
1000    for (int i = 0; i < num_samples; i++) {
1001       uint32_t min_idx = 0;
1002
1003       for (int j = 1; j < num_samples; j++) {
1004          if (distances[j] < distances[min_idx])
1005             min_idx = j;
1006       }
1007
1008       centroid_priorities[i] = min_idx;
1009       distances[min_idx] = 0xffffffff;
1010    }
1011
1012    /* Compute the final centroid priority. */
1013    for (int i = 0; i < 8; i++) {
1014       centroid_priority |= centroid_priorities[i & sample_mask] << (i * 4);
1015    }
1016
1017    return centroid_priority << 32 | centroid_priority;
1018 }
1019
1020 /**
1021  * Emit the sample locations that are specified with VK_EXT_sample_locations.
1022  */
1023 static void
1024 radv_emit_sample_locations(struct radv_cmd_buffer *cmd_buffer)
1025 {
1026    struct radv_sample_locations_state *sample_location = &cmd_buffer->state.dynamic.sample_location;
1027    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
1028    uint32_t num_samples = (uint32_t)sample_location->per_pixel;
1029    unsigned pa_sc_aa_config = pipeline->ms.pa_sc_aa_config;
1030    struct radeon_cmdbuf *cs = cmd_buffer->cs;
1031    uint32_t sample_locs_pixel[4][2] = {0};
1032    VkOffset2D sample_locs[4][8]; /* 8 is the max. sample count supported */
1033    uint32_t max_sample_dist = 0;
1034    uint64_t centroid_priority;
1035
1036    if (!cmd_buffer->state.dynamic.sample_location.count)
1037       return;
1038
1039    /* Convert the user sample locations to hardware sample locations. */
1040    radv_convert_user_sample_locs(sample_location, 0, 0, sample_locs[0]);
1041    radv_convert_user_sample_locs(sample_location, 1, 0, sample_locs[1]);
1042    radv_convert_user_sample_locs(sample_location, 0, 1, sample_locs[2]);
1043    radv_convert_user_sample_locs(sample_location, 1, 1, sample_locs[3]);
1044
1045    /* Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask. */
1046    for (uint32_t i = 0; i < 4; i++) {
1047       radv_compute_sample_locs_pixel(num_samples, sample_locs[i], sample_locs_pixel[i]);
1048    }
1049
1050    /* Compute the PA_SC_CENTROID_PRIORITY_* mask. */
1051    centroid_priority = radv_compute_centroid_priority(cmd_buffer, sample_locs[0], num_samples);
1052
1053    /* Compute the maximum sample distance from the specified locations. */
1054    for (unsigned i = 0; i < 4; ++i) {
1055       for (uint32_t j = 0; j < num_samples; j++) {
1056          VkOffset2D offset = sample_locs[i][j];
1057          max_sample_dist = MAX2(max_sample_dist, MAX2(abs(offset.x), abs(offset.y)));
1058       }
1059    }
1060
1061    /* Emit the specified user sample locations. */
1062    switch (num_samples) {
1063    case 2:
1064    case 4:
1065       radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0,
1066                              sample_locs_pixel[0][0]);
1067       radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0,
1068                              sample_locs_pixel[1][0]);
1069       radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0,
1070                              sample_locs_pixel[2][0]);
1071       radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0,
1072                              sample_locs_pixel[3][0]);
1073       break;
1074    case 8:
1075       radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0,
1076                              sample_locs_pixel[0][0]);
1077       radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0,
1078                              sample_locs_pixel[1][0]);
1079       radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0,
1080                              sample_locs_pixel[2][0]);
1081       radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0,
1082                              sample_locs_pixel[3][0]);
1083       radeon_set_context_reg(cs, R_028BFC_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_1,
1084                              sample_locs_pixel[0][1]);
1085       radeon_set_context_reg(cs, R_028C0C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_1,
1086                              sample_locs_pixel[1][1]);
1087       radeon_set_context_reg(cs, R_028C1C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_1,
1088                              sample_locs_pixel[2][1]);
1089       radeon_set_context_reg(cs, R_028C2C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_1,
1090                              sample_locs_pixel[3][1]);
1091       break;
1092    default:
1093       unreachable("invalid number of samples");
1094    }
1095
1096    /* Emit the maximum sample distance and the centroid priority. */
1097    pa_sc_aa_config &= C_028BE0_MAX_SAMPLE_DIST;
1098    pa_sc_aa_config |= S_028BE0_MAX_SAMPLE_DIST(max_sample_dist);
1099
1100    radeon_set_context_reg(cs, R_028BE0_PA_SC_AA_CONFIG, pa_sc_aa_config);
1101
1102    radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
1103    radeon_emit(cs, centroid_priority);
1104    radeon_emit(cs, centroid_priority >> 32);
1105
1106    cmd_buffer->state.context_roll_without_scissor_emitted = true;
1107 }
1108
1109 static void
1110 radv_emit_inline_push_consts(struct radv_device *device, struct radeon_cmdbuf *cs,
1111                              struct radv_pipeline *pipeline, gl_shader_stage stage, int idx,
1112                              uint32_t *values)
1113 {
1114    struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, idx);
1115    uint32_t base_reg = pipeline->user_data_0[stage];
1116    if (loc->sgpr_idx == -1)
1117       return;
1118
1119    radeon_check_space(device->ws, cs, 2 + loc->num_sgprs);
1120
1121    radeon_set_sh_reg_seq(cs, base_reg + loc->sgpr_idx * 4, loc->num_sgprs);
1122    radeon_emit_array(cs, values, loc->num_sgprs);
1123 }
1124
1125 static void
1126 radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer,
1127                               struct radv_graphics_pipeline *pipeline)
1128 {
1129    int num_samples = pipeline->ms.num_samples;
1130    struct radv_graphics_pipeline *old_pipeline = cmd_buffer->state.emitted_graphics_pipeline;
1131
1132    if (pipeline->base.shaders[MESA_SHADER_FRAGMENT]->info.ps.needs_sample_positions)
1133       cmd_buffer->sample_positions_needed = true;
1134
1135    if (old_pipeline && num_samples == old_pipeline->ms.num_samples)
1136       return;
1137
1138    radv_emit_default_sample_locations(cmd_buffer->cs, num_samples);
1139
1140    cmd_buffer->state.context_roll_without_scissor_emitted = true;
1141 }
1142
1143 static void
1144 radv_update_binning_state(struct radv_cmd_buffer *cmd_buffer,
1145                           struct radv_graphics_pipeline *pipeline)
1146 {
1147    const struct radv_graphics_pipeline *old_pipeline = cmd_buffer->state.emitted_graphics_pipeline;
1148
1149    if (pipeline->base.device->physical_device->rad_info.gfx_level < GFX9)
1150       return;
1151
1152    if (old_pipeline &&
1153        old_pipeline->binning.pa_sc_binner_cntl_0 ==
1154           pipeline->binning.pa_sc_binner_cntl_0)
1155       return;
1156
1157    bool binning_flush = false;
1158    if (cmd_buffer->device->physical_device->rad_info.family == CHIP_VEGA12 ||
1159        cmd_buffer->device->physical_device->rad_info.family == CHIP_VEGA20 ||
1160        cmd_buffer->device->physical_device->rad_info.family == CHIP_RAVEN2 ||
1161        cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10) {
1162       binning_flush = !old_pipeline ||
1163                       G_028C44_BINNING_MODE(old_pipeline->binning.pa_sc_binner_cntl_0) !=
1164                          G_028C44_BINNING_MODE(pipeline->binning.pa_sc_binner_cntl_0);
1165    }
1166
1167    radeon_set_context_reg(cmd_buffer->cs, R_028C44_PA_SC_BINNER_CNTL_0,
1168                           pipeline->binning.pa_sc_binner_cntl_0 |
1169                              S_028C44_FLUSH_ON_BINNING_TRANSITION(!!binning_flush));
1170
1171    cmd_buffer->state.context_roll_without_scissor_emitted = true;
1172 }
1173
1174 static void
1175 radv_emit_shader_prefetch(struct radv_cmd_buffer *cmd_buffer, struct radv_shader *shader)
1176 {
1177    uint64_t va;
1178
1179    if (!shader)
1180       return;
1181
1182    va = radv_shader_get_va(shader);
1183
1184    si_cp_dma_prefetch(cmd_buffer, va, shader->code_size);
1185 }
1186
1187 static void
1188 radv_emit_prefetch_L2(struct radv_cmd_buffer *cmd_buffer,
1189                       struct radv_graphics_pipeline *pipeline, bool first_stage_only)
1190 {
1191    struct radv_cmd_state *state = &cmd_buffer->state;
1192    uint32_t mask = state->prefetch_L2_mask;
1193
1194    /* Fast prefetch path for starting draws as soon as possible. */
1195    if (first_stage_only)
1196       mask &= RADV_PREFETCH_VS | RADV_PREFETCH_VBO_DESCRIPTORS | RADV_PREFETCH_MS;
1197
1198    if (mask & RADV_PREFETCH_VS)
1199       radv_emit_shader_prefetch(cmd_buffer, pipeline->base.shaders[MESA_SHADER_VERTEX]);
1200
1201    if (mask & RADV_PREFETCH_MS)
1202       radv_emit_shader_prefetch(cmd_buffer, pipeline->base.shaders[MESA_SHADER_MESH]);
1203
1204    if (mask & RADV_PREFETCH_VBO_DESCRIPTORS)
1205       si_cp_dma_prefetch(cmd_buffer, state->vb_va, pipeline->vb_desc_alloc_size);
1206
1207    if (mask & RADV_PREFETCH_TCS)
1208       radv_emit_shader_prefetch(cmd_buffer, pipeline->base.shaders[MESA_SHADER_TESS_CTRL]);
1209
1210    if (mask & RADV_PREFETCH_TES)
1211       radv_emit_shader_prefetch(cmd_buffer, pipeline->base.shaders[MESA_SHADER_TESS_EVAL]);
1212
1213    if (mask & RADV_PREFETCH_GS) {
1214       radv_emit_shader_prefetch(cmd_buffer, pipeline->base.shaders[MESA_SHADER_GEOMETRY]);
1215       if (radv_pipeline_has_gs_copy_shader(&pipeline->base))
1216          radv_emit_shader_prefetch(cmd_buffer, pipeline->base.gs_copy_shader);
1217    }
1218
1219    if (mask & RADV_PREFETCH_PS)
1220       radv_emit_shader_prefetch(cmd_buffer, pipeline->base.shaders[MESA_SHADER_FRAGMENT]);
1221
1222    state->prefetch_L2_mask &= ~mask;
1223 }
1224
1225 static void
1226 radv_emit_rbplus_state(struct radv_cmd_buffer *cmd_buffer)
1227 {
1228    if (!cmd_buffer->device->physical_device->rad_info.rbplus_allowed)
1229       return;
1230
1231    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
1232    const struct radv_subpass *subpass = cmd_buffer->state.subpass;
1233
1234    unsigned sx_ps_downconvert = 0;
1235    unsigned sx_blend_opt_epsilon = 0;
1236    unsigned sx_blend_opt_control = 0;
1237
1238    for (unsigned i = 0; i < subpass->color_count; ++i) {
1239       unsigned format, swap;
1240       bool has_alpha, has_rgb;
1241       if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) {
1242          /* We don't set the DISABLE bits, because the HW can't have holes,
1243           * so the SPI color format is set to 32-bit 1-component. */
1244          sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
1245          continue;
1246       }
1247
1248       int idx = subpass->color_attachments[i].attachment;
1249       if (cmd_buffer->state.attachments) {
1250          struct radv_color_buffer_info *cb = &cmd_buffer->state.attachments[idx].cb;
1251
1252          format = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11
1253                      ? G_028C70_FORMAT_GFX11(cb->cb_color_info)
1254                      : G_028C70_FORMAT_GFX6(cb->cb_color_info);
1255          swap = G_028C70_COMP_SWAP(cb->cb_color_info);
1256          has_alpha = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11
1257                         ? !G_028C74_FORCE_DST_ALPHA_1_GFX11(cb->cb_color_attrib)
1258                         : !G_028C74_FORCE_DST_ALPHA_1_GFX6(cb->cb_color_attrib);
1259       } else {
1260          VkFormat fmt = cmd_buffer->state.pass->attachments[idx].format;
1261          format = radv_translate_colorformat(fmt);
1262          swap = radv_translate_colorswap(fmt, false);
1263          has_alpha = vk_format_description(fmt)->swizzle[3] != PIPE_SWIZZLE_1;
1264       }
1265
1266       uint32_t spi_format = (pipeline->col_format >> (i * 4)) & 0xf;
1267       uint32_t colormask = (pipeline->cb_target_mask >> (i * 4)) & 0xf;
1268
1269       if (format == V_028C70_COLOR_8 || format == V_028C70_COLOR_16 || format == V_028C70_COLOR_32)
1270          has_rgb = !has_alpha;
1271       else
1272          has_rgb = true;
1273
1274       /* Check the colormask and export format. */
1275       if (!(colormask & 0x7))
1276          has_rgb = false;
1277       if (!(colormask & 0x8))
1278          has_alpha = false;
1279
1280       if (spi_format == V_028714_SPI_SHADER_ZERO) {
1281          has_rgb = false;
1282          has_alpha = false;
1283       }
1284
1285       /* The HW doesn't quite blend correctly with rgb9e5 if we disable the alpha
1286        * optimization, even though it has no alpha. */
1287       if (has_rgb && format == V_028C70_COLOR_5_9_9_9)
1288          has_alpha = true;
1289
1290       /* Disable value checking for disabled channels. */
1291       if (!has_rgb)
1292          sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4);
1293       if (!has_alpha)
1294          sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4);
1295
1296       /* Enable down-conversion for 32bpp and smaller formats. */
1297       switch (format) {
1298       case V_028C70_COLOR_8:
1299       case V_028C70_COLOR_8_8:
1300       case V_028C70_COLOR_8_8_8_8:
1301          /* For 1 and 2-channel formats, use the superset thereof. */
1302          if (spi_format == V_028714_SPI_SHADER_FP16_ABGR ||
1303              spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
1304              spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
1305             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_8_8_8_8 << (i * 4);
1306             sx_blend_opt_epsilon |= V_028758_8BIT_FORMAT << (i * 4);
1307          }
1308          break;
1309
1310       case V_028C70_COLOR_5_6_5:
1311          if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1312             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_5_6_5 << (i * 4);
1313             sx_blend_opt_epsilon |= V_028758_6BIT_FORMAT << (i * 4);
1314          }
1315          break;
1316
1317       case V_028C70_COLOR_1_5_5_5:
1318          if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1319             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_1_5_5_5 << (i * 4);
1320             sx_blend_opt_epsilon |= V_028758_5BIT_FORMAT << (i * 4);
1321          }
1322          break;
1323
1324       case V_028C70_COLOR_4_4_4_4:
1325          if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1326             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_4_4_4_4 << (i * 4);
1327             sx_blend_opt_epsilon |= V_028758_4BIT_FORMAT << (i * 4);
1328          }
1329          break;
1330
1331       case V_028C70_COLOR_32:
1332          if (swap == V_028C70_SWAP_STD && spi_format == V_028714_SPI_SHADER_32_R)
1333             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
1334          else if (swap == V_028C70_SWAP_ALT_REV && spi_format == V_028714_SPI_SHADER_32_AR)
1335             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_A << (i * 4);
1336          break;
1337
1338       case V_028C70_COLOR_16:
1339       case V_028C70_COLOR_16_16:
1340          /* For 1-channel formats, use the superset thereof. */
1341          if (spi_format == V_028714_SPI_SHADER_UNORM16_ABGR ||
1342              spi_format == V_028714_SPI_SHADER_SNORM16_ABGR ||
1343              spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
1344              spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
1345             if (swap == V_028C70_SWAP_STD || swap == V_028C70_SWAP_STD_REV)
1346                sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_GR << (i * 4);
1347             else
1348                sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_AR << (i * 4);
1349          }
1350          break;
1351
1352       case V_028C70_COLOR_10_11_11:
1353          if (spi_format == V_028714_SPI_SHADER_FP16_ABGR)
1354             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_10_11_11 << (i * 4);
1355          break;
1356
1357       case V_028C70_COLOR_2_10_10_10:
1358          if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1359             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_2_10_10_10 << (i * 4);
1360             sx_blend_opt_epsilon |= V_028758_10BIT_FORMAT << (i * 4);
1361          }
1362          break;
1363       case V_028C70_COLOR_5_9_9_9:
1364          if (spi_format == V_028714_SPI_SHADER_FP16_ABGR)
1365             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_9_9_9_E5 << (i * 4);
1366          break;
1367       }
1368    }
1369
1370    /* Do not set the DISABLE bits for the unused attachments, as that
1371     * breaks dual source blending in SkQP and does not seem to improve
1372     * performance. */
1373
1374    if (sx_ps_downconvert == cmd_buffer->state.last_sx_ps_downconvert &&
1375        sx_blend_opt_epsilon == cmd_buffer->state.last_sx_blend_opt_epsilon &&
1376        sx_blend_opt_control == cmd_buffer->state.last_sx_blend_opt_control)
1377       return;
1378
1379    radeon_set_context_reg_seq(cmd_buffer->cs, R_028754_SX_PS_DOWNCONVERT, 3);
1380    radeon_emit(cmd_buffer->cs, sx_ps_downconvert);
1381    radeon_emit(cmd_buffer->cs, sx_blend_opt_epsilon);
1382    radeon_emit(cmd_buffer->cs, sx_blend_opt_control);
1383
1384    cmd_buffer->state.context_roll_without_scissor_emitted = true;
1385
1386    cmd_buffer->state.last_sx_ps_downconvert = sx_ps_downconvert;
1387    cmd_buffer->state.last_sx_blend_opt_epsilon = sx_blend_opt_epsilon;
1388    cmd_buffer->state.last_sx_blend_opt_control = sx_blend_opt_control;
1389 }
1390
1391 static void
1392 radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer)
1393 {
1394    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
1395    const struct radv_device *device = cmd_buffer->device;
1396
1397    if (cmd_buffer->state.emitted_graphics_pipeline == pipeline)
1398       return;
1399
1400    radv_update_multisample_state(cmd_buffer, pipeline);
1401    radv_update_binning_state(cmd_buffer, pipeline);
1402
1403    cmd_buffer->scratch_size_per_wave_needed =
1404       MAX2(cmd_buffer->scratch_size_per_wave_needed, pipeline->base.scratch_bytes_per_wave);
1405    cmd_buffer->scratch_waves_wanted = MAX2(cmd_buffer->scratch_waves_wanted, pipeline->base.max_waves);
1406
1407    if (!cmd_buffer->state.emitted_graphics_pipeline)
1408       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY |
1409                                  RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS |
1410                                  RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS |
1411                                  RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE |
1412                                  RADV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE |
1413                                  RADV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE |
1414                                  RADV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP |
1415                                  RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE |
1416                                  RADV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE |
1417                                  RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP;
1418
1419    if (!cmd_buffer->state.emitted_graphics_pipeline ||
1420        cmd_buffer->state.emitted_graphics_pipeline->negative_one_to_one != pipeline->negative_one_to_one ||
1421        cmd_buffer->state.emitted_graphics_pipeline->depth_clamp_mode != pipeline->depth_clamp_mode)
1422       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_VIEWPORT;
1423
1424    if (!cmd_buffer->state.emitted_graphics_pipeline ||
1425        radv_rast_prim_is_points_or_lines(cmd_buffer->state.emitted_graphics_pipeline->rast_prim) != radv_rast_prim_is_points_or_lines(pipeline->rast_prim))
1426       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_GUARDBAND;
1427
1428    if (!cmd_buffer->state.emitted_graphics_pipeline ||
1429        cmd_buffer->state.emitted_graphics_pipeline->pa_su_sc_mode_cntl != pipeline->pa_su_sc_mode_cntl)
1430       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_CULL_MODE |
1431                                  RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE |
1432                                  RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
1433
1434    if (!cmd_buffer->state.emitted_graphics_pipeline ||
1435        cmd_buffer->state.emitted_graphics_pipeline->pa_cl_clip_cntl != pipeline->pa_cl_clip_cntl)
1436       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE;
1437
1438    if (!cmd_buffer->state.emitted_graphics_pipeline ||
1439        cmd_buffer->state.emitted_graphics_pipeline->cb_color_control != pipeline->cb_color_control)
1440       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP;
1441
1442    if (!cmd_buffer->state.emitted_graphics_pipeline ||
1443        cmd_buffer->state.emitted_graphics_pipeline->cb_target_mask != pipeline->cb_target_mask)
1444       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE;
1445
1446    radeon_emit_array(cmd_buffer->cs, pipeline->base.cs.buf, pipeline->base.cs.cdw);
1447
1448    if (pipeline->has_ngg_culling &&
1449        pipeline->last_vgt_api_stage != MESA_SHADER_GEOMETRY &&
1450        !cmd_buffer->state.last_nggc_settings) {
1451       /* The already emitted RSRC2 contains the LDS required for NGG culling.
1452        * Culling is currently disabled, so re-emit RSRC2 to reduce LDS usage.
1453        * API GS always needs LDS, so this isn't useful there.
1454        */
1455       struct radv_shader *v = pipeline->base.shaders[pipeline->last_vgt_api_stage];
1456       radeon_set_sh_reg(cmd_buffer->cs, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
1457                         (v->config.rsrc2 & C_00B22C_LDS_SIZE) |
1458                         S_00B22C_LDS_SIZE(v->info.num_lds_blocks_when_not_culling));
1459    }
1460
1461    if (!cmd_buffer->state.emitted_graphics_pipeline ||
1462        cmd_buffer->state.emitted_graphics_pipeline->base.ctx_cs.cdw != pipeline->base.ctx_cs.cdw ||
1463        cmd_buffer->state.emitted_graphics_pipeline->base.ctx_cs_hash != pipeline->base.ctx_cs_hash ||
1464        memcmp(cmd_buffer->state.emitted_graphics_pipeline->base.ctx_cs.buf, pipeline->base.ctx_cs.buf,
1465               pipeline->base.ctx_cs.cdw * 4)) {
1466       radeon_emit_array(cmd_buffer->cs, pipeline->base.ctx_cs.buf, pipeline->base.ctx_cs.cdw);
1467       cmd_buffer->state.context_roll_without_scissor_emitted = true;
1468    }
1469
1470    if (device->pbb_allowed) {
1471       struct radv_binning_settings *settings = &device->physical_device->binning_settings;
1472
1473       if ((!cmd_buffer->state.emitted_graphics_pipeline ||
1474            cmd_buffer->state.emitted_graphics_pipeline->base.shaders[MESA_SHADER_FRAGMENT] !=
1475            cmd_buffer->state.graphics_pipeline->base.shaders[MESA_SHADER_FRAGMENT]) &&
1476           (settings->context_states_per_bin > 1 || settings->persistent_states_per_bin > 1)) {
1477          /* Break the batch on PS changes. */
1478          radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
1479          radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
1480       }
1481    }
1482
1483    radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->base.slab_bo);
1484
1485    if (unlikely(cmd_buffer->device->trace_bo))
1486       radv_save_pipeline(cmd_buffer, &pipeline->base);
1487
1488    cmd_buffer->state.emitted_graphics_pipeline = pipeline;
1489
1490    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_PIPELINE;
1491 }
1492
1493 static void
1494 radv_emit_viewport(struct radv_cmd_buffer *cmd_buffer)
1495 {
1496    const struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
1497    const struct radv_viewport_state *viewport = &cmd_buffer->state.dynamic.viewport;
1498    int i;
1499    const unsigned count = viewport->count;
1500
1501    assert(count);
1502    radeon_set_context_reg_seq(cmd_buffer->cs, R_02843C_PA_CL_VPORT_XSCALE, count * 6);
1503
1504    for (i = 0; i < count; i++) {
1505       radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].scale[0]));
1506       radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].translate[0]));
1507       radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].scale[1]));
1508       radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].translate[1]));
1509
1510       double scale_z, translate_z;
1511       if (pipeline->negative_one_to_one) {
1512          scale_z = viewport->xform[i].scale[2] * 0.5f;
1513          translate_z = (viewport->xform[i].translate[2] + viewport->viewports[i].maxDepth) * 0.5f;
1514       } else {
1515          scale_z = viewport->xform[i].scale[2];
1516          translate_z = viewport->xform[i].translate[2];
1517
1518       }
1519       radeon_emit(cmd_buffer->cs, fui(scale_z));
1520       radeon_emit(cmd_buffer->cs, fui(translate_z));
1521    }
1522
1523    radeon_set_context_reg_seq(cmd_buffer->cs, R_0282D0_PA_SC_VPORT_ZMIN_0, count * 2);
1524    for (i = 0; i < count; i++) {
1525       float zmin, zmax;
1526
1527       if (pipeline->depth_clamp_mode == RADV_DEPTH_CLAMP_MODE_ZERO_TO_ONE) {
1528          zmin = 0.0f;
1529          zmax = 1.0f;
1530       } else {
1531          zmin = MIN2(viewport->viewports[i].minDepth, viewport->viewports[i].maxDepth);
1532          zmax = MAX2(viewport->viewports[i].minDepth, viewport->viewports[i].maxDepth);
1533       }
1534
1535       radeon_emit(cmd_buffer->cs, fui(zmin));
1536       radeon_emit(cmd_buffer->cs, fui(zmax));
1537    }
1538 }
1539
1540 void
1541 radv_write_scissors(struct radv_cmd_buffer *cmd_buffer, struct radeon_cmdbuf *cs)
1542 {
1543    uint32_t count = cmd_buffer->state.dynamic.scissor.count;
1544
1545    si_write_scissors(cs, count, cmd_buffer->state.dynamic.scissor.scissors,
1546                      cmd_buffer->state.dynamic.viewport.viewports);
1547 }
1548
1549 static void
1550 radv_emit_scissor(struct radv_cmd_buffer *cmd_buffer)
1551 {
1552    radv_write_scissors(cmd_buffer, cmd_buffer->cs);
1553
1554    cmd_buffer->state.context_roll_without_scissor_emitted = false;
1555 }
1556
1557 static void
1558 radv_emit_discard_rectangle(struct radv_cmd_buffer *cmd_buffer)
1559 {
1560    if (!cmd_buffer->state.dynamic.discard_rectangle.count)
1561       return;
1562
1563    radeon_set_context_reg_seq(cmd_buffer->cs, R_028210_PA_SC_CLIPRECT_0_TL,
1564                               cmd_buffer->state.dynamic.discard_rectangle.count * 2);
1565    for (unsigned i = 0; i < cmd_buffer->state.dynamic.discard_rectangle.count; ++i) {
1566       VkRect2D rect = cmd_buffer->state.dynamic.discard_rectangle.rectangles[i];
1567       radeon_emit(cmd_buffer->cs, S_028210_TL_X(rect.offset.x) | S_028210_TL_Y(rect.offset.y));
1568       radeon_emit(cmd_buffer->cs, S_028214_BR_X(rect.offset.x + rect.extent.width) |
1569                                      S_028214_BR_Y(rect.offset.y + rect.extent.height));
1570    }
1571 }
1572
1573 static void
1574 radv_emit_line_width(struct radv_cmd_buffer *cmd_buffer)
1575 {
1576    unsigned width = cmd_buffer->state.dynamic.line_width * 8;
1577
1578    radeon_set_context_reg(cmd_buffer->cs, R_028A08_PA_SU_LINE_CNTL,
1579                           S_028A08_WIDTH(CLAMP(width, 0, 0xFFFF)));
1580 }
1581
1582 static void
1583 radv_emit_blend_constants(struct radv_cmd_buffer *cmd_buffer)
1584 {
1585    struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1586
1587    radeon_set_context_reg_seq(cmd_buffer->cs, R_028414_CB_BLEND_RED, 4);
1588    radeon_emit_array(cmd_buffer->cs, (uint32_t *)d->blend_constants, 4);
1589 }
1590
1591 static void
1592 radv_emit_stencil(struct radv_cmd_buffer *cmd_buffer)
1593 {
1594    struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1595
1596    radeon_set_context_reg_seq(cmd_buffer->cs, R_028430_DB_STENCILREFMASK, 2);
1597    radeon_emit(cmd_buffer->cs, S_028430_STENCILTESTVAL(d->stencil_reference.front) |
1598                                   S_028430_STENCILMASK(d->stencil_compare_mask.front) |
1599                                   S_028430_STENCILWRITEMASK(d->stencil_write_mask.front) |
1600                                   S_028430_STENCILOPVAL(1));
1601    radeon_emit(cmd_buffer->cs, S_028434_STENCILTESTVAL_BF(d->stencil_reference.back) |
1602                                   S_028434_STENCILMASK_BF(d->stencil_compare_mask.back) |
1603                                   S_028434_STENCILWRITEMASK_BF(d->stencil_write_mask.back) |
1604                                   S_028434_STENCILOPVAL_BF(1));
1605 }
1606
1607 static void
1608 radv_emit_depth_bounds(struct radv_cmd_buffer *cmd_buffer)
1609 {
1610    struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1611
1612    radeon_set_context_reg_seq(cmd_buffer->cs, R_028020_DB_DEPTH_BOUNDS_MIN, 2);
1613    radeon_emit(cmd_buffer->cs, fui(d->depth_bounds.min));
1614    radeon_emit(cmd_buffer->cs, fui(d->depth_bounds.max));
1615 }
1616
1617 static void
1618 radv_emit_depth_bias(struct radv_cmd_buffer *cmd_buffer)
1619 {
1620    struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1621    unsigned slope = fui(d->depth_bias.slope * 16.0f);
1622
1623    radeon_set_context_reg_seq(cmd_buffer->cs, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, 5);
1624    radeon_emit(cmd_buffer->cs, fui(d->depth_bias.clamp)); /* CLAMP */
1625    radeon_emit(cmd_buffer->cs, slope);                    /* FRONT SCALE */
1626    radeon_emit(cmd_buffer->cs, fui(d->depth_bias.bias));  /* FRONT OFFSET */
1627    radeon_emit(cmd_buffer->cs, slope);                    /* BACK SCALE */
1628    radeon_emit(cmd_buffer->cs, fui(d->depth_bias.bias));  /* BACK OFFSET */
1629 }
1630
1631 static void
1632 radv_emit_line_stipple(struct radv_cmd_buffer *cmd_buffer)
1633 {
1634    struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1635    uint32_t auto_reset_cntl = 1;
1636
1637    if (d->primitive_topology == V_008958_DI_PT_LINESTRIP)
1638       auto_reset_cntl = 2;
1639
1640    radeon_set_context_reg(cmd_buffer->cs, R_028A0C_PA_SC_LINE_STIPPLE,
1641                           S_028A0C_LINE_PATTERN(d->line_stipple.pattern) |
1642                              S_028A0C_REPEAT_COUNT(d->line_stipple.factor - 1) |
1643                              S_028A0C_AUTO_RESET_CNTL(auto_reset_cntl));
1644 }
1645
1646 uint32_t
1647 radv_get_pa_su_sc_mode_cntl(const struct radv_cmd_buffer *cmd_buffer)
1648 {
1649    unsigned pa_su_sc_mode_cntl = cmd_buffer->state.graphics_pipeline->pa_su_sc_mode_cntl;
1650    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1651
1652    pa_su_sc_mode_cntl |= S_028814_CULL_FRONT(!!(d->cull_mode & VK_CULL_MODE_FRONT_BIT)) |
1653                          S_028814_CULL_BACK(!!(d->cull_mode & VK_CULL_MODE_BACK_BIT)) |
1654                          S_028814_FACE(d->front_face) |
1655                          S_028814_POLY_OFFSET_FRONT_ENABLE(d->depth_bias_enable) |
1656                          S_028814_POLY_OFFSET_BACK_ENABLE(d->depth_bias_enable) |
1657                          S_028814_POLY_OFFSET_PARA_ENABLE(d->depth_bias_enable);
1658    return pa_su_sc_mode_cntl;
1659 }
1660
1661 static void
1662 radv_emit_culling(struct radv_cmd_buffer *cmd_buffer)
1663 {
1664    unsigned pa_su_sc_mode_cntl = radv_get_pa_su_sc_mode_cntl(cmd_buffer);
1665
1666    radeon_set_context_reg(cmd_buffer->cs, R_028814_PA_SU_SC_MODE_CNTL, pa_su_sc_mode_cntl);
1667 }
1668
1669 static void
1670 radv_emit_primitive_topology(struct radv_cmd_buffer *cmd_buffer)
1671 {
1672    struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1673
1674    assert(!cmd_buffer->state.mesh_shading);
1675
1676    if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7) {
1677       radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device, cmd_buffer->cs,
1678                                  R_030908_VGT_PRIMITIVE_TYPE, 1, d->primitive_topology);
1679    } else {
1680       radeon_set_config_reg(cmd_buffer->cs, R_008958_VGT_PRIMITIVE_TYPE, d->primitive_topology);
1681    }
1682 }
1683
1684 static void
1685 radv_emit_depth_control(struct radv_cmd_buffer *cmd_buffer)
1686 {
1687    struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1688
1689    radeon_set_context_reg(cmd_buffer->cs, R_028800_DB_DEPTH_CONTROL,
1690                           S_028800_Z_ENABLE(d->depth_test_enable ? 1 : 0) |
1691                           S_028800_Z_WRITE_ENABLE(d->depth_write_enable ? 1 : 0) |
1692                           S_028800_ZFUNC(d->depth_compare_op) |
1693                           S_028800_DEPTH_BOUNDS_ENABLE(d->depth_bounds_test_enable ? 1 : 0) |
1694                           S_028800_STENCIL_ENABLE(d->stencil_test_enable ? 1 : 0) |
1695                           S_028800_BACKFACE_ENABLE(d->stencil_test_enable ? 1 : 0) |
1696                           S_028800_STENCILFUNC(d->stencil_op.front.compare_op) |
1697                           S_028800_STENCILFUNC_BF(d->stencil_op.back.compare_op));
1698 }
1699
1700 static void
1701 radv_emit_stencil_control(struct radv_cmd_buffer *cmd_buffer)
1702 {
1703    struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1704
1705    radeon_set_context_reg(
1706       cmd_buffer->cs, R_02842C_DB_STENCIL_CONTROL,
1707       S_02842C_STENCILFAIL(si_translate_stencil_op(d->stencil_op.front.fail_op)) |
1708          S_02842C_STENCILZPASS(si_translate_stencil_op(d->stencil_op.front.pass_op)) |
1709          S_02842C_STENCILZFAIL(si_translate_stencil_op(d->stencil_op.front.depth_fail_op)) |
1710          S_02842C_STENCILFAIL_BF(si_translate_stencil_op(d->stencil_op.back.fail_op)) |
1711          S_02842C_STENCILZPASS_BF(si_translate_stencil_op(d->stencil_op.back.pass_op)) |
1712          S_02842C_STENCILZFAIL_BF(si_translate_stencil_op(d->stencil_op.back.depth_fail_op)));
1713 }
1714
1715 static void
1716 radv_emit_fragment_shading_rate(struct radv_cmd_buffer *cmd_buffer)
1717 {
1718    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
1719    const struct radv_subpass *subpass = cmd_buffer->state.subpass;
1720    struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1721    uint32_t rate_x = MIN2(2, d->fragment_shading_rate.size.width) - 1;
1722    uint32_t rate_y = MIN2(2, d->fragment_shading_rate.size.height) - 1;
1723    uint32_t pa_cl_vrs_cntl = pipeline->vrs.pa_cl_vrs_cntl;
1724    uint32_t pipeline_comb_mode = d->fragment_shading_rate.combiner_ops[0];
1725    uint32_t htile_comb_mode = d->fragment_shading_rate.combiner_ops[1];
1726
1727    assert(cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10_3);
1728
1729    if (subpass && !subpass->vrs_attachment) {
1730       /* When the current subpass has no VRS attachment, the VRS rates are expected to be 1x1, so we
1731        * can cheat by tweaking the different combiner modes.
1732        */
1733       switch (htile_comb_mode) {
1734       case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MIN_KHR:
1735          /* The result of min(A, 1x1) is always 1x1. */
1736          FALLTHROUGH;
1737       case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_REPLACE_KHR:
1738          /* Force the per-draw VRS rate to 1x1. */
1739          rate_x = rate_y = 0;
1740
1741          /* As the result of min(A, 1x1) or replace(A, 1x1) are always 1x1, set the vertex rate
1742           * combiner mode as passthrough.
1743           */
1744          pipeline_comb_mode = V_028848_VRS_COMB_MODE_PASSTHRU;
1745          break;
1746       case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MAX_KHR:
1747          /* The result of max(A, 1x1) is always A. */
1748          FALLTHROUGH;
1749       case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR:
1750          /* Nothing to do here because the SAMPLE_ITER combiner mode should already be passthrough. */
1751          break;
1752       default:
1753          break;
1754       }
1755    }
1756
1757    /* Emit per-draw VRS rate which is the first combiner. */
1758    radeon_set_uconfig_reg(cmd_buffer->cs, R_03098C_GE_VRS_RATE,
1759                           S_03098C_RATE_X(rate_x) | S_03098C_RATE_Y(rate_y));
1760
1761    /* VERTEX_RATE_COMBINER_MODE controls the combiner mode between the
1762     * draw rate and the vertex rate.
1763     */
1764    if (cmd_buffer->state.mesh_shading) {
1765       pa_cl_vrs_cntl |= S_028848_VERTEX_RATE_COMBINER_MODE(V_028848_VRS_COMB_MODE_PASSTHRU) |
1766                         S_028848_PRIMITIVE_RATE_COMBINER_MODE(pipeline_comb_mode);
1767    } else {
1768       pa_cl_vrs_cntl |= S_028848_VERTEX_RATE_COMBINER_MODE(pipeline_comb_mode) |
1769                         S_028848_PRIMITIVE_RATE_COMBINER_MODE(V_028848_VRS_COMB_MODE_PASSTHRU);
1770    }
1771
1772    /* HTILE_RATE_COMBINER_MODE controls the combiner mode between the primitive rate and the HTILE
1773     * rate.
1774     */
1775    pa_cl_vrs_cntl |= S_028848_HTILE_RATE_COMBINER_MODE(htile_comb_mode);
1776
1777    radeon_set_context_reg(cmd_buffer->cs, R_028848_PA_CL_VRS_CNTL, pa_cl_vrs_cntl);
1778 }
1779
1780 static void
1781 radv_emit_primitive_restart_enable(struct radv_cmd_buffer *cmd_buffer)
1782 {
1783    struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1784
1785    if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
1786       radeon_set_uconfig_reg(cmd_buffer->cs, R_03092C_GE_MULTI_PRIM_IB_RESET_EN,
1787                              d->primitive_restart_enable);
1788    } else if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9) {
1789       radeon_set_uconfig_reg(cmd_buffer->cs, R_03092C_VGT_MULTI_PRIM_IB_RESET_EN,
1790                              d->primitive_restart_enable);
1791    } else {
1792       radeon_set_context_reg(cmd_buffer->cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN,
1793                              d->primitive_restart_enable);
1794    }
1795 }
1796
1797 static void
1798 radv_emit_rasterizer_discard_enable(struct radv_cmd_buffer *cmd_buffer)
1799 {
1800    unsigned pa_cl_clip_cntl = cmd_buffer->state.graphics_pipeline->pa_cl_clip_cntl;
1801    struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1802
1803    pa_cl_clip_cntl |= S_028810_DX_RASTERIZATION_KILL(d->rasterizer_discard_enable);
1804
1805    radeon_set_context_reg(cmd_buffer->cs, R_028810_PA_CL_CLIP_CNTL, pa_cl_clip_cntl);
1806 }
1807
1808 static void
1809 radv_emit_logic_op(struct radv_cmd_buffer *cmd_buffer)
1810 {
1811    unsigned cb_color_control = cmd_buffer->state.graphics_pipeline->cb_color_control;
1812    struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1813
1814    cb_color_control |= S_028808_ROP3(d->logic_op);
1815
1816    radeon_set_context_reg(cmd_buffer->cs, R_028808_CB_COLOR_CONTROL, cb_color_control);
1817 }
1818
1819 static void
1820 radv_emit_color_write_enable(struct radv_cmd_buffer *cmd_buffer)
1821 {
1822    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
1823    struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1824
1825    radeon_set_context_reg(cmd_buffer->cs, R_028238_CB_TARGET_MASK,
1826                           pipeline->cb_target_mask & d->color_write_enable);
1827 }
1828
1829 static void
1830 radv_emit_fb_color_state(struct radv_cmd_buffer *cmd_buffer, int index,
1831                          struct radv_color_buffer_info *cb, struct radv_image_view *iview,
1832                          VkImageLayout layout)
1833 {
1834    bool is_vi = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX8;
1835    uint32_t cb_fdcc_control = cb->cb_dcc_control;
1836    uint32_t cb_color_info = cb->cb_color_info;
1837    struct radv_image *image = iview->image;
1838
1839    if (!radv_layout_dcc_compressed(
1840           cmd_buffer->device, image, iview->vk.base_mip_level, layout,
1841           radv_image_queue_family_mask(image, cmd_buffer->qf,
1842                                        cmd_buffer->qf))) {
1843       if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
1844          cb_fdcc_control &= C_028C78_FDCC_ENABLE;
1845       } else {
1846          cb_color_info &= C_028C70_DCC_ENABLE;
1847       }
1848    }
1849
1850    if (!radv_layout_fmask_compressed(
1851           cmd_buffer->device, image, layout,
1852           radv_image_queue_family_mask(image, cmd_buffer->qf,
1853                                        cmd_buffer->qf))) {
1854       cb_color_info &= C_028C70_COMPRESSION;
1855    }
1856
1857    if (radv_image_is_tc_compat_cmask(image) && (radv_is_fmask_decompress_pipeline(cmd_buffer) ||
1858                                                 radv_is_dcc_decompress_pipeline(cmd_buffer))) {
1859       /* If this bit is set, the FMASK decompression operation
1860        * doesn't occur (DCC_COMPRESS also implies FMASK_DECOMPRESS).
1861        */
1862       cb_color_info &= C_028C70_FMASK_COMPRESS_1FRAG_ONLY;
1863    }
1864
1865    if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
1866       radeon_set_context_reg_seq(cmd_buffer->cs, R_028C6C_CB_COLOR0_VIEW + index * 0x3c, 4);
1867       radeon_emit(cmd_buffer->cs, cb->cb_color_view);                      /* CB_COLOR0_VIEW */
1868       radeon_emit(cmd_buffer->cs, cb->cb_color_info);                      /* CB_COLOR0_INFO */
1869       radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);                    /* CB_COLOR0_ATTRIB */
1870       radeon_emit(cmd_buffer->cs, cb_fdcc_control);                        /* CB_COLOR0_FDCC_CONTROL */
1871
1872       radeon_set_context_reg(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, cb->cb_color_base);
1873       radeon_set_context_reg(cmd_buffer->cs, R_028E40_CB_COLOR0_BASE_EXT + index * 4, cb->cb_color_base >> 32);
1874       radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, cb->cb_dcc_base);
1875       radeon_set_context_reg(cmd_buffer->cs, R_028EA0_CB_COLOR0_DCC_BASE_EXT + index * 4, cb->cb_dcc_base >> 32);
1876       radeon_set_context_reg(cmd_buffer->cs, R_028EC0_CB_COLOR0_ATTRIB2 + index * 4, cb->cb_color_attrib2);
1877       radeon_set_context_reg(cmd_buffer->cs, R_028EE0_CB_COLOR0_ATTRIB3 + index * 4, cb->cb_color_attrib3);
1878    } else if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10) {
1879       radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
1880       radeon_emit(cmd_buffer->cs, cb->cb_color_base);
1881       radeon_emit(cmd_buffer->cs, 0);
1882       radeon_emit(cmd_buffer->cs, 0);
1883       radeon_emit(cmd_buffer->cs, cb->cb_color_view);
1884       radeon_emit(cmd_buffer->cs, cb_color_info);
1885       radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
1886       radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
1887       radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
1888       radeon_emit(cmd_buffer->cs, 0);
1889       radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
1890       radeon_emit(cmd_buffer->cs, 0);
1891
1892       radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, cb->cb_dcc_base);
1893
1894       radeon_set_context_reg(cmd_buffer->cs, R_028E40_CB_COLOR0_BASE_EXT + index * 4,
1895                              cb->cb_color_base >> 32);
1896       radeon_set_context_reg(cmd_buffer->cs, R_028E60_CB_COLOR0_CMASK_BASE_EXT + index * 4,
1897                              cb->cb_color_cmask >> 32);
1898       radeon_set_context_reg(cmd_buffer->cs, R_028E80_CB_COLOR0_FMASK_BASE_EXT + index * 4,
1899                              cb->cb_color_fmask >> 32);
1900       radeon_set_context_reg(cmd_buffer->cs, R_028EA0_CB_COLOR0_DCC_BASE_EXT + index * 4,
1901                              cb->cb_dcc_base >> 32);
1902       radeon_set_context_reg(cmd_buffer->cs, R_028EC0_CB_COLOR0_ATTRIB2 + index * 4,
1903                              cb->cb_color_attrib2);
1904       radeon_set_context_reg(cmd_buffer->cs, R_028EE0_CB_COLOR0_ATTRIB3 + index * 4,
1905                              cb->cb_color_attrib3);
1906    } else if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX9) {
1907       radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
1908       radeon_emit(cmd_buffer->cs, cb->cb_color_base);
1909       radeon_emit(cmd_buffer->cs, S_028C64_BASE_256B(cb->cb_color_base >> 32));
1910       radeon_emit(cmd_buffer->cs, cb->cb_color_attrib2);
1911       radeon_emit(cmd_buffer->cs, cb->cb_color_view);
1912       radeon_emit(cmd_buffer->cs, cb_color_info);
1913       radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
1914       radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
1915       radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
1916       radeon_emit(cmd_buffer->cs, S_028C80_BASE_256B(cb->cb_color_cmask >> 32));
1917       radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
1918       radeon_emit(cmd_buffer->cs, S_028C88_BASE_256B(cb->cb_color_fmask >> 32));
1919
1920       radeon_set_context_reg_seq(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, 2);
1921       radeon_emit(cmd_buffer->cs, cb->cb_dcc_base);
1922       radeon_emit(cmd_buffer->cs, S_028C98_BASE_256B(cb->cb_dcc_base >> 32));
1923
1924       radeon_set_context_reg(cmd_buffer->cs, R_0287A0_CB_MRT0_EPITCH + index * 4,
1925                              cb->cb_mrt_epitch);
1926    } else {
1927       radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
1928       radeon_emit(cmd_buffer->cs, cb->cb_color_base);
1929       radeon_emit(cmd_buffer->cs, cb->cb_color_pitch);
1930       radeon_emit(cmd_buffer->cs, cb->cb_color_slice);
1931       radeon_emit(cmd_buffer->cs, cb->cb_color_view);
1932       radeon_emit(cmd_buffer->cs, cb_color_info);
1933       radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
1934       radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
1935       radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
1936       radeon_emit(cmd_buffer->cs, cb->cb_color_cmask_slice);
1937       radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
1938       radeon_emit(cmd_buffer->cs, cb->cb_color_fmask_slice);
1939
1940       if (is_vi) { /* DCC BASE */
1941          radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c,
1942                                 cb->cb_dcc_base);
1943       }
1944    }
1945
1946    if (G_028C70_DCC_ENABLE(cb_color_info)) {
1947       /* Drawing with DCC enabled also compresses colorbuffers. */
1948       VkImageSubresourceRange range = {
1949          .aspectMask = iview->vk.aspects,
1950          .baseMipLevel = iview->vk.base_mip_level,
1951          .levelCount = iview->vk.level_count,
1952          .baseArrayLayer = iview->vk.base_array_layer,
1953          .layerCount = iview->vk.layer_count,
1954       };
1955
1956       radv_update_dcc_metadata(cmd_buffer, image, &range, true);
1957    }
1958 }
1959
1960 static void
1961 radv_update_zrange_precision(struct radv_cmd_buffer *cmd_buffer, struct radv_ds_buffer_info *ds,
1962                              const struct radv_image_view *iview, VkImageLayout layout,
1963                              bool requires_cond_exec)
1964 {
1965    const struct radv_image *image = iview->image;
1966    uint32_t db_z_info = ds->db_z_info;
1967    uint32_t db_z_info_reg;
1968
1969    if (!cmd_buffer->device->physical_device->rad_info.has_tc_compat_zrange_bug ||
1970        !radv_image_is_tc_compat_htile(image))
1971       return;
1972
1973    if (!radv_layout_is_htile_compressed(
1974           cmd_buffer->device, image, layout,
1975           radv_image_queue_family_mask(image, cmd_buffer->qf,
1976                                        cmd_buffer->qf))) {
1977       db_z_info &= C_028040_TILE_SURFACE_ENABLE;
1978    }
1979
1980    db_z_info &= C_028040_ZRANGE_PRECISION;
1981
1982    if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX9) {
1983       db_z_info_reg = R_028038_DB_Z_INFO;
1984    } else {
1985       db_z_info_reg = R_028040_DB_Z_INFO;
1986    }
1987
1988    /* When we don't know the last fast clear value we need to emit a
1989     * conditional packet that will eventually skip the following
1990     * SET_CONTEXT_REG packet.
1991     */
1992    if (requires_cond_exec) {
1993       uint64_t va = radv_get_tc_compat_zrange_va(image, iview->vk.base_mip_level);
1994
1995       radeon_emit(cmd_buffer->cs, PKT3(PKT3_COND_EXEC, 3, 0));
1996       radeon_emit(cmd_buffer->cs, va);
1997       radeon_emit(cmd_buffer->cs, va >> 32);
1998       radeon_emit(cmd_buffer->cs, 0);
1999       radeon_emit(cmd_buffer->cs, 3); /* SET_CONTEXT_REG size */
2000    }
2001
2002    radeon_set_context_reg(cmd_buffer->cs, db_z_info_reg, db_z_info);
2003 }
2004
2005 static void
2006 radv_emit_fb_ds_state(struct radv_cmd_buffer *cmd_buffer, struct radv_ds_buffer_info *ds,
2007                       struct radv_image_view *iview, VkImageLayout layout)
2008 {
2009    const struct radv_image *image = iview->image;
2010    uint32_t db_z_info = ds->db_z_info;
2011    uint32_t db_stencil_info = ds->db_stencil_info;
2012    uint32_t db_htile_surface = ds->db_htile_surface;
2013
2014    if (!radv_layout_is_htile_compressed(
2015           cmd_buffer->device, image, layout,
2016           radv_image_queue_family_mask(image, cmd_buffer->qf,
2017                                        cmd_buffer->qf))) {
2018       db_z_info &= C_028040_TILE_SURFACE_ENABLE;
2019       db_stencil_info |= S_028044_TILE_STENCIL_DISABLE(1);
2020    }
2021
2022    if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10_3 &&
2023        !cmd_buffer->state.subpass->vrs_attachment) {
2024       db_htile_surface &= C_028ABC_VRS_HTILE_ENCODING;
2025    }
2026
2027    radeon_set_context_reg(cmd_buffer->cs, R_028008_DB_DEPTH_VIEW, ds->db_depth_view);
2028    radeon_set_context_reg(cmd_buffer->cs, R_028ABC_DB_HTILE_SURFACE, db_htile_surface);
2029
2030    if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10) {
2031       radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, ds->db_htile_data_base);
2032       radeon_set_context_reg(cmd_buffer->cs, R_02801C_DB_DEPTH_SIZE_XY, ds->db_depth_size);
2033
2034       if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
2035          radeon_set_context_reg_seq(cmd_buffer->cs, R_028040_DB_Z_INFO, 6);
2036       } else {
2037          radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 7);
2038          radeon_emit(cmd_buffer->cs, S_02803C_RESOURCE_LEVEL(1));
2039       }
2040       radeon_emit(cmd_buffer->cs, db_z_info);
2041       radeon_emit(cmd_buffer->cs, db_stencil_info);
2042       radeon_emit(cmd_buffer->cs, ds->db_z_read_base);
2043       radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);
2044       radeon_emit(cmd_buffer->cs, ds->db_z_read_base);
2045       radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);
2046
2047       radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_READ_BASE_HI, 5);
2048       radeon_emit(cmd_buffer->cs, ds->db_z_read_base >> 32);
2049       radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base >> 32);
2050       radeon_emit(cmd_buffer->cs, ds->db_z_read_base >> 32);
2051       radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base >> 32);
2052       radeon_emit(cmd_buffer->cs, ds->db_htile_data_base >> 32);
2053    } else if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX9) {
2054       radeon_set_context_reg_seq(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, 3);
2055       radeon_emit(cmd_buffer->cs, ds->db_htile_data_base);
2056       radeon_emit(cmd_buffer->cs, S_028018_BASE_HI(ds->db_htile_data_base >> 32));
2057       radeon_emit(cmd_buffer->cs, ds->db_depth_size);
2058
2059       radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 10);
2060       radeon_emit(cmd_buffer->cs, db_z_info);          /* DB_Z_INFO */
2061       radeon_emit(cmd_buffer->cs, db_stencil_info);    /* DB_STENCIL_INFO */
2062       radeon_emit(cmd_buffer->cs, ds->db_z_read_base); /* DB_Z_READ_BASE */
2063       radeon_emit(cmd_buffer->cs,
2064                   S_028044_BASE_HI(ds->db_z_read_base >> 32)); /* DB_Z_READ_BASE_HI */
2065       radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);   /* DB_STENCIL_READ_BASE */
2066       radeon_emit(cmd_buffer->cs,
2067                   S_02804C_BASE_HI(ds->db_stencil_read_base >> 32)); /* DB_STENCIL_READ_BASE_HI */
2068       radeon_emit(cmd_buffer->cs, ds->db_z_write_base);              /* DB_Z_WRITE_BASE */
2069       radeon_emit(cmd_buffer->cs,
2070                   S_028054_BASE_HI(ds->db_z_write_base >> 32)); /* DB_Z_WRITE_BASE_HI */
2071       radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base);   /* DB_STENCIL_WRITE_BASE */
2072       radeon_emit(cmd_buffer->cs,
2073                   S_02805C_BASE_HI(ds->db_stencil_write_base >> 32)); /* DB_STENCIL_WRITE_BASE_HI */
2074
2075       radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_INFO2, 2);
2076       radeon_emit(cmd_buffer->cs, ds->db_z_info2);
2077       radeon_emit(cmd_buffer->cs, ds->db_stencil_info2);
2078    } else {
2079       radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, ds->db_htile_data_base);
2080
2081       radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 9);
2082       radeon_emit(cmd_buffer->cs, ds->db_depth_info);         /* R_02803C_DB_DEPTH_INFO */
2083       radeon_emit(cmd_buffer->cs, db_z_info);                 /* R_028040_DB_Z_INFO */
2084       radeon_emit(cmd_buffer->cs, db_stencil_info);           /* R_028044_DB_STENCIL_INFO */
2085       radeon_emit(cmd_buffer->cs, ds->db_z_read_base);        /* R_028048_DB_Z_READ_BASE */
2086       radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);  /* R_02804C_DB_STENCIL_READ_BASE */
2087       radeon_emit(cmd_buffer->cs, ds->db_z_write_base);       /* R_028050_DB_Z_WRITE_BASE */
2088       radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base); /* R_028054_DB_STENCIL_WRITE_BASE */
2089       radeon_emit(cmd_buffer->cs, ds->db_depth_size);         /* R_028058_DB_DEPTH_SIZE */
2090       radeon_emit(cmd_buffer->cs, ds->db_depth_slice);        /* R_02805C_DB_DEPTH_SLICE */
2091    }
2092
2093    /* Update the ZRANGE_PRECISION value for the TC-compat bug. */
2094    radv_update_zrange_precision(cmd_buffer, ds, iview, layout, true);
2095
2096    radeon_set_context_reg(cmd_buffer->cs, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL,
2097                           ds->pa_su_poly_offset_db_fmt_cntl);
2098 }
2099
2100 /**
2101  * Update the fast clear depth/stencil values if the image is bound as a
2102  * depth/stencil buffer.
2103  */
2104 static void
2105 radv_update_bound_fast_clear_ds(struct radv_cmd_buffer *cmd_buffer,
2106                                 const struct radv_image_view *iview,
2107                                 VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects)
2108 {
2109    const struct radv_subpass *subpass = cmd_buffer->state.subpass;
2110    const struct radv_image *image = iview->image;
2111    struct radeon_cmdbuf *cs = cmd_buffer->cs;
2112    uint32_t att_idx;
2113
2114    if (!cmd_buffer->state.attachments || !subpass)
2115       return;
2116
2117    if (!subpass->depth_stencil_attachment)
2118       return;
2119
2120    att_idx = subpass->depth_stencil_attachment->attachment;
2121    if (cmd_buffer->state.attachments[att_idx].iview->image != image)
2122       return;
2123
2124    if (aspects == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
2125       radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2);
2126       radeon_emit(cs, ds_clear_value.stencil);
2127       radeon_emit(cs, fui(ds_clear_value.depth));
2128    } else if (aspects == VK_IMAGE_ASPECT_DEPTH_BIT) {
2129       radeon_set_context_reg(cs, R_02802C_DB_DEPTH_CLEAR, fui(ds_clear_value.depth));
2130    } else {
2131       assert(aspects == VK_IMAGE_ASPECT_STENCIL_BIT);
2132       radeon_set_context_reg(cs, R_028028_DB_STENCIL_CLEAR, ds_clear_value.stencil);
2133    }
2134
2135    /* Update the ZRANGE_PRECISION value for the TC-compat bug. This is
2136     * only needed when clearing Z to 0.0.
2137     */
2138    if ((aspects & VK_IMAGE_ASPECT_DEPTH_BIT) && ds_clear_value.depth == 0.0) {
2139       VkImageLayout layout = subpass->depth_stencil_attachment->layout;
2140
2141       radv_update_zrange_precision(cmd_buffer, &cmd_buffer->state.attachments[att_idx].ds, iview,
2142                                    layout, false);
2143    }
2144
2145    cmd_buffer->state.context_roll_without_scissor_emitted = true;
2146 }
2147
2148 /**
2149  * Set the clear depth/stencil values to the image's metadata.
2150  */
2151 static void
2152 radv_set_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
2153                            const VkImageSubresourceRange *range,
2154                            VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects)
2155 {
2156    struct radeon_cmdbuf *cs = cmd_buffer->cs;
2157    uint32_t level_count = radv_get_levelCount(image, range);
2158
2159    if (aspects == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
2160       uint64_t va = radv_get_ds_clear_value_va(image, range->baseMipLevel);
2161
2162       /* Use the fastest way when both aspects are used. */
2163       radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + 2 * level_count, cmd_buffer->state.predicating));
2164       radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
2165       radeon_emit(cs, va);
2166       radeon_emit(cs, va >> 32);
2167
2168       for (uint32_t l = 0; l < level_count; l++) {
2169          radeon_emit(cs, ds_clear_value.stencil);
2170          radeon_emit(cs, fui(ds_clear_value.depth));
2171       }
2172    } else {
2173       /* Otherwise we need one WRITE_DATA packet per level. */
2174       for (uint32_t l = 0; l < level_count; l++) {
2175          uint64_t va = radv_get_ds_clear_value_va(image, range->baseMipLevel + l);
2176          unsigned value;
2177
2178          if (aspects == VK_IMAGE_ASPECT_DEPTH_BIT) {
2179             value = fui(ds_clear_value.depth);
2180             va += 4;
2181          } else {
2182             assert(aspects == VK_IMAGE_ASPECT_STENCIL_BIT);
2183             value = ds_clear_value.stencil;
2184          }
2185
2186          radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, cmd_buffer->state.predicating));
2187          radeon_emit(cs,
2188                      S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
2189          radeon_emit(cs, va);
2190          radeon_emit(cs, va >> 32);
2191          radeon_emit(cs, value);
2192       }
2193    }
2194 }
2195
2196 /**
2197  * Update the TC-compat metadata value for this image.
2198  */
2199 static void
2200 radv_set_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
2201                                    const VkImageSubresourceRange *range, uint32_t value)
2202 {
2203    struct radeon_cmdbuf *cs = cmd_buffer->cs;
2204
2205    if (!cmd_buffer->device->physical_device->rad_info.has_tc_compat_zrange_bug)
2206       return;
2207
2208    uint64_t va = radv_get_tc_compat_zrange_va(image, range->baseMipLevel);
2209    uint32_t level_count = radv_get_levelCount(image, range);
2210
2211    radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + level_count, cmd_buffer->state.predicating));
2212    radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
2213    radeon_emit(cs, va);
2214    radeon_emit(cs, va >> 32);
2215
2216    for (uint32_t l = 0; l < level_count; l++)
2217       radeon_emit(cs, value);
2218 }
2219
2220 static void
2221 radv_update_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer,
2222                                       const struct radv_image_view *iview,
2223                                       VkClearDepthStencilValue ds_clear_value)
2224 {
2225    VkImageSubresourceRange range = {
2226       .aspectMask = iview->vk.aspects,
2227       .baseMipLevel = iview->vk.base_mip_level,
2228       .levelCount = iview->vk.level_count,
2229       .baseArrayLayer = iview->vk.base_array_layer,
2230       .layerCount = iview->vk.layer_count,
2231    };
2232    uint32_t cond_val;
2233
2234    /* Conditionally set DB_Z_INFO.ZRANGE_PRECISION to 0 when the last
2235     * depth clear value is 0.0f.
2236     */
2237    cond_val = ds_clear_value.depth == 0.0f ? UINT_MAX : 0;
2238
2239    radv_set_tc_compat_zrange_metadata(cmd_buffer, iview->image, &range, cond_val);
2240 }
2241
2242 /**
2243  * Update the clear depth/stencil values for this image.
2244  */
2245 void
2246 radv_update_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
2247                               const struct radv_image_view *iview,
2248                               VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects)
2249 {
2250    VkImageSubresourceRange range = {
2251       .aspectMask = iview->vk.aspects,
2252       .baseMipLevel = iview->vk.base_mip_level,
2253       .levelCount = iview->vk.level_count,
2254       .baseArrayLayer = iview->vk.base_array_layer,
2255       .layerCount = iview->vk.layer_count,
2256    };
2257    struct radv_image *image = iview->image;
2258
2259    assert(radv_htile_enabled(image, range.baseMipLevel));
2260
2261    radv_set_ds_clear_metadata(cmd_buffer, iview->image, &range, ds_clear_value, aspects);
2262
2263    if (radv_image_is_tc_compat_htile(image) && (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) {
2264       radv_update_tc_compat_zrange_metadata(cmd_buffer, iview, ds_clear_value);
2265    }
2266
2267    radv_update_bound_fast_clear_ds(cmd_buffer, iview, ds_clear_value, aspects);
2268 }
2269
2270 /**
2271  * Load the clear depth/stencil values from the image's metadata.
2272  */
2273 static void
2274 radv_load_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, const struct radv_image_view *iview)
2275 {
2276    struct radeon_cmdbuf *cs = cmd_buffer->cs;
2277    const struct radv_image *image = iview->image;
2278    VkImageAspectFlags aspects = vk_format_aspects(image->vk.format);
2279    uint64_t va = radv_get_ds_clear_value_va(image, iview->vk.base_mip_level);
2280    unsigned reg_offset = 0, reg_count = 0;
2281
2282    assert(radv_image_has_htile(image));
2283
2284    if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
2285       ++reg_count;
2286    } else {
2287       ++reg_offset;
2288       va += 4;
2289    }
2290    if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
2291       ++reg_count;
2292
2293    uint32_t reg = R_028028_DB_STENCIL_CLEAR + 4 * reg_offset;
2294
2295    if (cmd_buffer->device->physical_device->rad_info.has_load_ctx_reg_pkt) {
2296       radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, 0));
2297       radeon_emit(cs, va);
2298       radeon_emit(cs, va >> 32);
2299       radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
2300       radeon_emit(cs, reg_count);
2301    } else {
2302       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
2303       radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) |
2304                          (reg_count == 2 ? COPY_DATA_COUNT_SEL : 0));
2305       radeon_emit(cs, va);
2306       radeon_emit(cs, va >> 32);
2307       radeon_emit(cs, reg >> 2);
2308       radeon_emit(cs, 0);
2309
2310       radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
2311       radeon_emit(cs, 0);
2312    }
2313 }
2314
2315 /*
2316  * With DCC some colors don't require CMASK elimination before being
2317  * used as a texture. This sets a predicate value to determine if the
2318  * cmask eliminate is required.
2319  */
2320 void
2321 radv_update_fce_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
2322                          const VkImageSubresourceRange *range, bool value)
2323 {
2324    if (!image->fce_pred_offset)
2325       return;
2326
2327    uint64_t pred_val = value;
2328    uint64_t va = radv_image_get_fce_pred_va(image, range->baseMipLevel);
2329    uint32_t level_count = radv_get_levelCount(image, range);
2330    uint32_t count = 2 * level_count;
2331
2332    radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0));
2333    radeon_emit(cmd_buffer->cs,
2334                S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
2335    radeon_emit(cmd_buffer->cs, va);
2336    radeon_emit(cmd_buffer->cs, va >> 32);
2337
2338    for (uint32_t l = 0; l < level_count; l++) {
2339       radeon_emit(cmd_buffer->cs, pred_val);
2340       radeon_emit(cmd_buffer->cs, pred_val >> 32);
2341    }
2342 }
2343
2344 /**
2345  * Update the DCC predicate to reflect the compression state.
2346  */
2347 void
2348 radv_update_dcc_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
2349                          const VkImageSubresourceRange *range, bool value)
2350 {
2351    if (image->dcc_pred_offset == 0)
2352       return;
2353
2354    uint64_t pred_val = value;
2355    uint64_t va = radv_image_get_dcc_pred_va(image, range->baseMipLevel);
2356    uint32_t level_count = radv_get_levelCount(image, range);
2357    uint32_t count = 2 * level_count;
2358
2359    assert(radv_dcc_enabled(image, range->baseMipLevel));
2360
2361    radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0));
2362    radeon_emit(cmd_buffer->cs,
2363                S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
2364    radeon_emit(cmd_buffer->cs, va);
2365    radeon_emit(cmd_buffer->cs, va >> 32);
2366
2367    for (uint32_t l = 0; l < level_count; l++) {
2368       radeon_emit(cmd_buffer->cs, pred_val);
2369       radeon_emit(cmd_buffer->cs, pred_val >> 32);
2370    }
2371 }
2372
2373 /**
2374  * Update the fast clear color values if the image is bound as a color buffer.
2375  */
2376 static void
2377 radv_update_bound_fast_clear_color(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
2378                                    int cb_idx, uint32_t color_values[2])
2379 {
2380    const struct radv_subpass *subpass = cmd_buffer->state.subpass;
2381    struct radeon_cmdbuf *cs = cmd_buffer->cs;
2382    uint32_t att_idx;
2383
2384    if (!cmd_buffer->state.attachments || !subpass)
2385       return;
2386
2387    att_idx = subpass->color_attachments[cb_idx].attachment;
2388    if (att_idx == VK_ATTACHMENT_UNUSED)
2389       return;
2390
2391    if (cmd_buffer->state.attachments[att_idx].iview->image != image)
2392       return;
2393
2394    radeon_set_context_reg_seq(cs, R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c, 2);
2395    radeon_emit(cs, color_values[0]);
2396    radeon_emit(cs, color_values[1]);
2397
2398    cmd_buffer->state.context_roll_without_scissor_emitted = true;
2399 }
2400
2401 /**
2402  * Set the clear color values to the image's metadata.
2403  */
2404 static void
2405 radv_set_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
2406                               const VkImageSubresourceRange *range, uint32_t color_values[2])
2407 {
2408    struct radeon_cmdbuf *cs = cmd_buffer->cs;
2409    uint32_t level_count = radv_get_levelCount(image, range);
2410    uint32_t count = 2 * level_count;
2411
2412    assert(radv_image_has_cmask(image) || radv_dcc_enabled(image, range->baseMipLevel));
2413
2414    if (radv_image_has_clear_value(image)) {
2415       uint64_t va = radv_image_get_fast_clear_va(image, range->baseMipLevel);
2416
2417       radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + count, cmd_buffer->state.predicating));
2418       radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
2419       radeon_emit(cs, va);
2420       radeon_emit(cs, va >> 32);
2421
2422       for (uint32_t l = 0; l < level_count; l++) {
2423          radeon_emit(cs, color_values[0]);
2424          radeon_emit(cs, color_values[1]);
2425       }
2426    } else {
2427       /* Some default value we can set in the update. */
2428       assert(color_values[0] == 0 && color_values[1] == 0);
2429    }
2430 }
2431
2432 /**
2433  * Update the clear color values for this image.
2434  */
2435 void
2436 radv_update_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
2437                                  const struct radv_image_view *iview, int cb_idx,
2438                                  uint32_t color_values[2])
2439 {
2440    struct radv_image *image = iview->image;
2441    VkImageSubresourceRange range = {
2442       .aspectMask = iview->vk.aspects,
2443       .baseMipLevel = iview->vk.base_mip_level,
2444       .levelCount = iview->vk.level_count,
2445       .baseArrayLayer = iview->vk.base_array_layer,
2446       .layerCount = iview->vk.layer_count,
2447    };
2448
2449    assert(radv_image_has_cmask(image) || radv_dcc_enabled(image, iview->vk.base_mip_level));
2450
2451    /* Do not need to update the clear value for images that are fast cleared with the comp-to-single
2452     * mode because the hardware gets the value from the image directly.
2453     */
2454    if (iview->image->support_comp_to_single)
2455       return;
2456
2457    radv_set_color_clear_metadata(cmd_buffer, image, &range, color_values);
2458
2459    radv_update_bound_fast_clear_color(cmd_buffer, image, cb_idx, color_values);
2460 }
2461
2462 /**
2463  * Load the clear color values from the image's metadata.
2464  */
2465 static void
2466 radv_load_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image_view *iview,
2467                                int cb_idx)
2468 {
2469    struct radeon_cmdbuf *cs = cmd_buffer->cs;
2470    struct radv_image *image = iview->image;
2471
2472    if (!radv_image_has_cmask(image) && !radv_dcc_enabled(image, iview->vk.base_mip_level))
2473       return;
2474
2475    if (iview->image->support_comp_to_single)
2476       return;
2477
2478    if (!radv_image_has_clear_value(image)) {
2479       uint32_t color_values[2] = {0, 0};
2480       radv_update_bound_fast_clear_color(cmd_buffer, image, cb_idx, color_values);
2481       return;
2482    }
2483
2484    uint64_t va = radv_image_get_fast_clear_va(image, iview->vk.base_mip_level);
2485    uint32_t reg = R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c;
2486
2487    if (cmd_buffer->device->physical_device->rad_info.has_load_ctx_reg_pkt) {
2488       radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, cmd_buffer->state.predicating));
2489       radeon_emit(cs, va);
2490       radeon_emit(cs, va >> 32);
2491       radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
2492       radeon_emit(cs, 2);
2493    } else {
2494       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating));
2495       radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) |
2496                          COPY_DATA_COUNT_SEL);
2497       radeon_emit(cs, va);
2498       radeon_emit(cs, va >> 32);
2499       radeon_emit(cs, reg >> 2);
2500       radeon_emit(cs, 0);
2501
2502       radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
2503       radeon_emit(cs, 0);
2504    }
2505 }
2506
2507 /* GFX9+ metadata cache flushing workaround. metadata cache coherency is
2508  * broken if the CB caches data of multiple mips of the same image at the
2509  * same time.
2510  *
2511  * Insert some flushes to avoid this.
2512  */
2513 static void
2514 radv_emit_fb_mip_change_flush(struct radv_cmd_buffer *cmd_buffer)
2515 {
2516    struct vk_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
2517    const struct radv_subpass *subpass = cmd_buffer->state.subpass;
2518    bool color_mip_changed = false;
2519
2520    /* Entire workaround is not applicable before GFX9 */
2521    if (cmd_buffer->device->physical_device->rad_info.gfx_level < GFX9)
2522       return;
2523
2524    if (!framebuffer)
2525       return;
2526
2527    for (int i = 0; i < subpass->color_count; ++i) {
2528       int idx = subpass->color_attachments[i].attachment;
2529       if (idx == VK_ATTACHMENT_UNUSED)
2530          continue;
2531
2532       struct radv_image_view *iview = cmd_buffer->state.attachments[idx].iview;
2533
2534       if ((radv_image_has_CB_metadata(iview->image) ||
2535            radv_dcc_enabled(iview->image, iview->vk.base_mip_level) ||
2536            radv_dcc_enabled(iview->image, cmd_buffer->state.cb_mip[i])) &&
2537           cmd_buffer->state.cb_mip[i] != iview->vk.base_mip_level)
2538          color_mip_changed = true;
2539
2540       cmd_buffer->state.cb_mip[i] = iview->vk.base_mip_level;
2541    }
2542
2543    if (color_mip_changed) {
2544       cmd_buffer->state.flush_bits |=
2545          RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
2546    }
2547 }
2548
2549 /* This function does the flushes for mip changes if the levels are not zero for
2550  * all render targets. This way we can assume at the start of the next cmd_buffer
2551  * that rendering to mip 0 doesn't need any flushes. As that is the most common
2552  * case that saves some flushes. */
2553 static void
2554 radv_emit_mip_change_flush_default(struct radv_cmd_buffer *cmd_buffer)
2555 {
2556    /* Entire workaround is not applicable before GFX9 */
2557    if (cmd_buffer->device->physical_device->rad_info.gfx_level < GFX9)
2558       return;
2559
2560    bool need_color_mip_flush = false;
2561    for (unsigned i = 0; i < 8; ++i) {
2562       if (cmd_buffer->state.cb_mip[i]) {
2563          need_color_mip_flush = true;
2564          break;
2565       }
2566    }
2567
2568    if (need_color_mip_flush) {
2569       cmd_buffer->state.flush_bits |=
2570          RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
2571    }
2572
2573    memset(cmd_buffer->state.cb_mip, 0, sizeof(cmd_buffer->state.cb_mip));
2574 }
2575
2576 static struct radv_image *
2577 radv_cmd_buffer_get_vrs_image(struct radv_cmd_buffer *cmd_buffer)
2578 {
2579    struct radv_device *device = cmd_buffer->device;
2580
2581    if (!device->vrs.image) {
2582       VkResult result;
2583
2584       /* The global VRS state is initialized on-demand to avoid wasting VRAM. */
2585       result = radv_device_init_vrs_state(device);
2586       if (result != VK_SUCCESS) {
2587          vk_command_buffer_set_error(&cmd_buffer->vk, result);
2588          return NULL;
2589       }
2590    }
2591
2592    return device->vrs.image;
2593 }
2594
2595 static void
2596 radv_emit_framebuffer_state(struct radv_cmd_buffer *cmd_buffer)
2597 {
2598    int i;
2599    struct vk_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
2600    const struct radv_subpass *subpass = cmd_buffer->state.subpass;
2601    bool disable_constant_encode_ac01 = false;
2602    unsigned color_invalid = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11
2603                             ? G_028C70_FORMAT_GFX11(V_028C70_COLOR_INVALID)
2604                             : G_028C70_FORMAT_GFX6(V_028C70_COLOR_INVALID);
2605
2606    for (i = 0; i < subpass->color_count; ++i) {
2607       if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) {
2608          radeon_set_context_reg(cmd_buffer->cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, color_invalid);
2609          continue;
2610       }
2611
2612       int idx = subpass->color_attachments[i].attachment;
2613       struct radv_image_view *iview = cmd_buffer->state.attachments[idx].iview;
2614       VkImageLayout layout = subpass->color_attachments[i].layout;
2615
2616       radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, iview->image->bindings[0].bo);
2617
2618       assert(iview->vk.aspects & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_PLANE_0_BIT |
2619                                    VK_IMAGE_ASPECT_PLANE_1_BIT | VK_IMAGE_ASPECT_PLANE_2_BIT));
2620
2621       if (iview->image->disjoint && iview->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
2622          for (uint32_t plane_id = 0; plane_id < iview->image->plane_count; plane_id++) {
2623             radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
2624                   iview->image->bindings[plane_id].bo);
2625          }
2626       } else {
2627          uint32_t plane_id = iview->image->disjoint ? iview->plane_id : 0;
2628          radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
2629                iview->image->bindings[plane_id].bo);
2630       }
2631
2632       radv_emit_fb_color_state(cmd_buffer, i, &cmd_buffer->state.attachments[idx].cb, iview, layout);
2633
2634       radv_load_color_clear_metadata(cmd_buffer, iview, i);
2635
2636       if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9 &&
2637           iview->image->dcc_sign_reinterpret) {
2638          /* Disable constant encoding with the clear value of "1" with different DCC signedness
2639           * because the hardware will fill "1" instead of the clear value.
2640           */
2641          disable_constant_encode_ac01 = true;
2642       }
2643    }
2644    for (; i < cmd_buffer->state.last_subpass_color_count; i++) {
2645       radeon_set_context_reg(cmd_buffer->cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, color_invalid);
2646    }
2647    cmd_buffer->state.last_subpass_color_count = subpass->color_count;
2648
2649    if (subpass->depth_stencil_attachment) {
2650       int idx = subpass->depth_stencil_attachment->attachment;
2651       VkImageLayout layout = subpass->depth_stencil_attachment->layout;
2652       struct radv_image_view *iview = cmd_buffer->state.attachments[idx].iview;
2653       radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
2654                          cmd_buffer->state.attachments[idx].iview->image->bindings[0].bo);
2655
2656       radv_emit_fb_ds_state(cmd_buffer, &cmd_buffer->state.attachments[idx].ds, iview, layout);
2657
2658       if (radv_layout_is_htile_compressed(
2659              cmd_buffer->device, iview->image, layout,
2660              radv_image_queue_family_mask(iview->image, cmd_buffer->qf,
2661                                           cmd_buffer->qf))) {
2662          /* Only load the depth/stencil fast clear values when
2663           * compressed rendering is enabled.
2664           */
2665          radv_load_ds_clear_metadata(cmd_buffer, iview);
2666       }
2667    } else if (subpass->vrs_attachment && radv_cmd_buffer_get_vrs_image(cmd_buffer)) {
2668       /* When a subpass uses a VRS attachment without binding a depth/stencil attachment, we have to
2669        * bind our internal depth buffer that contains the VRS data as part of HTILE.
2670        */
2671       VkImageLayout layout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;
2672       struct radv_buffer *htile_buffer = cmd_buffer->device->vrs.buffer;
2673       struct radv_image *image = cmd_buffer->device->vrs.image;
2674       struct radv_ds_buffer_info ds;
2675       struct radv_image_view iview;
2676
2677       radv_image_view_init(&iview, cmd_buffer->device,
2678                            &(VkImageViewCreateInfo){
2679                               .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
2680                               .image = radv_image_to_handle(image),
2681                               .viewType = radv_meta_get_view_type(image),
2682                               .format = image->vk.format,
2683                               .subresourceRange =
2684                                  {
2685                                     .aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT,
2686                                     .baseMipLevel = 0,
2687                                     .levelCount = 1,
2688                                     .baseArrayLayer = 0,
2689                                     .layerCount = 1,
2690                                  },
2691                            },
2692                            0, NULL);
2693
2694       radv_initialise_vrs_surface(image, htile_buffer, &ds);
2695
2696       radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, htile_buffer->bo);
2697
2698       radv_emit_fb_ds_state(cmd_buffer, &ds, &iview, layout);
2699
2700       radv_image_view_finish(&iview);
2701    } else {
2702       unsigned num_samples = 0;
2703
2704       /* On GFX11, DB_Z_INFO.NUM_SAMPLES should always match the framebuffer samples. It affects
2705        * VRS and occlusion queries if depth and stencil are not bound.
2706        */
2707       if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX11)
2708          num_samples = util_logbase2(subpass->max_sample_count);
2709
2710       if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX9)
2711          radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 2);
2712       else
2713          radeon_set_context_reg_seq(cmd_buffer->cs, R_028040_DB_Z_INFO, 2);
2714
2715       radeon_emit(cmd_buffer->cs, S_028040_FORMAT(V_028040_Z_INVALID) |       /* DB_Z_INFO */
2716                                   S_028040_NUM_SAMPLES(num_samples));
2717       radeon_emit(cmd_buffer->cs, S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* DB_STENCIL_INFO */
2718    }
2719    radeon_set_context_reg(cmd_buffer->cs, R_028208_PA_SC_WINDOW_SCISSOR_BR,
2720                           S_028208_BR_X(framebuffer->width) | S_028208_BR_Y(framebuffer->height));
2721
2722    if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX8) {
2723       bool disable_constant_encode =
2724          cmd_buffer->device->physical_device->rad_info.has_dcc_constant_encode;
2725       enum amd_gfx_level gfx_level = cmd_buffer->device->physical_device->rad_info.gfx_level;
2726       uint8_t watermark = gfx_level >= GFX10 ? 6 : 4;
2727
2728       if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
2729          radeon_set_context_reg(cmd_buffer->cs, R_028424_CB_FDCC_CONTROL,
2730                                 S_028424_SAMPLE_MASK_TRACKER_WATERMARK(watermark));
2731       } else {
2732          radeon_set_context_reg(cmd_buffer->cs, R_028424_CB_DCC_CONTROL,
2733                                 S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(gfx_level <= GFX9) |
2734                                 S_028424_OVERWRITE_COMBINER_WATERMARK(watermark) |
2735                                 S_028424_DISABLE_CONSTANT_ENCODE_AC01(disable_constant_encode_ac01) |
2736                                 S_028424_DISABLE_CONSTANT_ENCODE_REG(disable_constant_encode));
2737       }
2738    }
2739
2740    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_FRAMEBUFFER;
2741 }
2742
2743 static void
2744 radv_emit_guardband_state(struct radv_cmd_buffer *cmd_buffer)
2745 {
2746    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
2747    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2748    unsigned rast_prim;
2749
2750    if (!(pipeline->dynamic_states & RADV_DYNAMIC_PRIMITIVE_TOPOLOGY) ||
2751        (pipeline->active_stages & (VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT |
2752                                    VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT |
2753                                    VK_SHADER_STAGE_GEOMETRY_BIT |
2754                                    VK_SHADER_STAGE_MESH_BIT_NV))) {
2755       /* Ignore dynamic primitive topology for TES/GS/MS stages. */
2756       rast_prim = pipeline->rast_prim;
2757    } else {
2758       rast_prim = si_conv_prim_to_gs_out(d->primitive_topology);
2759    }
2760
2761    si_write_guardband(cmd_buffer->cs, d->viewport.count, d->viewport.viewports, rast_prim,
2762                       d->line_width);
2763
2764    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_GUARDBAND;
2765 }
2766
2767 static void
2768 radv_emit_index_buffer(struct radv_cmd_buffer *cmd_buffer, bool indirect)
2769 {
2770    struct radeon_cmdbuf *cs = cmd_buffer->cs;
2771    struct radv_cmd_state *state = &cmd_buffer->state;
2772
2773    /* With indirect generated commands the index buffer bind may be part of the
2774     * indirect command buffer, in which case the app may not have bound any yet. */
2775    if (state->index_type < 0)
2776       return;
2777
2778    /* For the direct indexed draws we use DRAW_INDEX_2, which includes
2779     * the index_va and max_index_count already. */
2780    if (!indirect)
2781       return;
2782
2783    if (state->max_index_count ||
2784        !cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug) {
2785       radeon_emit(cs, PKT3(PKT3_INDEX_BASE, 1, 0));
2786       radeon_emit(cs, state->index_va);
2787       radeon_emit(cs, state->index_va >> 32);
2788
2789       radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
2790       radeon_emit(cs, state->max_index_count);
2791    }
2792
2793    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_INDEX_BUFFER;
2794 }
2795
2796 void
2797 radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer, bool enable_occlusion_queries)
2798 {
2799    bool has_perfect_queries = cmd_buffer->state.perfect_occlusion_queries_enabled;
2800    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
2801    uint32_t pa_sc_mode_cntl_1 = pipeline ? pipeline->ms.pa_sc_mode_cntl_1 : 0;
2802    uint32_t db_count_control;
2803
2804    if (!enable_occlusion_queries) {
2805       if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7) {
2806          if (G_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(pa_sc_mode_cntl_1) &&
2807              pipeline->disable_out_of_order_rast_for_occlusion && has_perfect_queries) {
2808             /* Re-enable out-of-order rasterization if the
2809              * bound pipeline supports it and if it's has
2810              * been disabled before starting any perfect
2811              * occlusion queries.
2812              */
2813             radeon_set_context_reg(cmd_buffer->cs, R_028A4C_PA_SC_MODE_CNTL_1, pa_sc_mode_cntl_1);
2814          }
2815       }
2816       db_count_control = S_028004_ZPASS_INCREMENT_DISABLE(1);
2817    } else {
2818       const struct radv_subpass *subpass = cmd_buffer->state.subpass;
2819       uint32_t sample_rate = subpass ? util_logbase2(subpass->max_sample_count) : 0;
2820       bool gfx10_perfect =
2821          cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10 && has_perfect_queries;
2822
2823       if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7) {
2824          /* Always enable PERFECT_ZPASS_COUNTS due to issues with partially
2825           * covered tiles, discards, and early depth testing. For more details,
2826           * see https://gitlab.freedesktop.org/mesa/mesa/-/issues/3218 */
2827          db_count_control = S_028004_PERFECT_ZPASS_COUNTS(1) |
2828                             S_028004_DISABLE_CONSERVATIVE_ZPASS_COUNTS(gfx10_perfect) |
2829                             S_028004_SAMPLE_RATE(sample_rate) | S_028004_ZPASS_ENABLE(1) |
2830                             S_028004_SLICE_EVEN_ENABLE(1) | S_028004_SLICE_ODD_ENABLE(1);
2831
2832          if (G_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(pa_sc_mode_cntl_1) &&
2833              pipeline->disable_out_of_order_rast_for_occlusion && has_perfect_queries) {
2834             /* If the bound pipeline has enabled
2835              * out-of-order rasterization, we should
2836              * disable it before starting any perfect
2837              * occlusion queries.
2838              */
2839             pa_sc_mode_cntl_1 &= C_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE;
2840
2841             radeon_set_context_reg(cmd_buffer->cs, R_028A4C_PA_SC_MODE_CNTL_1, pa_sc_mode_cntl_1);
2842          }
2843       } else {
2844          db_count_control = S_028004_PERFECT_ZPASS_COUNTS(1) | S_028004_SAMPLE_RATE(sample_rate);
2845       }
2846    }
2847
2848    radeon_set_context_reg(cmd_buffer->cs, R_028004_DB_COUNT_CONTROL, db_count_control);
2849
2850    cmd_buffer->state.context_roll_without_scissor_emitted = true;
2851 }
2852
2853 unsigned
2854 radv_instance_rate_prolog_index(unsigned num_attributes, uint32_t instance_rate_inputs)
2855 {
2856    /* instance_rate_vs_prologs is a flattened array of array of arrays of different sizes, or a
2857     * single array sorted in ascending order using:
2858     * - total number of attributes
2859     * - number of instanced attributes
2860     * - index of first instanced attribute
2861     */
2862
2863    /* From total number of attributes to offset. */
2864    static const uint16_t total_to_offset[16] = {0,   1,   4,   10,  20,  35,  56,  84,
2865                                                 120, 165, 220, 286, 364, 455, 560, 680};
2866    unsigned start_index = total_to_offset[num_attributes - 1];
2867
2868    /* From number of instanced attributes to offset. This would require a different LUT depending on
2869     * the total number of attributes, but we can exploit a pattern to use just the LUT for 16 total
2870     * attributes.
2871     */
2872    static const uint8_t count_to_offset_total16[16] = {0,   16,  31,  45,  58,  70,  81,  91,
2873                                                        100, 108, 115, 121, 126, 130, 133, 135};
2874    unsigned count = util_bitcount(instance_rate_inputs);
2875    unsigned offset_from_start_index =
2876       count_to_offset_total16[count - 1] - ((16 - num_attributes) * (count - 1));
2877
2878    unsigned first = ffs(instance_rate_inputs) - 1;
2879    return start_index + offset_from_start_index + first;
2880 }
2881
2882 union vs_prolog_key_header {
2883    struct {
2884       uint32_t key_size : 8;
2885       uint32_t num_attributes : 6;
2886       uint32_t as_ls : 1;
2887       uint32_t is_ngg : 1;
2888       uint32_t wave32 : 1;
2889       uint32_t next_stage : 3;
2890       uint32_t instance_rate_inputs : 1;
2891       uint32_t alpha_adjust_lo : 1;
2892       uint32_t alpha_adjust_hi : 1;
2893       uint32_t misaligned_mask : 1;
2894       uint32_t post_shuffle : 1;
2895       uint32_t nontrivial_divisors : 1;
2896       uint32_t zero_divisors : 1;
2897       /* We need this to ensure the padding is zero. It's useful even if it's unused. */
2898       uint32_t padding0 : 5;
2899    };
2900    uint32_t v;
2901 };
2902
2903 uint32_t
2904 radv_hash_vs_prolog(const void *key_)
2905 {
2906    const uint32_t *key = key_;
2907    union vs_prolog_key_header header;
2908    header.v = key[0];
2909    return _mesa_hash_data(key, header.key_size);
2910 }
2911
2912 bool
2913 radv_cmp_vs_prolog(const void *a_, const void *b_)
2914 {
2915    const uint32_t *a = a_;
2916    const uint32_t *b = b_;
2917    if (a[0] != b[0])
2918       return false;
2919
2920    union vs_prolog_key_header header;
2921    header.v = a[0];
2922    return memcmp(a, b, header.key_size) == 0;
2923 }
2924
2925 static struct radv_shader_part *
2926 lookup_vs_prolog(struct radv_cmd_buffer *cmd_buffer, struct radv_shader *vs_shader,
2927                  uint32_t *nontrivial_divisors)
2928 {
2929    STATIC_ASSERT(sizeof(union vs_prolog_key_header) == 4);
2930    assert(vs_shader->info.vs.dynamic_inputs);
2931
2932    const struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input;
2933    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
2934    struct radv_device *device = cmd_buffer->device;
2935
2936    unsigned num_attributes = pipeline->last_vertex_attrib_bit;
2937    uint32_t attribute_mask = BITFIELD_MASK(num_attributes);
2938
2939    uint32_t instance_rate_inputs = state->instance_rate_inputs & attribute_mask;
2940    uint32_t zero_divisors = state->zero_divisors & attribute_mask;
2941    *nontrivial_divisors = state->nontrivial_divisors & attribute_mask;
2942    uint32_t misaligned_mask = cmd_buffer->state.vbo_misaligned_mask;
2943    if (cmd_buffer->state.vbo_misaligned_mask_invalid) {
2944       assert(device->physical_device->rad_info.gfx_level == GFX6 ||
2945              device->physical_device->rad_info.gfx_level >= GFX10);
2946
2947       u_foreach_bit (index, cmd_buffer->state.vbo_misaligned_mask_invalid & attribute_mask) {
2948          uint8_t binding = state->bindings[index];
2949          if (!(cmd_buffer->state.vbo_bound_mask & BITFIELD_BIT(binding)))
2950             continue;
2951          uint8_t req = state->format_align_req_minus_1[index];
2952          struct radv_vertex_binding *vb = &cmd_buffer->vertex_bindings[binding];
2953          VkDeviceSize offset = vb->offset + state->offsets[index];
2954          if ((offset & req) || (vb->stride & req))
2955             misaligned_mask |= BITFIELD_BIT(index);
2956       }
2957       cmd_buffer->state.vbo_misaligned_mask = misaligned_mask;
2958       cmd_buffer->state.vbo_misaligned_mask_invalid &= ~attribute_mask;
2959    }
2960    misaligned_mask |= state->nontrivial_formats;
2961
2962    /* try to use a pre-compiled prolog first */
2963    struct radv_shader_part *prolog = NULL;
2964    if (pipeline->can_use_simple_input &&
2965        (!vs_shader->info.vs.as_ls || !instance_rate_inputs) &&
2966        !misaligned_mask && !state->alpha_adjust_lo && !state->alpha_adjust_hi) {
2967       if (!instance_rate_inputs) {
2968          prolog = device->simple_vs_prologs[num_attributes - 1];
2969       } else if (num_attributes <= 16 && !*nontrivial_divisors && !zero_divisors &&
2970                  util_bitcount(instance_rate_inputs) ==
2971                     (util_last_bit(instance_rate_inputs) - ffs(instance_rate_inputs) + 1)) {
2972          unsigned index = radv_instance_rate_prolog_index(num_attributes, instance_rate_inputs);
2973          prolog = device->instance_rate_vs_prologs[index];
2974       }
2975    }
2976    if (prolog)
2977       return prolog;
2978
2979    /* if we couldn't use a pre-compiled prolog, find one in the cache or create one */
2980    uint32_t key_words[17];
2981    unsigned key_size = 1;
2982
2983    struct radv_vs_prolog_key key;
2984    key.state = state;
2985    key.num_attributes = num_attributes;
2986    key.misaligned_mask = misaligned_mask;
2987    /* The instance ID input VGPR is placed differently when as_ls=true. */
2988    key.as_ls = vs_shader->info.vs.as_ls && instance_rate_inputs;
2989    key.is_ngg = vs_shader->info.is_ngg;
2990    key.wave32 = vs_shader->info.wave_size == 32;
2991    key.next_stage = pipeline->next_vertex_stage;
2992
2993    union vs_prolog_key_header header;
2994    header.v = 0;
2995    header.num_attributes = num_attributes;
2996    header.as_ls = key.as_ls;
2997    header.is_ngg = key.is_ngg;
2998    header.wave32 = key.wave32;
2999    header.next_stage = key.next_stage;
3000
3001    if (instance_rate_inputs & ~*nontrivial_divisors) {
3002       header.instance_rate_inputs = true;
3003       key_words[key_size++] = instance_rate_inputs;
3004    }
3005    if (*nontrivial_divisors) {
3006       header.nontrivial_divisors = true;
3007       key_words[key_size++] = *nontrivial_divisors;
3008    }
3009    if (zero_divisors) {
3010       header.zero_divisors = true;
3011       key_words[key_size++] = zero_divisors;
3012    }
3013    if (misaligned_mask) {
3014       header.misaligned_mask = true;
3015       key_words[key_size++] = misaligned_mask;
3016
3017       uint8_t *formats = (uint8_t *)&key_words[key_size];
3018       unsigned num_formats = 0;
3019       u_foreach_bit(index, misaligned_mask) formats[num_formats++] = state->formats[index];
3020       while (num_formats & 0x3)
3021          formats[num_formats++] = 0;
3022       key_size += num_formats / 4u;
3023
3024       if (state->post_shuffle & attribute_mask) {
3025          header.post_shuffle = true;
3026          key_words[key_size++] = state->post_shuffle & attribute_mask;
3027       }
3028    }
3029    if (state->alpha_adjust_lo & attribute_mask) {
3030       header.alpha_adjust_lo = true;
3031       key_words[key_size++] = state->alpha_adjust_lo & attribute_mask;
3032    }
3033    if (state->alpha_adjust_hi & attribute_mask) {
3034       header.alpha_adjust_hi = true;
3035       key_words[key_size++] = state->alpha_adjust_hi & attribute_mask;
3036    }
3037
3038    header.key_size = key_size * sizeof(key_words[0]);
3039    key_words[0] = header.v;
3040
3041    uint32_t hash = radv_hash_vs_prolog(key_words);
3042
3043    if (cmd_buffer->state.emitted_vs_prolog &&
3044        cmd_buffer->state.emitted_vs_prolog_key_hash == hash &&
3045        radv_cmp_vs_prolog(key_words, cmd_buffer->state.emitted_vs_prolog_key))
3046       return cmd_buffer->state.emitted_vs_prolog;
3047
3048    u_rwlock_rdlock(&device->vs_prologs_lock);
3049    struct hash_entry *prolog_entry =
3050       _mesa_hash_table_search_pre_hashed(device->vs_prologs, hash, key_words);
3051    u_rwlock_rdunlock(&device->vs_prologs_lock);
3052
3053    if (!prolog_entry) {
3054       u_rwlock_wrlock(&device->vs_prologs_lock);
3055       prolog_entry = _mesa_hash_table_search_pre_hashed(device->vs_prologs, hash, key_words);
3056       if (prolog_entry) {
3057          u_rwlock_wrunlock(&device->vs_prologs_lock);
3058          return prolog_entry->data;
3059       }
3060
3061       prolog = radv_create_vs_prolog(device, &key);
3062       uint32_t *key2 = malloc(key_size * 4);
3063       if (!prolog || !key2) {
3064          radv_shader_part_unref(device, prolog);
3065          free(key2);
3066          u_rwlock_wrunlock(&device->vs_prologs_lock);
3067          return NULL;
3068       }
3069       memcpy(key2, key_words, key_size * 4);
3070       _mesa_hash_table_insert_pre_hashed(device->vs_prologs, hash, key2, prolog);
3071
3072       u_rwlock_wrunlock(&device->vs_prologs_lock);
3073       return prolog;
3074    }
3075
3076    return prolog_entry->data;
3077 }
3078
3079 static void
3080 emit_prolog_regs(struct radv_cmd_buffer *cmd_buffer, struct radv_shader *vs_shader,
3081                  struct radv_shader_part *prolog, bool pipeline_is_dirty)
3082 {
3083    /* no need to re-emit anything in this case */
3084    if (cmd_buffer->state.emitted_vs_prolog == prolog && !pipeline_is_dirty)
3085       return;
3086
3087    enum amd_gfx_level chip = cmd_buffer->device->physical_device->rad_info.gfx_level;
3088    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
3089    uint64_t prolog_va = radv_buffer_get_va(prolog->bo) + prolog->alloc->offset;
3090
3091    assert(cmd_buffer->state.emitted_graphics_pipeline == cmd_buffer->state.graphics_pipeline);
3092
3093    uint32_t rsrc1 = vs_shader->config.rsrc1;
3094    if (chip < GFX10 && G_00B228_SGPRS(prolog->rsrc1) > G_00B228_SGPRS(vs_shader->config.rsrc1))
3095       rsrc1 = (rsrc1 & C_00B228_SGPRS) | (prolog->rsrc1 & ~C_00B228_SGPRS);
3096
3097    /* The main shader must not use less VGPRs than the prolog, otherwise shared vgprs might not
3098     * work.
3099     */
3100    assert(G_00B848_VGPRS(vs_shader->config.rsrc1) >= G_00B848_VGPRS(prolog->rsrc1));
3101
3102    unsigned pgm_lo_reg = R_00B120_SPI_SHADER_PGM_LO_VS;
3103    unsigned rsrc1_reg = R_00B128_SPI_SHADER_PGM_RSRC1_VS;
3104    if (vs_shader->info.is_ngg || pipeline->base.shaders[MESA_SHADER_GEOMETRY] == vs_shader) {
3105       pgm_lo_reg = chip >= GFX10 ? R_00B320_SPI_SHADER_PGM_LO_ES : R_00B210_SPI_SHADER_PGM_LO_ES;
3106       rsrc1_reg = R_00B228_SPI_SHADER_PGM_RSRC1_GS;
3107    } else if (pipeline->base.shaders[MESA_SHADER_TESS_CTRL] == vs_shader) {
3108       pgm_lo_reg = chip >= GFX10 ? R_00B520_SPI_SHADER_PGM_LO_LS : R_00B410_SPI_SHADER_PGM_LO_LS;
3109       rsrc1_reg = R_00B428_SPI_SHADER_PGM_RSRC1_HS;
3110    } else if (vs_shader->info.vs.as_ls) {
3111       pgm_lo_reg = R_00B520_SPI_SHADER_PGM_LO_LS;
3112       rsrc1_reg = R_00B528_SPI_SHADER_PGM_RSRC1_LS;
3113    } else if (vs_shader->info.vs.as_es) {
3114       pgm_lo_reg = R_00B320_SPI_SHADER_PGM_LO_ES;
3115       rsrc1_reg = R_00B328_SPI_SHADER_PGM_RSRC1_ES;
3116    }
3117
3118    radeon_set_sh_reg(cmd_buffer->cs, pgm_lo_reg, prolog_va >> 8);
3119
3120    if (chip < GFX10)
3121       radeon_set_sh_reg(cmd_buffer->cs, rsrc1_reg, rsrc1);
3122    else
3123       assert(rsrc1 == vs_shader->config.rsrc1);
3124
3125    radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, prolog->bo);
3126 }
3127
3128 static void
3129 emit_prolog_inputs(struct radv_cmd_buffer *cmd_buffer, struct radv_shader *vs_shader,
3130                    uint32_t nontrivial_divisors, bool pipeline_is_dirty)
3131 {
3132    /* no need to re-emit anything in this case */
3133    if (!nontrivial_divisors && !pipeline_is_dirty && cmd_buffer->state.emitted_vs_prolog &&
3134        !cmd_buffer->state.emitted_vs_prolog->nontrivial_divisors)
3135       return;
3136
3137    const struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input;
3138    uint64_t input_va = radv_shader_get_va(vs_shader);
3139
3140    if (nontrivial_divisors) {
3141       unsigned inputs_offset;
3142       uint32_t *inputs;
3143       unsigned size = 8 + util_bitcount(nontrivial_divisors) * 8;
3144       if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, &inputs_offset, (void **)&inputs))
3145          return;
3146
3147       *(inputs++) = input_va;
3148       *(inputs++) = input_va >> 32;
3149
3150       u_foreach_bit(index, nontrivial_divisors)
3151       {
3152          uint32_t div = state->divisors[index];
3153          if (div == 0) {
3154             *(inputs++) = 0;
3155             *(inputs++) = 1;
3156          } else if (util_is_power_of_two_or_zero(div)) {
3157             *(inputs++) = util_logbase2(div) | (1 << 8);
3158             *(inputs++) = 0xffffffffu;
3159          } else {
3160             struct util_fast_udiv_info info = util_compute_fast_udiv_info(div, 32, 32);
3161             *(inputs++) = info.pre_shift | (info.increment << 8) | (info.post_shift << 16);
3162             *(inputs++) = info.multiplier;
3163          }
3164       }
3165
3166       input_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + inputs_offset;
3167    }
3168
3169    struct radv_userdata_info *loc =
3170       &vs_shader->info.user_sgprs_locs.shader_data[AC_UD_VS_PROLOG_INPUTS];
3171    uint32_t base_reg = cmd_buffer->state.graphics_pipeline->base.user_data_0[MESA_SHADER_VERTEX];
3172    assert(loc->sgpr_idx != -1);
3173    assert(loc->num_sgprs == 2);
3174    radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4,
3175                             input_va, true);
3176 }
3177
3178 static void
3179 radv_emit_vertex_input(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
3180 {
3181    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
3182    struct radv_shader *vs_shader = radv_get_shader(&pipeline->base, MESA_SHADER_VERTEX);
3183
3184    assert(!cmd_buffer->state.mesh_shading);
3185
3186    if (!vs_shader->info.vs.has_prolog)
3187       return;
3188
3189    uint32_t nontrivial_divisors;
3190    struct radv_shader_part *prolog =
3191       lookup_vs_prolog(cmd_buffer, vs_shader, &nontrivial_divisors);
3192    if (!prolog) {
3193       vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
3194       return;
3195    }
3196    emit_prolog_regs(cmd_buffer, vs_shader, prolog, pipeline_is_dirty);
3197    emit_prolog_inputs(cmd_buffer, vs_shader, nontrivial_divisors, pipeline_is_dirty);
3198
3199    cmd_buffer->state.emitted_vs_prolog = prolog;
3200
3201    if (unlikely(cmd_buffer->device->trace_bo))
3202       radv_save_vs_prolog(cmd_buffer, prolog);
3203 }
3204
3205 static void
3206 radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
3207 {
3208    uint64_t states =
3209       cmd_buffer->state.dirty & cmd_buffer->state.emitted_graphics_pipeline->needed_dynamic_state;
3210
3211    if (states & (RADV_CMD_DIRTY_DYNAMIC_VIEWPORT))
3212       radv_emit_viewport(cmd_buffer);
3213
3214    if (states & (RADV_CMD_DIRTY_DYNAMIC_SCISSOR | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT) &&
3215        !cmd_buffer->device->physical_device->rad_info.has_gfx9_scissor_bug)
3216       radv_emit_scissor(cmd_buffer);
3217
3218    if (states & RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH)
3219       radv_emit_line_width(cmd_buffer);
3220
3221    if (states & RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS)
3222       radv_emit_blend_constants(cmd_buffer);
3223
3224    if (states &
3225        (RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE | RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK |
3226         RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK))
3227       radv_emit_stencil(cmd_buffer);
3228
3229    if (states & RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS)
3230       radv_emit_depth_bounds(cmd_buffer);
3231
3232    if (states & RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS)
3233       radv_emit_depth_bias(cmd_buffer);
3234
3235    if (states & RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE)
3236       radv_emit_discard_rectangle(cmd_buffer);
3237
3238    if (states & RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS)
3239       radv_emit_sample_locations(cmd_buffer);
3240
3241    if (states & (RADV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE))
3242       radv_emit_line_stipple(cmd_buffer);
3243
3244    if (states & (RADV_CMD_DIRTY_DYNAMIC_CULL_MODE | RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE |
3245                  RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE))
3246       radv_emit_culling(cmd_buffer);
3247
3248    if (states & RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY)
3249       radv_emit_primitive_topology(cmd_buffer);
3250
3251    if (states &
3252        (RADV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE |
3253         RADV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP | RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE |
3254         RADV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP))
3255       radv_emit_depth_control(cmd_buffer);
3256
3257    if (states & RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP)
3258       radv_emit_stencil_control(cmd_buffer);
3259
3260    if (states & RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE)
3261       radv_emit_fragment_shading_rate(cmd_buffer);
3262
3263    if (states & RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE)
3264       radv_emit_primitive_restart_enable(cmd_buffer);
3265
3266    if (states & RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE)
3267       radv_emit_rasterizer_discard_enable(cmd_buffer);
3268
3269    if (states & RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP)
3270       radv_emit_logic_op(cmd_buffer);
3271
3272    if (states & RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE)
3273       radv_emit_color_write_enable(cmd_buffer);
3274
3275    if (states & RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT)
3276       radv_emit_vertex_input(cmd_buffer, pipeline_is_dirty);
3277
3278    cmd_buffer->state.dirty &= ~states;
3279 }
3280
3281 static void
3282 radv_flush_push_descriptors(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point)
3283 {
3284    struct radv_descriptor_state *descriptors_state =
3285       radv_get_descriptors_state(cmd_buffer, bind_point);
3286    struct radv_descriptor_set *set = (struct radv_descriptor_set *)&descriptors_state->push_set.set;
3287    unsigned bo_offset;
3288
3289    if (!radv_cmd_buffer_upload_data(cmd_buffer, set->header.size, set->header.mapped_ptr,
3290                                     &bo_offset))
3291       return;
3292
3293    set->header.va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
3294    set->header.va += bo_offset;
3295 }
3296
3297 static void
3298 radv_flush_indirect_descriptor_sets(struct radv_cmd_buffer *cmd_buffer,
3299                                     struct radv_pipeline *pipeline, VkPipelineBindPoint bind_point)
3300 {
3301    struct radv_descriptor_state *descriptors_state =
3302       radv_get_descriptors_state(cmd_buffer, bind_point);
3303    uint32_t size = MAX_SETS * 4;
3304    uint32_t offset;
3305    void *ptr;
3306
3307    if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, &offset, &ptr))
3308       return;
3309
3310    for (unsigned i = 0; i < MAX_SETS; i++) {
3311       uint32_t *uptr = ((uint32_t *)ptr) + i;
3312       uint64_t set_va = 0;
3313       struct radv_descriptor_set *set = descriptors_state->sets[i];
3314       if (descriptors_state->valid & (1u << i))
3315          set_va = set->header.va;
3316       uptr[0] = set_va & 0xffffffff;
3317    }
3318
3319    struct radeon_cmdbuf *cs = cmd_buffer->cs;
3320    struct radv_device *device = cmd_buffer->device;
3321    uint64_t va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
3322    va += offset;
3323
3324    if (bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) {
3325       struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline);
3326
3327       if (pipeline->shaders[MESA_SHADER_VERTEX])
3328          radv_emit_userdata_address(device, cs, pipeline, MESA_SHADER_VERTEX,
3329                                     AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
3330
3331       if (pipeline->shaders[MESA_SHADER_FRAGMENT])
3332          radv_emit_userdata_address(device, cs, pipeline, MESA_SHADER_FRAGMENT,
3333                                     AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
3334
3335       if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_MESH))
3336          radv_emit_userdata_address(device, cs, pipeline, MESA_SHADER_MESH,
3337                                     AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
3338
3339       if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_TASK))
3340          radv_emit_userdata_address(device, cmd_buffer->ace_internal.cs, pipeline, MESA_SHADER_TASK,
3341                                     AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
3342
3343       if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_GEOMETRY))
3344          radv_emit_userdata_address(device, cs, pipeline, MESA_SHADER_GEOMETRY,
3345                                     AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
3346
3347       if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_TESS_CTRL))
3348          radv_emit_userdata_address(device, cs, pipeline, MESA_SHADER_TESS_CTRL,
3349                                     AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
3350
3351       if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_TESS_CTRL))
3352          radv_emit_userdata_address(device, cs, pipeline, MESA_SHADER_TESS_EVAL,
3353                                     AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
3354    } else {
3355       radv_emit_userdata_address(device, cs, pipeline, MESA_SHADER_COMPUTE,
3356                                  AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
3357    }
3358 }
3359
3360 static void
3361 radv_flush_descriptors(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stages,
3362                        struct radv_pipeline *pipeline, VkPipelineBindPoint bind_point)
3363 {
3364    struct radv_descriptor_state *descriptors_state =
3365       radv_get_descriptors_state(cmd_buffer, bind_point);
3366    struct radv_device *device = cmd_buffer->device;
3367    struct radeon_cmdbuf *cs = cmd_buffer->cs;
3368    bool flush_indirect_descriptors;
3369
3370    if (!descriptors_state->dirty)
3371       return;
3372
3373    if (descriptors_state->push_dirty)
3374       radv_flush_push_descriptors(cmd_buffer, bind_point);
3375
3376    flush_indirect_descriptors = pipeline->need_indirect_descriptor_sets;
3377
3378    if (flush_indirect_descriptors)
3379       radv_flush_indirect_descriptor_sets(cmd_buffer, pipeline, bind_point);
3380
3381    ASSERTED unsigned cdw_max =
3382       radeon_check_space(device->ws, cs, MAX_SETS * MESA_VULKAN_SHADER_STAGES * 4);
3383
3384    if (stages & VK_SHADER_STAGE_COMPUTE_BIT) {
3385       radv_emit_descriptor_pointers(device, cs, pipeline, descriptors_state, MESA_SHADER_COMPUTE);
3386    } else {
3387       radv_foreach_stage(stage, stages & ~VK_SHADER_STAGE_TASK_BIT_NV)
3388       {
3389          if (!cmd_buffer->state.graphics_pipeline->base.shaders[stage])
3390             continue;
3391
3392          radv_emit_descriptor_pointers(device, cs, pipeline, descriptors_state, stage);
3393       }
3394
3395       if (stages & VK_SHADER_STAGE_TASK_BIT_NV) {
3396          radv_emit_descriptor_pointers(device, cmd_buffer->ace_internal.cs, pipeline,
3397                                        descriptors_state, MESA_SHADER_TASK);
3398       }
3399    }
3400
3401    descriptors_state->dirty = 0;
3402    descriptors_state->push_dirty = false;
3403
3404    assert(cmd_buffer->cs->cdw <= cdw_max);
3405
3406    if (unlikely(cmd_buffer->device->trace_bo))
3407       radv_save_descriptors(cmd_buffer, bind_point);
3408 }
3409
3410 static bool
3411 radv_shader_loads_push_constants(struct radv_pipeline *pipeline, gl_shader_stage stage)
3412 {
3413    struct radv_userdata_info *loc =
3414       radv_lookup_user_sgpr(pipeline, stage, AC_UD_PUSH_CONSTANTS);
3415    return loc->sgpr_idx != -1;
3416 }
3417
3418 static void
3419 radv_emit_all_inline_push_consts(struct radv_device *device, struct radeon_cmdbuf *cs,
3420                                  struct radv_pipeline *pipeline, gl_shader_stage stage,
3421                                  uint32_t *values, bool *need_push_constants)
3422 {
3423    const struct radv_shader *shader = radv_get_shader(pipeline, stage);
3424    if (!shader)
3425       return;
3426
3427    *need_push_constants |= radv_shader_loads_push_constants(pipeline, stage);
3428
3429    const uint64_t mask = shader->info.inline_push_constant_mask;
3430    if (!mask)
3431       return;
3432
3433    const uint8_t base = ffs(mask) - 1;
3434    if (mask == u_bit_consecutive64(base, util_last_bit64(mask) - base)) {
3435       /* consecutive inline push constants */
3436       radv_emit_inline_push_consts(device, cs, pipeline, stage, AC_UD_INLINE_PUSH_CONSTANTS,
3437                                    values + base);
3438    } else {
3439       /* sparse inline push constants */
3440       uint32_t consts[AC_MAX_INLINE_PUSH_CONSTS];
3441       unsigned num_consts = 0;
3442       u_foreach_bit64 (idx, mask)
3443          consts[num_consts++] = values[idx];
3444       radv_emit_inline_push_consts(device, cs, pipeline, stage, AC_UD_INLINE_PUSH_CONSTANTS,
3445                                    consts);
3446    }
3447 }
3448
3449 static void
3450 radv_flush_constants(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stages,
3451                      struct radv_pipeline *pipeline, VkPipelineBindPoint bind_point)
3452 {
3453    struct radv_device *device = cmd_buffer->device;
3454    struct radeon_cmdbuf *cs = cmd_buffer->cs;
3455    struct radv_descriptor_state *descriptors_state =
3456       radv_get_descriptors_state(cmd_buffer, bind_point);
3457    struct radv_shader *shader, *prev_shader;
3458    bool need_push_constants = false;
3459    unsigned offset;
3460    void *ptr;
3461    uint64_t va;
3462    uint32_t internal_stages;
3463    uint32_t dirty_stages = 0;
3464
3465    stages &= cmd_buffer->push_constant_stages;
3466    if (!stages || (!pipeline->push_constant_size && !pipeline->dynamic_offset_count))
3467       return;
3468
3469    internal_stages = stages;
3470    switch (bind_point) {
3471    case VK_PIPELINE_BIND_POINT_GRAPHICS:
3472       break;
3473    case VK_PIPELINE_BIND_POINT_COMPUTE:
3474       dirty_stages = RADV_RT_STAGE_BITS;
3475       break;
3476    case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR:
3477       internal_stages = VK_SHADER_STAGE_COMPUTE_BIT;
3478       dirty_stages = VK_SHADER_STAGE_COMPUTE_BIT;
3479       break;
3480    default:
3481       unreachable("Unhandled bind point");
3482    }
3483
3484    radv_foreach_stage(stage, internal_stages & ~VK_SHADER_STAGE_TASK_BIT_NV)
3485    {
3486       radv_emit_all_inline_push_consts(
3487          device, cs, pipeline, stage, (uint32_t *)cmd_buffer->push_constants, &need_push_constants);
3488    }
3489
3490    if (internal_stages & VK_SHADER_STAGE_TASK_BIT_NV) {
3491       radv_emit_all_inline_push_consts(device, cmd_buffer->ace_internal.cs, pipeline,
3492                                        MESA_SHADER_TASK, (uint32_t *)cmd_buffer->push_constants,
3493                                        &need_push_constants);
3494    }
3495
3496    if (need_push_constants) {
3497       if (!radv_cmd_buffer_upload_alloc(
3498              cmd_buffer, pipeline->push_constant_size + 16 * pipeline->dynamic_offset_count, &offset,
3499              &ptr))
3500          return;
3501
3502       memcpy(ptr, cmd_buffer->push_constants, pipeline->push_constant_size);
3503       memcpy((char *)ptr + pipeline->push_constant_size, descriptors_state->dynamic_buffers,
3504              16 * pipeline->dynamic_offset_count);
3505
3506       va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
3507       va += offset;
3508
3509       ASSERTED unsigned cdw_max =
3510          radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, MESA_VULKAN_SHADER_STAGES * 4);
3511
3512       prev_shader = NULL;
3513       radv_foreach_stage(stage, internal_stages & ~VK_SHADER_STAGE_TASK_BIT_NV)
3514       {
3515          shader = radv_get_shader(pipeline, stage);
3516
3517          /* Avoid redundantly emitting the address for merged stages. */
3518          if (shader && shader != prev_shader) {
3519             radv_emit_userdata_address(device, cs, pipeline, stage, AC_UD_PUSH_CONSTANTS, va);
3520
3521             prev_shader = shader;
3522          }
3523       }
3524
3525       if (internal_stages & VK_SHADER_STAGE_TASK_BIT_NV) {
3526          radv_emit_userdata_address(device, cmd_buffer->ace_internal.cs, pipeline, MESA_SHADER_TASK,
3527                                     AC_UD_PUSH_CONSTANTS, va);
3528       }
3529
3530       assert(cmd_buffer->cs->cdw <= cdw_max);
3531    }
3532
3533    cmd_buffer->push_constant_stages &= ~stages;
3534    cmd_buffer->push_constant_stages |= dirty_stages;
3535 }
3536
3537 void
3538 radv_write_vertex_descriptors(const struct radv_cmd_buffer *cmd_buffer,
3539                               const struct radv_graphics_pipeline *pipeline,
3540                               bool full_null_descriptors, void *vb_ptr)
3541 {
3542    struct radv_shader *vs_shader = radv_get_shader(&pipeline->base, MESA_SHADER_VERTEX);
3543    enum amd_gfx_level chip = cmd_buffer->device->physical_device->rad_info.gfx_level;
3544    enum radeon_family family = cmd_buffer->device->physical_device->rad_info.family;
3545    unsigned desc_index = 0;
3546    uint32_t mask = pipeline->vb_desc_usage_mask;
3547    uint64_t va;
3548    const struct radv_vs_input_state *vs_state =
3549       vs_shader->info.vs.dynamic_inputs ? &cmd_buffer->state.dynamic_vs_input : NULL;
3550    assert(!vs_state || pipeline->use_per_attribute_vb_descs);
3551
3552    const struct ac_vtx_format_info *vtx_info_table =
3553       vs_state ? ac_get_vtx_format_info_table(chip, family) : NULL;
3554
3555    while (mask) {
3556       unsigned i = u_bit_scan(&mask);
3557       uint32_t *desc = &((uint32_t *)vb_ptr)[desc_index++ * 4];
3558       uint32_t offset, rsrc_word3;
3559       unsigned binding =
3560          vs_state ? cmd_buffer->state.dynamic_vs_input.bindings[i]
3561                   : (pipeline->use_per_attribute_vb_descs ? pipeline->attrib_bindings[i] : i);
3562       struct radv_buffer *buffer = cmd_buffer->vertex_binding_buffers[binding];
3563       unsigned num_records;
3564       unsigned stride;
3565
3566       if (vs_state && !(vs_state->nontrivial_formats & BITFIELD_BIT(i))) {
3567          const struct ac_vtx_format_info *vtx_info = &vtx_info_table[vs_state->formats[i]];
3568          unsigned hw_format = vtx_info->hw_format[vtx_info->num_channels - 1];
3569
3570          if (chip >= GFX10) {
3571             rsrc_word3 = vtx_info->dst_sel | S_008F0C_FORMAT(hw_format);
3572          } else {
3573             rsrc_word3 = vtx_info->dst_sel | S_008F0C_NUM_FORMAT((hw_format >> 4) & 0x7) |
3574                          S_008F0C_DATA_FORMAT(hw_format & 0xf);
3575          }
3576       } else {
3577          rsrc_word3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
3578                       S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3579                       S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
3580          if (chip >= GFX10)
3581             rsrc_word3 |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_UINT);
3582          else
3583             rsrc_word3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
3584                           S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
3585       }
3586
3587       if (pipeline->dynamic_states & (RADV_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE |
3588                                       RADV_DYNAMIC_VERTEX_INPUT)) {
3589          stride = cmd_buffer->vertex_bindings[binding].stride;
3590       } else {
3591          stride = pipeline->binding_stride[binding];
3592       }
3593
3594       if (!buffer) {
3595          if (full_null_descriptors) {
3596             /* Put all the info in for the DGC generation shader in case the VBO gets overridden. */
3597             desc[0] = 0;
3598             desc[1] = S_008F04_STRIDE(stride);
3599             desc[2] = 0;
3600             desc[3] = rsrc_word3;
3601          } else if (vs_state) {
3602             /* Stride needs to be non-zero on GFX9, or else bounds checking is disabled. We need
3603              * to include the format/word3 so that the alpha channel is 1 for formats without an
3604              * alpha channel.
3605              */
3606             desc[0] = 0;
3607             desc[1] = S_008F04_STRIDE(16);
3608             desc[2] = 0;
3609             desc[3] = rsrc_word3;
3610          } else {
3611             memset(desc, 0, 4 * 4);
3612          }
3613
3614          continue;
3615       }
3616
3617       va = radv_buffer_get_va(buffer->bo);
3618
3619       offset = cmd_buffer->vertex_bindings[binding].offset;
3620       va += offset + buffer->offset;
3621       if (vs_state)
3622          va += vs_state->offsets[i];
3623
3624       if (cmd_buffer->vertex_bindings[binding].size) {
3625          num_records = cmd_buffer->vertex_bindings[binding].size;
3626       } else {
3627          num_records = vk_buffer_range(&buffer->vk, offset, VK_WHOLE_SIZE);
3628       }
3629
3630       if (pipeline->use_per_attribute_vb_descs) {
3631          uint32_t attrib_end =
3632             vs_state ? vs_state->offsets[i] + vs_state->format_sizes[i] : pipeline->attrib_ends[i];
3633
3634          if (num_records < attrib_end) {
3635             num_records = 0; /* not enough space for one vertex */
3636          } else if (stride == 0) {
3637             num_records = 1; /* only one vertex */
3638          } else {
3639             num_records = (num_records - attrib_end) / stride + 1;
3640             /* If attrib_offset>stride, then the compiler will increase the vertex index by
3641              * attrib_offset/stride and decrease the offset by attrib_offset%stride. This is
3642              * only allowed with static strides.
3643              */
3644             num_records += pipeline->attrib_index_offset[i];
3645          }
3646
3647          /* GFX10 uses OOB_SELECT_RAW if stride==0, so convert num_records from elements into
3648           * into bytes in that case. GFX8 always uses bytes.
3649           */
3650          if (num_records && (chip == GFX8 || (chip != GFX9 && !stride))) {
3651             num_records = (num_records - 1) * stride + attrib_end;
3652          } else if (!num_records) {
3653             /* On GFX9, it seems bounds checking is disabled if both
3654              * num_records and stride are zero. This doesn't seem necessary on GFX8, GFX10 and
3655              * GFX10.3 but it doesn't hurt.
3656              */
3657             if (full_null_descriptors) {
3658                /* Put all the info in for the DGC generation shader in case the VBO gets overridden.
3659                 */
3660                desc[0] = 0;
3661                desc[1] = S_008F04_STRIDE(stride);
3662                desc[2] = 0;
3663                desc[3] = rsrc_word3;
3664             } else if (vs_state) {
3665                desc[0] = 0;
3666                desc[1] = S_008F04_STRIDE(16);
3667                desc[2] = 0;
3668                desc[3] = rsrc_word3;
3669             } else {
3670                memset(desc, 0, 16);
3671             }
3672
3673             continue;
3674          }
3675       } else {
3676          if (chip != GFX8 && stride)
3677             num_records = DIV_ROUND_UP(num_records, stride);
3678       }
3679
3680       if (chip >= GFX10) {
3681          /* OOB_SELECT chooses the out-of-bounds check:
3682           * - 1: index >= NUM_RECORDS (Structured)
3683           * - 3: offset >= NUM_RECORDS (Raw)
3684           */
3685          int oob_select = stride ? V_008F0C_OOB_SELECT_STRUCTURED : V_008F0C_OOB_SELECT_RAW;
3686          rsrc_word3 |= S_008F0C_OOB_SELECT(oob_select) | S_008F0C_RESOURCE_LEVEL(chip < GFX11);
3687       }
3688
3689       desc[0] = va;
3690       desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride);
3691       desc[2] = num_records;
3692       desc[3] = rsrc_word3;
3693    }
3694 }
3695
3696 static void
3697 radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
3698 {
3699    if ((pipeline_is_dirty || (cmd_buffer->state.dirty & RADV_CMD_DIRTY_VERTEX_BUFFER)) &&
3700        cmd_buffer->state.graphics_pipeline->vb_desc_usage_mask) {
3701       /* Mesh shaders don't have vertex descriptors. */
3702       assert(!cmd_buffer->state.mesh_shading);
3703
3704       struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
3705       unsigned vb_offset;
3706       void *vb_ptr;
3707       uint64_t va;
3708
3709       /* allocate some descriptor state for vertex buffers */
3710       if (!radv_cmd_buffer_upload_alloc(cmd_buffer, pipeline->vb_desc_alloc_size, &vb_offset,
3711                                         &vb_ptr))
3712          return;
3713
3714       radv_write_vertex_descriptors(cmd_buffer, pipeline, false, vb_ptr);
3715
3716       va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
3717       va += vb_offset;
3718
3719       radv_emit_userdata_address(cmd_buffer->device, cmd_buffer->cs, &pipeline->base,
3720                                  MESA_SHADER_VERTEX, AC_UD_VS_VERTEX_BUFFERS, va);
3721
3722       cmd_buffer->state.vb_va = va;
3723       cmd_buffer->state.prefetch_L2_mask |= RADV_PREFETCH_VBO_DESCRIPTORS;
3724
3725       if (unlikely(cmd_buffer->device->trace_bo))
3726          radv_save_vertex_descriptors(cmd_buffer, (uintptr_t)vb_ptr);
3727    }
3728    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_VERTEX_BUFFER;
3729 }
3730
3731 static void
3732 radv_emit_streamout_buffers(struct radv_cmd_buffer *cmd_buffer, uint64_t va)
3733 {
3734    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
3735    struct radv_userdata_info *loc;
3736    uint32_t base_reg;
3737
3738    for (unsigned stage = 0; stage < MESA_VULKAN_SHADER_STAGES; ++stage) {
3739       if (!radv_get_shader(&pipeline->base, stage))
3740          continue;
3741
3742       loc = radv_lookup_user_sgpr(&pipeline->base, stage, AC_UD_STREAMOUT_BUFFERS);
3743       if (loc->sgpr_idx == -1)
3744          continue;
3745
3746       base_reg = pipeline->base.user_data_0[stage];
3747
3748       radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, va,
3749                                false);
3750    }
3751
3752    if (radv_pipeline_has_gs_copy_shader(&pipeline->base)) {
3753       loc = &pipeline->base.gs_copy_shader->info.user_sgprs_locs.shader_data[AC_UD_STREAMOUT_BUFFERS];
3754       if (loc->sgpr_idx != -1) {
3755          base_reg = R_00B130_SPI_SHADER_USER_DATA_VS_0;
3756
3757          radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4,
3758                                   va, false);
3759       }
3760    }
3761 }
3762
3763 static void
3764 radv_flush_streamout_descriptors(struct radv_cmd_buffer *cmd_buffer)
3765 {
3766    if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_STREAMOUT_BUFFER) {
3767       struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
3768       struct radv_streamout_state *so = &cmd_buffer->state.streamout;
3769       unsigned so_offset;
3770       void *so_ptr;
3771       uint64_t va;
3772
3773       /* Allocate some descriptor state for streamout buffers. */
3774       if (!radv_cmd_buffer_upload_alloc(cmd_buffer, MAX_SO_BUFFERS * 16, &so_offset, &so_ptr))
3775          return;
3776
3777       for (uint32_t i = 0; i < MAX_SO_BUFFERS; i++) {
3778          struct radv_buffer *buffer = sb[i].buffer;
3779          uint32_t *desc = &((uint32_t *)so_ptr)[i * 4];
3780
3781          if (!(so->enabled_mask & (1 << i)))
3782             continue;
3783
3784          va = radv_buffer_get_va(buffer->bo) + buffer->offset;
3785
3786          va += sb[i].offset;
3787
3788          /* Set the descriptor.
3789           *
3790           * On GFX8, the format must be non-INVALID, otherwise
3791           * the buffer will be considered not bound and store
3792           * instructions will be no-ops.
3793           */
3794          uint32_t size = 0xffffffff;
3795
3796          /* Compute the correct buffer size for NGG streamout
3797           * because it's used to determine the max emit per
3798           * buffer.
3799           */
3800          if (cmd_buffer->device->physical_device->use_ngg_streamout)
3801             size = buffer->vk.size - sb[i].offset;
3802
3803          uint32_t rsrc_word3 =
3804             S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3805             S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
3806
3807          if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
3808             rsrc_word3 |= S_008F0C_FORMAT(V_008F0C_GFX11_FORMAT_32_FLOAT) |
3809                           S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW);
3810          } else if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10) {
3811             rsrc_word3 |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
3812                           S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
3813          } else {
3814             rsrc_word3 |= S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
3815          }
3816
3817          desc[0] = va;
3818          desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
3819          desc[2] = size;
3820          desc[3] = rsrc_word3;
3821       }
3822
3823       va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
3824       va += so_offset;
3825
3826       radv_emit_streamout_buffers(cmd_buffer, va);
3827    }
3828
3829    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_STREAMOUT_BUFFER;
3830 }
3831
3832 static void
3833 radv_flush_ngg_query_state(struct radv_cmd_buffer *cmd_buffer)
3834 {
3835    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
3836    const unsigned stage = pipeline->last_vgt_api_stage;
3837    struct radv_userdata_info *loc;
3838    uint32_t ngg_query_state = 0;
3839    uint32_t base_reg;
3840
3841    loc = radv_lookup_user_sgpr(&pipeline->base, stage, AC_UD_NGG_QUERY_STATE);
3842    if (loc->sgpr_idx == -1)
3843       return;
3844
3845    assert(pipeline->is_ngg);
3846
3847    /* By default NGG queries are disabled but they are enabled if the command buffer has active GDS
3848     * queries or if it's a secondary command buffer that inherits the number of generated
3849     * primitives.
3850     */
3851    if (cmd_buffer->state.active_pipeline_gds_queries ||
3852        (cmd_buffer->state.inherited_pipeline_statistics &
3853         VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT))
3854       ngg_query_state = 1;
3855
3856    base_reg = pipeline->base.user_data_0[stage];
3857    assert(loc->sgpr_idx != -1);
3858
3859    radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, ngg_query_state);
3860 }
3861
3862 static void
3863 radv_flush_force_vrs_state(struct radv_cmd_buffer *cmd_buffer)
3864 {
3865    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
3866    enum amd_gfx_level gfx_level = pipeline->base.device->physical_device->rad_info.gfx_level;
3867    const unsigned stage = pipeline->last_vgt_api_stage;
3868    struct radv_userdata_info *loc;
3869    uint32_t vrs_rates = 0;
3870    uint32_t base_reg;
3871
3872    if (!pipeline->force_vrs_per_vertex) {
3873       /* Un-set the SGPR index so we know to re-emit it later. */
3874       cmd_buffer->state.last_vrs_rates_sgpr_idx = -1;
3875       return;
3876    }
3877
3878    loc = radv_lookup_user_sgpr(&pipeline->base, stage, AC_UD_FORCE_VRS_RATES);
3879    assert(loc->sgpr_idx != -1);
3880
3881    base_reg = pipeline->base.user_data_0[stage];
3882
3883    switch (cmd_buffer->device->force_vrs) {
3884    case RADV_FORCE_VRS_2x2:
3885       vrs_rates = gfx_level >= GFX11 ? V_0283D0_VRS_SHADING_RATE_2X2 : (1u << 2) | (1u << 4);
3886       break;
3887    case RADV_FORCE_VRS_2x1:
3888       vrs_rates = gfx_level >= GFX11 ? V_0283D0_VRS_SHADING_RATE_2X1 : (1u << 2) | (0u << 4);
3889       break;
3890    case RADV_FORCE_VRS_1x2:
3891       vrs_rates = gfx_level >= GFX11 ? V_0283D0_VRS_SHADING_RATE_1X2 : (0u << 2) | (1u << 4);
3892       break;
3893    default:
3894       break;
3895    }
3896
3897    if (cmd_buffer->state.last_vrs_rates != vrs_rates ||
3898        cmd_buffer->state.last_vrs_rates_sgpr_idx != loc->sgpr_idx) {
3899       radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, vrs_rates);
3900    }
3901
3902    cmd_buffer->state.last_vrs_rates = vrs_rates;
3903    cmd_buffer->state.last_vrs_rates_sgpr_idx = loc->sgpr_idx;
3904 }
3905
3906 static void
3907 radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
3908 {
3909    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
3910
3911    radv_flush_vertex_descriptors(cmd_buffer, pipeline_is_dirty);
3912    radv_flush_streamout_descriptors(cmd_buffer);
3913
3914    VkShaderStageFlags stages = VK_SHADER_STAGE_ALL_GRAPHICS | VK_SHADER_STAGE_MESH_BIT_NV;
3915    radv_flush_descriptors(cmd_buffer, stages, &pipeline->base, VK_PIPELINE_BIND_POINT_GRAPHICS);
3916    radv_flush_constants(cmd_buffer, stages, &pipeline->base, VK_PIPELINE_BIND_POINT_GRAPHICS);
3917    radv_flush_ngg_query_state(cmd_buffer);
3918    radv_flush_force_vrs_state(cmd_buffer);
3919 }
3920
3921 struct radv_draw_info {
3922    /**
3923     * Number of vertices.
3924     */
3925    uint32_t count;
3926
3927    /**
3928     * First instance id.
3929     */
3930    uint32_t first_instance;
3931
3932    /**
3933     * Number of instances.
3934     */
3935    uint32_t instance_count;
3936
3937    /**
3938     * Whether it's an indexed draw.
3939     */
3940    bool indexed;
3941
3942    /**
3943     * Indirect draw parameters resource.
3944     */
3945    struct radv_buffer *indirect;
3946    uint64_t indirect_offset;
3947    uint32_t stride;
3948
3949    /**
3950     * Draw count parameters resource.
3951     */
3952    struct radv_buffer *count_buffer;
3953    uint64_t count_buffer_offset;
3954
3955    /**
3956     * Stream output parameters resource.
3957     */
3958    struct radv_buffer *strmout_buffer;
3959    uint64_t strmout_buffer_offset;
3960 };
3961
3962 static uint32_t
3963 radv_get_primitive_reset_index(struct radv_cmd_buffer *cmd_buffer)
3964 {
3965    uint32_t index_type = G_028A7C_INDEX_TYPE(cmd_buffer->state.index_type);
3966    switch (index_type) {
3967    case V_028A7C_VGT_INDEX_8:
3968       return 0xffu;
3969    case V_028A7C_VGT_INDEX_16:
3970       return 0xffffu;
3971    case V_028A7C_VGT_INDEX_32:
3972       return 0xffffffffu;
3973    default:
3974       unreachable("invalid index type");
3975    }
3976 }
3977
3978 static void
3979 si_emit_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer, bool instanced_draw,
3980                            bool indirect_draw, bool count_from_stream_output,
3981                            uint32_t draw_vertex_count)
3982 {
3983    struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info;
3984    struct radv_cmd_state *state = &cmd_buffer->state;
3985    unsigned topology = state->dynamic.primitive_topology;
3986    bool prim_restart_enable = state->dynamic.primitive_restart_enable;
3987    struct radeon_cmdbuf *cs = cmd_buffer->cs;
3988    unsigned ia_multi_vgt_param;
3989
3990    ia_multi_vgt_param =
3991       si_get_ia_multi_vgt_param(cmd_buffer, instanced_draw, indirect_draw, count_from_stream_output,
3992                                 draw_vertex_count, topology, prim_restart_enable);
3993
3994    if (state->last_ia_multi_vgt_param != ia_multi_vgt_param) {
3995       if (info->gfx_level == GFX9) {
3996          radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device, cs,
3997                                     R_030960_IA_MULTI_VGT_PARAM, 4, ia_multi_vgt_param);
3998       } else if (info->gfx_level >= GFX7) {
3999          radeon_set_context_reg_idx(cs, R_028AA8_IA_MULTI_VGT_PARAM, 1, ia_multi_vgt_param);
4000       } else {
4001          radeon_set_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param);
4002       }
4003       state->last_ia_multi_vgt_param = ia_multi_vgt_param;
4004    }
4005 }
4006
4007 static void
4008 radv_emit_draw_registers(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *draw_info)
4009 {
4010    struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info;
4011    struct radv_cmd_state *state = &cmd_buffer->state;
4012    struct radeon_cmdbuf *cs = cmd_buffer->cs;
4013    uint32_t topology = state->dynamic.primitive_topology;
4014    bool disable_instance_packing = false;
4015
4016    /* Draw state. */
4017    if (info->gfx_level < GFX10) {
4018       si_emit_ia_multi_vgt_param(cmd_buffer, draw_info->instance_count > 1, draw_info->indirect,
4019                                  !!draw_info->strmout_buffer,
4020                                  draw_info->indirect ? 0 : draw_info->count);
4021    }
4022
4023    if (state->dynamic.primitive_restart_enable) {
4024       uint32_t primitive_reset_index = radv_get_primitive_reset_index(cmd_buffer);
4025
4026       if (primitive_reset_index != state->last_primitive_reset_index) {
4027          radeon_set_context_reg(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, primitive_reset_index);
4028          state->last_primitive_reset_index = primitive_reset_index;
4029       }
4030    }
4031
4032    if (draw_info->strmout_buffer) {
4033       uint64_t va = radv_buffer_get_va(draw_info->strmout_buffer->bo);
4034
4035       va += draw_info->strmout_buffer->offset + draw_info->strmout_buffer_offset;
4036
4037       radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE, draw_info->stride);
4038
4039       if (info->gfx_level >= GFX10) {
4040          /* Emitting a COPY_DATA packet should be enough because RADV doesn't support preemption
4041           * (shadow memory) but for unknown reasons, it can lead to GPU hangs on GFX10+.
4042           */
4043          radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
4044          radeon_emit(cs, 0);
4045
4046          radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, 0));
4047          radeon_emit(cs, va);
4048          radeon_emit(cs, va >> 32);
4049          radeon_emit(cs, (R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE - SI_CONTEXT_REG_OFFSET) >> 2);
4050          radeon_emit(cs, 1); /* 1 DWORD */
4051       } else {
4052          radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
4053          radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) |
4054                          COPY_DATA_WR_CONFIRM);
4055          radeon_emit(cs, va);
4056          radeon_emit(cs, va >> 32);
4057          radeon_emit(cs, R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2);
4058          radeon_emit(cs, 0); /* unused */
4059       }
4060
4061       radv_cs_add_buffer(cmd_buffer->device->ws, cs, draw_info->strmout_buffer->bo);
4062    }
4063
4064    /* RDNA2 is affected by a hardware bug when instance packing is enabled for adjacent primitive
4065     * topologies and instance_count > 1, pipeline stats generated by GE are incorrect. It needs to
4066     * be applied for indexed and non-indexed draws.
4067     */
4068    if (info->gfx_level == GFX10_3 && state->active_pipeline_queries > 0 &&
4069        (draw_info->instance_count > 1 || draw_info->indirect) &&
4070        (topology == V_008958_DI_PT_LINELIST_ADJ || topology == V_008958_DI_PT_LINESTRIP_ADJ ||
4071         topology == V_008958_DI_PT_TRILIST_ADJ || topology == V_008958_DI_PT_TRISTRIP_ADJ)) {
4072       disable_instance_packing = true;
4073    }
4074
4075    if ((draw_info->indexed && state->index_type != state->last_index_type) ||
4076        (info->gfx_level == GFX10_3 &&
4077         (state->last_index_type == -1 ||
4078          disable_instance_packing != G_028A7C_DISABLE_INSTANCE_PACKING(state->last_index_type)))) {
4079       uint32_t index_type = state->index_type | S_028A7C_DISABLE_INSTANCE_PACKING(disable_instance_packing);
4080
4081       if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9) {
4082          radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device, cs,
4083                                     R_03090C_VGT_INDEX_TYPE, 2, index_type);
4084       } else {
4085          radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0));
4086          radeon_emit(cs, index_type);
4087       }
4088
4089       state->last_index_type = index_type;
4090    }
4091 }
4092
4093 static void
4094 radv_stage_flush(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2 src_stage_mask)
4095 {
4096    /* For simplicity, if the barrier wants to wait for the task shader,
4097     * just make it wait for the mesh shader too.
4098     */
4099    if (src_stage_mask & VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_NV)
4100       src_stage_mask |= VK_PIPELINE_STAGE_2_MESH_SHADER_BIT_NV;
4101
4102    if (src_stage_mask & (VK_PIPELINE_STAGE_2_COPY_BIT |
4103                          VK_PIPELINE_STAGE_2_RESOLVE_BIT |
4104                          VK_PIPELINE_STAGE_2_BLIT_BIT |
4105                          VK_PIPELINE_STAGE_2_CLEAR_BIT)) {
4106       /* Be conservative for now. */
4107       src_stage_mask |= VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT;
4108    }
4109
4110    if (src_stage_mask &
4111        (VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT |
4112         VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_BUILD_BIT_KHR |
4113         VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_COPY_BIT_KHR |
4114         VK_PIPELINE_STAGE_2_RAY_TRACING_SHADER_BIT_KHR | VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT |
4115         VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) {
4116       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
4117    }
4118
4119    if (src_stage_mask &
4120        (VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT |
4121         VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT |
4122         VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT | VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT |
4123         VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) {
4124       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH;
4125    } else if (src_stage_mask &
4126               (VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT |
4127                VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT |
4128                VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT |
4129                VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT |
4130                VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT |
4131                VK_PIPELINE_STAGE_2_MESH_SHADER_BIT_NV |
4132                VK_PIPELINE_STAGE_2_TRANSFORM_FEEDBACK_BIT_EXT |
4133                VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT)) {
4134       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH;
4135    }
4136 }
4137
4138 static bool
4139 can_skip_buffer_l2_flushes(struct radv_device *device)
4140 {
4141    return device->physical_device->rad_info.gfx_level == GFX9 ||
4142           (device->physical_device->rad_info.gfx_level >= GFX10 &&
4143            !device->physical_device->rad_info.tcc_rb_non_coherent);
4144 }
4145
4146 /*
4147  * In vulkan barriers have two kinds of operations:
4148  *
4149  * - visibility (implemented with radv_src_access_flush)
4150  * - availability (implemented with radv_dst_access_flush)
4151  *
4152  * for a memory operation to observe the result of a previous memory operation
4153  * one needs to do a visibility operation from the source memory and then an
4154  * availability operation to the target memory.
4155  *
4156  * The complication is the availability and visibility operations do not need to
4157  * be in the same barrier.
4158  *
4159  * The cleanest way to implement this is to define the visibility operation to
4160  * bring the caches to a "state of rest", which none of the caches below that
4161  * level dirty.
4162  *
4163  * For GFX8 and earlier this would be VRAM/GTT with none of the caches dirty.
4164  *
4165  * For GFX9+ we can define the state at rest to be L2 instead of VRAM for all
4166  * buffers and for images marked as coherent, and VRAM/GTT for non-coherent
4167  * images. However, given the existence of memory barriers which do not specify
4168  * the image/buffer it often devolves to just VRAM/GTT anyway.
4169  *
4170  * To help reducing the invalidations for GPUs that have L2 coherency between the
4171  * RB and the shader caches, we always invalidate L2 on the src side, as we can
4172  * use our knowledge of past usage to optimize flushes away.
4173  */
4174
4175 enum radv_cmd_flush_bits
4176 radv_src_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags2 src_flags,
4177                       const struct radv_image *image)
4178 {
4179    bool has_CB_meta = true, has_DB_meta = true;
4180    bool image_is_coherent = image ? image->l2_coherent : false;
4181    enum radv_cmd_flush_bits flush_bits = 0;
4182
4183    if (image) {
4184       if (!radv_image_has_CB_metadata(image))
4185          has_CB_meta = false;
4186       if (!radv_image_has_htile(image))
4187          has_DB_meta = false;
4188    }
4189
4190    u_foreach_bit64(b, src_flags)
4191    {
4192       switch ((VkAccessFlags2)(1 << b)) {
4193       case VK_ACCESS_2_SHADER_WRITE_BIT:
4194       case VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT:
4195          /* since the STORAGE bit isn't set we know that this is a meta operation.
4196           * on the dst flush side we skip CB/DB flushes without the STORAGE bit, so
4197           * set it here. */
4198          if (image && !(image->vk.usage & VK_IMAGE_USAGE_STORAGE_BIT)) {
4199             if (vk_format_is_depth_or_stencil(image->vk.format)) {
4200                flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
4201             } else {
4202                flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
4203             }
4204          }
4205
4206          /* This is valid even for the rb_noncoherent_dirty case, because with how we account for
4207           * dirtyness, if it isn't dirty it doesn't contain the data at all and hence doesn't need
4208           * invalidating. */
4209          if (!image_is_coherent)
4210             flush_bits |= RADV_CMD_FLAG_WB_L2;
4211          break;
4212       case VK_ACCESS_2_ACCELERATION_STRUCTURE_WRITE_BIT_KHR:
4213       case VK_ACCESS_2_TRANSFORM_FEEDBACK_WRITE_BIT_EXT:
4214       case VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
4215          if (!image_is_coherent)
4216             flush_bits |= RADV_CMD_FLAG_WB_L2;
4217          break;
4218       case VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT:
4219          flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
4220          if (has_CB_meta)
4221             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
4222          break;
4223       case VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
4224          flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
4225          if (has_DB_meta)
4226             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
4227          break;
4228       case VK_ACCESS_2_TRANSFER_WRITE_BIT:
4229          flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB;
4230
4231          if (!image_is_coherent)
4232             flush_bits |= RADV_CMD_FLAG_INV_L2;
4233          if (has_CB_meta)
4234             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
4235          if (has_DB_meta)
4236             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
4237          break;
4238       case VK_ACCESS_2_MEMORY_WRITE_BIT:
4239          flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB;
4240
4241          if (!image_is_coherent)
4242             flush_bits |= RADV_CMD_FLAG_INV_L2;
4243          if (has_CB_meta)
4244             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
4245          if (has_DB_meta)
4246             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
4247          break;
4248       default:
4249          break;
4250       }
4251    }
4252    return flush_bits;
4253 }
4254
4255 enum radv_cmd_flush_bits
4256 radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags2 dst_flags,
4257                       const struct radv_image *image)
4258 {
4259    bool has_CB_meta = true, has_DB_meta = true;
4260    enum radv_cmd_flush_bits flush_bits = 0;
4261    bool flush_CB = true, flush_DB = true;
4262    bool image_is_coherent = image ? image->l2_coherent : false;
4263
4264    if (image) {
4265       if (!(image->vk.usage & VK_IMAGE_USAGE_STORAGE_BIT)) {
4266          flush_CB = false;
4267          flush_DB = false;
4268       }
4269
4270       if (!radv_image_has_CB_metadata(image))
4271          has_CB_meta = false;
4272       if (!radv_image_has_htile(image))
4273          has_DB_meta = false;
4274    }
4275
4276    /* All the L2 invalidations below are not the CB/DB. So if there are no incoherent images
4277     * in the L2 cache in CB/DB mode then they are already usable from all the other L2 clients. */
4278    image_is_coherent |=
4279       can_skip_buffer_l2_flushes(cmd_buffer->device) && !cmd_buffer->state.rb_noncoherent_dirty;
4280
4281    u_foreach_bit64(b, dst_flags)
4282    {
4283       switch ((VkAccessFlags2)(1 << b)) {
4284       case VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT:
4285          /* SMEM loads are used to read compute dispatch size in shaders */
4286          if (!cmd_buffer->device->load_grid_size_from_user_sgpr)
4287             flush_bits |= RADV_CMD_FLAG_INV_SCACHE;
4288
4289          /* Ensure the DGC meta shader can read the commands. */
4290          if (cmd_buffer->device->uses_device_generated_commands) {
4291             flush_bits |= RADV_CMD_FLAG_INV_SCACHE | RADV_CMD_FLAG_INV_VCACHE;
4292
4293             if (cmd_buffer->device->physical_device->rad_info.gfx_level < GFX9)
4294                flush_bits |= RADV_CMD_FLAG_INV_L2;
4295          }
4296
4297          break;
4298       case VK_ACCESS_2_INDEX_READ_BIT:
4299       case VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
4300          break;
4301       case VK_ACCESS_2_UNIFORM_READ_BIT:
4302          flush_bits |= RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_SCACHE;
4303          break;
4304       case VK_ACCESS_2_VERTEX_ATTRIBUTE_READ_BIT:
4305       case VK_ACCESS_2_INPUT_ATTACHMENT_READ_BIT:
4306       case VK_ACCESS_2_TRANSFER_READ_BIT:
4307       case VK_ACCESS_2_TRANSFER_WRITE_BIT:
4308          flush_bits |= RADV_CMD_FLAG_INV_VCACHE;
4309
4310          if (has_CB_meta || has_DB_meta)
4311             flush_bits |= RADV_CMD_FLAG_INV_L2_METADATA;
4312          if (!image_is_coherent)
4313             flush_bits |= RADV_CMD_FLAG_INV_L2;
4314          break;
4315       case VK_ACCESS_2_SHADER_BINDING_TABLE_READ_BIT_KHR:
4316       case VK_ACCESS_2_SHADER_READ_BIT:
4317       case VK_ACCESS_2_SHADER_STORAGE_READ_BIT:
4318          flush_bits |= RADV_CMD_FLAG_INV_VCACHE;
4319          /* Unlike LLVM, ACO uses SMEM for SSBOs and we have to
4320           * invalidate the scalar cache. */
4321          if (!cmd_buffer->device->physical_device->use_llvm && !image)
4322             flush_bits |= RADV_CMD_FLAG_INV_SCACHE;
4323
4324          if (has_CB_meta || has_DB_meta)
4325             flush_bits |= RADV_CMD_FLAG_INV_L2_METADATA;
4326          if (!image_is_coherent)
4327             flush_bits |= RADV_CMD_FLAG_INV_L2;
4328          break;
4329       case VK_ACCESS_2_ACCELERATION_STRUCTURE_READ_BIT_KHR:
4330          flush_bits |= RADV_CMD_FLAG_INV_VCACHE;
4331          if (cmd_buffer->device->physical_device->rad_info.gfx_level < GFX9)
4332             flush_bits |= RADV_CMD_FLAG_INV_L2;
4333          break;
4334       case VK_ACCESS_2_SHADER_WRITE_BIT:
4335       case VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT:
4336       case VK_ACCESS_2_ACCELERATION_STRUCTURE_WRITE_BIT_KHR:
4337          break;
4338       case VK_ACCESS_2_COLOR_ATTACHMENT_READ_BIT:
4339       case VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT:
4340          if (flush_CB)
4341             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
4342          if (has_CB_meta)
4343             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
4344          break;
4345       case VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_READ_BIT:
4346       case VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
4347          if (flush_DB)
4348             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
4349          if (has_DB_meta)
4350             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
4351          break;
4352       case VK_ACCESS_2_MEMORY_READ_BIT:
4353       case VK_ACCESS_2_MEMORY_WRITE_BIT:
4354          flush_bits |= RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_SCACHE;
4355          if (!image_is_coherent)
4356             flush_bits |= RADV_CMD_FLAG_INV_L2;
4357          if (flush_CB)
4358             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
4359          if (has_CB_meta)
4360             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
4361          if (flush_DB)
4362             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
4363          if (has_DB_meta)
4364             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
4365          break;
4366       default:
4367          break;
4368       }
4369    }
4370    return flush_bits;
4371 }
4372
4373 void
4374 radv_emit_subpass_barrier(struct radv_cmd_buffer *cmd_buffer,
4375                           const struct radv_subpass_barrier *barrier)
4376 {
4377    struct radv_render_pass *pass = cmd_buffer->state.pass;
4378
4379    for (uint32_t i = 0; i < pass->attachment_count; i++) {
4380       struct radv_image_view *iview = cmd_buffer->state.attachments[i].iview;
4381
4382       cmd_buffer->state.flush_bits |=
4383          radv_src_access_flush(cmd_buffer, barrier->src_access_mask, iview->image);
4384    }
4385
4386    radv_stage_flush(cmd_buffer, barrier->src_stage_mask);
4387
4388    for (uint32_t i = 0; i < pass->attachment_count; i++) {
4389       struct radv_image_view *iview = cmd_buffer->state.attachments[i].iview;
4390
4391       cmd_buffer->state.flush_bits |=
4392          radv_dst_access_flush(cmd_buffer, barrier->dst_access_mask, iview->image);
4393    }
4394
4395    radv_ace_internal_barrier(cmd_buffer, barrier->src_stage_mask, barrier->dst_stage_mask);
4396 }
4397
4398 uint32_t
4399 radv_get_subpass_id(struct radv_cmd_buffer *cmd_buffer)
4400 {
4401    struct radv_cmd_state *state = &cmd_buffer->state;
4402    uint32_t subpass_id = state->subpass - state->pass->subpasses;
4403
4404    /* The id of this subpass shouldn't exceed the number of subpasses in
4405     * this render pass minus 1.
4406     */
4407    assert(subpass_id < state->pass->subpass_count);
4408    return subpass_id;
4409 }
4410
4411 static struct radv_sample_locations_state *
4412 radv_get_attachment_sample_locations(struct radv_cmd_buffer *cmd_buffer, uint32_t att_idx,
4413                                      bool begin_subpass)
4414 {
4415    struct radv_cmd_state *state = &cmd_buffer->state;
4416    uint32_t subpass_id = radv_get_subpass_id(cmd_buffer);
4417    struct radv_image_view *view = state->attachments[att_idx].iview;
4418
4419    if (view->image->info.samples == 1)
4420       return NULL;
4421
4422    if (state->pass->attachments[att_idx].first_subpass_idx == subpass_id) {
4423       /* Return the initial sample locations if this is the initial
4424        * layout transition of the given subpass attachemnt.
4425        */
4426       if (state->attachments[att_idx].sample_location.count > 0)
4427          return &state->attachments[att_idx].sample_location;
4428    } else {
4429       /* Otherwise return the subpass sample locations if defined. */
4430       if (state->subpass_sample_locs) {
4431          /* Because the driver sets the current subpass before
4432           * initial layout transitions, we should use the sample
4433           * locations from the previous subpass to avoid an
4434           * off-by-one problem. Otherwise, use the sample
4435           * locations for the current subpass for final layout
4436           * transitions.
4437           */
4438          if (begin_subpass)
4439             subpass_id--;
4440
4441          for (uint32_t i = 0; i < state->num_subpass_sample_locs; i++) {
4442             if (state->subpass_sample_locs[i].subpass_idx == subpass_id)
4443                return &state->subpass_sample_locs[i].sample_location;
4444          }
4445       }
4446    }
4447
4448    return NULL;
4449 }
4450
4451 static void
4452 radv_handle_image_transition_separate(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
4453                                       VkImageLayout src_layout, VkImageLayout dst_layout,
4454                                       VkImageLayout src_stencil_layout,
4455                                       VkImageLayout dst_stencil_layout,
4456                                       uint32_t src_family_index, uint32_t dst_family_index,
4457                                       const VkImageSubresourceRange *range,
4458                                       struct radv_sample_locations_state *sample_locs)
4459 {
4460    /* If we have a stencil layout that's different from depth, we need to
4461     * perform the stencil transition separately.
4462     */
4463    if ((range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) &&
4464        (src_layout != src_stencil_layout || dst_layout != dst_stencil_layout)) {
4465       VkImageSubresourceRange aspect_range = *range;
4466       /* Depth-only transitions. */
4467       if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
4468          aspect_range.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT;
4469          radv_handle_image_transition(cmd_buffer, image, src_layout, dst_layout,
4470                                       src_family_index, dst_family_index, &aspect_range, sample_locs);
4471       }
4472
4473       /* Stencil-only transitions. */
4474       aspect_range.aspectMask = VK_IMAGE_ASPECT_STENCIL_BIT;
4475       radv_handle_image_transition(cmd_buffer, image, src_stencil_layout, dst_stencil_layout,
4476                                    src_family_index, dst_family_index, &aspect_range, sample_locs);
4477    } else {
4478       radv_handle_image_transition(cmd_buffer, image, src_layout, dst_layout,
4479                                    src_family_index, dst_family_index, range, sample_locs);
4480    }
4481 }
4482
4483 static void
4484 radv_handle_subpass_image_transition(struct radv_cmd_buffer *cmd_buffer,
4485                                      struct radv_subpass_attachment att, bool begin_subpass)
4486 {
4487    unsigned idx = att.attachment;
4488    struct radv_image_view *view = cmd_buffer->state.attachments[idx].iview;
4489    struct radv_sample_locations_state *sample_locs;
4490    VkImageSubresourceRange range;
4491    range.aspectMask = view->vk.aspects;
4492    range.baseMipLevel = view->vk.base_mip_level;
4493    range.levelCount = 1;
4494    range.baseArrayLayer = view->vk.base_array_layer;
4495    range.layerCount = cmd_buffer->state.framebuffer->layers;
4496
4497    if (cmd_buffer->state.subpass->view_mask) {
4498       /* If the current subpass uses multiview, the driver might have
4499        * performed a fast color/depth clear to the whole image
4500        * (including all layers). To make sure the driver will
4501        * decompress the image correctly (if needed), we have to
4502        * account for the "real" number of layers. If the view mask is
4503        * sparse, this will decompress more layers than needed.
4504        */
4505       range.layerCount = util_last_bit(cmd_buffer->state.subpass->view_mask);
4506    }
4507
4508    /* Get the subpass sample locations for the given attachment, if NULL
4509     * is returned the driver will use the default HW locations.
4510     */
4511    sample_locs = radv_get_attachment_sample_locations(cmd_buffer, idx, begin_subpass);
4512
4513    radv_handle_image_transition_separate(cmd_buffer, view->image,
4514                                          cmd_buffer->state.attachments[idx].current_layout,
4515                                          att.layout,
4516                                          cmd_buffer->state.attachments[idx].current_stencil_layout,
4517                                          att.stencil_layout,
4518                                          0, 0, &range, sample_locs);
4519
4520    cmd_buffer->state.attachments[idx].current_layout = att.layout;
4521    cmd_buffer->state.attachments[idx].current_stencil_layout = att.stencil_layout;
4522 }
4523
4524 static VkResult
4525 radv_cmd_state_setup_sample_locations(struct radv_cmd_buffer *cmd_buffer,
4526                                       struct radv_render_pass *pass,
4527                                       const VkRenderPassBeginInfo *info)
4528 {
4529    const struct VkRenderPassSampleLocationsBeginInfoEXT *sample_locs =
4530       vk_find_struct_const(info->pNext, RENDER_PASS_SAMPLE_LOCATIONS_BEGIN_INFO_EXT);
4531    struct radv_cmd_state *state = &cmd_buffer->state;
4532
4533    if (!sample_locs) {
4534       state->subpass_sample_locs = NULL;
4535       return VK_SUCCESS;
4536    }
4537
4538    for (uint32_t i = 0; i < sample_locs->attachmentInitialSampleLocationsCount; i++) {
4539       const VkAttachmentSampleLocationsEXT *att_sample_locs =
4540          &sample_locs->pAttachmentInitialSampleLocations[i];
4541       uint32_t att_idx = att_sample_locs->attachmentIndex;
4542       struct radv_image *image = cmd_buffer->state.attachments[att_idx].iview->image;
4543
4544       assert(vk_format_is_depth_or_stencil(image->vk.format));
4545
4546       /* From the Vulkan spec 1.1.108:
4547        *
4548        * "If the image referenced by the framebuffer attachment at
4549        *  index attachmentIndex was not created with
4550        *  VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT
4551        *  then the values specified in sampleLocationsInfo are
4552        *  ignored."
4553        */
4554       if (!(image->vk.create_flags & VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT))
4555          continue;
4556
4557       const VkSampleLocationsInfoEXT *sample_locs_info = &att_sample_locs->sampleLocationsInfo;
4558
4559       state->attachments[att_idx].sample_location.per_pixel =
4560          sample_locs_info->sampleLocationsPerPixel;
4561       state->attachments[att_idx].sample_location.grid_size =
4562          sample_locs_info->sampleLocationGridSize;
4563       state->attachments[att_idx].sample_location.count = sample_locs_info->sampleLocationsCount;
4564       typed_memcpy(&state->attachments[att_idx].sample_location.locations[0],
4565                    sample_locs_info->pSampleLocations, sample_locs_info->sampleLocationsCount);
4566    }
4567
4568    state->subpass_sample_locs =
4569       vk_alloc(&cmd_buffer->pool->vk.alloc,
4570                sample_locs->postSubpassSampleLocationsCount * sizeof(state->subpass_sample_locs[0]),
4571                8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
4572    if (state->subpass_sample_locs == NULL) {
4573       return vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
4574    }
4575
4576    state->num_subpass_sample_locs = sample_locs->postSubpassSampleLocationsCount;
4577
4578    for (uint32_t i = 0; i < sample_locs->postSubpassSampleLocationsCount; i++) {
4579       const VkSubpassSampleLocationsEXT *subpass_sample_locs_info =
4580          &sample_locs->pPostSubpassSampleLocations[i];
4581       const VkSampleLocationsInfoEXT *sample_locs_info =
4582          &subpass_sample_locs_info->sampleLocationsInfo;
4583
4584       state->subpass_sample_locs[i].subpass_idx = subpass_sample_locs_info->subpassIndex;
4585       state->subpass_sample_locs[i].sample_location.per_pixel =
4586          sample_locs_info->sampleLocationsPerPixel;
4587       state->subpass_sample_locs[i].sample_location.grid_size =
4588          sample_locs_info->sampleLocationGridSize;
4589       state->subpass_sample_locs[i].sample_location.count = sample_locs_info->sampleLocationsCount;
4590       typed_memcpy(&state->subpass_sample_locs[i].sample_location.locations[0],
4591                    sample_locs_info->pSampleLocations, sample_locs_info->sampleLocationsCount);
4592    }
4593
4594    return VK_SUCCESS;
4595 }
4596
4597 static VkResult
4598 radv_cmd_state_setup_attachments(struct radv_cmd_buffer *cmd_buffer, struct radv_render_pass *pass,
4599                                  const VkRenderPassBeginInfo *info)
4600 {
4601    struct radv_cmd_state *state = &cmd_buffer->state;
4602    const struct VkRenderPassAttachmentBeginInfo *attachment_info = NULL;
4603
4604    if (info) {
4605       attachment_info = vk_find_struct_const(info->pNext, RENDER_PASS_ATTACHMENT_BEGIN_INFO);
4606    }
4607
4608    if (pass->attachment_count == 0) {
4609       state->attachments = NULL;
4610       return VK_SUCCESS;
4611    }
4612
4613    state->attachments =
4614       vk_alloc(&cmd_buffer->pool->vk.alloc, pass->attachment_count * sizeof(state->attachments[0]),
4615                8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
4616    if (state->attachments == NULL) {
4617       return vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
4618    }
4619
4620    for (uint32_t i = 0; i < pass->attachment_count; ++i) {
4621       struct radv_render_pass_attachment *att = &pass->attachments[i];
4622       VkImageAspectFlags att_aspects = vk_format_aspects(att->format);
4623       VkImageAspectFlags clear_aspects = 0;
4624
4625       if (att_aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
4626          /* color attachment */
4627          if (att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
4628             clear_aspects |= VK_IMAGE_ASPECT_COLOR_BIT;
4629          }
4630       } else {
4631          /* depthstencil attachment */
4632          if ((att_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
4633              att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
4634             clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
4635             if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
4636                 att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_DONT_CARE)
4637                clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
4638          }
4639          if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
4640              att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
4641             clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
4642          }
4643       }
4644
4645       state->attachments[i].pending_clear_aspects = clear_aspects;
4646       state->attachments[i].cleared_views = 0;
4647       if (clear_aspects && info) {
4648          assert(info->clearValueCount > i);
4649          state->attachments[i].clear_value = info->pClearValues[i];
4650       }
4651
4652       state->attachments[i].current_layout = att->initial_layout;
4653       state->attachments[i].current_stencil_layout = att->stencil_initial_layout;
4654       state->attachments[i].sample_location.count = 0;
4655
4656       struct radv_image_view *iview;
4657       if (attachment_info && attachment_info->attachmentCount > i) {
4658          iview = radv_image_view_from_handle(attachment_info->pAttachments[i]);
4659       } else {
4660          iview = radv_image_view_from_handle(state->framebuffer->attachments[i]);
4661       }
4662
4663       state->attachments[i].iview = iview;
4664       if (iview->vk.aspects & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
4665          radv_initialise_ds_surface(cmd_buffer->device, &state->attachments[i].ds, iview);
4666       } else {
4667          radv_initialise_color_surface(cmd_buffer->device, &state->attachments[i].cb, iview);
4668       }
4669    }
4670
4671    return VK_SUCCESS;
4672 }
4673
4674 VKAPI_ATTR VkResult VKAPI_CALL
4675 radv_AllocateCommandBuffers(VkDevice _device, const VkCommandBufferAllocateInfo *pAllocateInfo,
4676                             VkCommandBuffer *pCommandBuffers)
4677 {
4678    RADV_FROM_HANDLE(radv_device, device, _device);
4679    RADV_FROM_HANDLE(radv_cmd_pool, pool, pAllocateInfo->commandPool);
4680
4681    VkResult result = VK_SUCCESS;
4682    uint32_t i;
4683
4684    for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {
4685
4686       if (!list_is_empty(&pool->free_cmd_buffers)) {
4687          struct radv_cmd_buffer *cmd_buffer =
4688             list_first_entry(&pool->free_cmd_buffers, struct radv_cmd_buffer, pool_link);
4689
4690          list_del(&cmd_buffer->pool_link);
4691          list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
4692
4693          result = radv_reset_cmd_buffer(cmd_buffer);
4694          vk_command_buffer_finish(&cmd_buffer->vk);
4695          VkResult init_result =
4696             vk_command_buffer_init(&cmd_buffer->vk, &pool->vk, pAllocateInfo->level);
4697          if (init_result != VK_SUCCESS)
4698             result = init_result;
4699
4700          pCommandBuffers[i] = radv_cmd_buffer_to_handle(cmd_buffer);
4701       } else {
4702          result = radv_create_cmd_buffer(device, pool, pAllocateInfo->level, &pCommandBuffers[i]);
4703       }
4704       if (result != VK_SUCCESS)
4705          break;
4706    }
4707
4708    if (result != VK_SUCCESS) {
4709       radv_FreeCommandBuffers(_device, pAllocateInfo->commandPool, i, pCommandBuffers);
4710
4711       /* From the Vulkan 1.0.66 spec:
4712        *
4713        * "vkAllocateCommandBuffers can be used to create multiple
4714        *  command buffers. If the creation of any of those command
4715        *  buffers fails, the implementation must destroy all
4716        *  successfully created command buffer objects from this
4717        *  command, set all entries of the pCommandBuffers array to
4718        *  NULL and return the error."
4719        */
4720       memset(pCommandBuffers, 0, sizeof(*pCommandBuffers) * pAllocateInfo->commandBufferCount);
4721    }
4722
4723    return result;
4724 }
4725
4726 VKAPI_ATTR void VKAPI_CALL
4727 radv_FreeCommandBuffers(VkDevice device, VkCommandPool commandPool, uint32_t commandBufferCount,
4728                         const VkCommandBuffer *pCommandBuffers)
4729 {
4730    RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
4731
4732    for (uint32_t i = 0; i < commandBufferCount; i++) {
4733       RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, pCommandBuffers[i]);
4734
4735       if (!cmd_buffer)
4736          continue;
4737       assert(cmd_buffer->pool == pool);
4738
4739       list_del(&cmd_buffer->pool_link);
4740       list_addtail(&cmd_buffer->pool_link, &pool->free_cmd_buffers);
4741    }
4742 }
4743
4744 VKAPI_ATTR VkResult VKAPI_CALL
4745 radv_ResetCommandBuffer(VkCommandBuffer commandBuffer, VkCommandBufferResetFlags flags)
4746 {
4747    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4748    return radv_reset_cmd_buffer(cmd_buffer);
4749 }
4750
4751 static void
4752 radv_inherit_dynamic_rendering(struct radv_cmd_buffer *cmd_buffer,
4753                                const VkCommandBufferInheritanceInfo *inherit_info,
4754                                const VkCommandBufferInheritanceRenderingInfo *dyn_info)
4755 {
4756    const VkAttachmentSampleCountInfoAMD *sample_info =
4757       vk_find_struct_const(inherit_info->pNext, ATTACHMENT_SAMPLE_COUNT_INFO_AMD);
4758    VkResult result;
4759    /* (normal + resolve) for color attachments and ds and a VRS attachment */
4760    VkAttachmentDescription2 att_desc[MAX_RTS * 2 + 3];
4761    VkAttachmentReference2 color_refs[MAX_RTS], ds_ref;
4762    unsigned att_count = 0;
4763
4764    VkSubpassDescription2 subpass = {
4765       .sType = VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_2,
4766       .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS,
4767       .viewMask = dyn_info->viewMask,
4768       .colorAttachmentCount = dyn_info->colorAttachmentCount,
4769       .pColorAttachments = color_refs,
4770    };
4771
4772    for (unsigned i = 0; i < dyn_info->colorAttachmentCount; ++i) {
4773       if (dyn_info->pColorAttachmentFormats[i] == VK_FORMAT_UNDEFINED) {
4774          color_refs[i] = (VkAttachmentReference2){
4775             .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
4776             .attachment = VK_ATTACHMENT_UNUSED,
4777          };
4778          continue;
4779       }
4780
4781       color_refs[i] = (VkAttachmentReference2){
4782          .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
4783          .attachment = att_count,
4784          .layout = VK_IMAGE_LAYOUT_GENERAL, /* Shouldn't be used */
4785          .aspectMask = 0,                   /* Shouldn't be used */
4786       };
4787
4788       VkAttachmentDescription2 *att = att_desc + att_count++;
4789       memset(att, 0, sizeof(*att));
4790       att->sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2;
4791       att->format = dyn_info->pColorAttachmentFormats[i];
4792       att->samples =
4793          sample_info ? sample_info->pColorAttachmentSamples[i] : dyn_info->rasterizationSamples;
4794       att->loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
4795       att->storeOp = VK_ATTACHMENT_STORE_OP_STORE;
4796       att->initialLayout = VK_IMAGE_LAYOUT_GENERAL;
4797       att->finalLayout = VK_IMAGE_LAYOUT_GENERAL;
4798    }
4799
4800    if (dyn_info->depthAttachmentFormat != VK_FORMAT_UNDEFINED ||
4801        dyn_info->stencilAttachmentFormat != VK_FORMAT_UNDEFINED) {
4802       VkFormat fmt = dyn_info->depthAttachmentFormat != VK_FORMAT_UNDEFINED
4803                         ? dyn_info->depthAttachmentFormat
4804                         : dyn_info->stencilAttachmentFormat;
4805
4806       ds_ref = (VkAttachmentReference2){
4807          .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
4808          .attachment = att_count,
4809          .layout = VK_IMAGE_LAYOUT_GENERAL, /* Shouldn't be used */
4810          .aspectMask = 0,                   /* Shouldn't be used */
4811       };
4812       subpass.pDepthStencilAttachment = &ds_ref;
4813
4814       VkAttachmentDescription2 *att = att_desc + att_count++;
4815
4816       memset(att, 0, sizeof(*att));
4817       att->sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2;
4818       att->format = fmt;
4819       att->samples =
4820          sample_info ? sample_info->depthStencilAttachmentSamples : dyn_info->rasterizationSamples;
4821       att->loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
4822       att->storeOp = VK_ATTACHMENT_STORE_OP_STORE;
4823       att->stencilLoadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
4824       att->stencilStoreOp = VK_ATTACHMENT_STORE_OP_STORE;
4825    }
4826
4827    VkRenderPassCreateInfo2 rp_create_info = {
4828       .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2,
4829       .attachmentCount = att_count,
4830       .pAttachments = att_desc,
4831       .subpassCount = 1,
4832       .pSubpasses = &subpass,
4833    };
4834
4835    VkRenderPass rp;
4836    result =
4837       radv_CreateRenderPass2(radv_device_to_handle(cmd_buffer->device), &rp_create_info, NULL, &rp);
4838    if (result != VK_SUCCESS) {
4839       vk_command_buffer_set_error(&cmd_buffer->vk, result);
4840       return;
4841    }
4842
4843    cmd_buffer->state.pass = radv_render_pass_from_handle(rp);
4844    cmd_buffer->state.own_render_pass = true;
4845 }
4846
4847 VKAPI_ATTR VkResult VKAPI_CALL
4848 radv_BeginCommandBuffer(VkCommandBuffer commandBuffer, const VkCommandBufferBeginInfo *pBeginInfo)
4849 {
4850    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4851    VkResult result = VK_SUCCESS;
4852
4853    if (cmd_buffer->status != RADV_CMD_BUFFER_STATUS_INITIAL) {
4854       /* If the command buffer has already been resetted with
4855        * vkResetCommandBuffer, no need to do it again.
4856        */
4857       result = radv_reset_cmd_buffer(cmd_buffer);
4858       if (result != VK_SUCCESS)
4859          return result;
4860    }
4861
4862    memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));
4863    cmd_buffer->state.last_primitive_reset_en = -1;
4864    cmd_buffer->state.last_index_type = -1;
4865    cmd_buffer->state.last_num_instances = -1;
4866    cmd_buffer->state.last_vertex_offset = -1;
4867    cmd_buffer->state.last_first_instance = -1;
4868    cmd_buffer->state.last_drawid = -1;
4869    cmd_buffer->state.last_subpass_color_count = MAX_RTS;
4870    cmd_buffer->state.predication_type = -1;
4871    cmd_buffer->state.last_sx_ps_downconvert = -1;
4872    cmd_buffer->state.last_sx_blend_opt_epsilon = -1;
4873    cmd_buffer->state.last_sx_blend_opt_control = -1;
4874    cmd_buffer->state.last_nggc_settings = -1;
4875    cmd_buffer->state.last_nggc_settings_sgpr_idx = -1;
4876    cmd_buffer->state.mesh_shading = false;
4877    cmd_buffer->state.last_vrs_rates = -1;
4878    cmd_buffer->state.last_vrs_rates_sgpr_idx = -1;
4879    cmd_buffer->usage_flags = pBeginInfo->flags;
4880
4881    if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
4882        (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) {
4883       struct radv_subpass *subpass = NULL;
4884
4885       assert(pBeginInfo->pInheritanceInfo);
4886
4887       cmd_buffer->state.framebuffer =
4888          vk_framebuffer_from_handle(pBeginInfo->pInheritanceInfo->framebuffer);
4889
4890       if (pBeginInfo->pInheritanceInfo->renderPass) {
4891          cmd_buffer->state.pass =
4892             radv_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass);
4893          assert(pBeginInfo->pInheritanceInfo->subpass < cmd_buffer->state.pass->subpass_count);
4894          subpass = &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
4895       } else {
4896          const VkCommandBufferInheritanceRenderingInfo *dyn_info =
4897             vk_find_struct_const(pBeginInfo->pInheritanceInfo->pNext,
4898                                  COMMAND_BUFFER_INHERITANCE_RENDERING_INFO);
4899          if (dyn_info) {
4900             radv_inherit_dynamic_rendering(cmd_buffer, pBeginInfo->pInheritanceInfo, dyn_info);
4901             subpass = &cmd_buffer->state.pass->subpasses[0];
4902          }
4903       }
4904
4905       if (cmd_buffer->state.framebuffer) {
4906          result = radv_cmd_state_setup_attachments(cmd_buffer, cmd_buffer->state.pass, NULL);
4907          if (result != VK_SUCCESS)
4908             return result;
4909       }
4910
4911       cmd_buffer->state.inherited_pipeline_statistics =
4912          pBeginInfo->pInheritanceInfo->pipelineStatistics;
4913
4914       if (cmd_buffer->state.pass) {
4915          cmd_buffer->state.subpass = subpass;
4916          if (cmd_buffer->state.framebuffer)
4917             cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FRAMEBUFFER;
4918       }
4919    }
4920
4921    if (unlikely(cmd_buffer->device->trace_bo))
4922       radv_cmd_buffer_trace_emit(cmd_buffer);
4923
4924    radv_describe_begin_cmd_buffer(cmd_buffer);
4925
4926    cmd_buffer->status = RADV_CMD_BUFFER_STATUS_RECORDING;
4927
4928    return result;
4929 }
4930
4931 VKAPI_ATTR void VKAPI_CALL
4932 radv_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer, uint32_t firstBinding,
4933                            uint32_t bindingCount, const VkBuffer *pBuffers,
4934                            const VkDeviceSize *pOffsets, const VkDeviceSize *pSizes,
4935                            const VkDeviceSize *pStrides)
4936 {
4937    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4938    struct radv_vertex_binding *vb = cmd_buffer->vertex_bindings;
4939    const struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input;
4940
4941    /* We have to defer setting up vertex buffer since we need the buffer
4942     * stride from the pipeline. */
4943
4944    assert(firstBinding + bindingCount <= MAX_VBS);
4945    enum amd_gfx_level chip = cmd_buffer->device->physical_device->rad_info.gfx_level;
4946
4947    if (firstBinding + bindingCount > cmd_buffer->used_vertex_bindings)
4948       cmd_buffer->used_vertex_bindings = firstBinding + bindingCount;
4949
4950    uint32_t misaligned_mask_invalid = 0;
4951
4952    for (uint32_t i = 0; i < bindingCount; i++) {
4953       RADV_FROM_HANDLE(radv_buffer, buffer, pBuffers[i]);
4954       uint32_t idx = firstBinding + i;
4955       VkDeviceSize size = pSizes ? pSizes[i] : 0;
4956       /* if pStrides=NULL, it shouldn't overwrite the strides specified by CmdSetVertexInputEXT */
4957       VkDeviceSize stride = pStrides ? pStrides[i] : vb[idx].stride;
4958
4959       if (!!cmd_buffer->vertex_binding_buffers[idx] != !!buffer ||
4960           (buffer && ((vb[idx].offset & 0x3) != (pOffsets[i] & 0x3) ||
4961                       (vb[idx].stride & 0x3) != (stride & 0x3)))) {
4962          misaligned_mask_invalid |= state->bindings_match_attrib ? BITFIELD_BIT(idx) : 0xffffffff;
4963       }
4964
4965       cmd_buffer->vertex_binding_buffers[idx] = buffer;
4966       vb[idx].offset = pOffsets[i];
4967       vb[idx].size = size;
4968       vb[idx].stride = stride;
4969
4970       uint32_t bit = BITFIELD_BIT(idx);
4971       if (buffer) {
4972          radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->vertex_binding_buffers[idx]->bo);
4973          cmd_buffer->state.vbo_bound_mask |= bit;
4974       } else {
4975          cmd_buffer->state.vbo_bound_mask &= ~bit;
4976       }
4977    }
4978
4979    if ((chip == GFX6 || chip >= GFX10) && misaligned_mask_invalid) {
4980       cmd_buffer->state.vbo_misaligned_mask_invalid = misaligned_mask_invalid;
4981       cmd_buffer->state.vbo_misaligned_mask &= ~misaligned_mask_invalid;
4982    }
4983
4984    cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER |
4985                               RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT;
4986 }
4987
4988 static uint32_t
4989 vk_to_index_type(VkIndexType type)
4990 {
4991    switch (type) {
4992    case VK_INDEX_TYPE_UINT8_EXT:
4993       return V_028A7C_VGT_INDEX_8;
4994    case VK_INDEX_TYPE_UINT16:
4995       return V_028A7C_VGT_INDEX_16;
4996    case VK_INDEX_TYPE_UINT32:
4997       return V_028A7C_VGT_INDEX_32;
4998    default:
4999       unreachable("invalid index type");
5000    }
5001 }
5002
5003 uint32_t
5004 radv_get_vgt_index_size(uint32_t type)
5005 {
5006    uint32_t index_type = G_028A7C_INDEX_TYPE(type);
5007    switch (index_type) {
5008    case V_028A7C_VGT_INDEX_8:
5009       return 1;
5010    case V_028A7C_VGT_INDEX_16:
5011       return 2;
5012    case V_028A7C_VGT_INDEX_32:
5013       return 4;
5014    default:
5015       unreachable("invalid index type");
5016    }
5017 }
5018
5019 VKAPI_ATTR void VKAPI_CALL
5020 radv_CmdBindIndexBuffer(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset,
5021                         VkIndexType indexType)
5022 {
5023    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5024    RADV_FROM_HANDLE(radv_buffer, index_buffer, buffer);
5025
5026    cmd_buffer->state.index_buffer = index_buffer;
5027    cmd_buffer->state.index_offset = offset;
5028    cmd_buffer->state.index_type = vk_to_index_type(indexType);
5029    cmd_buffer->state.index_va = radv_buffer_get_va(index_buffer->bo);
5030    cmd_buffer->state.index_va += index_buffer->offset + offset;
5031
5032    int index_size = radv_get_vgt_index_size(vk_to_index_type(indexType));
5033    cmd_buffer->state.max_index_count =
5034       (vk_buffer_range(&index_buffer->vk, offset, VK_WHOLE_SIZE)) / index_size;
5035    cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
5036    radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, index_buffer->bo);
5037 }
5038
5039 static void
5040 radv_bind_descriptor_set(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point,
5041                          struct radv_descriptor_set *set, unsigned idx)
5042 {
5043    struct radeon_winsys *ws = cmd_buffer->device->ws;
5044
5045    radv_set_descriptor_set(cmd_buffer, bind_point, set, idx);
5046
5047    assert(set);
5048    assert(!(set->header.layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR));
5049
5050    if (!cmd_buffer->device->use_global_bo_list) {
5051       for (unsigned j = 0; j < set->header.buffer_count; ++j)
5052          if (set->descriptors[j])
5053             radv_cs_add_buffer(ws, cmd_buffer->cs, set->descriptors[j]);
5054    }
5055
5056    if (set->header.bo)
5057       radv_cs_add_buffer(ws, cmd_buffer->cs, set->header.bo);
5058 }
5059
5060 VKAPI_ATTR void VKAPI_CALL
5061 radv_CmdBindDescriptorSets(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint,
5062                            VkPipelineLayout _layout, uint32_t firstSet, uint32_t descriptorSetCount,
5063                            const VkDescriptorSet *pDescriptorSets, uint32_t dynamicOffsetCount,
5064                            const uint32_t *pDynamicOffsets)
5065 {
5066    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5067    RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
5068    unsigned dyn_idx = 0;
5069
5070    const bool no_dynamic_bounds =
5071       cmd_buffer->device->instance->debug_flags & RADV_DEBUG_NO_DYNAMIC_BOUNDS;
5072    struct radv_descriptor_state *descriptors_state =
5073       radv_get_descriptors_state(cmd_buffer, pipelineBindPoint);
5074
5075    for (unsigned i = 0; i < descriptorSetCount; ++i) {
5076       unsigned set_idx = i + firstSet;
5077       RADV_FROM_HANDLE(radv_descriptor_set, set, pDescriptorSets[i]);
5078
5079       if (!set)
5080          continue;
5081
5082       /* If the set is already bound we only need to update the
5083        * (potentially changed) dynamic offsets. */
5084       if (descriptors_state->sets[set_idx] != set ||
5085           !(descriptors_state->valid & (1u << set_idx))) {
5086          radv_bind_descriptor_set(cmd_buffer, pipelineBindPoint, set, set_idx);
5087       }
5088
5089       for (unsigned j = 0; j < set->header.layout->dynamic_offset_count; ++j, ++dyn_idx) {
5090          unsigned idx = j + layout->set[i + firstSet].dynamic_offset_start;
5091          uint32_t *dst = descriptors_state->dynamic_buffers + idx * 4;
5092          assert(dyn_idx < dynamicOffsetCount);
5093
5094          struct radv_descriptor_range *range = set->header.dynamic_descriptors + j;
5095
5096          if (!range->va) {
5097             memset(dst, 0, 4 * 4);
5098          } else {
5099             uint64_t va = range->va + pDynamicOffsets[dyn_idx];
5100             dst[0] = va;
5101             dst[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
5102             dst[2] = no_dynamic_bounds ? 0xffffffffu : range->size;
5103             dst[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
5104                      S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
5105
5106             if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
5107                dst[3] |= S_008F0C_FORMAT(V_008F0C_GFX11_FORMAT_32_FLOAT) |
5108                          S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW);
5109             } else if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10) {
5110                dst[3] |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
5111                          S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
5112             } else {
5113                dst[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
5114                          S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
5115             }
5116          }
5117
5118          cmd_buffer->push_constant_stages |= set->header.layout->dynamic_shader_stages;
5119       }
5120    }
5121 }
5122
5123 static bool
5124 radv_init_push_descriptor_set(struct radv_cmd_buffer *cmd_buffer, struct radv_descriptor_set *set,
5125                               struct radv_descriptor_set_layout *layout,
5126                               VkPipelineBindPoint bind_point)
5127 {
5128    struct radv_descriptor_state *descriptors_state =
5129       radv_get_descriptors_state(cmd_buffer, bind_point);
5130    set->header.size = layout->size;
5131
5132    if (set->header.layout != layout) {
5133       if (set->header.layout)
5134          vk_descriptor_set_layout_unref(&cmd_buffer->device->vk, &set->header.layout->vk);
5135       vk_descriptor_set_layout_ref(&layout->vk);
5136       set->header.layout = layout;
5137    }
5138
5139    if (descriptors_state->push_set.capacity < set->header.size) {
5140       size_t new_size = MAX2(set->header.size, 1024);
5141       new_size = MAX2(new_size, 2 * descriptors_state->push_set.capacity);
5142       new_size = MIN2(new_size, 96 * MAX_PUSH_DESCRIPTORS);
5143
5144       free(set->header.mapped_ptr);
5145       set->header.mapped_ptr = malloc(new_size);
5146
5147       if (!set->header.mapped_ptr) {
5148          descriptors_state->push_set.capacity = 0;
5149          vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
5150          return false;
5151       }
5152
5153       descriptors_state->push_set.capacity = new_size;
5154    }
5155
5156    return true;
5157 }
5158
5159 void
5160 radv_meta_push_descriptor_set(struct radv_cmd_buffer *cmd_buffer,
5161                               VkPipelineBindPoint pipelineBindPoint, VkPipelineLayout _layout,
5162                               uint32_t set, uint32_t descriptorWriteCount,
5163                               const VkWriteDescriptorSet *pDescriptorWrites)
5164 {
5165    RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
5166    struct radv_descriptor_set *push_set =
5167       (struct radv_descriptor_set *)&cmd_buffer->meta_push_descriptors;
5168    unsigned bo_offset;
5169
5170    assert(set == 0);
5171    assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
5172
5173    push_set->header.size = layout->set[set].layout->size;
5174    push_set->header.layout = layout->set[set].layout;
5175
5176    if (!radv_cmd_buffer_upload_alloc(cmd_buffer, push_set->header.size, &bo_offset,
5177                                      (void **)&push_set->header.mapped_ptr))
5178       return;
5179
5180    push_set->header.va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
5181    push_set->header.va += bo_offset;
5182
5183    radv_cmd_update_descriptor_sets(cmd_buffer->device, cmd_buffer,
5184                                    radv_descriptor_set_to_handle(push_set), descriptorWriteCount,
5185                                    pDescriptorWrites, 0, NULL);
5186
5187    radv_set_descriptor_set(cmd_buffer, pipelineBindPoint, push_set, set);
5188 }
5189
5190 VKAPI_ATTR void VKAPI_CALL
5191 radv_CmdPushDescriptorSetKHR(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint,
5192                              VkPipelineLayout _layout, uint32_t set, uint32_t descriptorWriteCount,
5193                              const VkWriteDescriptorSet *pDescriptorWrites)
5194 {
5195    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5196    RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
5197    struct radv_descriptor_state *descriptors_state =
5198       radv_get_descriptors_state(cmd_buffer, pipelineBindPoint);
5199    struct radv_descriptor_set *push_set =
5200       (struct radv_descriptor_set *)&descriptors_state->push_set.set;
5201
5202    assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
5203
5204    if (!radv_init_push_descriptor_set(cmd_buffer, push_set, layout->set[set].layout,
5205                                       pipelineBindPoint))
5206       return;
5207
5208    /* Check that there are no inline uniform block updates when calling vkCmdPushDescriptorSetKHR()
5209     * because it is invalid, according to Vulkan spec.
5210     */
5211    for (int i = 0; i < descriptorWriteCount; i++) {
5212       ASSERTED const VkWriteDescriptorSet *writeset = &pDescriptorWrites[i];
5213       assert(writeset->descriptorType != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK);
5214    }
5215
5216    radv_cmd_update_descriptor_sets(cmd_buffer->device, cmd_buffer,
5217                                    radv_descriptor_set_to_handle(push_set), descriptorWriteCount,
5218                                    pDescriptorWrites, 0, NULL);
5219
5220    radv_set_descriptor_set(cmd_buffer, pipelineBindPoint, push_set, set);
5221    descriptors_state->push_dirty = true;
5222 }
5223
5224 VKAPI_ATTR void VKAPI_CALL
5225 radv_CmdPushDescriptorSetWithTemplateKHR(VkCommandBuffer commandBuffer,
5226                                          VkDescriptorUpdateTemplate descriptorUpdateTemplate,
5227                                          VkPipelineLayout _layout, uint32_t set, const void *pData)
5228 {
5229    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5230    RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
5231    RADV_FROM_HANDLE(radv_descriptor_update_template, templ, descriptorUpdateTemplate);
5232    struct radv_descriptor_state *descriptors_state =
5233       radv_get_descriptors_state(cmd_buffer, templ->bind_point);
5234    struct radv_descriptor_set *push_set =
5235       (struct radv_descriptor_set *)&descriptors_state->push_set.set;
5236
5237    assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
5238
5239    if (!radv_init_push_descriptor_set(cmd_buffer, push_set, layout->set[set].layout,
5240                                       templ->bind_point))
5241       return;
5242
5243    radv_cmd_update_descriptor_set_with_template(cmd_buffer->device, cmd_buffer, push_set,
5244                                                 descriptorUpdateTemplate, pData);
5245
5246    radv_set_descriptor_set(cmd_buffer, templ->bind_point, push_set, set);
5247    descriptors_state->push_dirty = true;
5248 }
5249
5250 VKAPI_ATTR void VKAPI_CALL
5251 radv_CmdPushConstants(VkCommandBuffer commandBuffer, VkPipelineLayout layout,
5252                       VkShaderStageFlags stageFlags, uint32_t offset, uint32_t size,
5253                       const void *pValues)
5254 {
5255    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5256    memcpy(cmd_buffer->push_constants + offset, pValues, size);
5257    cmd_buffer->push_constant_stages |= stageFlags;
5258 }
5259
5260 VKAPI_ATTR VkResult VKAPI_CALL
5261 radv_EndCommandBuffer(VkCommandBuffer commandBuffer)
5262 {
5263    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5264
5265    radv_emit_mip_change_flush_default(cmd_buffer);
5266
5267    if (cmd_buffer->qf != RADV_QUEUE_TRANSFER) {
5268       if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX6)
5269          cmd_buffer->state.flush_bits |=
5270             RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_WB_L2;
5271
5272       /* Make sure to sync all pending active queries at the end of
5273        * command buffer.
5274        */
5275       cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits;
5276
5277       /* Flush noncoherent images on GFX9+ so we can assume they're clean on the start of a
5278        * command buffer.
5279        */
5280       if (cmd_buffer->state.rb_noncoherent_dirty && can_skip_buffer_l2_flushes(cmd_buffer->device))
5281          cmd_buffer->state.flush_bits |= radv_src_access_flush(
5282             cmd_buffer,
5283             VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT |
5284             VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT,
5285             NULL);
5286
5287       /* Since NGG streamout uses GDS, we need to make GDS idle when
5288        * we leave the IB, otherwise another process might overwrite
5289        * it while our shaders are busy.
5290        */
5291       if (cmd_buffer->gds_needed)
5292          cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH;
5293
5294       /* Finalize the internal compute command stream, if it exists. */
5295       if (cmd_buffer->ace_internal.cs) {
5296          VkResult result = radv_ace_internal_finalize(cmd_buffer);
5297          if (result != VK_SUCCESS)
5298             return vk_error(cmd_buffer, result);
5299       }
5300
5301       si_emit_cache_flush(cmd_buffer);
5302    }
5303
5304    /* Make sure CP DMA is idle at the end of IBs because the kernel
5305     * doesn't wait for it.
5306     */
5307    si_cp_dma_wait_for_idle(cmd_buffer);
5308
5309    radv_describe_end_cmd_buffer(cmd_buffer);
5310
5311    vk_free(&cmd_buffer->pool->vk.alloc, cmd_buffer->state.attachments);
5312    vk_free(&cmd_buffer->pool->vk.alloc, cmd_buffer->state.subpass_sample_locs);
5313
5314    VkResult result = cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs);
5315    if (result != VK_SUCCESS)
5316       return vk_error(cmd_buffer, result);
5317
5318    cmd_buffer->status = RADV_CMD_BUFFER_STATUS_EXECUTABLE;
5319
5320    return vk_command_buffer_get_record_result(&cmd_buffer->vk);
5321 }
5322
5323 static void
5324 radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer,
5325                            struct radv_compute_pipeline *pipeline)
5326 {
5327    if (pipeline == cmd_buffer->state.emitted_compute_pipeline)
5328       return;
5329
5330    assert(!pipeline->base.ctx_cs.cdw);
5331
5332    cmd_buffer->state.emitted_compute_pipeline = pipeline;
5333
5334    radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->base.cs.cdw);
5335    radeon_emit_array(cmd_buffer->cs, pipeline->base.cs.buf, pipeline->base.cs.cdw);
5336
5337    cmd_buffer->compute_scratch_size_per_wave_needed =
5338       MAX2(cmd_buffer->compute_scratch_size_per_wave_needed, pipeline->base.scratch_bytes_per_wave);
5339    cmd_buffer->compute_scratch_waves_wanted =
5340       MAX2(cmd_buffer->compute_scratch_waves_wanted, pipeline->base.max_waves);
5341
5342    radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->base.slab_bo);
5343
5344    if (unlikely(cmd_buffer->device->trace_bo))
5345       radv_save_pipeline(cmd_buffer, &pipeline->base);
5346 }
5347
5348 static void
5349 radv_mark_descriptor_sets_dirty(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point)
5350 {
5351    struct radv_descriptor_state *descriptors_state =
5352       radv_get_descriptors_state(cmd_buffer, bind_point);
5353
5354    descriptors_state->dirty |= descriptors_state->valid;
5355 }
5356
5357 VKAPI_ATTR void VKAPI_CALL
5358 radv_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint,
5359                      VkPipeline _pipeline)
5360 {
5361    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5362    RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline);
5363
5364    switch (pipelineBindPoint) {
5365    case VK_PIPELINE_BIND_POINT_COMPUTE: {
5366       struct radv_compute_pipeline *compute_pipeline = radv_pipeline_to_compute(pipeline);
5367
5368       if (cmd_buffer->state.compute_pipeline == compute_pipeline)
5369          return;
5370       radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
5371
5372       cmd_buffer->state.compute_pipeline = compute_pipeline;
5373       cmd_buffer->push_constant_stages |= VK_SHADER_STAGE_COMPUTE_BIT;
5374       break;
5375    }
5376    case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR: {
5377       struct radv_compute_pipeline *compute_pipeline = radv_pipeline_to_compute(pipeline);
5378
5379       if (cmd_buffer->state.rt_pipeline == compute_pipeline)
5380          return;
5381       radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
5382
5383       cmd_buffer->state.rt_pipeline = compute_pipeline;
5384       cmd_buffer->push_constant_stages |= RADV_RT_STAGE_BITS;
5385       if (compute_pipeline->dynamic_stack_size)
5386          radv_set_rt_stack_size(cmd_buffer, cmd_buffer->state.rt_stack_size);
5387       break;
5388    }
5389    case VK_PIPELINE_BIND_POINT_GRAPHICS: {
5390       struct radv_graphics_pipeline *graphics_pipeline =
5391          pipeline ? radv_pipeline_to_graphics(pipeline) : NULL;
5392
5393       if (cmd_buffer->state.graphics_pipeline == graphics_pipeline)
5394          return;
5395       radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
5396
5397       bool vtx_emit_count_changed =
5398          !pipeline || !cmd_buffer->state.graphics_pipeline ||
5399          cmd_buffer->state.graphics_pipeline->vtx_emit_num != graphics_pipeline->vtx_emit_num ||
5400          cmd_buffer->state.graphics_pipeline->vtx_base_sgpr != graphics_pipeline->vtx_base_sgpr;
5401       cmd_buffer->state.graphics_pipeline = graphics_pipeline;
5402       if (!pipeline)
5403          break;
5404
5405       bool mesh_shading = radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_MESH);
5406       if (mesh_shading != cmd_buffer->state.mesh_shading) {
5407          /* Re-emit VRS state because the combiner is different (vertex vs primitive).
5408           * Re-emit primitive topology because the mesh shading pipeline clobbered it.
5409           */
5410          cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE |
5411                                     RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY;
5412       }
5413
5414       cmd_buffer->state.mesh_shading = mesh_shading;
5415       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_PIPELINE | RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT;
5416       cmd_buffer->push_constant_stages |= graphics_pipeline->active_stages;
5417
5418       /* the new vertex shader might not have the same user regs */
5419       if (vtx_emit_count_changed) {
5420          cmd_buffer->state.last_first_instance = -1;
5421          cmd_buffer->state.last_vertex_offset = -1;
5422          cmd_buffer->state.last_drawid = -1;
5423       }
5424
5425       /* Prefetch all pipeline shaders at first draw time. */
5426       cmd_buffer->state.prefetch_L2_mask |= RADV_PREFETCH_SHADERS;
5427
5428       if (cmd_buffer->device->physical_device->rad_info.has_vgt_flush_ngg_legacy_bug &&
5429           cmd_buffer->state.emitted_graphics_pipeline &&
5430           cmd_buffer->state.emitted_graphics_pipeline->is_ngg &&
5431           !cmd_buffer->state.graphics_pipeline->is_ngg) {
5432          /* Transitioning from NGG to legacy GS requires
5433           * VGT_FLUSH on GFX10 and Navi21. VGT_FLUSH
5434           * is also emitted at the beginning of IBs when legacy
5435           * GS ring pointers are set.
5436           */
5437          cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_FLUSH;
5438       }
5439
5440       radv_bind_dynamic_state(cmd_buffer, &graphics_pipeline->dynamic_state);
5441
5442       if (graphics_pipeline->esgs_ring_size > cmd_buffer->esgs_ring_size_needed)
5443          cmd_buffer->esgs_ring_size_needed = graphics_pipeline->esgs_ring_size;
5444       if (graphics_pipeline->gsvs_ring_size > cmd_buffer->gsvs_ring_size_needed)
5445          cmd_buffer->gsvs_ring_size_needed = graphics_pipeline->gsvs_ring_size;
5446
5447       if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_TESS_CTRL))
5448          cmd_buffer->tess_rings_needed = true;
5449       if (mesh_shading)
5450          cmd_buffer->mesh_scratch_ring_needed |=
5451             pipeline->shaders[MESA_SHADER_MESH]->info.ms.needs_ms_scratch_ring;
5452
5453       if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_TASK)) {
5454          if (!cmd_buffer->ace_internal.cs) {
5455             cmd_buffer->ace_internal.cs = radv_ace_internal_create(cmd_buffer);
5456             if (!cmd_buffer->ace_internal.cs)
5457                return;
5458          }
5459
5460          cmd_buffer->task_rings_needed = true;
5461       }
5462       break;
5463    }
5464    default:
5465       assert(!"invalid bind point");
5466       break;
5467    }
5468 }
5469
5470 VKAPI_ATTR void VKAPI_CALL
5471 radv_CmdSetViewport(VkCommandBuffer commandBuffer, uint32_t firstViewport, uint32_t viewportCount,
5472                     const VkViewport *pViewports)
5473 {
5474    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5475    struct radv_cmd_state *state = &cmd_buffer->state;
5476    ASSERTED const uint32_t total_count = firstViewport + viewportCount;
5477
5478    assert(firstViewport < MAX_VIEWPORTS);
5479    assert(total_count >= 1 && total_count <= MAX_VIEWPORTS);
5480
5481    if (state->dynamic.viewport.count < total_count)
5482       state->dynamic.viewport.count = total_count;
5483
5484    memcpy(state->dynamic.viewport.viewports + firstViewport, pViewports,
5485           viewportCount * sizeof(*pViewports));
5486    for (unsigned i = 0; i < viewportCount; i++) {
5487       radv_get_viewport_xform(&pViewports[i],
5488                               state->dynamic.viewport.xform[i + firstViewport].scale,
5489                               state->dynamic.viewport.xform[i + firstViewport].translate);
5490    }
5491
5492    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_VIEWPORT | RADV_CMD_DIRTY_GUARDBAND;
5493 }
5494
5495 VKAPI_ATTR void VKAPI_CALL
5496 radv_CmdSetScissor(VkCommandBuffer commandBuffer, uint32_t firstScissor, uint32_t scissorCount,
5497                    const VkRect2D *pScissors)
5498 {
5499    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5500    struct radv_cmd_state *state = &cmd_buffer->state;
5501    ASSERTED const uint32_t total_count = firstScissor + scissorCount;
5502
5503    assert(firstScissor < MAX_SCISSORS);
5504    assert(total_count >= 1 && total_count <= MAX_SCISSORS);
5505
5506    if (state->dynamic.scissor.count < total_count)
5507       state->dynamic.scissor.count = total_count;
5508
5509    memcpy(state->dynamic.scissor.scissors + firstScissor, pScissors,
5510           scissorCount * sizeof(*pScissors));
5511
5512    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR;
5513 }
5514
5515 VKAPI_ATTR void VKAPI_CALL
5516 radv_CmdSetLineWidth(VkCommandBuffer commandBuffer, float lineWidth)
5517 {
5518    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5519
5520    cmd_buffer->state.dynamic.line_width = lineWidth;
5521    cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH | RADV_CMD_DIRTY_GUARDBAND;
5522 }
5523
5524 VKAPI_ATTR void VKAPI_CALL
5525 radv_CmdSetDepthBias(VkCommandBuffer commandBuffer, float depthBiasConstantFactor,
5526                      float depthBiasClamp, float depthBiasSlopeFactor)
5527 {
5528    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5529    struct radv_cmd_state *state = &cmd_buffer->state;
5530
5531    state->dynamic.depth_bias.bias = depthBiasConstantFactor;
5532    state->dynamic.depth_bias.clamp = depthBiasClamp;
5533    state->dynamic.depth_bias.slope = depthBiasSlopeFactor;
5534
5535    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
5536 }
5537
5538 VKAPI_ATTR void VKAPI_CALL
5539 radv_CmdSetBlendConstants(VkCommandBuffer commandBuffer, const float blendConstants[4])
5540 {
5541    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5542    struct radv_cmd_state *state = &cmd_buffer->state;
5543
5544    memcpy(state->dynamic.blend_constants, blendConstants, sizeof(float) * 4);
5545
5546    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS;
5547 }
5548
5549 VKAPI_ATTR void VKAPI_CALL
5550 radv_CmdSetDepthBounds(VkCommandBuffer commandBuffer, float minDepthBounds, float maxDepthBounds)
5551 {
5552    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5553    struct radv_cmd_state *state = &cmd_buffer->state;
5554
5555    state->dynamic.depth_bounds.min = minDepthBounds;
5556    state->dynamic.depth_bounds.max = maxDepthBounds;
5557
5558    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS;
5559 }
5560
5561 VKAPI_ATTR void VKAPI_CALL
5562 radv_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask,
5563                               uint32_t compareMask)
5564 {
5565    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5566    struct radv_cmd_state *state = &cmd_buffer->state;
5567
5568    if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
5569       state->dynamic.stencil_compare_mask.front = compareMask;
5570    if (faceMask & VK_STENCIL_FACE_BACK_BIT)
5571       state->dynamic.stencil_compare_mask.back = compareMask;
5572
5573    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK;
5574 }
5575
5576 VKAPI_ATTR void VKAPI_CALL
5577 radv_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask,
5578                             uint32_t writeMask)
5579 {
5580    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5581    struct radv_cmd_state *state = &cmd_buffer->state;
5582
5583    if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
5584       state->dynamic.stencil_write_mask.front = writeMask;
5585    if (faceMask & VK_STENCIL_FACE_BACK_BIT)
5586       state->dynamic.stencil_write_mask.back = writeMask;
5587
5588    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK;
5589 }
5590
5591 VKAPI_ATTR void VKAPI_CALL
5592 radv_CmdSetStencilReference(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask,
5593                             uint32_t reference)
5594 {
5595    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5596
5597    if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
5598       cmd_buffer->state.dynamic.stencil_reference.front = reference;
5599    if (faceMask & VK_STENCIL_FACE_BACK_BIT)
5600       cmd_buffer->state.dynamic.stencil_reference.back = reference;
5601
5602    cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE;
5603 }
5604
5605 VKAPI_ATTR void VKAPI_CALL
5606 radv_CmdSetDiscardRectangleEXT(VkCommandBuffer commandBuffer, uint32_t firstDiscardRectangle,
5607                                uint32_t discardRectangleCount, const VkRect2D *pDiscardRectangles)
5608 {
5609    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5610    struct radv_cmd_state *state = &cmd_buffer->state;
5611    ASSERTED const uint32_t total_count = firstDiscardRectangle + discardRectangleCount;
5612
5613    assert(firstDiscardRectangle < MAX_DISCARD_RECTANGLES);
5614    assert(total_count >= 1 && total_count <= MAX_DISCARD_RECTANGLES);
5615
5616    typed_memcpy(&state->dynamic.discard_rectangle.rectangles[firstDiscardRectangle],
5617                 pDiscardRectangles, discardRectangleCount);
5618
5619    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE;
5620 }
5621
5622 VKAPI_ATTR void VKAPI_CALL
5623 radv_CmdSetSampleLocationsEXT(VkCommandBuffer commandBuffer,
5624                               const VkSampleLocationsInfoEXT *pSampleLocationsInfo)
5625 {
5626    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5627    struct radv_cmd_state *state = &cmd_buffer->state;
5628
5629    assert(pSampleLocationsInfo->sampleLocationsCount <= MAX_SAMPLE_LOCATIONS);
5630
5631    state->dynamic.sample_location.per_pixel = pSampleLocationsInfo->sampleLocationsPerPixel;
5632    state->dynamic.sample_location.grid_size = pSampleLocationsInfo->sampleLocationGridSize;
5633    state->dynamic.sample_location.count = pSampleLocationsInfo->sampleLocationsCount;
5634    typed_memcpy(&state->dynamic.sample_location.locations[0],
5635                 pSampleLocationsInfo->pSampleLocations, pSampleLocationsInfo->sampleLocationsCount);
5636
5637    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS;
5638 }
5639
5640 VKAPI_ATTR void VKAPI_CALL
5641 radv_CmdSetLineStippleEXT(VkCommandBuffer commandBuffer, uint32_t lineStippleFactor,
5642                           uint16_t lineStipplePattern)
5643 {
5644    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5645    struct radv_cmd_state *state = &cmd_buffer->state;
5646
5647    state->dynamic.line_stipple.factor = lineStippleFactor;
5648    state->dynamic.line_stipple.pattern = lineStipplePattern;
5649
5650    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE;
5651 }
5652
5653 VKAPI_ATTR void VKAPI_CALL
5654 radv_CmdSetCullMode(VkCommandBuffer commandBuffer, VkCullModeFlags cullMode)
5655 {
5656    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5657    struct radv_cmd_state *state = &cmd_buffer->state;
5658
5659    state->dynamic.cull_mode = cullMode;
5660
5661    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_CULL_MODE;
5662 }
5663
5664 VKAPI_ATTR void VKAPI_CALL
5665 radv_CmdSetFrontFace(VkCommandBuffer commandBuffer, VkFrontFace frontFace)
5666 {
5667    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5668    struct radv_cmd_state *state = &cmd_buffer->state;
5669
5670    state->dynamic.front_face = frontFace;
5671
5672    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE;
5673 }
5674
5675 VKAPI_ATTR void VKAPI_CALL
5676 radv_CmdSetPrimitiveTopology(VkCommandBuffer commandBuffer, VkPrimitiveTopology primitiveTopology)
5677 {
5678    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5679    struct radv_cmd_state *state = &cmd_buffer->state;
5680    unsigned primitive_topology = si_translate_prim(primitiveTopology);
5681
5682    if ((state->dynamic.primitive_topology == V_008958_DI_PT_LINESTRIP) !=
5683        (primitive_topology == V_008958_DI_PT_LINESTRIP))
5684       state->dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE;
5685
5686    if (radv_prim_is_points_or_lines(state->dynamic.primitive_topology) !=
5687        radv_prim_is_points_or_lines(primitive_topology))
5688       state->dirty |= RADV_CMD_DIRTY_GUARDBAND;
5689
5690    state->dynamic.primitive_topology = primitive_topology;
5691
5692    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY;
5693 }
5694
5695 VKAPI_ATTR void VKAPI_CALL
5696 radv_CmdSetViewportWithCount(VkCommandBuffer commandBuffer, uint32_t viewportCount,
5697                              const VkViewport *pViewports)
5698 {
5699    radv_CmdSetViewport(commandBuffer, 0, viewportCount, pViewports);
5700 }
5701
5702 VKAPI_ATTR void VKAPI_CALL
5703 radv_CmdSetScissorWithCount(VkCommandBuffer commandBuffer, uint32_t scissorCount,
5704                             const VkRect2D *pScissors)
5705 {
5706    radv_CmdSetScissor(commandBuffer, 0, scissorCount, pScissors);
5707 }
5708
5709 VKAPI_ATTR void VKAPI_CALL
5710 radv_CmdSetDepthTestEnable(VkCommandBuffer commandBuffer, VkBool32 depthTestEnable)
5711
5712 {
5713    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5714    struct radv_cmd_state *state = &cmd_buffer->state;
5715
5716    state->dynamic.depth_test_enable = depthTestEnable;
5717
5718    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE;
5719 }
5720
5721 VKAPI_ATTR void VKAPI_CALL
5722 radv_CmdSetDepthWriteEnable(VkCommandBuffer commandBuffer, VkBool32 depthWriteEnable)
5723 {
5724    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5725    struct radv_cmd_state *state = &cmd_buffer->state;
5726
5727    state->dynamic.depth_write_enable = depthWriteEnable;
5728
5729    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE;
5730 }
5731
5732 VKAPI_ATTR void VKAPI_CALL
5733 radv_CmdSetDepthCompareOp(VkCommandBuffer commandBuffer, VkCompareOp depthCompareOp)
5734 {
5735    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5736    struct radv_cmd_state *state = &cmd_buffer->state;
5737
5738    state->dynamic.depth_compare_op = depthCompareOp;
5739
5740    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP;
5741 }
5742
5743 VKAPI_ATTR void VKAPI_CALL
5744 radv_CmdSetDepthBoundsTestEnable(VkCommandBuffer commandBuffer, VkBool32 depthBoundsTestEnable)
5745 {
5746    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5747    struct radv_cmd_state *state = &cmd_buffer->state;
5748
5749    state->dynamic.depth_bounds_test_enable = depthBoundsTestEnable;
5750
5751    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE;
5752 }
5753
5754 VKAPI_ATTR void VKAPI_CALL
5755 radv_CmdSetStencilTestEnable(VkCommandBuffer commandBuffer, VkBool32 stencilTestEnable)
5756 {
5757    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5758    struct radv_cmd_state *state = &cmd_buffer->state;
5759
5760    state->dynamic.stencil_test_enable = stencilTestEnable;
5761
5762    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE;
5763 }
5764
5765 VKAPI_ATTR void VKAPI_CALL
5766 radv_CmdSetStencilOp(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask,
5767                      VkStencilOp failOp, VkStencilOp passOp, VkStencilOp depthFailOp,
5768                      VkCompareOp compareOp)
5769 {
5770    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5771    struct radv_cmd_state *state = &cmd_buffer->state;
5772
5773    if (faceMask & VK_STENCIL_FACE_FRONT_BIT) {
5774       state->dynamic.stencil_op.front.fail_op = failOp;
5775       state->dynamic.stencil_op.front.pass_op = passOp;
5776       state->dynamic.stencil_op.front.depth_fail_op = depthFailOp;
5777       state->dynamic.stencil_op.front.compare_op = compareOp;
5778    }
5779
5780    if (faceMask & VK_STENCIL_FACE_BACK_BIT) {
5781       state->dynamic.stencil_op.back.fail_op = failOp;
5782       state->dynamic.stencil_op.back.pass_op = passOp;
5783       state->dynamic.stencil_op.back.depth_fail_op = depthFailOp;
5784       state->dynamic.stencil_op.back.compare_op = compareOp;
5785    }
5786
5787    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP;
5788 }
5789
5790 VKAPI_ATTR void VKAPI_CALL
5791 radv_CmdSetFragmentShadingRateKHR(VkCommandBuffer commandBuffer, const VkExtent2D *pFragmentSize,
5792                                   const VkFragmentShadingRateCombinerOpKHR combinerOps[2])
5793 {
5794    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5795    struct radv_cmd_state *state = &cmd_buffer->state;
5796
5797    state->dynamic.fragment_shading_rate.size = *pFragmentSize;
5798    for (unsigned i = 0; i < 2; i++)
5799       state->dynamic.fragment_shading_rate.combiner_ops[i] = combinerOps[i];
5800
5801    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE;
5802 }
5803
5804 VKAPI_ATTR void VKAPI_CALL
5805 radv_CmdSetDepthBiasEnable(VkCommandBuffer commandBuffer, VkBool32 depthBiasEnable)
5806 {
5807    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5808    struct radv_cmd_state *state = &cmd_buffer->state;
5809
5810    state->dynamic.depth_bias_enable = depthBiasEnable;
5811
5812    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE;
5813 }
5814
5815 VKAPI_ATTR void VKAPI_CALL
5816 radv_CmdSetPrimitiveRestartEnable(VkCommandBuffer commandBuffer, VkBool32 primitiveRestartEnable)
5817 {
5818    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5819    struct radv_cmd_state *state = &cmd_buffer->state;
5820
5821    state->dynamic.primitive_restart_enable = primitiveRestartEnable;
5822
5823    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE;
5824 }
5825
5826 VKAPI_ATTR void VKAPI_CALL
5827 radv_CmdSetRasterizerDiscardEnable(VkCommandBuffer commandBuffer, VkBool32 rasterizerDiscardEnable)
5828 {
5829    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5830    struct radv_cmd_state *state = &cmd_buffer->state;
5831
5832    state->dynamic.rasterizer_discard_enable = rasterizerDiscardEnable;
5833
5834    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE;
5835 }
5836
5837 VKAPI_ATTR void VKAPI_CALL
5838 radv_CmdSetPatchControlPointsEXT(VkCommandBuffer commandBuffer, uint32_t patchControlPoints)
5839 {
5840    /* not implemented */
5841 }
5842
5843 VKAPI_ATTR void VKAPI_CALL
5844 radv_CmdSetLogicOpEXT(VkCommandBuffer commandBuffer, VkLogicOp logicOp)
5845 {
5846    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5847    struct radv_cmd_state *state = &cmd_buffer->state;
5848    unsigned logic_op = si_translate_blend_logic_op(logicOp);
5849
5850    state->dynamic.logic_op = logic_op;
5851
5852    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP;
5853 }
5854
5855 VKAPI_ATTR void VKAPI_CALL
5856 radv_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer, uint32_t attachmentCount,
5857                                const VkBool32 *pColorWriteEnables)
5858 {
5859    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5860    struct radv_cmd_state *state = &cmd_buffer->state;
5861    uint32_t color_write_enable = 0;
5862
5863    assert(attachmentCount <= MAX_RTS);
5864
5865    for (uint32_t i = 0; i < attachmentCount; i++) {
5866       color_write_enable |= pColorWriteEnables[i] ? (0xfu << (i * 4)) : 0;
5867    }
5868
5869    state->dynamic.color_write_enable = color_write_enable;
5870
5871    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE;
5872 }
5873
5874 VKAPI_ATTR void VKAPI_CALL
5875 radv_CmdSetVertexInputEXT(VkCommandBuffer commandBuffer, uint32_t vertexBindingDescriptionCount,
5876                           const VkVertexInputBindingDescription2EXT *pVertexBindingDescriptions,
5877                           uint32_t vertexAttributeDescriptionCount,
5878                           const VkVertexInputAttributeDescription2EXT *pVertexAttributeDescriptions)
5879 {
5880    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5881    struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input;
5882
5883    const VkVertexInputBindingDescription2EXT *bindings[MAX_VBS];
5884    for (unsigned i = 0; i < vertexBindingDescriptionCount; i++)
5885       bindings[pVertexBindingDescriptions[i].binding] = &pVertexBindingDescriptions[i];
5886
5887    cmd_buffer->state.vbo_misaligned_mask = 0;
5888    cmd_buffer->state.vbo_misaligned_mask_invalid = 0;
5889
5890    memset(state, 0, sizeof(*state));
5891    state->bindings_match_attrib = true;
5892
5893    enum amd_gfx_level chip = cmd_buffer->device->physical_device->rad_info.gfx_level;
5894    enum radeon_family family = cmd_buffer->device->physical_device->rad_info.family;
5895    const struct ac_vtx_format_info *vtx_info_table = ac_get_vtx_format_info_table(chip, family);
5896
5897    for (unsigned i = 0; i < vertexAttributeDescriptionCount; i++) {
5898       const VkVertexInputAttributeDescription2EXT *attrib = &pVertexAttributeDescriptions[i];
5899       const VkVertexInputBindingDescription2EXT *binding = bindings[attrib->binding];
5900       unsigned loc = attrib->location;
5901
5902       state->attribute_mask |= 1u << loc;
5903       state->bindings[loc] = attrib->binding;
5904       if (attrib->binding != loc)
5905          state->bindings_match_attrib = false;
5906       if (binding->inputRate == VK_VERTEX_INPUT_RATE_INSTANCE) {
5907          state->instance_rate_inputs |= 1u << loc;
5908          state->divisors[loc] = binding->divisor;
5909          if (binding->divisor == 0) {
5910             state->zero_divisors |= 1u << loc;
5911          } else if (binding->divisor > 1) {
5912             state->nontrivial_divisors |= 1u << loc;
5913          }
5914       }
5915       cmd_buffer->vertex_bindings[attrib->binding].stride = binding->stride;
5916       state->offsets[loc] = attrib->offset;
5917
5918       enum pipe_format format = vk_format_to_pipe_format(attrib->format);
5919       const struct ac_vtx_format_info *vtx_info = &vtx_info_table[format];
5920
5921       state->formats[loc] = format;
5922       uint8_t align_req_minus_1 = vtx_info->chan_byte_size >= 4 ? 3 : (vtx_info->element_size - 1);
5923       state->format_align_req_minus_1[loc] = align_req_minus_1;
5924       state->format_sizes[loc] = vtx_info->element_size;
5925       state->alpha_adjust_lo |= (vtx_info->alpha_adjust & 0x1) << loc;
5926       state->alpha_adjust_hi |= (vtx_info->alpha_adjust >> 1) << loc;
5927       if (G_008F0C_DST_SEL_X(vtx_info->dst_sel) == V_008F0C_SQ_SEL_Z)
5928          state->post_shuffle |= BITFIELD_BIT(loc);
5929
5930       if (!(vtx_info->has_hw_format & BITFIELD_BIT(vtx_info->num_channels - 1)))
5931          state->nontrivial_formats |= BITFIELD_BIT(loc);
5932
5933       if ((chip == GFX6 || chip >= GFX10) &&
5934           cmd_buffer->state.vbo_bound_mask & BITFIELD_BIT(attrib->binding)) {
5935          if (binding->stride & align_req_minus_1) {
5936             cmd_buffer->state.vbo_misaligned_mask |= BITFIELD_BIT(loc);
5937          } else if ((cmd_buffer->vertex_bindings[attrib->binding].offset + state->offsets[loc]) &
5938                     align_req_minus_1) {
5939             cmd_buffer->state.vbo_misaligned_mask |= BITFIELD_BIT(loc);
5940          }
5941       }
5942    }
5943
5944    cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER |
5945                               RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT;
5946 }
5947
5948 VKAPI_ATTR void VKAPI_CALL
5949 radv_CmdExecuteCommands(VkCommandBuffer commandBuffer, uint32_t commandBufferCount,
5950                         const VkCommandBuffer *pCmdBuffers)
5951 {
5952    RADV_FROM_HANDLE(radv_cmd_buffer, primary, commandBuffer);
5953
5954    assert(commandBufferCount > 0);
5955
5956    radv_emit_mip_change_flush_default(primary);
5957
5958    /* Emit pending flushes on primary prior to executing secondary */
5959    si_emit_cache_flush(primary);
5960
5961    /* Make sure CP DMA is idle on primary prior to executing secondary. */
5962    si_cp_dma_wait_for_idle(primary);
5963
5964    for (uint32_t i = 0; i < commandBufferCount; i++) {
5965       RADV_FROM_HANDLE(radv_cmd_buffer, secondary, pCmdBuffers[i]);
5966       bool allow_ib2 = true;
5967
5968       if (secondary->device->physical_device->rad_info.gfx_level == GFX7 &&
5969           secondary->state.uses_draw_indirect_multi) {
5970          /* Do not launch an IB2 for secondary command buffers that contain
5971           * DRAW_{INDEX}_INDIRECT_MULTI on GFX7 because it's illegal and hang the GPU.
5972           */
5973          allow_ib2 = false;
5974       }
5975
5976       if (secondary->qf == RADV_QUEUE_COMPUTE) {
5977          /* IB2 packets are not supported on compute queues according to PAL. */
5978          allow_ib2 = false;
5979       }
5980
5981       primary->scratch_size_per_wave_needed =
5982          MAX2(primary->scratch_size_per_wave_needed, secondary->scratch_size_per_wave_needed);
5983       primary->scratch_waves_wanted =
5984          MAX2(primary->scratch_waves_wanted, secondary->scratch_waves_wanted);
5985       primary->compute_scratch_size_per_wave_needed =
5986          MAX2(primary->compute_scratch_size_per_wave_needed,
5987               secondary->compute_scratch_size_per_wave_needed);
5988       primary->compute_scratch_waves_wanted =
5989          MAX2(primary->compute_scratch_waves_wanted, secondary->compute_scratch_waves_wanted);
5990
5991       if (secondary->esgs_ring_size_needed > primary->esgs_ring_size_needed)
5992          primary->esgs_ring_size_needed = secondary->esgs_ring_size_needed;
5993       if (secondary->gsvs_ring_size_needed > primary->gsvs_ring_size_needed)
5994          primary->gsvs_ring_size_needed = secondary->gsvs_ring_size_needed;
5995       if (secondary->tess_rings_needed)
5996          primary->tess_rings_needed = true;
5997       if (secondary->task_rings_needed)
5998          primary->task_rings_needed = true;
5999       if (secondary->mesh_scratch_ring_needed)
6000          primary->mesh_scratch_ring_needed = true;
6001       if (secondary->sample_positions_needed)
6002          primary->sample_positions_needed = true;
6003       if (secondary->gds_needed)
6004          primary->gds_needed = true;
6005
6006       if (!secondary->state.framebuffer && primary->state.pass && (primary->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)) {
6007          /* Emit the framebuffer state from primary if secondary
6008           * has been recorded without a framebuffer, otherwise
6009           * fast color/depth clears can't work.
6010           */
6011          radv_emit_fb_mip_change_flush(primary);
6012          radv_emit_framebuffer_state(primary);
6013       }
6014
6015       if (secondary->ace_internal.cs) {
6016          if (!primary->ace_internal.cs) {
6017             primary->ace_internal.cs = radv_ace_internal_create(primary);
6018             if (!primary->ace_internal.cs)
6019                return;
6020          }
6021
6022          struct radeon_cmdbuf *ace_primary = primary->ace_internal.cs;
6023          struct radeon_cmdbuf *ace_secondary = secondary->ace_internal.cs;
6024
6025          /* Emit pending flushes on primary prior to executing secondary. */
6026          radv_ace_internal_cache_flush(primary);
6027
6028          /* Wait for primary GFX->ACE semaphore, if necessary. */
6029          if (radv_flush_gfx2ace_semaphore(primary))
6030             radv_wait_gfx2ace_semaphore(primary);
6031
6032          /* Execute the secondary compute cmdbuf.
6033           * Don't use IB2 packets because they are not supported on compute queues.
6034           */
6035          primary->device->ws->cs_execute_secondary(ace_primary, ace_secondary, false);
6036       }
6037
6038       /* Update pending ACE internal flush bits from the secondary cmdbuf */
6039       primary->ace_internal.flush_bits |= secondary->ace_internal.flush_bits;
6040
6041       /* Increment primary semaphore if secondary was dirty.
6042        * This happens when the secondary cmdbuf has a barrier which
6043        * isn't consumed by a draw call.
6044        */
6045       if (radv_ace_internal_sem_dirty(secondary))
6046          primary->ace_internal.sem.gfx2ace_value++;
6047
6048       primary->device->ws->cs_execute_secondary(primary->cs, secondary->cs, allow_ib2);
6049
6050       /* When the secondary command buffer is compute only we don't
6051        * need to re-emit the current graphics pipeline.
6052        */
6053       if (secondary->state.emitted_graphics_pipeline) {
6054          primary->state.emitted_graphics_pipeline = secondary->state.emitted_graphics_pipeline;
6055       }
6056
6057       /* When the secondary command buffer is graphics only we don't
6058        * need to re-emit the current compute pipeline.
6059        */
6060       if (secondary->state.emitted_compute_pipeline) {
6061          primary->state.emitted_compute_pipeline = secondary->state.emitted_compute_pipeline;
6062       }
6063
6064       /* Only re-emit the draw packets when needed. */
6065       if (secondary->state.last_primitive_reset_en != -1) {
6066          primary->state.last_primitive_reset_en = secondary->state.last_primitive_reset_en;
6067       }
6068
6069       if (secondary->state.last_primitive_reset_index) {
6070          primary->state.last_primitive_reset_index = secondary->state.last_primitive_reset_index;
6071       }
6072
6073       if (secondary->state.last_ia_multi_vgt_param) {
6074          primary->state.last_ia_multi_vgt_param = secondary->state.last_ia_multi_vgt_param;
6075       }
6076
6077       primary->state.last_first_instance = secondary->state.last_first_instance;
6078       primary->state.last_num_instances = secondary->state.last_num_instances;
6079       primary->state.last_drawid = secondary->state.last_drawid;
6080       primary->state.last_subpass_color_count = secondary->state.last_subpass_color_count;
6081       primary->state.last_vertex_offset = secondary->state.last_vertex_offset;
6082       primary->state.last_sx_ps_downconvert = secondary->state.last_sx_ps_downconvert;
6083       primary->state.last_sx_blend_opt_epsilon = secondary->state.last_sx_blend_opt_epsilon;
6084       primary->state.last_sx_blend_opt_control = secondary->state.last_sx_blend_opt_control;
6085
6086       if (secondary->state.last_index_type != -1) {
6087          primary->state.last_index_type = secondary->state.last_index_type;
6088       }
6089
6090       primary->state.last_nggc_settings = secondary->state.last_nggc_settings;
6091       primary->state.last_nggc_settings_sgpr_idx = secondary->state.last_nggc_settings_sgpr_idx;
6092       primary->state.last_nggc_skip = secondary->state.last_nggc_skip;
6093
6094       primary->state.last_vrs_rates = secondary->state.last_vrs_rates;
6095       primary->state.last_vrs_rates_sgpr_idx = secondary->state.last_vrs_rates_sgpr_idx;
6096    }
6097
6098    /* After executing commands from secondary buffers we have to dirty
6099     * some states.
6100     */
6101    primary->state.dirty |=
6102       RADV_CMD_DIRTY_PIPELINE | RADV_CMD_DIRTY_INDEX_BUFFER | RADV_CMD_DIRTY_GUARDBAND |
6103       RADV_CMD_DIRTY_DYNAMIC_ALL;
6104    radv_mark_descriptor_sets_dirty(primary, VK_PIPELINE_BIND_POINT_GRAPHICS);
6105    radv_mark_descriptor_sets_dirty(primary, VK_PIPELINE_BIND_POINT_COMPUTE);
6106 }
6107
6108 VKAPI_ATTR VkResult VKAPI_CALL
6109 radv_CreateCommandPool(VkDevice _device, const VkCommandPoolCreateInfo *pCreateInfo,
6110                        const VkAllocationCallbacks *pAllocator, VkCommandPool *pCmdPool)
6111 {
6112    RADV_FROM_HANDLE(radv_device, device, _device);
6113    struct radv_cmd_pool *pool;
6114
6115    pool =
6116       vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*pool), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
6117    if (pool == NULL)
6118       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
6119
6120    VkResult result = vk_command_pool_init(&pool->vk, &device->vk, pCreateInfo, pAllocator);
6121    if (result != VK_SUCCESS) {
6122       vk_free2(&device->vk.alloc, pAllocator, pool);
6123       return result;
6124    }
6125
6126    list_inithead(&pool->cmd_buffers);
6127    list_inithead(&pool->free_cmd_buffers);
6128
6129    *pCmdPool = radv_cmd_pool_to_handle(pool);
6130
6131    return VK_SUCCESS;
6132 }
6133
6134 VKAPI_ATTR void VKAPI_CALL
6135 radv_DestroyCommandPool(VkDevice _device, VkCommandPool commandPool,
6136                         const VkAllocationCallbacks *pAllocator)
6137 {
6138    RADV_FROM_HANDLE(radv_device, device, _device);
6139    RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
6140
6141    if (!pool)
6142       return;
6143
6144    list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer, &pool->cmd_buffers, pool_link)
6145    {
6146       radv_destroy_cmd_buffer(cmd_buffer);
6147    }
6148
6149    list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer, &pool->free_cmd_buffers, pool_link)
6150    {
6151       radv_destroy_cmd_buffer(cmd_buffer);
6152    }
6153
6154    vk_command_pool_finish(&pool->vk);
6155    vk_free2(&device->vk.alloc, pAllocator, pool);
6156 }
6157
6158 VKAPI_ATTR VkResult VKAPI_CALL
6159 radv_ResetCommandPool(VkDevice device, VkCommandPool commandPool, VkCommandPoolResetFlags flags)
6160 {
6161    RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
6162    VkResult result;
6163
6164    list_for_each_entry(struct radv_cmd_buffer, cmd_buffer, &pool->cmd_buffers, pool_link)
6165    {
6166       result = radv_reset_cmd_buffer(cmd_buffer);
6167       if (result != VK_SUCCESS)
6168          return result;
6169    }
6170
6171    return VK_SUCCESS;
6172 }
6173
6174 VKAPI_ATTR void VKAPI_CALL
6175 radv_TrimCommandPool(VkDevice device, VkCommandPool commandPool, VkCommandPoolTrimFlags flags)
6176 {
6177    RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
6178
6179    list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer, &pool->free_cmd_buffers, pool_link)
6180    {
6181       radv_destroy_cmd_buffer(cmd_buffer);
6182    }
6183 }
6184
6185 static void
6186 radv_cmd_buffer_begin_subpass(struct radv_cmd_buffer *cmd_buffer, uint32_t subpass_id)
6187 {
6188    struct radv_cmd_state *state = &cmd_buffer->state;
6189    struct radv_subpass *subpass = &state->pass->subpasses[subpass_id];
6190
6191    ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4096);
6192
6193    radv_emit_subpass_barrier(cmd_buffer, &subpass->start_barrier);
6194
6195    cmd_buffer->state.subpass = subpass;
6196    cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FRAMEBUFFER;
6197
6198    radv_describe_barrier_start(cmd_buffer, RGP_BARRIER_EXTERNAL_RENDER_PASS_SYNC);
6199
6200    for (uint32_t i = 0; i < subpass->attachment_count; ++i) {
6201       const uint32_t a = subpass->attachments[i].attachment;
6202       if (a == VK_ATTACHMENT_UNUSED)
6203          continue;
6204
6205       radv_handle_subpass_image_transition(cmd_buffer, subpass->attachments[i], true);
6206    }
6207
6208    radv_ace_internal_barrier(cmd_buffer, 0, 0);
6209    radv_describe_barrier_end(cmd_buffer);
6210
6211    radv_cmd_buffer_clear_subpass(cmd_buffer);
6212
6213    if (subpass->vrs_attachment) {
6214       int idx = subpass->vrs_attachment->attachment;
6215       struct radv_image_view *vrs_iview = cmd_buffer->state.attachments[idx].iview;
6216
6217       if (subpass->depth_stencil_attachment) {
6218          /* When a subpass uses a VRS attachment and a depth/stencil attachment, we just need to
6219           * copy the VRS rates to the HTILE buffer of the attachment.
6220           */
6221          int ds_idx = subpass->depth_stencil_attachment->attachment;
6222          struct radv_image_view *ds_iview = cmd_buffer->state.attachments[ds_idx].iview;
6223          struct radv_image *ds_image = ds_iview->image;
6224          uint32_t level = ds_iview->vk.base_mip_level;
6225
6226          VkExtent2D extent = {
6227             .width = radv_minify(ds_image->info.width, level),
6228             .height = radv_minify(ds_image->info.height, level),
6229          };
6230
6231          /* HTILE buffer */
6232          uint64_t htile_offset = ds_image->bindings[0].offset + ds_image->planes[0].surface.meta_offset +
6233                                  ds_image->planes[0].surface.u.gfx9.meta_levels[level].offset;
6234          uint64_t htile_size = ds_image->planes[0].surface.u.gfx9.meta_levels[level].size;
6235          struct radv_buffer htile_buffer;
6236
6237          radv_buffer_init(&htile_buffer, cmd_buffer->device, ds_image->bindings[0].bo, htile_size, htile_offset);
6238
6239          /* Copy the VRS rates to the HTILE buffer. */
6240          radv_copy_vrs_htile(cmd_buffer, vrs_iview->image, &extent, ds_image, &htile_buffer, true);
6241
6242          radv_buffer_finish(&htile_buffer);
6243       } else {
6244          /* When a subpass uses a VRS attachment without binding a depth/stencil attachment, we have
6245           * to copy the VRS rates to our internal HTILE buffer.
6246           */
6247          struct vk_framebuffer *fb = cmd_buffer->state.framebuffer;
6248          struct radv_image *ds_image = radv_cmd_buffer_get_vrs_image(cmd_buffer);
6249
6250          if (ds_image) {
6251             /* HTILE buffer */
6252             struct radv_buffer *htile_buffer = cmd_buffer->device->vrs.buffer;
6253
6254             VkExtent2D extent = {
6255                .width = MIN2(fb->width, ds_image->info.width),
6256                .height = MIN2(fb->height, ds_image->info.height),
6257             };
6258
6259             /* Copy the VRS rates to the HTILE buffer. */
6260             radv_copy_vrs_htile(cmd_buffer, vrs_iview->image, &extent, ds_image, htile_buffer, false);
6261          }
6262       }
6263    }
6264
6265    assert(cmd_buffer->cs->cdw <= cdw_max);
6266 }
6267
6268 static void
6269 radv_mark_noncoherent_rb(struct radv_cmd_buffer *cmd_buffer)
6270 {
6271    const struct radv_subpass *subpass = cmd_buffer->state.subpass;
6272
6273    /* Have to be conservative in cmdbuffers with inherited attachments. */
6274    if (!cmd_buffer->state.attachments) {
6275       cmd_buffer->state.rb_noncoherent_dirty = true;
6276       return;
6277    }
6278
6279    for (uint32_t i = 0; i < subpass->color_count; ++i) {
6280       const uint32_t a = subpass->color_attachments[i].attachment;
6281       if (a == VK_ATTACHMENT_UNUSED)
6282          continue;
6283       if (!cmd_buffer->state.attachments[a].iview->image->l2_coherent) {
6284          cmd_buffer->state.rb_noncoherent_dirty = true;
6285          return;
6286       }
6287    }
6288    if (subpass->depth_stencil_attachment &&
6289        !cmd_buffer->state.attachments[subpass->depth_stencil_attachment->attachment]
6290            .iview->image->l2_coherent)
6291       cmd_buffer->state.rb_noncoherent_dirty = true;
6292 }
6293
6294 static void
6295 radv_cmd_buffer_end_subpass(struct radv_cmd_buffer *cmd_buffer)
6296 {
6297    struct radv_cmd_state *state = &cmd_buffer->state;
6298    const struct radv_subpass *subpass = state->subpass;
6299    uint32_t subpass_id = radv_get_subpass_id(cmd_buffer);
6300
6301    radv_cmd_buffer_resolve_subpass(cmd_buffer);
6302
6303    radv_describe_barrier_start(cmd_buffer, RGP_BARRIER_EXTERNAL_RENDER_PASS_SYNC);
6304
6305    for (uint32_t i = 0; i < subpass->attachment_count; ++i) {
6306       const uint32_t a = subpass->attachments[i].attachment;
6307       if (a == VK_ATTACHMENT_UNUSED)
6308          continue;
6309
6310       if (state->pass->attachments[a].last_subpass_idx != subpass_id)
6311          continue;
6312
6313       VkImageLayout layout = state->pass->attachments[a].final_layout;
6314       VkImageLayout stencil_layout = state->pass->attachments[a].stencil_final_layout;
6315       struct radv_subpass_attachment att = {a, layout, stencil_layout};
6316       radv_handle_subpass_image_transition(cmd_buffer, att, false);
6317    }
6318
6319    radv_ace_internal_barrier(cmd_buffer, 0, 0);
6320    radv_describe_barrier_end(cmd_buffer);
6321 }
6322
6323 VKAPI_ATTR void VKAPI_CALL
6324 radv_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
6325                          const VkRenderPassBeginInfo *pRenderPassBeginInfo,
6326                          const VkSubpassBeginInfo *pSubpassBeginInfo)
6327 {
6328    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6329    RADV_FROM_HANDLE(radv_render_pass, pass, pRenderPassBeginInfo->renderPass);
6330    RADV_FROM_HANDLE(vk_framebuffer, framebuffer, pRenderPassBeginInfo->framebuffer);
6331    VkResult result;
6332
6333    cmd_buffer->state.framebuffer = framebuffer;
6334    cmd_buffer->state.pass = pass;
6335    cmd_buffer->state.render_area = pRenderPassBeginInfo->renderArea;
6336
6337    result = radv_cmd_state_setup_attachments(cmd_buffer, pass, pRenderPassBeginInfo);
6338    if (result != VK_SUCCESS)
6339       return;
6340
6341    result = radv_cmd_state_setup_sample_locations(cmd_buffer, pass, pRenderPassBeginInfo);
6342    if (result != VK_SUCCESS)
6343       return;
6344
6345    radv_cmd_buffer_begin_subpass(cmd_buffer, 0);
6346 }
6347
6348 VKAPI_ATTR void VKAPI_CALL
6349 radv_CmdNextSubpass2(VkCommandBuffer commandBuffer, const VkSubpassBeginInfo *pSubpassBeginInfo,
6350                      const VkSubpassEndInfo *pSubpassEndInfo)
6351 {
6352    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6353
6354    radv_mark_noncoherent_rb(cmd_buffer);
6355
6356    uint32_t prev_subpass = radv_get_subpass_id(cmd_buffer);
6357    radv_cmd_buffer_end_subpass(cmd_buffer);
6358    radv_cmd_buffer_begin_subpass(cmd_buffer, prev_subpass + 1);
6359 }
6360
6361 static void
6362 radv_emit_view_index_per_stage(struct radeon_cmdbuf *cs, struct radv_graphics_pipeline *pipeline,
6363                                unsigned stage, unsigned index)
6364 {
6365    struct radv_userdata_info *loc = radv_lookup_user_sgpr(&pipeline->base, stage, AC_UD_VIEW_INDEX);
6366    if (loc->sgpr_idx == -1)
6367       return;
6368    uint32_t base_reg = pipeline->base.user_data_0[stage];
6369    radeon_set_sh_reg(cs, base_reg + loc->sgpr_idx * 4, index);
6370 }
6371
6372 static void
6373 radv_emit_view_index(struct radv_cmd_buffer *cmd_buffer, unsigned index)
6374 {
6375    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
6376
6377    radv_foreach_stage(stage, pipeline->active_stages & ~VK_SHADER_STAGE_TASK_BIT_NV) {
6378       radv_emit_view_index_per_stage(cmd_buffer->cs, pipeline, stage, index);
6379    }
6380    if (radv_pipeline_has_gs_copy_shader(&pipeline->base)) {
6381       struct radv_userdata_info *loc =
6382          &pipeline->base.gs_copy_shader->info.user_sgprs_locs.shader_data[AC_UD_VIEW_INDEX];
6383       if (loc->sgpr_idx != -1) {
6384          uint32_t base_reg = R_00B130_SPI_SHADER_USER_DATA_VS_0;
6385          radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, index);
6386       }
6387    }
6388    if (pipeline->active_stages & VK_SHADER_STAGE_TASK_BIT_NV) {
6389       radv_emit_view_index_per_stage(cmd_buffer->ace_internal.cs, pipeline, MESA_SHADER_TASK,
6390                                      index);
6391    }
6392 }
6393
6394 /**
6395  * Emulates predication for MEC using COND_EXEC.
6396  * When the current command buffer is predicating, emit a COND_EXEC packet
6397  * so that the MEC skips the next few dwords worth of packets.
6398  *
6399  * To make it work with inverted conditional rendering, we allocate
6400  * space in the upload BO and emit some packets to invert the condition.
6401  */
6402 static void
6403 radv_cs_emit_compute_predication(struct radv_cmd_state *state, struct radeon_cmdbuf *cs,
6404                                  uint64_t inv_va, bool *inv_emitted, unsigned dwords)
6405 {
6406    if (!state->predicating)
6407       return;
6408
6409    uint64_t va = state->predication_va;
6410
6411    if (!state->predication_type) {
6412       /* Invert the condition the first time it is needed. */
6413       if (!*inv_emitted) {
6414          *inv_emitted = true;
6415
6416          /* Write 1 to the inverted predication VA. */
6417          radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
6418          radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
6419                             COPY_DATA_WR_CONFIRM);
6420          radeon_emit(cs, 1);
6421          radeon_emit(cs, 0);
6422          radeon_emit(cs, inv_va);
6423          radeon_emit(cs, inv_va >> 32);
6424
6425          /* If the API predication VA == 0, skip next command. */
6426          radeon_emit(cs, PKT3(PKT3_COND_EXEC, 3, 0));
6427          radeon_emit(cs, va);
6428          radeon_emit(cs, va >> 32);
6429          radeon_emit(cs, 0);
6430          radeon_emit(cs, 6); /* 1x COPY_DATA size */
6431
6432          /* Write 0 to the new predication VA (when the API condition != 0) */
6433          radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
6434          radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
6435                             COPY_DATA_WR_CONFIRM);
6436          radeon_emit(cs, 0);
6437          radeon_emit(cs, 0);
6438          radeon_emit(cs, inv_va);
6439          radeon_emit(cs, inv_va >> 32);
6440       }
6441
6442       va = inv_va;
6443    }
6444
6445    radeon_emit(cs, PKT3(PKT3_COND_EXEC, 3, 0));
6446    radeon_emit(cs, va);
6447    radeon_emit(cs, va >> 32);
6448    radeon_emit(cs, 0); /* Cache policy */
6449    radeon_emit(cs, dwords); /* Size of the predicated packet(s) in DWORDs. */
6450 }
6451
6452 static void
6453 radv_cs_emit_draw_packet(struct radv_cmd_buffer *cmd_buffer, uint32_t vertex_count,
6454                          uint32_t use_opaque)
6455 {
6456    radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, cmd_buffer->state.predicating));
6457    radeon_emit(cmd_buffer->cs, vertex_count);
6458    radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX | use_opaque);
6459 }
6460
6461 /**
6462  * Emit a PKT3_DRAW_INDEX_2 packet to render "index_count` vertices.
6463  *
6464  * The starting address "index_va" may point anywhere within the index buffer. The number of
6465  * indexes allocated in the index buffer *past that point* is specified by "max_index_count".
6466  * Hardware uses this information to return 0 for out-of-bounds reads.
6467  */
6468 static void
6469 radv_cs_emit_draw_indexed_packet(struct radv_cmd_buffer *cmd_buffer, uint64_t index_va,
6470                                  uint32_t max_index_count, uint32_t index_count, bool not_eop)
6471 {
6472    radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_2, 4, cmd_buffer->state.predicating));
6473    radeon_emit(cmd_buffer->cs, max_index_count);
6474    radeon_emit(cmd_buffer->cs, index_va);
6475    radeon_emit(cmd_buffer->cs, index_va >> 32);
6476    radeon_emit(cmd_buffer->cs, index_count);
6477    /* NOT_EOP allows merging multiple draws into 1 wave, but only user VGPRs
6478     * can be changed between draws and GS fast launch must be disabled.
6479     * NOT_EOP doesn't work on gfx9 and older.
6480     */
6481    radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_DMA | S_0287F0_NOT_EOP(not_eop));
6482 }
6483
6484 /* MUST inline this function to avoid massive perf loss in drawoverhead */
6485 ALWAYS_INLINE static void
6486 radv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer *cmd_buffer, bool indexed,
6487                                   uint32_t draw_count, uint64_t count_va, uint32_t stride)
6488 {
6489    struct radeon_cmdbuf *cs = cmd_buffer->cs;
6490    const unsigned di_src_sel = indexed ? V_0287F0_DI_SRC_SEL_DMA : V_0287F0_DI_SRC_SEL_AUTO_INDEX;
6491    bool draw_id_enable = cmd_buffer->state.graphics_pipeline->uses_drawid;
6492    uint32_t base_reg = cmd_buffer->state.graphics_pipeline->vtx_base_sgpr;
6493    uint32_t vertex_offset_reg, start_instance_reg = 0, draw_id_reg = 0;
6494    bool predicating = cmd_buffer->state.predicating;
6495    bool mesh = cmd_buffer->state.mesh_shading;
6496    assert(base_reg);
6497
6498    /* just reset draw state for vertex data */
6499    cmd_buffer->state.last_first_instance = -1;
6500    cmd_buffer->state.last_num_instances = -1;
6501    cmd_buffer->state.last_drawid = -1;
6502    cmd_buffer->state.last_vertex_offset = -1;
6503
6504    vertex_offset_reg = (base_reg - SI_SH_REG_OFFSET) >> 2;
6505    if (cmd_buffer->state.graphics_pipeline->uses_baseinstance)
6506       start_instance_reg = ((base_reg + (draw_id_enable ? 8 : 4)) - SI_SH_REG_OFFSET) >> 2;
6507    if (draw_id_enable)
6508       draw_id_reg = ((base_reg + mesh * 12 + 4) - SI_SH_REG_OFFSET) >> 2;
6509
6510    if (draw_count == 1 && !count_va && !draw_id_enable) {
6511       radeon_emit(cs,
6512                   PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT : PKT3_DRAW_INDIRECT, 3, predicating));
6513       radeon_emit(cs, 0);
6514       radeon_emit(cs, vertex_offset_reg);
6515       radeon_emit(cs, start_instance_reg);
6516       radeon_emit(cs, di_src_sel);
6517    } else {
6518       radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT_MULTI : PKT3_DRAW_INDIRECT_MULTI, 8,
6519                            predicating));
6520       radeon_emit(cs, 0);
6521       radeon_emit(cs, vertex_offset_reg);
6522       radeon_emit(cs, start_instance_reg);
6523       radeon_emit(cs, draw_id_reg | S_2C3_DRAW_INDEX_ENABLE(draw_id_enable) |
6524                          S_2C3_COUNT_INDIRECT_ENABLE(!!count_va));
6525       radeon_emit(cs, draw_count); /* count */
6526       radeon_emit(cs, count_va);   /* count_addr */
6527       radeon_emit(cs, count_va >> 32);
6528       radeon_emit(cs, stride); /* stride */
6529       radeon_emit(cs, di_src_sel);
6530
6531       cmd_buffer->state.uses_draw_indirect_multi = true;
6532    }
6533 }
6534
6535 ALWAYS_INLINE static void
6536 radv_cs_emit_dispatch_taskmesh_direct_ace_packet(struct radv_cmd_buffer *cmd_buffer,
6537                                                  const uint32_t x, const uint32_t y,
6538                                                  const uint32_t z)
6539 {
6540    struct radv_pipeline *pipeline = &cmd_buffer->state.graphics_pipeline->base;
6541    struct radv_shader *compute_shader = radv_get_shader(pipeline, MESA_SHADER_TASK);
6542    struct radeon_cmdbuf *cs = cmd_buffer->ace_internal.cs;
6543    const bool predicating = cmd_buffer->state.predicating;
6544    const uint32_t dispatch_initiator = cmd_buffer->device->dispatch_initiator_task |
6545                                        S_00B800_CS_W32_EN(compute_shader->info.wave_size == 32);
6546
6547    struct radv_userdata_info *ring_entry_loc =
6548       radv_lookup_user_sgpr(pipeline, MESA_SHADER_TASK, AC_UD_TASK_RING_ENTRY);
6549    assert(ring_entry_loc && ring_entry_loc->sgpr_idx != -1 && ring_entry_loc->num_sgprs == 1);
6550
6551    uint32_t ring_entry_reg =
6552       (R_00B900_COMPUTE_USER_DATA_0 + ring_entry_loc->sgpr_idx * 4 - SI_SH_REG_OFFSET) >> 2;
6553
6554    radeon_emit(cs, PKT3(PKT3_DISPATCH_TASKMESH_DIRECT_ACE, 4, predicating) | PKT3_SHADER_TYPE_S(1));
6555    radeon_emit(cs, x);
6556    radeon_emit(cs, y);
6557    radeon_emit(cs, z);
6558    radeon_emit(cs, dispatch_initiator);
6559    radeon_emit(cs, ring_entry_reg & 0xFFFF);
6560 }
6561
6562 ALWAYS_INLINE static void
6563 radv_cs_emit_dispatch_taskmesh_indirect_multi_ace_packet(struct radv_cmd_buffer *cmd_buffer,
6564                                                          uint64_t data_va, uint32_t draw_count,
6565                                                          uint64_t count_va, uint32_t stride)
6566 {
6567    assert((data_va & 0x03) == 0);
6568    assert((count_va & 0x03) == 0);
6569
6570    struct radv_pipeline *pipeline = &cmd_buffer->state.graphics_pipeline->base;
6571    struct radv_shader *compute_shader = radv_get_shader(pipeline, MESA_SHADER_TASK);
6572    struct radeon_cmdbuf *cs = cmd_buffer->ace_internal.cs;
6573
6574    const uint32_t count_indirect_enable = !!count_va;
6575    const uint32_t xyz_dim_enable = compute_shader->info.cs.uses_grid_size;
6576    const uint32_t draw_id_enable = compute_shader->info.vs.needs_draw_id;
6577    const uint32_t dispatch_initiator = cmd_buffer->device->dispatch_initiator_task |
6578                                        S_00B800_CS_W32_EN(compute_shader->info.wave_size == 32);
6579
6580    const struct radv_userdata_info *ring_entry_loc =
6581       radv_lookup_user_sgpr(pipeline, MESA_SHADER_TASK, AC_UD_TASK_RING_ENTRY);
6582    const struct radv_userdata_info *xyz_dim_loc =
6583       radv_lookup_user_sgpr(pipeline, MESA_SHADER_TASK, AC_UD_CS_GRID_SIZE);
6584    const struct radv_userdata_info *draw_id_loc =
6585       radv_lookup_user_sgpr(pipeline, MESA_SHADER_TASK, AC_UD_CS_TASK_DRAW_ID);
6586
6587    assert(ring_entry_loc->sgpr_idx != -1 && ring_entry_loc->num_sgprs == 1);
6588    assert(!xyz_dim_enable || (xyz_dim_loc->sgpr_idx != -1 && xyz_dim_loc->num_sgprs == 3));
6589    assert(!draw_id_enable || (draw_id_loc->sgpr_idx != -1 && draw_id_loc->num_sgprs == 1));
6590
6591    const uint32_t ring_entry_reg =
6592       (R_00B900_COMPUTE_USER_DATA_0 + ring_entry_loc->sgpr_idx * 4 - SI_SH_REG_OFFSET) >> 2;
6593    const uint32_t xyz_dim_reg =
6594       !xyz_dim_enable
6595          ? 0
6596          : (R_00B900_COMPUTE_USER_DATA_0 + xyz_dim_loc->sgpr_idx * 4 - SI_SH_REG_OFFSET) >> 2;
6597    const uint32_t draw_id_reg =
6598       !draw_id_enable
6599          ? 0
6600          : (R_00B900_COMPUTE_USER_DATA_0 + draw_id_loc->sgpr_idx * 4 - SI_SH_REG_OFFSET) >> 2;
6601
6602    radeon_emit(cs, PKT3(PKT3_DISPATCH_TASKMESH_INDIRECT_MULTI_ACE, 9, 0) | PKT3_SHADER_TYPE_S(1));
6603    radeon_emit(cs, data_va);
6604    radeon_emit(cs, data_va >> 32);
6605    radeon_emit(cs, ring_entry_reg & 0xFFFF);
6606    radeon_emit(cs, (count_indirect_enable << 1) | (draw_id_enable << 2) | (xyz_dim_enable << 3) |
6607                       (draw_id_reg << 16));
6608    radeon_emit(cs, xyz_dim_reg & 0xFFFF);
6609    radeon_emit(cs, draw_count);
6610    radeon_emit(cs, count_va);
6611    radeon_emit(cs, count_va >> 32);
6612    radeon_emit(cs, stride);
6613    radeon_emit(cs, dispatch_initiator);
6614 }
6615
6616 ALWAYS_INLINE static void
6617 radv_cs_emit_dispatch_taskmesh_gfx_packet(struct radv_cmd_buffer *cmd_buffer)
6618 {
6619    struct radv_pipeline *pipeline = &cmd_buffer->state.graphics_pipeline->base;
6620    struct radeon_cmdbuf *cs = cmd_buffer->cs;
6621    bool predicating = cmd_buffer->state.predicating;
6622
6623    struct radv_userdata_info *ring_entry_loc =
6624       radv_lookup_user_sgpr(pipeline, MESA_SHADER_MESH, AC_UD_TASK_RING_ENTRY);
6625
6626    assert(ring_entry_loc && ring_entry_loc->sgpr_idx != -1);
6627
6628    uint32_t base_reg = cmd_buffer->state.graphics_pipeline->vtx_base_sgpr;
6629    uint32_t xyz_dim_reg = ((base_reg + 4) - SI_SH_REG_OFFSET) >> 2;
6630    uint32_t ring_entry_reg = ((base_reg + ring_entry_loc->sgpr_idx * 4) - SI_SH_REG_OFFSET) >> 2;
6631
6632    radeon_emit(cs, PKT3(PKT3_DISPATCH_TASKMESH_GFX, 2, predicating));
6633    radeon_emit(cs, (ring_entry_reg << 16) | (xyz_dim_reg & 0xFFFF));
6634    radeon_emit(cs, 0);
6635    radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX);
6636 }
6637
6638 ALWAYS_INLINE static void
6639 radv_emit_userdata_vertex_internal(struct radv_cmd_buffer *cmd_buffer,
6640                                    const struct radv_draw_info *info, const uint32_t vertex_offset)
6641 {
6642    struct radv_cmd_state *state = &cmd_buffer->state;
6643    struct radeon_cmdbuf *cs = cmd_buffer->cs;
6644    const bool uses_baseinstance = state->graphics_pipeline->uses_baseinstance;
6645    const bool uses_drawid = state->graphics_pipeline->uses_drawid;
6646
6647    radeon_set_sh_reg_seq(cs, state->graphics_pipeline->vtx_base_sgpr, state->graphics_pipeline->vtx_emit_num);
6648
6649    radeon_emit(cs, vertex_offset);
6650    state->last_vertex_offset = vertex_offset;
6651    if (uses_drawid) {
6652       radeon_emit(cs, 0);
6653       state->last_drawid = 0;
6654    }
6655    if (uses_baseinstance) {
6656       radeon_emit(cs, info->first_instance);
6657       state->last_first_instance = info->first_instance;
6658    }
6659 }
6660
6661 ALWAYS_INLINE static void
6662 radv_emit_userdata_vertex(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info,
6663                           const uint32_t vertex_offset)
6664 {
6665    const struct radv_cmd_state *state = &cmd_buffer->state;
6666    const bool uses_baseinstance = state->graphics_pipeline->uses_baseinstance;
6667    const bool uses_drawid = state->graphics_pipeline->uses_drawid;
6668
6669    if (vertex_offset != state->last_vertex_offset ||
6670        (uses_drawid && 0 != state->last_drawid) ||
6671        (uses_baseinstance && info->first_instance != state->last_first_instance))
6672       radv_emit_userdata_vertex_internal(cmd_buffer, info, vertex_offset);
6673 }
6674
6675 ALWAYS_INLINE static void
6676 radv_emit_userdata_vertex_drawid(struct radv_cmd_buffer *cmd_buffer, uint32_t vertex_offset, uint32_t drawid)
6677 {
6678    struct radv_cmd_state *state = &cmd_buffer->state;
6679    struct radeon_cmdbuf *cs = cmd_buffer->cs;
6680    radeon_set_sh_reg_seq(cs, state->graphics_pipeline->vtx_base_sgpr, 1 + !!drawid);
6681    radeon_emit(cs, vertex_offset);
6682    state->last_vertex_offset = vertex_offset;
6683    if (drawid)
6684       radeon_emit(cs, drawid);
6685
6686 }
6687
6688 ALWAYS_INLINE static void
6689 radv_emit_userdata_mesh(struct radv_cmd_buffer *cmd_buffer,
6690                         const uint32_t x, const uint32_t y, const uint32_t z,
6691                         const uint32_t first_task)
6692 {
6693    struct radv_cmd_state *state = &cmd_buffer->state;
6694    struct radeon_cmdbuf *cs = cmd_buffer->cs;
6695    const bool uses_drawid = state->graphics_pipeline->uses_drawid;
6696
6697    radeon_set_sh_reg_seq(cs, state->graphics_pipeline->vtx_base_sgpr, state->graphics_pipeline->vtx_emit_num);
6698    radeon_emit(cs, first_task);
6699    radeon_emit(cs, x);
6700    radeon_emit(cs, y);
6701    radeon_emit(cs, z);
6702
6703    if (uses_drawid) {
6704       radeon_emit(cs, 0);
6705       state->last_drawid = 0;
6706    }
6707 }
6708
6709 ALWAYS_INLINE static void
6710 radv_emit_userdata_mesh_first_task_0_draw_id_0(struct radv_cmd_buffer *cmd_buffer)
6711 {
6712    struct radv_cmd_state *state = &cmd_buffer->state;
6713    struct radeon_cmdbuf *cs = cmd_buffer->cs;
6714    struct radv_graphics_pipeline *pipeline = state->graphics_pipeline;
6715    const bool uses_drawid = pipeline->uses_drawid;
6716
6717    radeon_set_sh_reg_seq(cs, pipeline->vtx_base_sgpr, 1);
6718    radeon_emit(cs, 0);
6719
6720    if (uses_drawid) {
6721       radeon_set_sh_reg_seq(cs, pipeline->vtx_base_sgpr + (pipeline->vtx_emit_num - 1) * 4, 1);
6722       radeon_emit(cs, 0);
6723    }
6724 }
6725
6726 ALWAYS_INLINE static void
6727 radv_emit_userdata_task_ib_only(struct radv_cmd_buffer *cmd_buffer, uint64_t ib_va,
6728                                 uint32_t ib_stride)
6729 {
6730    struct radv_pipeline *pipeline = &cmd_buffer->state.graphics_pipeline->base;
6731    struct radeon_cmdbuf *cs = cmd_buffer->ace_internal.cs;
6732
6733    struct radv_userdata_info *task_ib_loc =
6734       radv_lookup_user_sgpr(pipeline, MESA_SHADER_TASK, AC_UD_CS_TASK_IB);
6735
6736    if (task_ib_loc->sgpr_idx != -1) {
6737       assert(task_ib_loc->num_sgprs == 3);
6738       unsigned task_ib_reg = R_00B900_COMPUTE_USER_DATA_0 + task_ib_loc->sgpr_idx * 4;
6739
6740       radeon_set_sh_reg_seq(cs, task_ib_reg, 3);
6741       radeon_emit(cs, ib_va);
6742       radeon_emit(cs, ib_va >> 32);
6743       radeon_emit(cs, ib_stride);
6744    }
6745 }
6746
6747 ALWAYS_INLINE static void
6748 radv_emit_userdata_task(struct radv_cmd_buffer *cmd_buffer, uint32_t x, uint32_t y, uint32_t z,
6749                         uint32_t draw_id, uint32_t first_task, uint64_t ib_va)
6750 {
6751    struct radv_pipeline *pipeline = &cmd_buffer->state.graphics_pipeline->base;
6752    struct radeon_cmdbuf *cs = cmd_buffer->ace_internal.cs;
6753
6754    struct radv_userdata_info *xyz_loc =
6755       radv_lookup_user_sgpr(pipeline, MESA_SHADER_TASK, AC_UD_CS_GRID_SIZE);
6756    struct radv_userdata_info *draw_id_loc =
6757       radv_lookup_user_sgpr(pipeline, MESA_SHADER_TASK, AC_UD_CS_TASK_DRAW_ID);
6758
6759    if (xyz_loc->sgpr_idx != -1) {
6760       assert(xyz_loc->num_sgprs == 3);
6761       unsigned xyz_reg = R_00B900_COMPUTE_USER_DATA_0 + xyz_loc->sgpr_idx * 4;
6762
6763       radeon_set_sh_reg_seq(cs, xyz_reg, 3);
6764       radeon_emit(cs, x);
6765       radeon_emit(cs, y);
6766       radeon_emit(cs, z);
6767    }
6768
6769    if (draw_id_loc->sgpr_idx != -1) {
6770       assert(draw_id_loc->num_sgprs == 1);
6771       unsigned draw_id_reg = R_00B900_COMPUTE_USER_DATA_0 + draw_id_loc->sgpr_idx * 4;
6772
6773       radeon_set_sh_reg_seq(cs, draw_id_reg, 1);
6774       radeon_emit(cs, draw_id);
6775    }
6776
6777    radv_emit_userdata_task_ib_only(cmd_buffer, ib_va, first_task ? 8 : 0);
6778 }
6779
6780 ALWAYS_INLINE static void
6781 radv_emit_draw_packets_indexed(struct radv_cmd_buffer *cmd_buffer,
6782                                const struct radv_draw_info *info,
6783                                uint32_t drawCount, const VkMultiDrawIndexedInfoEXT *minfo,
6784                                uint32_t stride,
6785                                const int32_t *vertexOffset)
6786
6787 {
6788    struct radv_cmd_state *state = &cmd_buffer->state;
6789    struct radeon_cmdbuf *cs = cmd_buffer->cs;
6790    const int index_size = radv_get_vgt_index_size(state->index_type);
6791    unsigned i = 0;
6792    const bool uses_drawid = state->graphics_pipeline->uses_drawid;
6793    const bool can_eop =
6794       !uses_drawid && cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10;
6795
6796    if (uses_drawid) {
6797       if (vertexOffset) {
6798          radv_emit_userdata_vertex(cmd_buffer, info, *vertexOffset);
6799          vk_foreach_multi_draw_indexed(draw, i, minfo, drawCount, stride) {
6800             const uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
6801
6802             /* Skip draw calls with 0-sized index buffers if the GPU can't handle them */
6803             if (!remaining_indexes &&
6804                 cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug)
6805                continue;
6806
6807             if (i > 0)
6808                radeon_set_sh_reg(cs, state->graphics_pipeline->vtx_base_sgpr + sizeof(uint32_t), i);
6809
6810             const uint64_t index_va = state->index_va + draw->firstIndex * index_size;
6811
6812             if (!state->subpass->view_mask) {
6813                radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
6814             } else {
6815                u_foreach_bit(view, state->subpass->view_mask) {
6816                   radv_emit_view_index(cmd_buffer, view);
6817
6818                   radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
6819                }
6820             }
6821          }
6822       } else {
6823          vk_foreach_multi_draw_indexed(draw, i, minfo, drawCount, stride) {
6824             const uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
6825
6826             /* Skip draw calls with 0-sized index buffers if the GPU can't handle them */
6827             if (!remaining_indexes &&
6828                 cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug)
6829                continue;
6830
6831             if (i > 0) {
6832                if (state->last_vertex_offset != draw->vertexOffset)
6833                   radv_emit_userdata_vertex_drawid(cmd_buffer, draw->vertexOffset, i);
6834                else
6835                   radeon_set_sh_reg(cs, state->graphics_pipeline->vtx_base_sgpr + sizeof(uint32_t), i);
6836             } else
6837                radv_emit_userdata_vertex(cmd_buffer, info, draw->vertexOffset);
6838
6839             const uint64_t index_va = state->index_va + draw->firstIndex * index_size;
6840
6841             if (!state->subpass->view_mask) {
6842                radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
6843             } else {
6844                u_foreach_bit(view, state->subpass->view_mask) {
6845                   radv_emit_view_index(cmd_buffer, view);
6846
6847                   radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
6848                }
6849             }
6850          }
6851       }
6852       if (drawCount > 1) {
6853          state->last_drawid = drawCount - 1;
6854       }
6855    } else {
6856       if (vertexOffset) {
6857          if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX10) {
6858             /* GFX10 has a bug that consecutive draw packets with NOT_EOP must not have
6859              * count == 0 for the last draw that doesn't have NOT_EOP.
6860              */
6861             while (drawCount > 1) {
6862                const VkMultiDrawIndexedInfoEXT *last = (const VkMultiDrawIndexedInfoEXT*)(((const uint8_t*)minfo) + (drawCount - 1) * stride);
6863                if (last->indexCount)
6864                   break;
6865                drawCount--;
6866             }
6867          }
6868
6869          radv_emit_userdata_vertex(cmd_buffer, info, *vertexOffset);
6870          vk_foreach_multi_draw_indexed(draw, i, minfo, drawCount, stride) {
6871             const uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
6872
6873             /* Skip draw calls with 0-sized index buffers if the GPU can't handle them */
6874             if (!remaining_indexes &&
6875                 cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug)
6876                continue;
6877
6878             const uint64_t index_va = state->index_va + draw->firstIndex * index_size;
6879
6880             if (!state->subpass->view_mask) {
6881                radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, can_eop && i < drawCount - 1);
6882             } else {
6883                u_foreach_bit(view, state->subpass->view_mask) {
6884                   radv_emit_view_index(cmd_buffer, view);
6885
6886                   radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
6887                }
6888             }
6889          }
6890       } else {
6891          vk_foreach_multi_draw_indexed(draw, i, minfo, drawCount, stride) {
6892             const uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
6893
6894             /* Skip draw calls with 0-sized index buffers if the GPU can't handle them */
6895             if (!remaining_indexes &&
6896                 cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug)
6897                continue;
6898
6899             const VkMultiDrawIndexedInfoEXT *next = (const VkMultiDrawIndexedInfoEXT*)(i < drawCount - 1 ? ((uint8_t*)draw + stride) : NULL);
6900             const bool offset_changes = next && next->vertexOffset != draw->vertexOffset;
6901             radv_emit_userdata_vertex(cmd_buffer, info, draw->vertexOffset);
6902
6903             const uint64_t index_va = state->index_va + draw->firstIndex * index_size;
6904
6905             if (!state->subpass->view_mask) {
6906                radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, can_eop && !offset_changes && i < drawCount - 1);
6907             } else {
6908                u_foreach_bit(view, state->subpass->view_mask) {
6909                   radv_emit_view_index(cmd_buffer, view);
6910
6911                   radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
6912                }
6913             }
6914          }
6915       }
6916       if (drawCount > 1) {
6917          state->last_drawid = drawCount - 1;
6918       }
6919    }
6920 }
6921
6922 ALWAYS_INLINE static void
6923 radv_emit_direct_draw_packets(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info,
6924                               uint32_t drawCount, const VkMultiDrawInfoEXT *minfo,
6925                               uint32_t use_opaque, uint32_t stride)
6926 {
6927    unsigned i = 0;
6928    const uint32_t view_mask = cmd_buffer->state.subpass->view_mask;
6929    const bool uses_drawid = cmd_buffer->state.graphics_pipeline->uses_drawid;
6930    uint32_t last_start = 0;
6931
6932    vk_foreach_multi_draw(draw, i, minfo, drawCount, stride) {
6933       if (!i)
6934          radv_emit_userdata_vertex(cmd_buffer, info, draw->firstVertex);
6935       else
6936          radv_emit_userdata_vertex_drawid(cmd_buffer, draw->firstVertex, uses_drawid ? i : 0);
6937
6938       if (!view_mask) {
6939          radv_cs_emit_draw_packet(cmd_buffer, draw->vertexCount, use_opaque);
6940       } else {
6941          u_foreach_bit(view, view_mask) {
6942             radv_emit_view_index(cmd_buffer, view);
6943             radv_cs_emit_draw_packet(cmd_buffer, draw->vertexCount, use_opaque);
6944          }
6945       }
6946       last_start = draw->firstVertex;
6947    }
6948    if (drawCount > 1) {
6949        struct radv_cmd_state *state = &cmd_buffer->state;
6950        state->last_vertex_offset = last_start;
6951        if (uses_drawid)
6952            state->last_drawid = drawCount - 1;
6953    }
6954 }
6955
6956 ALWAYS_INLINE static void
6957 radv_emit_direct_mesh_draw_packet(struct radv_cmd_buffer *cmd_buffer,
6958                                   uint32_t x, uint32_t y, uint32_t z,
6959                                   uint32_t first_task)
6960 {
6961    const uint32_t view_mask = cmd_buffer->state.subpass->view_mask;
6962    const uint32_t count = x * y * z;
6963
6964    radv_emit_userdata_mesh(cmd_buffer, x, y, z, first_task);
6965
6966    if (!view_mask) {
6967       radv_cs_emit_draw_packet(cmd_buffer, count, 0);
6968    } else {
6969       u_foreach_bit(view, view_mask) {
6970          radv_emit_view_index(cmd_buffer, view);
6971          radv_cs_emit_draw_packet(cmd_buffer, count, 0);
6972       }
6973    }
6974 }
6975
6976 ALWAYS_INLINE static void
6977 radv_emit_direct_taskmesh_draw_packets(struct radv_cmd_buffer *cmd_buffer, uint32_t x, uint32_t y,
6978                                        uint32_t z, uint32_t first_task)
6979 {
6980    uint64_t fake_ib_va = 0;
6981    const uint32_t view_mask = cmd_buffer->state.subpass->view_mask;
6982    const unsigned num_views = MAX2(1, util_bitcount(view_mask));
6983    unsigned ace_predication_size = num_views * 6; /* DISPATCH_TASKMESH_DIRECT_ACE size */
6984
6985    if (first_task) {
6986       /* Pass this as the IB to the shader for emulating firstTask in task shaders. */
6987       uint32_t fake_ib_dwords[2] = {x, first_task};
6988       unsigned fake_ib_offset;
6989       radv_cmd_buffer_upload_data(cmd_buffer, 8, fake_ib_dwords, &fake_ib_offset);
6990       fake_ib_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + fake_ib_offset;
6991    }
6992
6993    radv_emit_userdata_task(cmd_buffer, x, y, z, 0, first_task, fake_ib_va);
6994    radv_emit_userdata_mesh_first_task_0_draw_id_0(cmd_buffer);
6995    radv_cs_emit_compute_predication(&cmd_buffer->state, cmd_buffer->ace_internal.cs,
6996                                     cmd_buffer->mec_inv_pred_va, &cmd_buffer->mec_inv_pred_emitted,
6997                                     ace_predication_size);
6998
6999    if (!view_mask) {
7000       radv_cs_emit_dispatch_taskmesh_direct_ace_packet(cmd_buffer, x, y, z);
7001       radv_cs_emit_dispatch_taskmesh_gfx_packet(cmd_buffer);
7002    } else {
7003       u_foreach_bit (view, view_mask) {
7004          radv_emit_view_index(cmd_buffer, view);
7005          radv_cs_emit_dispatch_taskmesh_direct_ace_packet(cmd_buffer, x, y, z);
7006          radv_cs_emit_dispatch_taskmesh_gfx_packet(cmd_buffer);
7007       }
7008    }
7009 }
7010
7011 static void
7012 radv_emit_indirect_taskmesh_draw_packets(struct radv_cmd_buffer *cmd_buffer,
7013                                          const struct radv_draw_info *info, uint64_t nv_ib_va,
7014                                          uint32_t nv_ib_stride)
7015 {
7016    const uint32_t view_mask = cmd_buffer->state.subpass->view_mask;
7017    struct radeon_winsys *ws = cmd_buffer->device->ws;
7018    const unsigned num_views = MAX2(1, util_bitcount(view_mask));
7019    unsigned ace_predication_size = num_views * 11; /* DISPATCH_TASKMESH_INDIRECT_MULTI_ACE size */
7020    struct radeon_cmdbuf *ace_cs = cmd_buffer->ace_internal.cs;
7021
7022    const uint64_t va =
7023       radv_buffer_get_va(info->indirect->bo) + info->indirect->offset + info->indirect_offset;
7024    const uint64_t count_va = !info->count_buffer
7025                                 ? 0
7026                                 : radv_buffer_get_va(info->count_buffer->bo) +
7027                                      info->count_buffer->offset + info->count_buffer_offset;
7028    uint64_t workaround_cond_va = 0;
7029
7030    if (count_va) {
7031       radv_cs_add_buffer(ws, cmd_buffer->ace_internal.cs, info->count_buffer->bo);
7032
7033       /* MEC firmware bug workaround.
7034        * When the count buffer contains zero, DISPATCH_TASKMESH_INDIRECT_MULTI_ACE hangs.
7035        * - We must ensure that DISPATCH_TASKMESH_INDIRECT_MULTI_ACE
7036        *   is only executed when the count buffer contains non-zero.
7037        * - Furthermore, we must also ensure that each DISPATCH_TASKMESH_GFX packet
7038        *   has a matching ACE packet.
7039        *
7040        * As a workaround:
7041        * - Reserve a dword in the upload buffer and initialize it to 1 for the workaround
7042        * - When count != 0, write 0 to the workaround BO and execute the indirect dispatch
7043        * - When workaround BO != 0 (count was 0), execute an empty direct dispatch
7044        */
7045
7046       uint32_t workaround_cond_init = 0;
7047       uint32_t workaround_cond_off;
7048       if (!radv_cmd_buffer_upload_data(cmd_buffer, 4, &workaround_cond_init, &workaround_cond_off))
7049          vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
7050
7051       workaround_cond_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + workaround_cond_off;
7052
7053       radeon_emit(ace_cs, PKT3(PKT3_COPY_DATA, 4, 0));
7054       radeon_emit(ace_cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
7055                              COPY_DATA_WR_CONFIRM);
7056       radeon_emit(ace_cs, 1);
7057       radeon_emit(ace_cs, 0);
7058       radeon_emit(ace_cs, workaround_cond_va);
7059       radeon_emit(ace_cs, workaround_cond_va >> 32);
7060
7061       /* 2x COND_EXEC + 1x COPY_DATA + Nx DISPATCH_TASKMESH_DIRECT_ACE */
7062       ace_predication_size += 2 * 5 + 6 + 6 * num_views;
7063    }
7064
7065    radv_cs_add_buffer(ws, cmd_buffer->ace_internal.cs, info->indirect->bo);
7066    radv_emit_userdata_task_ib_only(cmd_buffer, nv_ib_va, nv_ib_stride);
7067    radv_emit_userdata_mesh_first_task_0_draw_id_0(cmd_buffer);
7068    radv_cs_emit_compute_predication(&cmd_buffer->state, cmd_buffer->ace_internal.cs,
7069                                     cmd_buffer->mec_inv_pred_va, &cmd_buffer->mec_inv_pred_emitted,
7070                                     ace_predication_size);
7071
7072    if (workaround_cond_va) {
7073       radeon_emit(ace_cs, PKT3(PKT3_COND_EXEC, 3, 0));
7074       radeon_emit(ace_cs, count_va);
7075       radeon_emit(ace_cs, count_va >> 32);
7076       radeon_emit(ace_cs, 0);
7077       radeon_emit(ace_cs,
7078                   6 + 11 * num_views); /* 1x COPY_DATA + Nx DISPATCH_TASKMESH_INDIRECT_MULTI_ACE */
7079
7080       radeon_emit(ace_cs, PKT3(PKT3_COPY_DATA, 4, 0));
7081       radeon_emit(ace_cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
7082                              COPY_DATA_WR_CONFIRM);
7083       radeon_emit(ace_cs, 0);
7084       radeon_emit(ace_cs, 0);
7085       radeon_emit(ace_cs, workaround_cond_va);
7086       radeon_emit(ace_cs, workaround_cond_va >> 32);
7087    }
7088
7089    if (!view_mask) {
7090       radv_cs_emit_dispatch_taskmesh_indirect_multi_ace_packet(cmd_buffer, va, info->count,
7091                                                                count_va, info->stride);
7092       radv_cs_emit_dispatch_taskmesh_gfx_packet(cmd_buffer);
7093    } else {
7094       u_foreach_bit (view, view_mask) {
7095          radv_emit_view_index(cmd_buffer, view);
7096          radv_cs_emit_dispatch_taskmesh_indirect_multi_ace_packet(cmd_buffer, va, info->count,
7097                                                                   count_va, info->stride);
7098          radv_cs_emit_dispatch_taskmesh_gfx_packet(cmd_buffer);
7099       }
7100    }
7101
7102    if (workaround_cond_va) {
7103       radeon_emit(ace_cs, PKT3(PKT3_COND_EXEC, 3, 0));
7104       radeon_emit(ace_cs, workaround_cond_va);
7105       radeon_emit(ace_cs, workaround_cond_va >> 32);
7106       radeon_emit(ace_cs, 0);
7107       radeon_emit(ace_cs, 6 * num_views); /* Nx DISPATCH_TASKMESH_DIRECT_ACE */
7108
7109       for (unsigned v = 0; v < num_views; ++v) {
7110          radv_cs_emit_dispatch_taskmesh_direct_ace_packet(cmd_buffer, 0, 0, 0);
7111       }
7112    }
7113 }
7114
7115 static void
7116 radv_emit_indirect_draw_packets(struct radv_cmd_buffer *cmd_buffer,
7117                                 const struct radv_draw_info *info)
7118 {
7119    const struct radv_cmd_state *state = &cmd_buffer->state;
7120    struct radeon_winsys *ws = cmd_buffer->device->ws;
7121    struct radeon_cmdbuf *cs = cmd_buffer->cs;
7122    const uint64_t va =
7123       radv_buffer_get_va(info->indirect->bo) + info->indirect->offset + info->indirect_offset;
7124    const uint64_t count_va = info->count_buffer
7125                                 ? radv_buffer_get_va(info->count_buffer->bo) +
7126                                      info->count_buffer->offset + info->count_buffer_offset
7127                                 : 0;
7128
7129    radv_cs_add_buffer(ws, cs, info->indirect->bo);
7130
7131    radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0));
7132    radeon_emit(cs, 1);
7133    radeon_emit(cs, va);
7134    radeon_emit(cs, va >> 32);
7135
7136    if (info->count_buffer) {
7137       radv_cs_add_buffer(ws, cs, info->count_buffer->bo);
7138    }
7139
7140    if (!state->subpass->view_mask) {
7141       radv_cs_emit_indirect_draw_packet(cmd_buffer, info->indexed, info->count, count_va,
7142                                         info->stride);
7143    } else {
7144       u_foreach_bit(i, state->subpass->view_mask)
7145       {
7146          radv_emit_view_index(cmd_buffer, i);
7147
7148          radv_cs_emit_indirect_draw_packet(cmd_buffer, info->indexed, info->count, count_va,
7149                                            info->stride);
7150       }
7151    }
7152 }
7153
7154 /*
7155  * Vega and raven have a bug which triggers if there are multiple context
7156  * register contexts active at the same time with different scissor values.
7157  *
7158  * There are two possible workarounds:
7159  * 1) Wait for PS_PARTIAL_FLUSH every time the scissor is changed. That way
7160  *    there is only ever 1 active set of scissor values at the same time.
7161  *
7162  * 2) Whenever the hardware switches contexts we have to set the scissor
7163  *    registers again even if it is a noop. That way the new context gets
7164  *    the correct scissor values.
7165  *
7166  * This implements option 2. radv_need_late_scissor_emission needs to
7167  * return true on affected HW if radv_emit_all_graphics_states sets
7168  * any context registers.
7169  */
7170 static bool
7171 radv_need_late_scissor_emission(struct radv_cmd_buffer *cmd_buffer,
7172                                 const struct radv_draw_info *info)
7173 {
7174    struct radv_cmd_state *state = &cmd_buffer->state;
7175
7176    if (!cmd_buffer->device->physical_device->rad_info.has_gfx9_scissor_bug)
7177       return false;
7178
7179    if (cmd_buffer->state.context_roll_without_scissor_emitted || info->strmout_buffer)
7180       return true;
7181
7182    uint64_t used_states =
7183       cmd_buffer->state.graphics_pipeline->needed_dynamic_state | ~RADV_CMD_DIRTY_DYNAMIC_ALL;
7184
7185    /* Index, vertex and streamout buffers don't change context regs, and
7186     * pipeline is already handled.
7187     */
7188    used_states &= ~(RADV_CMD_DIRTY_INDEX_BUFFER | RADV_CMD_DIRTY_VERTEX_BUFFER |
7189                     RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT | RADV_CMD_DIRTY_STREAMOUT_BUFFER |
7190                     RADV_CMD_DIRTY_PIPELINE);
7191
7192    if (cmd_buffer->state.dirty & used_states)
7193       return true;
7194
7195    uint32_t primitive_reset_index = radv_get_primitive_reset_index(cmd_buffer);
7196
7197    if (info->indexed && state->dynamic.primitive_restart_enable &&
7198        primitive_reset_index != state->last_primitive_reset_index)
7199       return true;
7200
7201    return false;
7202 }
7203
7204 ALWAYS_INLINE static bool
7205 radv_skip_ngg_culling(bool has_tess, const unsigned vtx_cnt,
7206                       bool indirect)
7207 {
7208    /* If we have to draw only a few vertices, we get better latency if
7209     * we disable NGG culling.
7210     *
7211     * When tessellation is used, what matters is the number of tessellated
7212     * vertices, so let's always assume it's not a small draw.
7213     */
7214    return !has_tess && !indirect && vtx_cnt < 128;
7215 }
7216
7217 ALWAYS_INLINE static uint32_t
7218 radv_get_ngg_culling_settings(struct radv_cmd_buffer *cmd_buffer, bool vp_y_inverted)
7219 {
7220    const struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
7221    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
7222
7223    /* Cull every triangle when rasterizer discard is enabled. */
7224    if (d->rasterizer_discard_enable)
7225       return radv_nggc_front_face | radv_nggc_back_face;
7226
7227    uint32_t nggc_settings = radv_nggc_none;
7228
7229    /* The culling code needs to know whether face is CW or CCW. */
7230    bool ccw = d->front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE;
7231
7232    /* Take inverted viewport into account. */
7233    ccw ^= vp_y_inverted;
7234
7235    if (ccw)
7236       nggc_settings |= radv_nggc_face_is_ccw;
7237
7238    /* Face culling settings. */
7239    if (d->cull_mode & VK_CULL_MODE_FRONT_BIT)
7240       nggc_settings |= radv_nggc_front_face;
7241    if (d->cull_mode & VK_CULL_MODE_BACK_BIT)
7242       nggc_settings |= radv_nggc_back_face;
7243
7244    /* Small primitive culling is only valid when conservative overestimation is not used. It's also
7245     * disabled for user sample locations because small primitive culling assumes a sample
7246     * position at (0.5, 0.5). */
7247    if (!pipeline->uses_conservative_overestimate && !pipeline->uses_user_sample_locations) {
7248       nggc_settings |= radv_nggc_small_primitives;
7249
7250       /* small_prim_precision = num_samples / 2^subpixel_bits
7251        * num_samples is also always a power of two, so the small prim precision can only be
7252        * a power of two between 2^-2 and 2^-6, therefore it's enough to remember the exponent.
7253        */
7254       unsigned subpixel_bits = 256;
7255       int32_t small_prim_precision_log2 = util_logbase2(pipeline->ms.num_samples) - util_logbase2(subpixel_bits);
7256       nggc_settings |= ((uint32_t) small_prim_precision_log2 << 24u);
7257    }
7258
7259    return nggc_settings;
7260 }
7261
7262 static void
7263 radv_emit_ngg_culling_state(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *draw_info)
7264 {
7265    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
7266    const unsigned stage = pipeline->last_vgt_api_stage;
7267    const bool nggc_supported = pipeline->has_ngg_culling;
7268
7269    if (!nggc_supported && !cmd_buffer->state.last_nggc_settings) {
7270       /* Current shader doesn't support culling and culling was already disabled:
7271        * No further steps needed, just remember the SGPR's location is not set.
7272        */
7273       cmd_buffer->state.last_nggc_settings_sgpr_idx = -1;
7274       return;
7275    }
7276
7277    /* Check dirty flags:
7278     * - Dirty pipeline: SGPR index may have changed (we have to re-emit if changed).
7279     * - Dirty dynamic flags: culling settings may have changed.
7280     */
7281    const bool dirty =
7282       cmd_buffer->state.dirty &
7283       (RADV_CMD_DIRTY_PIPELINE |
7284        RADV_CMD_DIRTY_DYNAMIC_CULL_MODE | RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE |
7285        RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT);
7286
7287    /* Check small draw status:
7288     * For small draw calls, we disable culling by setting the SGPR to 0.
7289     */
7290    const bool skip =
7291       radv_skip_ngg_culling(stage == MESA_SHADER_TESS_EVAL, draw_info->count, draw_info->indirect);
7292
7293    /* See if anything changed. */
7294    if (!dirty && skip == cmd_buffer->state.last_nggc_skip)
7295       return;
7296
7297    /* Remember small draw state. */
7298    cmd_buffer->state.last_nggc_skip = skip;
7299    const struct radv_shader *v = pipeline->base.shaders[stage];
7300    assert(v->info.has_ngg_culling == nggc_supported);
7301
7302    /* Find the user SGPR. */
7303    const uint32_t base_reg = pipeline->base.user_data_0[stage];
7304    const int8_t nggc_sgpr_idx = v->info.user_sgprs_locs.shader_data[AC_UD_NGG_CULLING_SETTINGS].sgpr_idx;
7305    assert(!nggc_supported || nggc_sgpr_idx != -1);
7306
7307    /* Get viewport transform. */
7308    float vp_scale[2], vp_translate[2];
7309    memcpy(vp_scale, cmd_buffer->state.dynamic.viewport.xform[0].scale, 2 * sizeof(float));
7310    memcpy(vp_translate, cmd_buffer->state.dynamic.viewport.xform[0].translate, 2 * sizeof(float));
7311    bool vp_y_inverted = (-vp_scale[1] + vp_translate[1]) > (vp_scale[1] + vp_translate[1]);
7312
7313    /* Get current culling settings. */
7314    uint32_t nggc_settings = nggc_supported && !skip
7315                             ? radv_get_ngg_culling_settings(cmd_buffer, vp_y_inverted)
7316                             : radv_nggc_none;
7317
7318    bool emit_viewport = nggc_settings &&
7319                         (cmd_buffer->state.dirty & RADV_CMD_DIRTY_DYNAMIC_VIEWPORT ||
7320                          cmd_buffer->state.last_nggc_settings_sgpr_idx != nggc_sgpr_idx ||
7321                          !cmd_buffer->state.last_nggc_settings);
7322
7323    if (emit_viewport) {
7324       /* Correction for inverted Y */
7325       if (vp_y_inverted) {
7326          vp_scale[1] = -vp_scale[1];
7327          vp_translate[1] = -vp_translate[1];
7328       }
7329
7330       /* Correction for number of samples per pixel. */
7331       for (unsigned i = 0; i < 2; ++i) {
7332          vp_scale[i] *= (float) pipeline->ms.num_samples;
7333          vp_translate[i] *= (float) pipeline->ms.num_samples;
7334       }
7335
7336       uint32_t vp_reg_values[4] = {fui(vp_scale[0]), fui(vp_scale[1]), fui(vp_translate[0]), fui(vp_translate[1])};
7337       const int8_t vp_sgpr_idx = v->info.user_sgprs_locs.shader_data[AC_UD_NGG_VIEWPORT].sgpr_idx;
7338       assert(vp_sgpr_idx != -1);
7339       radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + vp_sgpr_idx * 4, 4);
7340       radeon_emit_array(cmd_buffer->cs, vp_reg_values, 4);
7341    }
7342
7343    bool emit_settings = nggc_supported &&
7344                         (cmd_buffer->state.last_nggc_settings != nggc_settings ||
7345                          cmd_buffer->state.last_nggc_settings_sgpr_idx != nggc_sgpr_idx);
7346
7347    /* This needs to be emitted when culling is turned on
7348     * and when it's already on but some settings change.
7349     */
7350    if (emit_settings) {
7351       assert(nggc_sgpr_idx >= 0);
7352       radeon_set_sh_reg(cmd_buffer->cs, base_reg + nggc_sgpr_idx * 4, nggc_settings);
7353    }
7354
7355    /* These only need to be emitted when culling is turned on or off,
7356     * but not when it stays on and just some settings change.
7357     */
7358    if (!!cmd_buffer->state.last_nggc_settings != !!nggc_settings) {
7359       uint32_t rsrc2 = v->config.rsrc2;
7360
7361       if (!nggc_settings) {
7362          /* Allocate less LDS when culling is disabled. (But GS always needs it.) */
7363          if (stage != MESA_SHADER_GEOMETRY)
7364             rsrc2 = (rsrc2 & C_00B22C_LDS_SIZE) | S_00B22C_LDS_SIZE(v->info.num_lds_blocks_when_not_culling);
7365       }
7366
7367       /* When the pipeline is dirty and not yet emitted, don't write it here
7368        * because radv_emit_graphics_pipeline will overwrite this register.
7369        */
7370       if (!(cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) ||
7371           cmd_buffer->state.emitted_graphics_pipeline == pipeline) {
7372          radeon_set_sh_reg(cmd_buffer->cs, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2);
7373       }
7374    }
7375
7376    cmd_buffer->state.last_nggc_settings = nggc_settings;
7377    cmd_buffer->state.last_nggc_settings_sgpr_idx = nggc_sgpr_idx;
7378 }
7379
7380 static void
7381 radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info,
7382                               bool pipeline_is_dirty)
7383 {
7384    const struct radv_device *device = cmd_buffer->device;
7385    bool late_scissor_emission;
7386
7387    if ((cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER) ||
7388        cmd_buffer->state.emitted_graphics_pipeline != cmd_buffer->state.graphics_pipeline)
7389       radv_emit_rbplus_state(cmd_buffer);
7390
7391    if (cmd_buffer->device->physical_device->use_ngg_culling &&
7392        cmd_buffer->state.graphics_pipeline->is_ngg)
7393       radv_emit_ngg_culling_state(cmd_buffer, info);
7394
7395    if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE)
7396       radv_emit_graphics_pipeline(cmd_buffer);
7397
7398    /* This should be before the cmd_buffer->state.dirty is cleared
7399     * (excluding RADV_CMD_DIRTY_PIPELINE) and after
7400     * cmd_buffer->state.context_roll_without_scissor_emitted is set. */
7401    late_scissor_emission = radv_need_late_scissor_emission(cmd_buffer, info);
7402
7403    if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)
7404       radv_emit_framebuffer_state(cmd_buffer);
7405
7406    if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_GUARDBAND)
7407       radv_emit_guardband_state(cmd_buffer);
7408
7409    if (info->indexed) {
7410       if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_INDEX_BUFFER)
7411          radv_emit_index_buffer(cmd_buffer, info->indirect);
7412    } else {
7413       /* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE,
7414        * so the state must be re-emitted before the next indexed
7415        * draw.
7416        */
7417       if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7) {
7418          cmd_buffer->state.last_index_type = -1;
7419          cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
7420       }
7421    }
7422
7423    if (cmd_buffer->device->force_vrs != RADV_FORCE_VRS_1x1) {
7424       struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
7425       uint64_t dynamic_states =
7426          cmd_buffer->state.dirty & cmd_buffer->state.emitted_graphics_pipeline->needed_dynamic_state;
7427
7428       if ((dynamic_states & RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE) &&
7429           d->fragment_shading_rate.size.width == 1 &&
7430           d->fragment_shading_rate.size.height == 1 &&
7431           d->fragment_shading_rate.combiner_ops[0] == VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR &&
7432           d->fragment_shading_rate.combiner_ops[1] == VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR) {
7433          /* When per-vertex VRS is forced and the dynamic fragment shading rate is a no-op, ignore
7434           * it. This is needed for vkd3d-proton because it always declares per-draw VRS as dynamic.
7435           */
7436          cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE;
7437       }
7438    }
7439
7440    if (device->pbb_allowed) {
7441       struct radv_binning_settings *settings = &device->physical_device->binning_settings;
7442
7443       if ((cmd_buffer->state.dirty & RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE) &&
7444           settings->context_states_per_bin > 1) {
7445          /* Break the batch on CB_TARGET_MASK changes. */
7446          radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
7447          radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
7448       }
7449    }
7450
7451    radv_cmd_buffer_flush_dynamic_state(cmd_buffer, pipeline_is_dirty);
7452
7453    radv_emit_draw_registers(cmd_buffer, info);
7454
7455    if (late_scissor_emission)
7456       radv_emit_scissor(cmd_buffer);
7457 }
7458
7459 /* MUST inline this function to avoid massive perf loss in drawoverhead */
7460 ALWAYS_INLINE static bool
7461 radv_before_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info, uint32_t drawCount)
7462 {
7463    const bool has_prefetch = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7;
7464    const bool pipeline_is_dirty = (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) &&
7465                                   cmd_buffer->state.graphics_pipeline != cmd_buffer->state.emitted_graphics_pipeline;
7466
7467    ASSERTED const unsigned cdw_max =
7468       radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4096 + 128 * (drawCount - 1));
7469
7470    if (likely(!info->indirect)) {
7471       /* GFX6-GFX7 treat instance_count==0 as instance_count==1. There is
7472        * no workaround for indirect draws, but we can at least skip
7473        * direct draws.
7474        */
7475       if (unlikely(!info->instance_count))
7476          return false;
7477
7478       /* Handle count == 0. */
7479       if (unlikely(!info->count && !info->strmout_buffer))
7480          return false;
7481    }
7482
7483    /* Need to apply this workaround early as it can set flush flags. */
7484    if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)
7485       radv_emit_fb_mip_change_flush(cmd_buffer);
7486
7487    /* Use optimal packet order based on whether we need to sync the
7488     * pipeline.
7489     */
7490    if (cmd_buffer->state.flush_bits &
7491        (RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB |
7492         RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) {
7493       /* If we have to wait for idle, set all states first, so that
7494        * all SET packets are processed in parallel with previous draw
7495        * calls. Then upload descriptors, set shader pointers, and
7496        * draw, and prefetch at the end. This ensures that the time
7497        * the CUs are idle is very short. (there are only SET_SH
7498        * packets between the wait and the draw)
7499        */
7500       radv_emit_all_graphics_states(cmd_buffer, info, pipeline_is_dirty);
7501       si_emit_cache_flush(cmd_buffer);
7502       /* <-- CUs are idle here --> */
7503
7504       radv_upload_graphics_shader_descriptors(cmd_buffer, pipeline_is_dirty);
7505    } else {
7506       /* If we don't wait for idle, start prefetches first, then set
7507        * states, and draw at the end.
7508        */
7509       si_emit_cache_flush(cmd_buffer);
7510
7511       if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) {
7512          /* Only prefetch the vertex shader and VBO descriptors
7513           * in order to start the draw as soon as possible.
7514           */
7515          radv_emit_prefetch_L2(cmd_buffer, cmd_buffer->state.graphics_pipeline, true);
7516       }
7517
7518       radv_upload_graphics_shader_descriptors(cmd_buffer, pipeline_is_dirty);
7519
7520       radv_emit_all_graphics_states(cmd_buffer, info, pipeline_is_dirty);
7521    }
7522
7523    radv_describe_draw(cmd_buffer);
7524    if (likely(!info->indirect)) {
7525       struct radv_cmd_state *state = &cmd_buffer->state;
7526       struct radeon_cmdbuf *cs = cmd_buffer->cs;
7527       assert(state->graphics_pipeline->vtx_base_sgpr);
7528       if (state->last_num_instances != info->instance_count) {
7529          radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, false));
7530          radeon_emit(cs, info->instance_count);
7531          state->last_num_instances = info->instance_count;
7532       }
7533    }
7534    assert(cmd_buffer->cs->cdw <= cdw_max);
7535
7536    return true;
7537 }
7538
7539 ALWAYS_INLINE static bool
7540 radv_before_taskmesh_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info,
7541                           uint32_t drawCount)
7542 {
7543    struct radv_descriptor_state *descriptors_state =
7544       radv_get_descriptors_state(cmd_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS);
7545    const bool pipeline_is_dirty =
7546       cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE &&
7547       cmd_buffer->state.graphics_pipeline != cmd_buffer->state.emitted_graphics_pipeline;
7548    const bool push_dirty = descriptors_state->push_dirty;
7549    const uint32_t desc_dirty = descriptors_state->dirty;
7550
7551    const bool gfx_result = radv_before_draw(cmd_buffer, info, drawCount);
7552    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
7553    struct radv_shader *task_shader = radv_get_shader(&pipeline->base, MESA_SHADER_TASK);
7554
7555    /* If there is no task shader, no need to do anything special. */
7556    if (!task_shader)
7557       return gfx_result;
7558
7559    /* Need to check the count even for indirect draws to work around
7560     * an issue with DISPATCH_TASKMESH_INDIRECT_MULTI_ACE.
7561     */
7562    if (!info->count || !gfx_result)
7563       return false;
7564
7565    const bool need_task_semaphore = radv_flush_gfx2ace_semaphore(cmd_buffer);
7566    struct radv_physical_device *pdevice = cmd_buffer->device->physical_device;
7567    struct radeon_cmdbuf *ace_cs = cmd_buffer->ace_internal.cs;
7568    struct radeon_winsys *ws = cmd_buffer->device->ws;
7569
7570    assert(ace_cs);
7571    ASSERTED const unsigned ace_cdw_max =
7572       radeon_check_space(ws, ace_cs, 4096 + 128 * (drawCount - 1));
7573
7574    if (need_task_semaphore)
7575       radv_wait_gfx2ace_semaphore(cmd_buffer);
7576
7577    if (pipeline_is_dirty) {
7578       radv_pipeline_emit_hw_cs(pdevice, ace_cs, task_shader);
7579       radv_pipeline_emit_compute_state(pdevice, ace_cs, task_shader);
7580    }
7581
7582    radv_ace_internal_cache_flush(cmd_buffer);
7583
7584    /* Restore dirty state of descriptors
7585     * They were marked non-dirty in radv_before_draw,
7586     * but they need to be re-emitted now to the ACE cmdbuf.
7587     */
7588    descriptors_state->push_dirty = push_dirty;
7589    descriptors_state->dirty = desc_dirty;
7590
7591    /* Flush descriptors and push constants for task shaders. */
7592    radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_TASK_BIT_NV, &pipeline->base,
7593                           VK_PIPELINE_BIND_POINT_GRAPHICS);
7594    radv_flush_constants(cmd_buffer, VK_SHADER_STAGE_TASK_BIT_NV, &pipeline->base,
7595                         VK_PIPELINE_BIND_POINT_GRAPHICS);
7596
7597    assert(ace_cs->cdw <= ace_cdw_max);
7598    return true;
7599 }
7600
7601 static void
7602 radv_after_draw(struct radv_cmd_buffer *cmd_buffer)
7603 {
7604    const struct radeon_info *rad_info = &cmd_buffer->device->physical_device->rad_info;
7605    bool has_prefetch = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7;
7606    /* Start prefetches after the draw has been started. Both will
7607     * run in parallel, but starting the draw first is more
7608     * important.
7609     */
7610    if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) {
7611       radv_emit_prefetch_L2(cmd_buffer, cmd_buffer->state.graphics_pipeline, false);
7612    }
7613
7614    /* Workaround for a VGT hang when streamout is enabled.
7615     * It must be done after drawing.
7616     */
7617    if (radv_is_streamout_enabled(cmd_buffer) &&
7618        (rad_info->family == CHIP_HAWAII || rad_info->family == CHIP_TONGA ||
7619         rad_info->family == CHIP_FIJI)) {
7620       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_STREAMOUT_SYNC;
7621    }
7622
7623    radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_PS_PARTIAL_FLUSH);
7624 }
7625
7626 static struct radv_buffer
7627 radv_nv_mesh_indirect_bo(struct radv_cmd_buffer *cmd_buffer,
7628                          struct radv_buffer *buffer, VkDeviceSize offset,
7629                          uint32_t draw_count, uint32_t stride)
7630 {
7631    /* Translates the indirect BO format used by NV_mesh_shader API
7632     * to the BO format used by DRAW_INDIRECT / DRAW_INDIRECT_MULTI.
7633     */
7634
7635    struct radeon_cmdbuf *cs = cmd_buffer->cs;
7636    struct radeon_winsys *ws = cmd_buffer->device->ws;
7637
7638    const size_t src_stride = MAX2(stride, sizeof(VkDrawMeshTasksIndirectCommandNV));
7639    const size_t dst_stride = sizeof(VkDrawIndirectCommand);
7640    const size_t src_off_task_count = offsetof(VkDrawMeshTasksIndirectCommandNV, taskCount);
7641    const size_t src_off_first_task = offsetof(VkDrawMeshTasksIndirectCommandNV, firstTask);
7642    const size_t dst_off_vertex_count = offsetof(VkDrawIndirectCommand, vertexCount);
7643    const size_t dst_off_first_vertex = offsetof(VkDrawIndirectCommand, firstVertex);
7644
7645    /* Fill the buffer with all zeroes except instanceCount = 1.
7646     * This helps emit fewer copy packets below.
7647     */
7648    VkDrawIndirectCommand *fill_data = (VkDrawIndirectCommand *) alloca(dst_stride * draw_count);
7649    const VkDrawIndirectCommand filler = { .instanceCount = 1 };
7650    for (unsigned i = 0; i < draw_count; ++i)
7651       fill_data[i] = filler;
7652
7653    /* We'll have to copy data from the API BO. */
7654    uint64_t va = radv_buffer_get_va(buffer->bo) + buffer->offset + offset;
7655    radv_cs_add_buffer(ws, cs, buffer->bo);
7656
7657    /* Allocate some space in the upload BO. */
7658    unsigned out_offset;
7659    radv_cmd_buffer_upload_data(cmd_buffer, dst_stride * draw_count, fill_data, &out_offset);
7660    const uint64_t new_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + out_offset;
7661
7662    ASSERTED unsigned cdw_max = radeon_check_space(ws, cs, 12 * draw_count + 2);
7663
7664    /* Copy data from the API BO so that the format is suitable for the
7665     * indirect draw packet:
7666     * - vertexCount = taskCount (copied here)
7667     * - instanceCount = 1 (filled by CPU above)
7668     * - firstVertex = firstTask (copied here)
7669     * - firstInstance = 0 (filled by CPU above)
7670     */
7671    for (unsigned i = 0; i < draw_count; ++i) {
7672       const uint64_t src_task_count = va + i * src_stride + src_off_task_count;
7673       const uint64_t src_first_task = va + i * src_stride + src_off_first_task;
7674       const uint64_t dst_vertex_count = new_va + i * dst_stride + dst_off_vertex_count;
7675       const uint64_t dst_first_vertex = new_va + i * dst_stride + dst_off_first_vertex;
7676
7677       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating));
7678       radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
7679                       COPY_DATA_WR_CONFIRM);
7680       radeon_emit(cs, src_task_count);
7681       radeon_emit(cs, src_task_count >> 32);
7682       radeon_emit(cs, dst_vertex_count);
7683       radeon_emit(cs, dst_vertex_count >> 32);
7684
7685       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating));
7686       radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
7687                       COPY_DATA_WR_CONFIRM);
7688       radeon_emit(cs, src_first_task);
7689       radeon_emit(cs, src_first_task >> 32);
7690       radeon_emit(cs, dst_first_vertex);
7691       radeon_emit(cs, dst_first_vertex >> 32);
7692    }
7693
7694    /* Wait for the copies to finish */
7695    radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
7696    radeon_emit(cs, 0);
7697
7698    /* The draw packet can now use this buffer: */
7699    struct radv_buffer buf = *buffer;
7700    buf.bo = cmd_buffer->upload.upload_bo;
7701    buf.offset = out_offset;
7702
7703    assert(cmd_buffer->cs->cdw <= cdw_max);
7704
7705    return buf;
7706 }
7707
7708 static struct radv_buffer
7709 radv_nv_task_indirect_bo(struct radv_cmd_buffer *cmd_buffer, struct radv_buffer *buffer,
7710                          VkDeviceSize offset, uint32_t draw_count, uint32_t stride)
7711 {
7712    /* Translates the indirect BO format used by NV_mesh_shader API
7713     * to the BO format used by DISPATCH_TASKMESH_INDIRECT_MULTI_ACE.
7714     */
7715
7716    assert(draw_count);
7717    static_assert(sizeof(VkDispatchIndirectCommand) == 12, "Incorrect size of taskmesh command.");
7718
7719    struct radeon_cmdbuf *cs = cmd_buffer->ace_internal.cs;
7720    struct radeon_winsys *ws = cmd_buffer->device->ws;
7721
7722    const size_t src_stride = MAX2(stride, sizeof(VkDrawMeshTasksIndirectCommandNV));
7723    const size_t dst_stride = sizeof(VkDispatchIndirectCommand);
7724    const size_t src_off_task_count = offsetof(VkDrawMeshTasksIndirectCommandNV, taskCount);
7725    const size_t dst_off_x = offsetof(VkDispatchIndirectCommand, x);
7726
7727    const unsigned new_disp_size = dst_stride * draw_count;
7728
7729    const uint64_t va = radv_buffer_get_va(buffer->bo) + buffer->offset + offset;
7730    radv_cs_add_buffer(ws, cs, buffer->bo);
7731
7732    /* Fill the buffer with X=0, Y=1, Z=1. */
7733    VkDispatchIndirectCommand *fill_data = (VkDispatchIndirectCommand *)alloca(new_disp_size);
7734    for (unsigned i = 0; i < draw_count; ++i) {
7735       fill_data[i].x = 0;
7736       fill_data[i].y = 1;
7737       fill_data[i].z = 1;
7738    }
7739
7740    /* Allocate space in the upload BO. */
7741    unsigned out_offset;
7742    ASSERTED bool uploaded =
7743       radv_cmd_buffer_upload_data(cmd_buffer, new_disp_size, fill_data, &out_offset);
7744    const uint64_t new_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + out_offset;
7745    assert(uploaded);
7746
7747    /* Clamp draw count to fit the actual size of the buffer.
7748     * This is to avoid potential out of bounds copies (eg. for draws with an indirect count buffer).
7749     * The remaining indirect draws will stay filled with X=0, Y=1, Z=1 which is harmless.
7750     */
7751    draw_count = MIN2(draw_count, (buffer->vk.size - buffer->offset - offset) / src_stride);
7752
7753    ASSERTED unsigned cdw_max = radeon_check_space(ws, cs, 6 * draw_count + 2);
7754
7755    /* Copy taskCount from the NV API BO to the X dispatch size of the compatible BO. */
7756    for (unsigned i = 0; i < draw_count; ++i) {
7757       const uint64_t src_task_count = va + i * src_stride + src_off_task_count;
7758       const uint64_t dst_x = new_va + i * dst_stride + dst_off_x;
7759
7760       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating));
7761       radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
7762                          COPY_DATA_WR_CONFIRM);
7763       radeon_emit(cs, src_task_count);
7764       radeon_emit(cs, src_task_count >> 32);
7765       radeon_emit(cs, dst_x);
7766       radeon_emit(cs, dst_x >> 32);
7767    }
7768
7769    assert(cs->cdw <= cdw_max);
7770
7771    /* The draw packet can now use this buffer: */
7772    struct radv_buffer buf = *buffer;
7773    buf.bo = cmd_buffer->upload.upload_bo;
7774    buf.offset = out_offset;
7775
7776    return buf;
7777 }
7778
7779 VKAPI_ATTR void VKAPI_CALL
7780 radv_CmdDraw(VkCommandBuffer commandBuffer, uint32_t vertexCount, uint32_t instanceCount,
7781              uint32_t firstVertex, uint32_t firstInstance)
7782 {
7783    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7784    struct radv_draw_info info;
7785
7786    info.count = vertexCount;
7787    info.instance_count = instanceCount;
7788    info.first_instance = firstInstance;
7789    info.strmout_buffer = NULL;
7790    info.indirect = NULL;
7791    info.indexed = false;
7792
7793    if (!radv_before_draw(cmd_buffer, &info, 1))
7794       return;
7795    const VkMultiDrawInfoEXT minfo = { firstVertex, vertexCount };
7796    radv_emit_direct_draw_packets(cmd_buffer, &info, 1, &minfo, 0, 0);
7797    radv_after_draw(cmd_buffer);
7798 }
7799
7800 VKAPI_ATTR void VKAPI_CALL
7801 radv_CmdDrawMultiEXT(VkCommandBuffer commandBuffer, uint32_t drawCount, const VkMultiDrawInfoEXT *pVertexInfo,
7802                           uint32_t instanceCount, uint32_t firstInstance, uint32_t stride)
7803 {
7804    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7805    struct radv_draw_info info;
7806
7807    if (!drawCount)
7808       return;
7809
7810    info.count = pVertexInfo->vertexCount;
7811    info.instance_count = instanceCount;
7812    info.first_instance = firstInstance;
7813    info.strmout_buffer = NULL;
7814    info.indirect = NULL;
7815    info.indexed = false;
7816
7817    if (!radv_before_draw(cmd_buffer, &info, drawCount))
7818       return;
7819    radv_emit_direct_draw_packets(cmd_buffer, &info, drawCount, pVertexInfo, 0, stride);
7820    radv_after_draw(cmd_buffer);
7821 }
7822
7823 VKAPI_ATTR void VKAPI_CALL
7824 radv_CmdDrawIndexed(VkCommandBuffer commandBuffer, uint32_t indexCount, uint32_t instanceCount,
7825                     uint32_t firstIndex, int32_t vertexOffset, uint32_t firstInstance)
7826 {
7827    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7828    struct radv_draw_info info;
7829
7830    info.indexed = true;
7831    info.count = indexCount;
7832    info.instance_count = instanceCount;
7833    info.first_instance = firstInstance;
7834    info.strmout_buffer = NULL;
7835    info.indirect = NULL;
7836
7837    if (!radv_before_draw(cmd_buffer, &info, 1))
7838       return;
7839    const VkMultiDrawIndexedInfoEXT minfo = { firstIndex, indexCount, vertexOffset };
7840    radv_emit_draw_packets_indexed(cmd_buffer, &info, 1, &minfo, 0, NULL);
7841    radv_after_draw(cmd_buffer);
7842 }
7843
7844 VKAPI_ATTR void VKAPI_CALL
7845 radv_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer, uint32_t drawCount, const VkMultiDrawIndexedInfoEXT *pIndexInfo,
7846                             uint32_t instanceCount, uint32_t firstInstance, uint32_t stride, const int32_t *pVertexOffset)
7847 {
7848    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7849    struct radv_draw_info info;
7850
7851    if (!drawCount)
7852       return;
7853
7854    const VkMultiDrawIndexedInfoEXT *minfo = pIndexInfo;
7855    info.indexed = true;
7856    info.count = minfo->indexCount;
7857    info.instance_count = instanceCount;
7858    info.first_instance = firstInstance;
7859    info.strmout_buffer = NULL;
7860    info.indirect = NULL;
7861
7862    if (!radv_before_draw(cmd_buffer, &info, drawCount))
7863       return;
7864    radv_emit_draw_packets_indexed(cmd_buffer, &info, drawCount, pIndexInfo, stride, pVertexOffset);
7865    radv_after_draw(cmd_buffer);
7866 }
7867
7868 VKAPI_ATTR void VKAPI_CALL
7869 radv_CmdDrawIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset,
7870                      uint32_t drawCount, uint32_t stride)
7871 {
7872    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7873    RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
7874    struct radv_draw_info info;
7875
7876    info.count = drawCount;
7877    info.indirect = buffer;
7878    info.indirect_offset = offset;
7879    info.stride = stride;
7880    info.strmout_buffer = NULL;
7881    info.count_buffer = NULL;
7882    info.indexed = false;
7883    info.instance_count = 0;
7884
7885    if (!radv_before_draw(cmd_buffer, &info, 1))
7886       return;
7887    radv_emit_indirect_draw_packets(cmd_buffer, &info);
7888    radv_after_draw(cmd_buffer);
7889 }
7890
7891 VKAPI_ATTR void VKAPI_CALL
7892 radv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset,
7893                             uint32_t drawCount, uint32_t stride)
7894 {
7895    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7896    RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
7897    struct radv_draw_info info;
7898
7899    info.indexed = true;
7900    info.count = drawCount;
7901    info.indirect = buffer;
7902    info.indirect_offset = offset;
7903    info.stride = stride;
7904    info.count_buffer = NULL;
7905    info.strmout_buffer = NULL;
7906    info.instance_count = 0;
7907
7908    if (!radv_before_draw(cmd_buffer, &info, 1))
7909       return;
7910    radv_emit_indirect_draw_packets(cmd_buffer, &info);
7911    radv_after_draw(cmd_buffer);
7912 }
7913
7914 VKAPI_ATTR void VKAPI_CALL
7915 radv_CmdDrawIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset,
7916                           VkBuffer _countBuffer, VkDeviceSize countBufferOffset,
7917                           uint32_t maxDrawCount, uint32_t stride)
7918 {
7919    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7920    RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
7921    RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
7922    struct radv_draw_info info;
7923
7924    info.count = maxDrawCount;
7925    info.indirect = buffer;
7926    info.indirect_offset = offset;
7927    info.count_buffer = count_buffer;
7928    info.count_buffer_offset = countBufferOffset;
7929    info.stride = stride;
7930    info.strmout_buffer = NULL;
7931    info.indexed = false;
7932    info.instance_count = 0;
7933
7934    if (!radv_before_draw(cmd_buffer, &info, 1))
7935       return;
7936    radv_emit_indirect_draw_packets(cmd_buffer, &info);
7937    radv_after_draw(cmd_buffer);
7938 }
7939
7940 VKAPI_ATTR void VKAPI_CALL
7941 radv_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer,
7942                                  VkDeviceSize offset, VkBuffer _countBuffer,
7943                                  VkDeviceSize countBufferOffset, uint32_t maxDrawCount,
7944                                  uint32_t stride)
7945 {
7946    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7947    RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
7948    RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
7949    struct radv_draw_info info;
7950
7951    info.indexed = true;
7952    info.count = maxDrawCount;
7953    info.indirect = buffer;
7954    info.indirect_offset = offset;
7955    info.count_buffer = count_buffer;
7956    info.count_buffer_offset = countBufferOffset;
7957    info.stride = stride;
7958    info.strmout_buffer = NULL;
7959    info.instance_count = 0;
7960
7961    if (!radv_before_draw(cmd_buffer, &info, 1))
7962       return;
7963    radv_emit_indirect_draw_packets(cmd_buffer, &info);
7964    radv_after_draw(cmd_buffer);
7965 }
7966
7967 VKAPI_ATTR void VKAPI_CALL
7968 radv_CmdDrawMeshTasksNV(VkCommandBuffer commandBuffer, uint32_t taskCount, uint32_t firstTask)
7969 {
7970    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7971    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
7972    struct radv_draw_info info;
7973
7974    info.count = taskCount;
7975    info.instance_count = 1;
7976    info.first_instance = 0;
7977    info.stride = 0;
7978    info.indexed = false;
7979    info.strmout_buffer = NULL;
7980    info.count_buffer = NULL;
7981    info.indirect = NULL;
7982
7983    if (!radv_before_taskmesh_draw(cmd_buffer, &info, 1))
7984       return;
7985
7986    if (radv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
7987       radv_emit_direct_taskmesh_draw_packets(cmd_buffer, taskCount, 1, 1, firstTask);
7988    } else {
7989       radv_emit_direct_mesh_draw_packet(cmd_buffer, taskCount, 1, 1, firstTask);
7990    }
7991
7992    radv_after_draw(cmd_buffer);
7993 }
7994
7995 VKAPI_ATTR void VKAPI_CALL
7996 radv_CmdDrawMeshTasksIndirectNV(VkCommandBuffer commandBuffer, VkBuffer _buffer,
7997                                 VkDeviceSize offset, uint32_t drawCount, uint32_t stride)
7998 {
7999    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8000    RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
8001
8002    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
8003    struct radv_draw_info info;
8004
8005    info.indirect = buffer;
8006    info.indirect_offset = offset;
8007    info.stride = stride;
8008    info.count = drawCount;
8009    info.strmout_buffer = NULL;
8010    info.count_buffer = NULL;
8011    info.indexed = false;
8012    info.instance_count = 0;
8013
8014    if (!radv_before_taskmesh_draw(cmd_buffer, &info, drawCount))
8015       return;
8016
8017    /* Indirect draw with mesh shader only:
8018     * Use DRAW_INDIRECT / DRAW_INDIRECT_MULTI like normal indirect draws.
8019     * Needed because DISPATCH_MESH_INDIRECT_MULTI doesn't support firstTask.
8020     *
8021     * Indirect draw with task + mesh shaders:
8022     * Use DISPATCH_TASKMESH_INDIRECT_MULTI_ACE + DISPATCH_TASKMESH_GFX.
8023     * These packets don't support firstTask so we implement that by
8024     * reading the NV command's indirect buffer in the shader.
8025     *
8026     * The indirect BO layout from the NV_mesh_shader API is incompatible
8027     * with AMD HW. To make it work, we allocate some space
8028     * in the upload buffer and copy the data to it.
8029     */
8030
8031    if (radv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
8032       uint64_t nv_ib_va = radv_buffer_get_va(buffer->bo) + buffer->offset + offset;
8033       uint32_t nv_ib_stride = MAX2(stride, sizeof(VkDrawMeshTasksIndirectCommandNV));
8034       struct radv_buffer buf =
8035          radv_nv_task_indirect_bo(cmd_buffer, buffer, offset, drawCount, stride);
8036       info.indirect = &buf;
8037       info.indirect_offset = 0;
8038       info.stride = sizeof(VkDispatchIndirectCommand);
8039
8040       radv_emit_indirect_taskmesh_draw_packets(cmd_buffer, &info, nv_ib_va, nv_ib_stride);
8041    } else {
8042       struct radv_buffer buf =
8043          radv_nv_mesh_indirect_bo(cmd_buffer, buffer, offset, drawCount, stride);
8044       info.indirect = &buf;
8045       info.indirect_offset = 0;
8046       info.stride = sizeof(VkDrawIndirectCommand);
8047
8048       radv_emit_indirect_draw_packets(cmd_buffer, &info);
8049    }
8050
8051    radv_after_draw(cmd_buffer);
8052 }
8053
8054 VKAPI_ATTR void VKAPI_CALL
8055 radv_CmdDrawMeshTasksIndirectCountNV(VkCommandBuffer commandBuffer, VkBuffer _buffer,
8056                                      VkDeviceSize offset, VkBuffer _countBuffer,
8057                                      VkDeviceSize countBufferOffset, uint32_t maxDrawCount,
8058                                      uint32_t stride)
8059 {
8060    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8061    RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
8062    RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
8063
8064    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
8065    struct radv_draw_info info;
8066
8067    info.indirect = buffer;
8068    info.indirect_offset = offset;
8069    info.stride = stride;
8070    info.count = maxDrawCount;
8071    info.strmout_buffer = NULL;
8072    info.count_buffer = count_buffer;
8073    info.count_buffer_offset = countBufferOffset;
8074    info.indexed = false;
8075    info.instance_count = 0;
8076
8077    if (!radv_before_taskmesh_draw(cmd_buffer, &info, maxDrawCount))
8078       return;
8079
8080    if (radv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
8081       uint64_t nv_ib_va = radv_buffer_get_va(buffer->bo) + buffer->offset + offset;
8082       uint32_t nv_ib_stride = MAX2(stride, sizeof(VkDrawMeshTasksIndirectCommandNV));
8083       struct radv_buffer buf =
8084          radv_nv_task_indirect_bo(cmd_buffer, buffer, offset, maxDrawCount, stride);
8085       info.indirect = &buf;
8086       info.indirect_offset = 0;
8087       info.stride = sizeof(VkDispatchIndirectCommand);
8088
8089       radv_emit_indirect_taskmesh_draw_packets(cmd_buffer, &info, nv_ib_va, nv_ib_stride);
8090    } else {
8091       struct radv_buffer buf =
8092          radv_nv_mesh_indirect_bo(cmd_buffer, buffer, offset, maxDrawCount, stride);
8093       info.indirect = &buf;
8094       info.indirect_offset = 0;
8095       info.stride = sizeof(VkDrawIndirectCommand);
8096
8097       radv_emit_indirect_draw_packets(cmd_buffer, &info);
8098    }
8099
8100    radv_after_draw(cmd_buffer);
8101 }
8102
8103 void
8104 radv_CmdExecuteGeneratedCommandsNV(VkCommandBuffer commandBuffer, VkBool32 isPreprocessed,
8105                                    const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo)
8106 {
8107    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8108    VK_FROM_HANDLE(radv_indirect_command_layout, layout,
8109                   pGeneratedCommandsInfo->indirectCommandsLayout);
8110    VK_FROM_HANDLE(radv_buffer, prep_buffer, pGeneratedCommandsInfo->preprocessBuffer);
8111
8112    /* The only actions that can be done are draws, so skip on other queues. */
8113    if (cmd_buffer->qf != RADV_QUEUE_GENERAL)
8114       return;
8115
8116    /* Secondary command buffers are needed for the full extension but can't use
8117     * PKT3_INDIRECT_BUFFER_CIK.
8118     */
8119    assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
8120
8121    radv_prepare_dgc(cmd_buffer, pGeneratedCommandsInfo);
8122
8123    struct radv_draw_info info;
8124
8125    info.count = pGeneratedCommandsInfo->sequencesCount;
8126    info.indirect = prep_buffer; /* We're not really going use it this way, but a good signal
8127                                    that this is not direct. */
8128    info.indirect_offset = 0;
8129    info.stride = 0;
8130    info.strmout_buffer = NULL;
8131    info.count_buffer = NULL;
8132    info.indexed = layout->indexed;
8133    info.instance_count = 0;
8134
8135    if (!radv_before_draw(cmd_buffer, &info, 1))
8136       return;
8137
8138    uint32_t cmdbuf_size = radv_get_indirect_cmdbuf_size(pGeneratedCommandsInfo);
8139    uint64_t va = radv_buffer_get_va(prep_buffer->bo) + prep_buffer->offset +
8140                  pGeneratedCommandsInfo->preprocessOffset;
8141    const uint32_t view_mask = cmd_buffer->state.subpass->view_mask;
8142
8143    radeon_emit(cmd_buffer->cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
8144    radeon_emit(cmd_buffer->cs, 0);
8145
8146    if (!view_mask) {
8147       radeon_emit(cmd_buffer->cs, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0));
8148       radeon_emit(cmd_buffer->cs, va);
8149       radeon_emit(cmd_buffer->cs, va >> 32);
8150       radeon_emit(cmd_buffer->cs, cmdbuf_size >> 2);
8151    } else {
8152       u_foreach_bit (view, view_mask) {
8153          radv_emit_view_index(cmd_buffer, view);
8154
8155          radeon_emit(cmd_buffer->cs, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0));
8156          radeon_emit(cmd_buffer->cs, va);
8157          radeon_emit(cmd_buffer->cs, va >> 32);
8158          radeon_emit(cmd_buffer->cs, cmdbuf_size >> 2);
8159       }
8160    }
8161
8162    if (layout->binds_index_buffer) {
8163       cmd_buffer->state.last_index_type = -1;
8164       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
8165    }
8166
8167    if (layout->bind_vbo_mask)
8168       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER;
8169
8170    if (layout->binds_state)
8171       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE;
8172
8173    cmd_buffer->push_constant_stages |= ~0;
8174
8175    cmd_buffer->state.last_index_type = -1;
8176    cmd_buffer->state.last_num_instances = -1;
8177    cmd_buffer->state.last_vertex_offset = -1;
8178    cmd_buffer->state.last_first_instance = -1;
8179    cmd_buffer->state.last_drawid = -1;
8180
8181    radv_after_draw(cmd_buffer);
8182 }
8183
8184 struct radv_dispatch_info {
8185    /**
8186     * Determine the layout of the grid (in block units) to be used.
8187     */
8188    uint32_t blocks[3];
8189
8190    /**
8191     * A starting offset for the grid. If unaligned is set, the offset
8192     * must still be aligned.
8193     */
8194    uint32_t offsets[3];
8195    /**
8196     * Whether it's an unaligned compute dispatch.
8197     */
8198    bool unaligned;
8199
8200    /**
8201     * Indirect compute parameters resource.
8202     */
8203    struct radeon_winsys_bo *indirect;
8204    uint64_t va;
8205 };
8206
8207 static void
8208 radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer,
8209                            struct radv_compute_pipeline *pipeline,
8210                            const struct radv_dispatch_info *info)
8211 {
8212    struct radv_shader *compute_shader = pipeline->base.shaders[MESA_SHADER_COMPUTE];
8213    unsigned dispatch_initiator = cmd_buffer->device->dispatch_initiator;
8214    struct radeon_winsys *ws = cmd_buffer->device->ws;
8215    bool predicating = cmd_buffer->state.predicating;
8216    struct radeon_cmdbuf *cs = cmd_buffer->cs;
8217    struct radv_userdata_info *loc;
8218
8219    radv_describe_dispatch(cmd_buffer, info->blocks[0], info->blocks[1], info->blocks[2]);
8220
8221    loc = radv_lookup_user_sgpr(&pipeline->base, MESA_SHADER_COMPUTE, AC_UD_CS_GRID_SIZE);
8222
8223    ASSERTED unsigned cdw_max = radeon_check_space(ws, cs, 30);
8224
8225    if (compute_shader->info.wave_size == 32) {
8226       assert(cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10);
8227       dispatch_initiator |= S_00B800_CS_W32_EN(1);
8228    }
8229
8230    if (info->va) {
8231       if (info->indirect)
8232          radv_cs_add_buffer(ws, cs, info->indirect);
8233
8234       if (info->unaligned) {
8235          radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
8236          radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[0]));
8237          radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[1]));
8238          radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[2]));
8239
8240          dispatch_initiator |= S_00B800_USE_THREAD_DIMENSIONS(1);
8241       }
8242
8243       if (loc->sgpr_idx != -1) {
8244          unsigned reg = R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4;
8245
8246          if (cmd_buffer->device->load_grid_size_from_user_sgpr) {
8247             assert(cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10_3);
8248             radeon_emit(cs, PKT3(PKT3_LOAD_SH_REG_INDEX, 3, 0));
8249             radeon_emit(cs, info->va);
8250             radeon_emit(cs, info->va >> 32);
8251             radeon_emit(cs, (reg - SI_SH_REG_OFFSET) >> 2);
8252             radeon_emit(cs, 3);
8253          } else {
8254             radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, reg, info->va, true);
8255          }
8256       }
8257
8258       if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
8259          radv_cs_emit_compute_predication(&cmd_buffer->state, cs, cmd_buffer->mec_inv_pred_va,
8260                                           &cmd_buffer->mec_inv_pred_emitted,
8261                                           4 /* DISPATCH_INDIRECT size */);
8262          radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 2, 0) | PKT3_SHADER_TYPE_S(1));
8263          radeon_emit(cs, info->va);
8264          radeon_emit(cs, info->va >> 32);
8265          radeon_emit(cs, dispatch_initiator);
8266       } else {
8267          radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) | PKT3_SHADER_TYPE_S(1));
8268          radeon_emit(cs, 1);
8269          radeon_emit(cs, info->va);
8270          radeon_emit(cs, info->va >> 32);
8271
8272          radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, predicating) | PKT3_SHADER_TYPE_S(1));
8273          radeon_emit(cs, 0);
8274          radeon_emit(cs, dispatch_initiator);
8275       }
8276    } else {
8277       unsigned blocks[3] = {info->blocks[0], info->blocks[1], info->blocks[2]};
8278       unsigned offsets[3] = {info->offsets[0], info->offsets[1], info->offsets[2]};
8279
8280       if (info->unaligned) {
8281          unsigned *cs_block_size = compute_shader->info.cs.block_size;
8282          unsigned remainder[3];
8283
8284          /* If aligned, these should be an entire block size,
8285           * not 0.
8286           */
8287          remainder[0] = blocks[0] + cs_block_size[0] - align_u32_npot(blocks[0], cs_block_size[0]);
8288          remainder[1] = blocks[1] + cs_block_size[1] - align_u32_npot(blocks[1], cs_block_size[1]);
8289          remainder[2] = blocks[2] + cs_block_size[2] - align_u32_npot(blocks[2], cs_block_size[2]);
8290
8291          blocks[0] = round_up_u32(blocks[0], cs_block_size[0]);
8292          blocks[1] = round_up_u32(blocks[1], cs_block_size[1]);
8293          blocks[2] = round_up_u32(blocks[2], cs_block_size[2]);
8294
8295          for (unsigned i = 0; i < 3; ++i) {
8296             assert(offsets[i] % cs_block_size[i] == 0);
8297             offsets[i] /= cs_block_size[i];
8298          }
8299
8300          radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
8301          radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(cs_block_size[0]) |
8302                             S_00B81C_NUM_THREAD_PARTIAL(remainder[0]));
8303          radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(cs_block_size[1]) |
8304                             S_00B81C_NUM_THREAD_PARTIAL(remainder[1]));
8305          radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(cs_block_size[2]) |
8306                             S_00B81C_NUM_THREAD_PARTIAL(remainder[2]));
8307
8308          dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1);
8309       }
8310
8311       if (loc->sgpr_idx != -1) {
8312          if (cmd_buffer->device->load_grid_size_from_user_sgpr) {
8313             assert(loc->num_sgprs == 3);
8314
8315             radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, 3);
8316             radeon_emit(cs, blocks[0]);
8317             radeon_emit(cs, blocks[1]);
8318             radeon_emit(cs, blocks[2]);
8319          } else {
8320             uint32_t offset;
8321             if (!radv_cmd_buffer_upload_data(cmd_buffer, 12, blocks, &offset))
8322                return;
8323
8324             uint64_t va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset;
8325             radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs,
8326                                      R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, va, true);
8327          }
8328       }
8329
8330       if (offsets[0] || offsets[1] || offsets[2]) {
8331          radeon_set_sh_reg_seq(cs, R_00B810_COMPUTE_START_X, 3);
8332          radeon_emit(cs, offsets[0]);
8333          radeon_emit(cs, offsets[1]);
8334          radeon_emit(cs, offsets[2]);
8335
8336          /* The blocks in the packet are not counts but end values. */
8337          for (unsigned i = 0; i < 3; ++i)
8338             blocks[i] += offsets[i];
8339       } else {
8340          dispatch_initiator |= S_00B800_FORCE_START_AT_000(1);
8341       }
8342
8343       if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
8344          radv_cs_emit_compute_predication(&cmd_buffer->state, cs, cmd_buffer->mec_inv_pred_va,
8345                                           &cmd_buffer->mec_inv_pred_emitted,
8346                                           5 /* DISPATCH_DIRECT size */);
8347          predicating = false;
8348       }
8349
8350       radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, predicating) | PKT3_SHADER_TYPE_S(1));
8351       radeon_emit(cs, blocks[0]);
8352       radeon_emit(cs, blocks[1]);
8353       radeon_emit(cs, blocks[2]);
8354       radeon_emit(cs, dispatch_initiator);
8355    }
8356
8357    assert(cmd_buffer->cs->cdw <= cdw_max);
8358 }
8359
8360 static void
8361 radv_upload_compute_shader_descriptors(struct radv_cmd_buffer *cmd_buffer,
8362                                        struct radv_compute_pipeline *pipeline,
8363                                        VkPipelineBindPoint bind_point)
8364 {
8365    radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_COMPUTE_BIT, &pipeline->base, bind_point);
8366    radv_flush_constants(cmd_buffer,
8367                         bind_point == VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR
8368                            ? RADV_RT_STAGE_BITS
8369                            : VK_SHADER_STAGE_COMPUTE_BIT,
8370                         &pipeline->base, bind_point);
8371 }
8372
8373 static void
8374 radv_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_info *info,
8375               struct radv_compute_pipeline *pipeline, VkPipelineBindPoint bind_point)
8376 {
8377    bool has_prefetch = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7;
8378    bool pipeline_is_dirty = pipeline != cmd_buffer->state.emitted_compute_pipeline;
8379
8380    if (pipeline->cs_regalloc_hang_bug)
8381       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
8382                                       RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
8383
8384    if (cmd_buffer->state.flush_bits &
8385        (RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB |
8386         RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) {
8387       /* If we have to wait for idle, set all states first, so that
8388        * all SET packets are processed in parallel with previous draw
8389        * calls. Then upload descriptors, set shader pointers, and
8390        * dispatch, and prefetch at the end. This ensures that the
8391        * time the CUs are idle is very short. (there are only SET_SH
8392        * packets between the wait and the draw)
8393        */
8394       radv_emit_compute_pipeline(cmd_buffer, pipeline);
8395       si_emit_cache_flush(cmd_buffer);
8396       /* <-- CUs are idle here --> */
8397
8398       radv_upload_compute_shader_descriptors(cmd_buffer, pipeline, bind_point);
8399
8400       radv_emit_dispatch_packets(cmd_buffer, pipeline, info);
8401       /* <-- CUs are busy here --> */
8402
8403       /* Start prefetches after the dispatch has been started. Both
8404        * will run in parallel, but starting the dispatch first is
8405        * more important.
8406        */
8407       if (has_prefetch && pipeline_is_dirty) {
8408          radv_emit_shader_prefetch(cmd_buffer, pipeline->base.shaders[MESA_SHADER_COMPUTE]);
8409       }
8410    } else {
8411       /* If we don't wait for idle, start prefetches first, then set
8412        * states, and dispatch at the end.
8413        */
8414       si_emit_cache_flush(cmd_buffer);
8415
8416       if (has_prefetch && pipeline_is_dirty) {
8417          radv_emit_shader_prefetch(cmd_buffer, pipeline->base.shaders[MESA_SHADER_COMPUTE]);
8418       }
8419
8420       radv_upload_compute_shader_descriptors(cmd_buffer, pipeline, bind_point);
8421
8422       radv_emit_compute_pipeline(cmd_buffer, pipeline);
8423       radv_emit_dispatch_packets(cmd_buffer, pipeline, info);
8424    }
8425
8426    if (pipeline_is_dirty) {
8427       /* Raytracing uses compute shaders but has separate bind points and pipelines.
8428        * So if we set compute userdata & shader registers we should dirty the raytracing
8429        * ones and the other way around.
8430        *
8431        * We only need to do this when the pipeline is dirty because when we switch between
8432        * the two we always need to switch pipelines.
8433        */
8434       radv_mark_descriptor_sets_dirty(cmd_buffer, bind_point == VK_PIPELINE_BIND_POINT_COMPUTE
8435                                                      ? VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR
8436                                                      : VK_PIPELINE_BIND_POINT_COMPUTE);
8437    }
8438
8439    if (pipeline->cs_regalloc_hang_bug)
8440       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
8441
8442    radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_CS_PARTIAL_FLUSH);
8443 }
8444
8445 static void
8446 radv_compute_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_info *info)
8447 {
8448    radv_dispatch(cmd_buffer, info, cmd_buffer->state.compute_pipeline,
8449                  VK_PIPELINE_BIND_POINT_COMPUTE);
8450 }
8451
8452 VKAPI_ATTR void VKAPI_CALL
8453 radv_CmdDispatchBase(VkCommandBuffer commandBuffer, uint32_t base_x, uint32_t base_y,
8454                      uint32_t base_z, uint32_t x, uint32_t y, uint32_t z)
8455 {
8456    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8457    struct radv_dispatch_info info = {0};
8458
8459    info.blocks[0] = x;
8460    info.blocks[1] = y;
8461    info.blocks[2] = z;
8462
8463    info.offsets[0] = base_x;
8464    info.offsets[1] = base_y;
8465    info.offsets[2] = base_z;
8466    radv_compute_dispatch(cmd_buffer, &info);
8467 }
8468
8469 VKAPI_ATTR void VKAPI_CALL
8470 radv_CmdDispatch(VkCommandBuffer commandBuffer, uint32_t x, uint32_t y, uint32_t z)
8471 {
8472    radv_CmdDispatchBase(commandBuffer, 0, 0, 0, x, y, z);
8473 }
8474
8475 VKAPI_ATTR void VKAPI_CALL
8476 radv_CmdDispatchIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset)
8477 {
8478    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8479    RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
8480    struct radv_dispatch_info info = {0};
8481
8482    info.indirect = buffer->bo;
8483    info.va = radv_buffer_get_va(buffer->bo) + buffer->offset + offset;
8484
8485    radv_compute_dispatch(cmd_buffer, &info);
8486 }
8487
8488 void
8489 radv_unaligned_dispatch(struct radv_cmd_buffer *cmd_buffer, uint32_t x, uint32_t y, uint32_t z)
8490 {
8491    struct radv_dispatch_info info = {0};
8492
8493    info.blocks[0] = x;
8494    info.blocks[1] = y;
8495    info.blocks[2] = z;
8496    info.unaligned = 1;
8497
8498    radv_compute_dispatch(cmd_buffer, &info);
8499 }
8500
8501 void
8502 radv_indirect_dispatch(struct radv_cmd_buffer *cmd_buffer, struct radeon_winsys_bo *bo, uint64_t va)
8503 {
8504    struct radv_dispatch_info info = {0};
8505
8506    info.indirect = bo;
8507    info.va = va;
8508
8509    radv_compute_dispatch(cmd_buffer, &info);
8510 }
8511
8512 enum radv_rt_mode {
8513    radv_rt_mode_direct,
8514    radv_rt_mode_indirect,
8515    radv_rt_mode_indirect2,
8516 };
8517
8518 static void
8519 radv_trace_rays(struct radv_cmd_buffer *cmd_buffer, const VkTraceRaysIndirectCommand2KHR *tables,
8520                 uint64_t indirect_va, enum radv_rt_mode mode)
8521 {
8522    struct radv_compute_pipeline *pipeline = cmd_buffer->state.rt_pipeline;
8523    uint32_t base_reg = pipeline->base.user_data_0[MESA_SHADER_COMPUTE];
8524
8525    struct radv_dispatch_info info = {0};
8526    info.unaligned = true;
8527
8528    uint64_t launch_size_va;
8529    uint64_t sbt_va;
8530
8531    if (mode != radv_rt_mode_indirect2) {
8532       uint32_t upload_size = mode == radv_rt_mode_direct
8533                                 ? sizeof(VkTraceRaysIndirectCommand2KHR)
8534                                 : offsetof(VkTraceRaysIndirectCommand2KHR, width);
8535
8536       uint32_t offset;
8537       if (!radv_cmd_buffer_upload_data(cmd_buffer, upload_size, tables, &offset))
8538          return;
8539
8540       uint64_t upload_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset;
8541
8542       launch_size_va = (mode == radv_rt_mode_direct)
8543                           ? upload_va + offsetof(VkTraceRaysIndirectCommand2KHR, width)
8544                           : indirect_va;
8545       sbt_va = upload_va;
8546    } else {
8547       launch_size_va = indirect_va + offsetof(VkTraceRaysIndirectCommand2KHR, width);
8548       sbt_va = indirect_va;
8549    }
8550
8551    if (mode == radv_rt_mode_direct) {
8552       info.blocks[0] = tables->width;
8553       info.blocks[1] = tables->height;
8554       info.blocks[2] = tables->depth;
8555    } else
8556       info.va = launch_size_va;
8557
8558    struct radv_userdata_info *desc_loc =
8559       radv_lookup_user_sgpr(&pipeline->base, MESA_SHADER_COMPUTE, AC_UD_CS_SBT_DESCRIPTORS);
8560    if (desc_loc->sgpr_idx != -1) {
8561       radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs,
8562                                base_reg + desc_loc->sgpr_idx * 4, sbt_va, true);
8563    }
8564
8565    struct radv_userdata_info *size_loc =
8566       radv_lookup_user_sgpr(&pipeline->base, MESA_SHADER_COMPUTE, AC_UD_CS_RAY_LAUNCH_SIZE_ADDR);
8567    if (size_loc->sgpr_idx != -1) {
8568       radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs,
8569                                base_reg + size_loc->sgpr_idx * 4, launch_size_va, true);
8570    }
8571
8572    radv_dispatch(cmd_buffer, &info, pipeline, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR);
8573 }
8574
8575 VKAPI_ATTR void VKAPI_CALL
8576 radv_CmdTraceRaysKHR(VkCommandBuffer commandBuffer,
8577                      const VkStridedDeviceAddressRegionKHR *pRaygenShaderBindingTable,
8578                      const VkStridedDeviceAddressRegionKHR *pMissShaderBindingTable,
8579                      const VkStridedDeviceAddressRegionKHR *pHitShaderBindingTable,
8580                      const VkStridedDeviceAddressRegionKHR *pCallableShaderBindingTable,
8581                      uint32_t width, uint32_t height, uint32_t depth)
8582 {
8583    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8584
8585    VkTraceRaysIndirectCommand2KHR tables = {
8586       .raygenShaderRecordAddress = pRaygenShaderBindingTable->deviceAddress,
8587       .raygenShaderRecordSize = pRaygenShaderBindingTable->size,
8588       .missShaderBindingTableAddress = pMissShaderBindingTable->deviceAddress,
8589       .missShaderBindingTableSize = pMissShaderBindingTable->size,
8590       .missShaderBindingTableStride = pMissShaderBindingTable->stride,
8591       .hitShaderBindingTableAddress = pHitShaderBindingTable->deviceAddress,
8592       .hitShaderBindingTableSize = pHitShaderBindingTable->size,
8593       .hitShaderBindingTableStride = pHitShaderBindingTable->stride,
8594       .callableShaderBindingTableAddress = pCallableShaderBindingTable->deviceAddress,
8595       .callableShaderBindingTableSize = pCallableShaderBindingTable->size,
8596       .callableShaderBindingTableStride = pCallableShaderBindingTable->stride,
8597       .width = width,
8598       .height = height,
8599       .depth = depth,
8600    };
8601
8602    radv_trace_rays(cmd_buffer, &tables, 0, radv_rt_mode_direct);
8603 }
8604
8605 VKAPI_ATTR void VKAPI_CALL
8606 radv_CmdTraceRaysIndirectKHR(VkCommandBuffer commandBuffer,
8607                              const VkStridedDeviceAddressRegionKHR *pRaygenShaderBindingTable,
8608                              const VkStridedDeviceAddressRegionKHR *pMissShaderBindingTable,
8609                              const VkStridedDeviceAddressRegionKHR *pHitShaderBindingTable,
8610                              const VkStridedDeviceAddressRegionKHR *pCallableShaderBindingTable,
8611                              VkDeviceAddress indirectDeviceAddress)
8612 {
8613    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8614
8615    assert(cmd_buffer->device->use_global_bo_list);
8616
8617    VkTraceRaysIndirectCommand2KHR tables = {
8618       .raygenShaderRecordAddress = pRaygenShaderBindingTable->deviceAddress,
8619       .raygenShaderRecordSize = pRaygenShaderBindingTable->size,
8620       .missShaderBindingTableAddress = pMissShaderBindingTable->deviceAddress,
8621       .missShaderBindingTableSize = pMissShaderBindingTable->size,
8622       .missShaderBindingTableStride = pMissShaderBindingTable->stride,
8623       .hitShaderBindingTableAddress = pHitShaderBindingTable->deviceAddress,
8624       .hitShaderBindingTableSize = pHitShaderBindingTable->size,
8625       .hitShaderBindingTableStride = pHitShaderBindingTable->stride,
8626       .callableShaderBindingTableAddress = pCallableShaderBindingTable->deviceAddress,
8627       .callableShaderBindingTableSize = pCallableShaderBindingTable->size,
8628       .callableShaderBindingTableStride = pCallableShaderBindingTable->stride,
8629    };
8630
8631    radv_trace_rays(cmd_buffer, &tables, indirectDeviceAddress, radv_rt_mode_indirect);
8632 }
8633
8634 VKAPI_ATTR void VKAPI_CALL
8635 radv_CmdTraceRaysIndirect2KHR(VkCommandBuffer commandBuffer, VkDeviceAddress indirectDeviceAddress)
8636 {
8637    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8638
8639    assert(cmd_buffer->device->use_global_bo_list);
8640
8641    radv_trace_rays(cmd_buffer, NULL, indirectDeviceAddress, radv_rt_mode_indirect2);
8642 }
8643
8644 static void
8645 radv_set_rt_stack_size(struct radv_cmd_buffer *cmd_buffer, uint32_t size)
8646 {
8647    unsigned wave_size = 0;
8648    unsigned scratch_bytes_per_wave = 0;
8649
8650    if (cmd_buffer->state.rt_pipeline) {
8651       scratch_bytes_per_wave = cmd_buffer->state.rt_pipeline->base.scratch_bytes_per_wave;
8652       wave_size = cmd_buffer->state.rt_pipeline->base.shaders[MESA_SHADER_COMPUTE]->info.wave_size;
8653    }
8654
8655    /* The hardware register is specified as a multiple of 256 DWORDS. */
8656    scratch_bytes_per_wave += align(size * wave_size, 1024);
8657
8658    cmd_buffer->compute_scratch_size_per_wave_needed =
8659       MAX2(cmd_buffer->compute_scratch_size_per_wave_needed, scratch_bytes_per_wave);
8660 }
8661
8662 VKAPI_ATTR void VKAPI_CALL
8663 radv_CmdSetRayTracingPipelineStackSizeKHR(VkCommandBuffer commandBuffer, uint32_t size)
8664 {
8665    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8666
8667    radv_set_rt_stack_size(cmd_buffer, size);
8668    cmd_buffer->state.rt_stack_size = size;
8669 }
8670
8671 VKAPI_ATTR void VKAPI_CALL
8672 radv_CmdEndRenderPass2(VkCommandBuffer commandBuffer, const VkSubpassEndInfo *pSubpassEndInfo)
8673 {
8674    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8675
8676    radv_mark_noncoherent_rb(cmd_buffer);
8677
8678    radv_emit_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier);
8679
8680    radv_cmd_buffer_end_subpass(cmd_buffer);
8681
8682    vk_free(&cmd_buffer->pool->vk.alloc, cmd_buffer->state.attachments);
8683    vk_free(&cmd_buffer->pool->vk.alloc, cmd_buffer->state.subpass_sample_locs);
8684
8685    cmd_buffer->state.pass = NULL;
8686    cmd_buffer->state.subpass = NULL;
8687    cmd_buffer->state.attachments = NULL;
8688    cmd_buffer->state.framebuffer = NULL;
8689    cmd_buffer->state.subpass_sample_locs = NULL;
8690 }
8691
8692 VKAPI_ATTR void VKAPI_CALL
8693 radv_CmdBeginRendering(VkCommandBuffer commandBuffer, const VkRenderingInfo *pRenderingInfo)
8694 {
8695    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8696    const VkRenderingFragmentShadingRateAttachmentInfoKHR *vrs_info = vk_find_struct_const(
8697       pRenderingInfo->pNext, RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_INFO_KHR);
8698    VkResult result;
8699    /* (normal + resolve) for color attachments and ds and a VRS attachment */
8700    VkAttachmentDescription2 att_desc[MAX_RTS * 2 + 3];
8701    VkAttachmentDescriptionStencilLayout ds_stencil_att, ds_stencil_resolve_att;
8702    VkImageView iviews[MAX_RTS * 2 + 3];
8703    VkAttachmentReference2 color_refs[MAX_RTS], color_resolve_refs[MAX_RTS];
8704    VkAttachmentReference2 ds_ref, ds_resolve_ref, vrs_ref;
8705    VkAttachmentReferenceStencilLayout ds_stencil_ref, ds_stencil_resolve_ref;
8706    VkSubpassDescriptionDepthStencilResolve ds_resolve_info;
8707    VkFragmentShadingRateAttachmentInfoKHR vrs_subpass_info;
8708    VkClearValue clear_values[MAX_RTS * 2 + 3];
8709    unsigned att_count = 0;
8710
8711    VkSubpassDescription2 subpass = {
8712       .sType = VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_2,
8713       .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS,
8714       .viewMask = pRenderingInfo->viewMask,
8715       .colorAttachmentCount = pRenderingInfo->colorAttachmentCount,
8716       .pColorAttachments = color_refs,
8717       .pResolveAttachments = color_resolve_refs,
8718    };
8719
8720    for (unsigned i = 0; i < pRenderingInfo->colorAttachmentCount; ++i) {
8721       color_refs[i] = (VkAttachmentReference2){
8722          .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
8723          .attachment = VK_ATTACHMENT_UNUSED,
8724       };
8725       color_resolve_refs[i] = (VkAttachmentReference2){
8726          .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
8727          .attachment = VK_ATTACHMENT_UNUSED,
8728       };
8729
8730       if (pRenderingInfo->pColorAttachments[i].imageView == VK_NULL_HANDLE)
8731          continue;
8732
8733       const VkRenderingAttachmentInfo *info = &pRenderingInfo->pColorAttachments[i];
8734       RADV_FROM_HANDLE(radv_image_view, iview, info->imageView);
8735       color_refs[i] = (VkAttachmentReference2){.sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
8736                                                .attachment = att_count,
8737                                                .layout = info->imageLayout,
8738                                                .aspectMask = iview->vk.aspects};
8739
8740       iviews[att_count] = info->imageView;
8741       clear_values[att_count] = info->clearValue;
8742       VkAttachmentDescription2 *att = att_desc + att_count++;
8743
8744       memset(att, 0, sizeof(*att));
8745       att->sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2;
8746       att->format = iview->vk.format;
8747       att->samples = iview->image->info.samples;
8748       att->loadOp = info->loadOp;
8749       att->storeOp = info->storeOp;
8750       att->initialLayout = info->imageLayout;
8751       att->finalLayout = info->imageLayout;
8752
8753       if (pRenderingInfo->flags & VK_RENDERING_RESUMING_BIT)
8754          att->loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
8755
8756       if (pRenderingInfo->flags & VK_RENDERING_SUSPENDING_BIT)
8757          att->storeOp = VK_ATTACHMENT_STORE_OP_STORE;
8758
8759       if (info->resolveMode != VK_RESOLVE_MODE_NONE &&
8760           !(pRenderingInfo->flags & VK_RENDERING_SUSPENDING_BIT)) {
8761          RADV_FROM_HANDLE(radv_image_view, resolve_iview, info->resolveImageView);
8762          color_resolve_refs[i] =
8763             (VkAttachmentReference2){.sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
8764                                      .attachment = att_count,
8765                                      .layout = info->resolveImageLayout,
8766                                      .aspectMask = resolve_iview->vk.aspects};
8767
8768          iviews[att_count] = info->resolveImageView;
8769          att = att_desc + att_count++;
8770
8771          memset(att, 0, sizeof(*att));
8772          att->sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2;
8773          att->format = resolve_iview->vk.format;
8774          att->samples = resolve_iview->image->info.samples;
8775          att->loadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
8776          att->storeOp = VK_ATTACHMENT_STORE_OP_STORE;
8777          att->initialLayout = info->resolveImageLayout;
8778          att->finalLayout = info->resolveImageLayout;
8779       }
8780    }
8781
8782    if (pRenderingInfo->pDepthAttachment || pRenderingInfo->pStencilAttachment) {
8783       const VkRenderingAttachmentInfo *common_info = pRenderingInfo->pDepthAttachment
8784                                                            ? pRenderingInfo->pDepthAttachment
8785                                                            : pRenderingInfo->pStencilAttachment;
8786       RADV_FROM_HANDLE(radv_image_view, iview, common_info->imageView);
8787
8788       if (common_info->imageView != VK_NULL_HANDLE) {
8789          ds_ref = (VkAttachmentReference2){
8790             .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
8791             .attachment = att_count,
8792             .layout = common_info->imageLayout,
8793             .aspectMask = (pRenderingInfo->pDepthAttachment ? VK_IMAGE_ASPECT_DEPTH_BIT : 0) |
8794                           (pRenderingInfo->pStencilAttachment ? VK_IMAGE_ASPECT_STENCIL_BIT : 0)};
8795          subpass.pDepthStencilAttachment = &ds_ref;
8796
8797          iviews[att_count] = common_info->imageView;
8798          if (pRenderingInfo->pDepthAttachment)
8799             clear_values[att_count].depthStencil.depth =
8800                pRenderingInfo->pDepthAttachment->clearValue.depthStencil.depth;
8801          if (pRenderingInfo->pStencilAttachment)
8802             clear_values[att_count].depthStencil.stencil =
8803                pRenderingInfo->pStencilAttachment->clearValue.depthStencil.stencil;
8804          VkAttachmentDescription2 *att = att_desc + att_count++;
8805
8806          memset(att, 0, sizeof(*att));
8807          att->sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2;
8808          att->format = iview->vk.format;
8809          att->samples = iview->image->info.samples;
8810
8811          if (pRenderingInfo->pDepthAttachment) {
8812             att->loadOp = pRenderingInfo->pDepthAttachment->loadOp;
8813             att->storeOp = pRenderingInfo->pDepthAttachment->storeOp;
8814          } else {
8815             att->loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
8816             att->storeOp = VK_ATTACHMENT_STORE_OP_STORE;
8817          }
8818
8819          if (pRenderingInfo->pStencilAttachment) {
8820             att->stencilLoadOp = pRenderingInfo->pStencilAttachment->loadOp;
8821             att->stencilStoreOp = pRenderingInfo->pStencilAttachment->storeOp;
8822          } else {
8823             att->stencilLoadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
8824             att->stencilStoreOp = VK_ATTACHMENT_STORE_OP_STORE;
8825          }
8826
8827          if (pRenderingInfo->flags & VK_RENDERING_RESUMING_BIT) {
8828             att->loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
8829             att->stencilLoadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
8830          }
8831
8832          if (pRenderingInfo->flags & VK_RENDERING_SUSPENDING_BIT) {
8833             att->storeOp = VK_ATTACHMENT_STORE_OP_STORE;
8834             att->stencilStoreOp = VK_ATTACHMENT_STORE_OP_STORE;
8835          }
8836
8837          att->initialLayout = common_info->imageLayout;
8838          att->finalLayout = common_info->imageLayout;
8839
8840          if (pRenderingInfo->pDepthAttachment && pRenderingInfo->pStencilAttachment) {
8841             ds_ref.pNext = &ds_stencil_ref;
8842             ds_stencil_ref = (VkAttachmentReferenceStencilLayout){
8843                .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_STENCIL_LAYOUT,
8844                .stencilLayout = pRenderingInfo->pStencilAttachment->imageLayout};
8845
8846             att->pNext = &ds_stencil_att;
8847             ds_stencil_att = (VkAttachmentDescriptionStencilLayout){
8848                .sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_STENCIL_LAYOUT,
8849                .stencilInitialLayout = pRenderingInfo->pStencilAttachment->imageLayout,
8850                .stencilFinalLayout = pRenderingInfo->pStencilAttachment->imageLayout,
8851             };
8852          }
8853
8854          if (((pRenderingInfo->pDepthAttachment &&
8855               pRenderingInfo->pDepthAttachment->resolveMode != VK_RESOLVE_MODE_NONE) ||
8856              (pRenderingInfo->pStencilAttachment &&
8857               pRenderingInfo->pStencilAttachment->resolveMode != VK_RESOLVE_MODE_NONE)) &&
8858              !(pRenderingInfo->flags & VK_RENDERING_SUSPENDING_BIT)) {
8859             RADV_FROM_HANDLE(radv_image_view, resolve_iview, common_info->resolveImageView);
8860             ds_resolve_ref =
8861                (VkAttachmentReference2){.sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
8862                                         .attachment = att_count,
8863                                         .layout = common_info->resolveImageLayout,
8864                                         .aspectMask = resolve_iview->vk.aspects};
8865
8866             iviews[att_count] = common_info->resolveImageView;
8867             att = att_desc + att_count++;
8868
8869             memset(att, 0, sizeof(*att));
8870             att->sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2;
8871             att->format = resolve_iview->vk.format;
8872             att->samples = resolve_iview->image->info.samples;
8873             att->loadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
8874             att->storeOp = VK_ATTACHMENT_STORE_OP_STORE;
8875             att->initialLayout = common_info->resolveImageLayout;
8876             att->finalLayout = common_info->resolveImageLayout;
8877
8878             ds_resolve_info = (VkSubpassDescriptionDepthStencilResolve){
8879                .sType = VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_DEPTH_STENCIL_RESOLVE,
8880                .pNext = subpass.pNext,
8881                .depthResolveMode =
8882                   (pRenderingInfo->pDepthAttachment &&
8883                    pRenderingInfo->pDepthAttachment->resolveMode != VK_RESOLVE_MODE_NONE)
8884                      ? pRenderingInfo->pDepthAttachment->resolveMode
8885                      : VK_RESOLVE_MODE_NONE,
8886                .stencilResolveMode =
8887                   (pRenderingInfo->pStencilAttachment &&
8888                    pRenderingInfo->pStencilAttachment->resolveMode != VK_RESOLVE_MODE_NONE)
8889                      ? pRenderingInfo->pStencilAttachment->resolveMode
8890                      : VK_RESOLVE_MODE_NONE,
8891                .pDepthStencilResolveAttachment = &ds_resolve_ref};
8892             subpass.pNext = &ds_resolve_info;
8893
8894             if (pRenderingInfo->pDepthAttachment && pRenderingInfo->pStencilAttachment &&
8895                 pRenderingInfo->pDepthAttachment->resolveMode != VK_RESOLVE_MODE_NONE &&
8896                 pRenderingInfo->pStencilAttachment->resolveMode != VK_RESOLVE_MODE_NONE) {
8897                ds_resolve_ref.pNext = &ds_stencil_resolve_ref;
8898                ds_stencil_resolve_ref = (VkAttachmentReferenceStencilLayout){
8899                   .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_STENCIL_LAYOUT,
8900                   .stencilLayout = pRenderingInfo->pStencilAttachment->resolveImageLayout};
8901
8902                att->pNext = &ds_stencil_resolve_att;
8903                ds_stencil_resolve_att = (VkAttachmentDescriptionStencilLayout){
8904                   .sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_STENCIL_LAYOUT,
8905                   .stencilInitialLayout = pRenderingInfo->pStencilAttachment->resolveImageLayout,
8906                   .stencilFinalLayout = pRenderingInfo->pStencilAttachment->resolveImageLayout,
8907                };
8908             }
8909          }
8910       }
8911    }
8912
8913    if (vrs_info && vrs_info->imageView) {
8914       RADV_FROM_HANDLE(radv_image_view, iview, vrs_info->imageView);
8915       vrs_ref = (VkAttachmentReference2){.sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
8916                                          .attachment = att_count,
8917                                          .layout = vrs_info->imageLayout,
8918                                          .aspectMask = iview->vk.aspects};
8919
8920       iviews[att_count] = vrs_info->imageView;
8921       VkAttachmentDescription2 *att = att_desc + att_count++;
8922
8923       memset(att, 0, sizeof(*att));
8924       att->sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2;
8925       att->format = iview->vk.format;
8926       att->samples = iview->image->info.samples;
8927       att->loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
8928       att->storeOp = VK_ATTACHMENT_STORE_OP_DONT_CARE;
8929       att->initialLayout = vrs_info->imageLayout;
8930       att->finalLayout = vrs_info->imageLayout;
8931
8932       vrs_subpass_info = (VkFragmentShadingRateAttachmentInfoKHR){
8933          .sType = VK_STRUCTURE_TYPE_FRAGMENT_SHADING_RATE_ATTACHMENT_INFO_KHR,
8934          .pNext = subpass.pNext,
8935          .pFragmentShadingRateAttachment = &vrs_ref,
8936          .shadingRateAttachmentTexelSize = vrs_info->shadingRateAttachmentTexelSize,
8937       };
8938       subpass.pNext = &vrs_subpass_info;
8939    }
8940
8941    VkRenderPassCreateInfo2 rp_create_info = {
8942       .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2,
8943       .attachmentCount = att_count,
8944       .pAttachments = att_desc,
8945       .subpassCount = 1,
8946       .pSubpasses = &subpass,
8947    };
8948
8949    VkRenderPass rp;
8950    result =
8951       radv_CreateRenderPass2(radv_device_to_handle(cmd_buffer->device), &rp_create_info, NULL, &rp);
8952    if (result != VK_SUCCESS) {
8953       vk_command_buffer_set_error(&cmd_buffer->vk, result);
8954       return;
8955    }
8956
8957    unsigned w = pRenderingInfo->renderArea.offset.x + pRenderingInfo->renderArea.extent.width;
8958    unsigned h = pRenderingInfo->renderArea.offset.y + pRenderingInfo->renderArea.extent.height;
8959    for (unsigned i = 0; i < att_count; ++i) {
8960       RADV_FROM_HANDLE(radv_image_view, iview, iviews[i]);
8961
8962       if (vrs_info && vrs_info->imageView == iviews[i])
8963          continue;
8964
8965       w = MIN2(w, iview->extent.width);
8966       h = MIN2(h, iview->extent.height);
8967    }
8968    VkFramebufferCreateInfo fb_create_info = {
8969       .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO,
8970       .renderPass = rp,
8971       .attachmentCount = att_count,
8972       .pAttachments = iviews,
8973       .width = w,
8974       .height = h,
8975       .layers = pRenderingInfo->layerCount,
8976    };
8977
8978    VkFramebuffer fb;
8979    result =
8980       vk_common_CreateFramebuffer(radv_device_to_handle(cmd_buffer->device), &fb_create_info, NULL, &fb);
8981    if (result != VK_SUCCESS) {
8982       radv_DestroyRenderPass(radv_device_to_handle(cmd_buffer->device), rp, NULL);
8983       vk_command_buffer_set_error(&cmd_buffer->vk, result);
8984       return;
8985    }
8986
8987    VkRenderPassBeginInfo begin_info = {.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
8988                                        .renderPass = rp,
8989                                        .framebuffer = fb,
8990                                        .renderArea = pRenderingInfo->renderArea,
8991                                        .clearValueCount = att_count,
8992                                        .pClearValues = clear_values};
8993
8994    const VkSubpassBeginInfo pass_begin_info = {
8995       .sType = VK_STRUCTURE_TYPE_SUBPASS_BEGIN_INFO,
8996       .contents = (pRenderingInfo->flags & VK_RENDERING_CONTENTS_SECONDARY_COMMAND_BUFFERS_BIT)
8997                      ? VK_SUBPASS_CONTENTS_SECONDARY_COMMAND_BUFFERS
8998                      : VK_SUBPASS_CONTENTS_INLINE,
8999    };
9000
9001    radv_CmdBeginRenderPass2(commandBuffer, &begin_info, &pass_begin_info);
9002 }
9003
9004 VKAPI_ATTR void VKAPI_CALL
9005 radv_CmdEndRendering(VkCommandBuffer commandBuffer)
9006 {
9007    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9008    struct radv_render_pass *pass = cmd_buffer->state.pass;
9009    struct vk_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
9010
9011    radv_CmdEndRenderPass2(commandBuffer, NULL);
9012
9013    vk_common_DestroyFramebuffer(radv_device_to_handle(cmd_buffer->device),
9014                                 vk_framebuffer_to_handle(framebuffer), NULL);
9015    radv_DestroyRenderPass(radv_device_to_handle(cmd_buffer->device),
9016                           radv_render_pass_to_handle(pass), NULL);
9017 }
9018
9019 /*
9020  * For HTILE we have the following interesting clear words:
9021  *   0xfffff30f: Uncompressed, full depth range, for depth+stencil HTILE
9022  *   0xfffc000f: Uncompressed, full depth range, for depth only HTILE.
9023  *   0xfffffff0: Clear depth to 1.0
9024  *   0x00000000: Clear depth to 0.0
9025  */
9026 static void
9027 radv_initialize_htile(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
9028                       const VkImageSubresourceRange *range)
9029 {
9030    struct radv_cmd_state *state = &cmd_buffer->state;
9031    uint32_t htile_value = radv_get_htile_initial_value(cmd_buffer->device, image);
9032    VkClearDepthStencilValue value = {0};
9033    struct radv_barrier_data barrier = {0};
9034
9035    barrier.layout_transitions.init_mask_ram = 1;
9036    radv_describe_layout_transition(cmd_buffer, &barrier);
9037
9038    /* Transitioning from LAYOUT_UNDEFINED layout not everyone is consistent
9039     * in considering previous rendering work for WAW hazards. */
9040    state->flush_bits |=
9041       radv_src_access_flush(cmd_buffer, VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, image);
9042
9043    if (image->planes[0].surface.has_stencil &&
9044        !(range->aspectMask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT))) {
9045       /* Flush caches before performing a separate aspect initialization because it's a
9046        * read-modify-write operation.
9047        */
9048       state->flush_bits |= radv_dst_access_flush(cmd_buffer, VK_ACCESS_2_SHADER_READ_BIT, image);
9049    }
9050
9051    state->flush_bits |= radv_clear_htile(cmd_buffer, image, range, htile_value);
9052
9053    radv_set_ds_clear_metadata(cmd_buffer, image, range, value, range->aspectMask);
9054
9055    if (radv_image_is_tc_compat_htile(image) && (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT)) {
9056       /* Initialize the TC-compat metada value to 0 because by
9057        * default DB_Z_INFO.RANGE_PRECISION is set to 1, and we only
9058        * need have to conditionally update its value when performing
9059        * a fast depth clear.
9060        */
9061       radv_set_tc_compat_zrange_metadata(cmd_buffer, image, range, 0);
9062    }
9063 }
9064
9065 static void
9066 radv_handle_depth_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
9067                                    VkImageLayout src_layout, VkImageLayout dst_layout,
9068                                    unsigned src_queue_mask, unsigned dst_queue_mask,
9069                                    const VkImageSubresourceRange *range,
9070                                    struct radv_sample_locations_state *sample_locs)
9071 {
9072    struct radv_device *device = cmd_buffer->device;
9073
9074    if (!radv_htile_enabled(image, range->baseMipLevel))
9075       return;
9076
9077    if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
9078       radv_initialize_htile(cmd_buffer, image, range);
9079    } else if (!radv_layout_is_htile_compressed(device, image, src_layout, src_queue_mask) &&
9080               radv_layout_is_htile_compressed(device, image, dst_layout, dst_queue_mask)) {
9081       radv_initialize_htile(cmd_buffer, image, range);
9082    } else if (radv_layout_is_htile_compressed(device, image, src_layout, src_queue_mask) &&
9083               !radv_layout_is_htile_compressed(device, image, dst_layout, dst_queue_mask)) {
9084       cmd_buffer->state.flush_bits |=
9085          RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
9086
9087       radv_expand_depth_stencil(cmd_buffer, image, range, sample_locs);
9088
9089       cmd_buffer->state.flush_bits |=
9090          RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
9091    }
9092 }
9093
9094 static uint32_t
9095 radv_init_cmask(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
9096                 const VkImageSubresourceRange *range, uint32_t value)
9097 {
9098    struct radv_barrier_data barrier = {0};
9099
9100    barrier.layout_transitions.init_mask_ram = 1;
9101    radv_describe_layout_transition(cmd_buffer, &barrier);
9102
9103    return radv_clear_cmask(cmd_buffer, image, range, value);
9104 }
9105
9106 uint32_t
9107 radv_init_fmask(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
9108                 const VkImageSubresourceRange *range)
9109 {
9110    static const uint32_t fmask_clear_values[4] = {0x00000000, 0x02020202, 0xE4E4E4E4, 0x76543210};
9111    uint32_t log2_samples = util_logbase2(image->info.samples);
9112    uint32_t value = fmask_clear_values[log2_samples];
9113    struct radv_barrier_data barrier = {0};
9114
9115    barrier.layout_transitions.init_mask_ram = 1;
9116    radv_describe_layout_transition(cmd_buffer, &barrier);
9117
9118    return radv_clear_fmask(cmd_buffer, image, range, value);
9119 }
9120
9121 uint32_t
9122 radv_init_dcc(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
9123               const VkImageSubresourceRange *range, uint32_t value)
9124 {
9125    struct radv_barrier_data barrier = {0};
9126    uint32_t flush_bits = 0;
9127    unsigned size = 0;
9128
9129    barrier.layout_transitions.init_mask_ram = 1;
9130    radv_describe_layout_transition(cmd_buffer, &barrier);
9131
9132    flush_bits |= radv_clear_dcc(cmd_buffer, image, range, value);
9133
9134    if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX8) {
9135       /* When DCC is enabled with mipmaps, some levels might not
9136        * support fast clears and we have to initialize them as "fully
9137        * expanded".
9138        */
9139       /* Compute the size of all fast clearable DCC levels. */
9140       for (unsigned i = 0; i < image->planes[0].surface.num_meta_levels; i++) {
9141          struct legacy_surf_dcc_level *dcc_level = &image->planes[0].surface.u.legacy.color.dcc_level[i];
9142          unsigned dcc_fast_clear_size =
9143             dcc_level->dcc_slice_fast_clear_size * image->info.array_size;
9144
9145          if (!dcc_fast_clear_size)
9146             break;
9147
9148          size = dcc_level->dcc_offset + dcc_fast_clear_size;
9149       }
9150
9151       /* Initialize the mipmap levels without DCC. */
9152       if (size != image->planes[0].surface.meta_size) {
9153          flush_bits |= radv_fill_buffer(cmd_buffer, image, image->bindings[0].bo,
9154                                         radv_buffer_get_va(image->bindings[0].bo) +
9155                                            image->bindings[0].offset +
9156                                            image->planes[0].surface.meta_offset + size,
9157                                         image->planes[0].surface.meta_size - size, 0xffffffff);
9158       }
9159    }
9160
9161    return flush_bits;
9162 }
9163
9164 /**
9165  * Initialize DCC/FMASK/CMASK metadata for a color image.
9166  */
9167 static void
9168 radv_init_color_image_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
9169                                VkImageLayout src_layout, VkImageLayout dst_layout,
9170                                unsigned src_queue_mask, unsigned dst_queue_mask,
9171                                const VkImageSubresourceRange *range)
9172 {
9173    uint32_t flush_bits = 0;
9174
9175    /* Transitioning from LAYOUT_UNDEFINED layout not everyone is
9176     * consistent in considering previous rendering work for WAW hazards.
9177     */
9178    cmd_buffer->state.flush_bits |=
9179       radv_src_access_flush(cmd_buffer, VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT, image);
9180
9181    if (radv_image_has_cmask(image)) {
9182       uint32_t value;
9183
9184       if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX9) {
9185          /* TODO: Fix clearing CMASK layers on GFX9. */
9186          if (radv_image_is_tc_compat_cmask(image) ||
9187              (radv_image_has_fmask(image) &&
9188               radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel, dst_layout,
9189                                          dst_queue_mask))) {
9190             value = 0xccccccccu;
9191          } else {
9192             value = 0xffffffffu;
9193          }
9194       } else {
9195          static const uint32_t cmask_clear_values[4] = {0xffffffff, 0xdddddddd, 0xeeeeeeee, 0xffffffff};
9196          uint32_t log2_samples = util_logbase2(image->info.samples);
9197
9198          value = cmask_clear_values[log2_samples];
9199       }
9200
9201       flush_bits |= radv_init_cmask(cmd_buffer, image, range, value);
9202    }
9203
9204    if (radv_image_has_fmask(image)) {
9205       flush_bits |= radv_init_fmask(cmd_buffer, image, range);
9206    }
9207
9208    if (radv_dcc_enabled(image, range->baseMipLevel)) {
9209       uint32_t value = 0xffffffffu; /* Fully expanded mode. */
9210
9211       if (radv_layout_dcc_compressed(cmd_buffer->device, image, range->baseMipLevel,
9212                                      dst_layout, dst_queue_mask)) {
9213          value = 0u;
9214       }
9215
9216       flush_bits |= radv_init_dcc(cmd_buffer, image, range, value);
9217    }
9218
9219    if (radv_image_has_cmask(image) || radv_dcc_enabled(image, range->baseMipLevel)) {
9220       radv_update_fce_metadata(cmd_buffer, image, range, false);
9221
9222       uint32_t color_values[2] = {0};
9223       radv_set_color_clear_metadata(cmd_buffer, image, range, color_values);
9224    }
9225
9226    cmd_buffer->state.flush_bits |= flush_bits;
9227 }
9228
9229 static void
9230 radv_retile_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
9231                        VkImageLayout src_layout, VkImageLayout dst_layout, unsigned dst_queue_mask)
9232 {
9233    /* If the image is read-only, we don't have to retile DCC because it can't change. */
9234    if (!(image->vk.usage & RADV_IMAGE_USAGE_WRITE_BITS))
9235       return;
9236
9237    if (src_layout != VK_IMAGE_LAYOUT_PRESENT_SRC_KHR &&
9238        (dst_layout == VK_IMAGE_LAYOUT_PRESENT_SRC_KHR ||
9239         (dst_queue_mask & (1u << RADV_QUEUE_FOREIGN))))
9240       radv_retile_dcc(cmd_buffer, image);
9241 }
9242
9243 static bool
9244 radv_image_need_retile(const struct radv_image *image)
9245 {
9246    return image->planes[0].surface.display_dcc_offset &&
9247           image->planes[0].surface.display_dcc_offset != image->planes[0].surface.meta_offset;
9248 }
9249
9250 /**
9251  * Handle color image transitions for DCC/FMASK/CMASK.
9252  */
9253 static void
9254 radv_handle_color_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
9255                                    VkImageLayout src_layout, VkImageLayout dst_layout,
9256                                    unsigned src_queue_mask, unsigned dst_queue_mask,
9257                                    const VkImageSubresourceRange *range)
9258 {
9259    bool dcc_decompressed = false, fast_clear_flushed = false;
9260
9261    if (!radv_image_has_cmask(image) && !radv_image_has_fmask(image) &&
9262        !radv_dcc_enabled(image, range->baseMipLevel))
9263       return;
9264
9265    if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
9266       radv_init_color_image_metadata(cmd_buffer, image, src_layout, dst_layout,
9267                                      src_queue_mask, dst_queue_mask, range);
9268
9269       if (radv_image_need_retile(image))
9270          radv_retile_transition(cmd_buffer, image, src_layout, dst_layout, dst_queue_mask);
9271       return;
9272    }
9273
9274    if (radv_dcc_enabled(image, range->baseMipLevel)) {
9275       if (src_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) {
9276          cmd_buffer->state.flush_bits |= radv_init_dcc(cmd_buffer, image, range, 0xffffffffu);
9277       } else if (radv_layout_dcc_compressed(cmd_buffer->device, image, range->baseMipLevel,
9278                                             src_layout, src_queue_mask) &&
9279                  !radv_layout_dcc_compressed(cmd_buffer->device, image, range->baseMipLevel,
9280                                              dst_layout, dst_queue_mask)) {
9281          radv_decompress_dcc(cmd_buffer, image, range);
9282          dcc_decompressed = true;
9283       } else if (radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel,
9284                                             src_layout, src_queue_mask) &&
9285                  !radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel,
9286                                              dst_layout, dst_queue_mask)) {
9287          radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
9288          fast_clear_flushed = true;
9289       }
9290
9291       if (radv_image_need_retile(image))
9292          radv_retile_transition(cmd_buffer, image, src_layout, dst_layout, dst_queue_mask);
9293    } else if (radv_image_has_cmask(image) || radv_image_has_fmask(image)) {
9294       if (radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel,
9295                                      src_layout, src_queue_mask) &&
9296           !radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel,
9297                                       dst_layout, dst_queue_mask)) {
9298          radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
9299          fast_clear_flushed = true;
9300       }
9301    }
9302
9303    /* MSAA color decompress. */
9304    if (radv_image_has_fmask(image) &&
9305        (image->vk.usage & (VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT)) &&
9306        radv_layout_fmask_compressed(cmd_buffer->device, image, src_layout, src_queue_mask) &&
9307        !radv_layout_fmask_compressed(cmd_buffer->device, image, dst_layout, dst_queue_mask)) {
9308       if (radv_dcc_enabled(image, range->baseMipLevel) &&
9309           !radv_image_use_dcc_image_stores(cmd_buffer->device, image) && !dcc_decompressed) {
9310          /* A DCC decompress is required before expanding FMASK
9311           * when DCC stores aren't supported to avoid being in
9312           * a state where DCC is compressed and the main
9313           * surface is uncompressed.
9314           */
9315          radv_decompress_dcc(cmd_buffer, image, range);
9316       } else if (!fast_clear_flushed) {
9317          /* A FMASK decompress is required before expanding
9318           * FMASK.
9319           */
9320          radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
9321       }
9322
9323       struct radv_barrier_data barrier = {0};
9324       barrier.layout_transitions.fmask_color_expand = 1;
9325       radv_describe_layout_transition(cmd_buffer, &barrier);
9326
9327       radv_expand_fmask_image_inplace(cmd_buffer, image, range);
9328    }
9329 }
9330
9331 static void
9332 radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
9333                              VkImageLayout src_layout, VkImageLayout dst_layout,
9334                              uint32_t src_family_index, uint32_t dst_family_index,
9335                              const VkImageSubresourceRange *range,
9336                              struct radv_sample_locations_state *sample_locs)
9337 {
9338    enum radv_queue_family src_qf = vk_queue_to_radv(cmd_buffer->device->physical_device, src_family_index);
9339    enum radv_queue_family dst_qf = vk_queue_to_radv(cmd_buffer->device->physical_device, dst_family_index);
9340    if (image->exclusive && src_family_index != dst_family_index) {
9341       /* This is an acquire or a release operation and there will be
9342        * a corresponding release/acquire. Do the transition in the
9343        * most flexible queue. */
9344
9345       assert(src_qf == cmd_buffer->qf ||
9346              dst_qf == cmd_buffer->qf);
9347
9348       if (src_family_index == VK_QUEUE_FAMILY_EXTERNAL || src_family_index == VK_QUEUE_FAMILY_FOREIGN_EXT)
9349          return;
9350
9351       if (cmd_buffer->qf == RADV_QUEUE_TRANSFER)
9352          return;
9353
9354       if (cmd_buffer->qf == RADV_QUEUE_COMPUTE &&
9355           (src_qf == RADV_QUEUE_GENERAL || dst_qf == RADV_QUEUE_GENERAL))
9356          return;
9357    }
9358
9359    unsigned src_queue_mask =
9360       radv_image_queue_family_mask(image, src_qf, cmd_buffer->qf);
9361    unsigned dst_queue_mask =
9362       radv_image_queue_family_mask(image, dst_qf, cmd_buffer->qf);
9363
9364    if (src_layout == dst_layout && src_queue_mask == dst_queue_mask)
9365       return;
9366
9367    if (vk_format_has_depth(image->vk.format)) {
9368       radv_handle_depth_image_transition(cmd_buffer, image, src_layout, dst_layout,
9369                                          src_queue_mask, dst_queue_mask, range, sample_locs);
9370    } else {
9371       radv_handle_color_image_transition(cmd_buffer, image, src_layout, dst_layout,
9372                                          src_queue_mask, dst_queue_mask, range);
9373    }
9374 }
9375
9376 static void
9377 radv_cp_dma_wait_for_stages(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2 stage_mask)
9378 {
9379    /* Make sure CP DMA is idle because the driver might have performed a DMA operation for copying a
9380     * buffer (or a MSAA image using FMASK). Note that updating a buffer is considered a clear
9381     * operation but it might also use a CP DMA copy in some rare situations. Other operations using
9382     * a CP DMA clear are implicitly synchronized (see CP_DMA_SYNC).
9383     */
9384    if (stage_mask & (VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_CLEAR_BIT |
9385                      VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT | VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT |
9386                      VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT))
9387       si_cp_dma_wait_for_idle(cmd_buffer);
9388 }
9389
9390 static void
9391 radv_barrier(struct radv_cmd_buffer *cmd_buffer, const VkDependencyInfo *dep_info,
9392              enum rgp_barrier_reason reason)
9393 {
9394    enum radv_cmd_flush_bits src_flush_bits = 0;
9395    enum radv_cmd_flush_bits dst_flush_bits = 0;
9396    VkPipelineStageFlags2 src_stage_mask = 0;
9397    VkPipelineStageFlags2 dst_stage_mask = 0;
9398
9399    if (cmd_buffer->state.subpass)
9400       radv_mark_noncoherent_rb(cmd_buffer);
9401
9402    radv_describe_barrier_start(cmd_buffer, reason);
9403
9404    for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) {
9405       src_stage_mask |= dep_info->pMemoryBarriers[i].srcStageMask;
9406       src_flush_bits |=
9407          radv_src_access_flush(cmd_buffer, dep_info->pMemoryBarriers[i].srcAccessMask, NULL);
9408       dst_stage_mask |= dep_info->pMemoryBarriers[i].dstStageMask;
9409       dst_flush_bits |=
9410          radv_dst_access_flush(cmd_buffer, dep_info->pMemoryBarriers[i].dstAccessMask, NULL);
9411    }
9412
9413    for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) {
9414       src_stage_mask |= dep_info->pBufferMemoryBarriers[i].srcStageMask;
9415       src_flush_bits |=
9416          radv_src_access_flush(cmd_buffer, dep_info->pBufferMemoryBarriers[i].srcAccessMask, NULL);
9417       dst_stage_mask |= dep_info->pBufferMemoryBarriers[i].dstStageMask;
9418       dst_flush_bits |=
9419          radv_dst_access_flush(cmd_buffer, dep_info->pBufferMemoryBarriers[i].dstAccessMask, NULL);
9420    }
9421
9422    for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
9423       RADV_FROM_HANDLE(radv_image, image, dep_info->pImageMemoryBarriers[i].image);
9424
9425       src_stage_mask |= dep_info->pImageMemoryBarriers[i].srcStageMask;
9426       src_flush_bits |=
9427          radv_src_access_flush(cmd_buffer, dep_info->pImageMemoryBarriers[i].srcAccessMask, image);
9428       dst_stage_mask |= dep_info->pImageMemoryBarriers[i].dstStageMask;
9429       dst_flush_bits |=
9430          radv_dst_access_flush(cmd_buffer, dep_info->pImageMemoryBarriers[i].dstAccessMask, image);
9431    }
9432
9433    /* The Vulkan spec 1.1.98 says:
9434     *
9435     * "An execution dependency with only
9436     *  VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT in the destination stage mask
9437     *  will only prevent that stage from executing in subsequently
9438     *  submitted commands. As this stage does not perform any actual
9439     *  execution, this is not observable - in effect, it does not delay
9440     *  processing of subsequent commands. Similarly an execution dependency
9441     *  with only VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT in the source stage mask
9442     *  will effectively not wait for any prior commands to complete."
9443     */
9444    if (dst_stage_mask != VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT)
9445       radv_stage_flush(cmd_buffer, src_stage_mask);
9446    cmd_buffer->state.flush_bits |= src_flush_bits;
9447
9448    radv_ace_internal_barrier(cmd_buffer, src_stage_mask, 0);
9449
9450    for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
9451       RADV_FROM_HANDLE(radv_image, image, dep_info->pImageMemoryBarriers[i].image);
9452
9453       const struct VkSampleLocationsInfoEXT *sample_locs_info =
9454          vk_find_struct_const(dep_info->pImageMemoryBarriers[i].pNext, SAMPLE_LOCATIONS_INFO_EXT);
9455       struct radv_sample_locations_state sample_locations;
9456
9457       if (sample_locs_info) {
9458          assert(image->vk.create_flags & VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT);
9459          sample_locations.per_pixel = sample_locs_info->sampleLocationsPerPixel;
9460          sample_locations.grid_size = sample_locs_info->sampleLocationGridSize;
9461          sample_locations.count = sample_locs_info->sampleLocationsCount;
9462          typed_memcpy(&sample_locations.locations[0], sample_locs_info->pSampleLocations,
9463                       sample_locs_info->sampleLocationsCount);
9464       }
9465
9466       radv_handle_image_transition(
9467          cmd_buffer, image, dep_info->pImageMemoryBarriers[i].oldLayout,
9468          dep_info->pImageMemoryBarriers[i].newLayout,
9469          dep_info->pImageMemoryBarriers[i].srcQueueFamilyIndex,
9470          dep_info->pImageMemoryBarriers[i].dstQueueFamilyIndex,
9471          &dep_info->pImageMemoryBarriers[i].subresourceRange, sample_locs_info ? &sample_locations : NULL);
9472    }
9473
9474    radv_ace_internal_barrier(cmd_buffer, 0, dst_stage_mask);
9475    radv_cp_dma_wait_for_stages(cmd_buffer, src_stage_mask);
9476
9477    cmd_buffer->state.flush_bits |= dst_flush_bits;
9478
9479    radv_describe_barrier_end(cmd_buffer);
9480 }
9481
9482 VKAPI_ATTR void VKAPI_CALL
9483 radv_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,
9484                          const VkDependencyInfo *pDependencyInfo)
9485 {
9486    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9487
9488    radv_barrier(cmd_buffer, pDependencyInfo, RGP_BARRIER_EXTERNAL_CMD_PIPELINE_BARRIER);
9489 }
9490
9491 static void
9492 write_event(struct radv_cmd_buffer *cmd_buffer, struct radv_event *event,
9493             VkPipelineStageFlags2 stageMask, unsigned value)
9494 {
9495    struct radeon_cmdbuf *cs = cmd_buffer->cs;
9496    uint64_t va = radv_buffer_get_va(event->bo);
9497
9498    si_emit_cache_flush(cmd_buffer);
9499
9500    radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo);
9501
9502    ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 28);
9503
9504    if (stageMask & (VK_PIPELINE_STAGE_2_COPY_BIT |
9505                     VK_PIPELINE_STAGE_2_RESOLVE_BIT |
9506                     VK_PIPELINE_STAGE_2_BLIT_BIT |
9507                     VK_PIPELINE_STAGE_2_CLEAR_BIT)) {
9508       /* Be conservative for now. */
9509       stageMask |= VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT;
9510    }
9511
9512    /* Flags that only require a top-of-pipe event. */
9513    VkPipelineStageFlags2 top_of_pipe_flags = VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT;
9514
9515    /* Flags that only require a post-index-fetch event. */
9516    VkPipelineStageFlags2 post_index_fetch_flags =
9517       top_of_pipe_flags | VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT;
9518
9519    /* Flags that only require signaling post PS. */
9520    VkPipelineStageFlags2 post_ps_flags =
9521       post_index_fetch_flags | VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT |
9522       VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT |
9523       VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT | VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT |
9524       VK_PIPELINE_STAGE_2_MESH_SHADER_BIT_NV |
9525       VK_PIPELINE_STAGE_2_TRANSFORM_FEEDBACK_BIT_EXT |
9526       VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT |
9527       VK_PIPELINE_STAGE_2_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR |
9528       VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT;
9529
9530    /* Flags that only require signaling post CS. */
9531    VkPipelineStageFlags2 post_cs_flags = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT;
9532
9533    radv_cp_dma_wait_for_stages(cmd_buffer, stageMask);
9534
9535    if (!(stageMask & ~top_of_pipe_flags)) {
9536       /* Just need to sync the PFP engine. */
9537       radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
9538       radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
9539       radeon_emit(cs, va);
9540       radeon_emit(cs, va >> 32);
9541       radeon_emit(cs, value);
9542    } else if (!(stageMask & ~post_index_fetch_flags)) {
9543       /* Sync ME because PFP reads index and indirect buffers. */
9544       radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
9545       radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_ME));
9546       radeon_emit(cs, va);
9547       radeon_emit(cs, va >> 32);
9548       radeon_emit(cs, value);
9549    } else {
9550       unsigned event_type;
9551
9552       if (!(stageMask & ~post_ps_flags)) {
9553          /* Sync previous fragment shaders. */
9554          event_type = V_028A90_PS_DONE;
9555       } else if (!(stageMask & ~post_cs_flags)) {
9556          /* Sync previous compute shaders. */
9557          event_type = V_028A90_CS_DONE;
9558       } else {
9559          /* Otherwise, sync all prior GPU work. */
9560          event_type = V_028A90_BOTTOM_OF_PIPE_TS;
9561       }
9562
9563       si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level,
9564                                  radv_cmd_buffer_uses_mec(cmd_buffer), event_type, 0,
9565                                  EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, va, value,
9566                                  cmd_buffer->gfx9_eop_bug_va);
9567    }
9568
9569    assert(cmd_buffer->cs->cdw <= cdw_max);
9570 }
9571
9572 VKAPI_ATTR void VKAPI_CALL
9573 radv_CmdSetEvent2(VkCommandBuffer commandBuffer, VkEvent _event,
9574                   const VkDependencyInfo* pDependencyInfo)
9575 {
9576    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9577    RADV_FROM_HANDLE(radv_event, event, _event);
9578    VkPipelineStageFlags2 src_stage_mask = 0;
9579
9580    for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++)
9581       src_stage_mask |= pDependencyInfo->pMemoryBarriers[i].srcStageMask;
9582    for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++)
9583       src_stage_mask |= pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask;
9584    for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++)
9585       src_stage_mask |= pDependencyInfo->pImageMemoryBarriers[i].srcStageMask;
9586
9587    write_event(cmd_buffer, event, src_stage_mask, 1);
9588 }
9589
9590 VKAPI_ATTR void VKAPI_CALL
9591 radv_CmdResetEvent2(VkCommandBuffer commandBuffer, VkEvent _event,
9592                     VkPipelineStageFlags2 stageMask)
9593 {
9594    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9595    RADV_FROM_HANDLE(radv_event, event, _event);
9596
9597    write_event(cmd_buffer, event, stageMask, 0);
9598 }
9599
9600 VKAPI_ATTR void VKAPI_CALL
9601 radv_CmdWaitEvents2(VkCommandBuffer commandBuffer, uint32_t eventCount, const VkEvent *pEvents,
9602                     const VkDependencyInfo* pDependencyInfos)
9603 {
9604    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9605    struct radeon_cmdbuf *cs = cmd_buffer->cs;
9606
9607    for (unsigned i = 0; i < eventCount; ++i) {
9608       RADV_FROM_HANDLE(radv_event, event, pEvents[i]);
9609       uint64_t va = radv_buffer_get_va(event->bo);
9610
9611       radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo);
9612
9613       ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 7);
9614
9615       radv_cp_wait_mem(cs, WAIT_REG_MEM_EQUAL, va, 1, 0xffffffff);
9616       assert(cmd_buffer->cs->cdw <= cdw_max);
9617    }
9618
9619    radv_barrier(cmd_buffer, pDependencyInfos, RGP_BARRIER_EXTERNAL_CMD_WAIT_EVENTS);
9620 }
9621
9622 VKAPI_ATTR void VKAPI_CALL
9623 radv_CmdSetDeviceMask(VkCommandBuffer commandBuffer, uint32_t deviceMask)
9624 {
9625    /* No-op */
9626 }
9627
9628 /* VK_EXT_conditional_rendering */
9629 VKAPI_ATTR void VKAPI_CALL
9630 radv_CmdBeginConditionalRenderingEXT(
9631    VkCommandBuffer commandBuffer,
9632    const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin)
9633 {
9634    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9635    RADV_FROM_HANDLE(radv_buffer, buffer, pConditionalRenderingBegin->buffer);
9636    struct radeon_cmdbuf *cs = cmd_buffer->cs;
9637    unsigned pred_op = PREDICATION_OP_BOOL32;
9638    bool draw_visible = true;
9639    uint64_t va;
9640
9641    va = radv_buffer_get_va(buffer->bo) + buffer->offset + pConditionalRenderingBegin->offset;
9642
9643    /* By default, if the 32-bit value at offset in buffer memory is zero,
9644     * then the rendering commands are discarded, otherwise they are
9645     * executed as normal. If the inverted flag is set, all commands are
9646     * discarded if the value is non zero.
9647     */
9648    if (pConditionalRenderingBegin->flags & VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT) {
9649       draw_visible = false;
9650    }
9651
9652    si_emit_cache_flush(cmd_buffer);
9653
9654    if (cmd_buffer->qf == RADV_QUEUE_GENERAL &&
9655        !cmd_buffer->device->physical_device->rad_info.has_32bit_predication) {
9656       uint64_t pred_value = 0, pred_va;
9657       unsigned pred_offset;
9658
9659       /* From the Vulkan spec 1.1.107:
9660        *
9661        * "If the 32-bit value at offset in buffer memory is zero,
9662        *  then the rendering commands are discarded, otherwise they
9663        *  are executed as normal. If the value of the predicate in
9664        *  buffer memory changes while conditional rendering is
9665        *  active, the rendering commands may be discarded in an
9666        *  implementation-dependent way. Some implementations may
9667        *  latch the value of the predicate upon beginning conditional
9668        *  rendering while others may read it before every rendering
9669        *  command."
9670        *
9671        * But, the AMD hardware treats the predicate as a 64-bit
9672        * value which means we need a workaround in the driver.
9673        * Luckily, it's not required to support if the value changes
9674        * when predication is active.
9675        *
9676        * The workaround is as follows:
9677        * 1) allocate a 64-value in the upload BO and initialize it
9678        *    to 0
9679        * 2) copy the 32-bit predicate value to the upload BO
9680        * 3) use the new allocated VA address for predication
9681        *
9682        * Based on the conditionalrender demo, it's faster to do the
9683        * COPY_DATA in ME  (+ sync PFP) instead of PFP.
9684        */
9685       radv_cmd_buffer_upload_data(cmd_buffer, 8, &pred_value, &pred_offset);
9686
9687       pred_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + pred_offset;
9688
9689       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
9690       radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
9691                          COPY_DATA_WR_CONFIRM);
9692       radeon_emit(cs, va);
9693       radeon_emit(cs, va >> 32);
9694       radeon_emit(cs, pred_va);
9695       radeon_emit(cs, pred_va >> 32);
9696
9697       radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
9698       radeon_emit(cs, 0);
9699
9700       va = pred_va;
9701       pred_op = PREDICATION_OP_BOOL64;
9702    }
9703
9704    /* MEC doesn't support predication, we emulate it elsewhere. */
9705    if (!radv_cmd_buffer_uses_mec(cmd_buffer)) {
9706       si_emit_set_predication_state(cmd_buffer, draw_visible, pred_op, va);
9707    }
9708
9709    /* Store conditional rendering user info. */
9710    cmd_buffer->state.predicating = true;
9711    cmd_buffer->state.predication_type = draw_visible;
9712    cmd_buffer->state.predication_op = pred_op;
9713    cmd_buffer->state.predication_va = va;
9714    cmd_buffer->mec_inv_pred_emitted = false;
9715 }
9716
9717 VKAPI_ATTR void VKAPI_CALL
9718 radv_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)
9719 {
9720    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9721
9722    /* MEC doesn't support predication, no need to emit anything here. */
9723    if (!radv_cmd_buffer_uses_mec(cmd_buffer)) {
9724       si_emit_set_predication_state(cmd_buffer, false, 0, 0);
9725    }
9726
9727    /* Reset conditional rendering user info. */
9728    cmd_buffer->state.predicating = false;
9729    cmd_buffer->state.predication_type = -1;
9730    cmd_buffer->state.predication_op = 0;
9731    cmd_buffer->state.predication_va = 0;
9732    cmd_buffer->mec_inv_pred_emitted = false;
9733 }
9734
9735 /* VK_EXT_transform_feedback */
9736 VKAPI_ATTR void VKAPI_CALL
9737 radv_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer, uint32_t firstBinding,
9738                                         uint32_t bindingCount, const VkBuffer *pBuffers,
9739                                         const VkDeviceSize *pOffsets, const VkDeviceSize *pSizes)
9740 {
9741    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9742    struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
9743    uint8_t enabled_mask = 0;
9744
9745    assert(firstBinding + bindingCount <= MAX_SO_BUFFERS);
9746    for (uint32_t i = 0; i < bindingCount; i++) {
9747       uint32_t idx = firstBinding + i;
9748
9749       sb[idx].buffer = radv_buffer_from_handle(pBuffers[i]);
9750       sb[idx].offset = pOffsets[i];
9751
9752       if (!pSizes || pSizes[i] == VK_WHOLE_SIZE) {
9753          sb[idx].size = sb[idx].buffer->vk.size - sb[idx].offset;
9754       } else {
9755          sb[idx].size = pSizes[i];
9756       }
9757
9758       radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, sb[idx].buffer->bo);
9759
9760       enabled_mask |= 1 << idx;
9761    }
9762
9763    cmd_buffer->state.streamout.enabled_mask |= enabled_mask;
9764
9765    cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_BUFFER;
9766 }
9767
9768 bool
9769 radv_is_streamout_enabled(struct radv_cmd_buffer *cmd_buffer)
9770 {
9771    struct radv_streamout_state *so = &cmd_buffer->state.streamout;
9772
9773    /* Streamout must be enabled for the PRIMITIVES_GENERATED query to work. */
9774    return (so->streamout_enabled || cmd_buffer->state.prims_gen_query_enabled) &&
9775           !cmd_buffer->state.suspend_streamout;
9776 }
9777
9778 void
9779 radv_emit_streamout_enable(struct radv_cmd_buffer *cmd_buffer)
9780 {
9781    struct radv_streamout_state *so = &cmd_buffer->state.streamout;
9782    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
9783    bool streamout_enabled = radv_is_streamout_enabled(cmd_buffer);
9784    struct radeon_cmdbuf *cs = cmd_buffer->cs;
9785    uint32_t enabled_stream_buffers_mask = 0;
9786
9787    if (pipeline && pipeline->streamout_shader) {
9788       enabled_stream_buffers_mask = pipeline->streamout_shader->info.so.enabled_stream_buffers_mask;
9789    }
9790
9791    radeon_set_context_reg_seq(cs, R_028B94_VGT_STRMOUT_CONFIG, 2);
9792    radeon_emit(cs, S_028B94_STREAMOUT_0_EN(streamout_enabled) | S_028B94_RAST_STREAM(0) |
9793                       S_028B94_STREAMOUT_1_EN(streamout_enabled) |
9794                       S_028B94_STREAMOUT_2_EN(streamout_enabled) |
9795                       S_028B94_STREAMOUT_3_EN(streamout_enabled));
9796    radeon_emit(cs, so->hw_enabled_mask & enabled_stream_buffers_mask);
9797
9798    cmd_buffer->state.context_roll_without_scissor_emitted = true;
9799 }
9800
9801 static void
9802 radv_set_streamout_enable(struct radv_cmd_buffer *cmd_buffer, bool enable)
9803 {
9804    struct radv_streamout_state *so = &cmd_buffer->state.streamout;
9805    bool old_streamout_enabled = radv_is_streamout_enabled(cmd_buffer);
9806    uint32_t old_hw_enabled_mask = so->hw_enabled_mask;
9807
9808    so->streamout_enabled = enable;
9809
9810    so->hw_enabled_mask = so->enabled_mask | (so->enabled_mask << 4) | (so->enabled_mask << 8) |
9811                          (so->enabled_mask << 12);
9812
9813    if (!cmd_buffer->device->physical_device->use_ngg_streamout &&
9814        ((old_streamout_enabled != radv_is_streamout_enabled(cmd_buffer)) ||
9815         (old_hw_enabled_mask != so->hw_enabled_mask)))
9816       radv_emit_streamout_enable(cmd_buffer);
9817
9818    if (cmd_buffer->device->physical_device->use_ngg_streamout) {
9819       cmd_buffer->gds_needed = true;
9820       cmd_buffer->gds_oa_needed = true;
9821    }
9822 }
9823
9824 static void
9825 radv_flush_vgt_streamout(struct radv_cmd_buffer *cmd_buffer)
9826 {
9827    struct radeon_cmdbuf *cs = cmd_buffer->cs;
9828    unsigned reg_strmout_cntl;
9829
9830    /* The register is at different places on different ASICs. */
9831    if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9) {
9832       reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
9833       radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
9834       radeon_emit(cs, S_370_DST_SEL(V_370_MEM_MAPPED_REGISTER) | S_370_ENGINE_SEL(V_370_ME));
9835       radeon_emit(cs, R_0300FC_CP_STRMOUT_CNTL >> 2);
9836       radeon_emit(cs, 0);
9837       radeon_emit(cs, 0);
9838    } else if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7) {
9839       reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
9840       radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0);
9841    } else {
9842       reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL;
9843       radeon_set_config_reg(cs, reg_strmout_cntl, 0);
9844    }
9845
9846    radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
9847    radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0));
9848
9849    radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
9850    radeon_emit(cs,
9851                WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
9852    radeon_emit(cs, reg_strmout_cntl >> 2); /* register */
9853    radeon_emit(cs, 0);
9854    radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */
9855    radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */
9856    radeon_emit(cs, 4);                              /* poll interval */
9857 }
9858
9859 static void
9860 radv_emit_streamout_begin(struct radv_cmd_buffer *cmd_buffer, uint32_t firstCounterBuffer,
9861                           uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
9862                           const VkDeviceSize *pCounterBufferOffsets)
9863
9864 {
9865    struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
9866    struct radv_streamout_state *so = &cmd_buffer->state.streamout;
9867    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
9868    struct radv_shader_info *info = &pipeline->streamout_shader->info;
9869    struct radeon_cmdbuf *cs = cmd_buffer->cs;
9870
9871    radv_flush_vgt_streamout(cmd_buffer);
9872
9873    assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
9874    u_foreach_bit(i, so->enabled_mask)
9875    {
9876       int32_t counter_buffer_idx = i - firstCounterBuffer;
9877       if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
9878          counter_buffer_idx = -1;
9879
9880       /* AMD GCN binds streamout buffers as shader resources.
9881        * VGT only counts primitives and tells the shader through
9882        * SGPRs what to do.
9883        */
9884       radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 2);
9885       radeon_emit(cs, sb[i].size >> 2);     /* BUFFER_SIZE (in DW) */
9886       radeon_emit(cs, info->so.strides[i]); /* VTX_STRIDE (in DW) */
9887
9888       cmd_buffer->state.context_roll_without_scissor_emitted = true;
9889
9890       if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) {
9891          /* The array of counter buffers is optional. */
9892          RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
9893          uint64_t va = radv_buffer_get_va(buffer->bo);
9894          uint64_t counter_buffer_offset = 0;
9895
9896          if (pCounterBufferOffsets)
9897             counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx];
9898
9899          va += buffer->offset + counter_buffer_offset;
9900
9901          /* Append */
9902          radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
9903          radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) |   /* offset in bytes */
9904                             STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */
9905          radeon_emit(cs, 0);                                                 /* unused */
9906          radeon_emit(cs, 0);                                                 /* unused */
9907          radeon_emit(cs, va);                                                /* src address lo */
9908          radeon_emit(cs, va >> 32);                                          /* src address hi */
9909
9910          radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
9911       } else {
9912          /* Start from the beginning. */
9913          radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
9914          radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) | /* offset in bytes */
9915                             STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */
9916          radeon_emit(cs, 0);                                                    /* unused */
9917          radeon_emit(cs, 0);                                                    /* unused */
9918          radeon_emit(cs, 0);                                                    /* unused */
9919          radeon_emit(cs, 0);                                                    /* unused */
9920       }
9921    }
9922
9923    radv_set_streamout_enable(cmd_buffer, true);
9924 }
9925
9926 static void
9927 gfx10_emit_streamout_begin(struct radv_cmd_buffer *cmd_buffer, uint32_t firstCounterBuffer,
9928                            uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
9929                            const VkDeviceSize *pCounterBufferOffsets)
9930 {
9931    struct radv_streamout_state *so = &cmd_buffer->state.streamout;
9932    unsigned last_target = util_last_bit(so->enabled_mask) - 1;
9933    struct radeon_cmdbuf *cs = cmd_buffer->cs;
9934
9935    assert(cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10);
9936    assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
9937
9938    /* Sync because the next streamout operation will overwrite GDS and we
9939     * have to make sure it's idle.
9940     * TODO: Improve by tracking if there is a streamout operation in
9941     * flight.
9942     */
9943    cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH;
9944    si_emit_cache_flush(cmd_buffer);
9945
9946    u_foreach_bit(i, so->enabled_mask)
9947    {
9948       int32_t counter_buffer_idx = i - firstCounterBuffer;
9949       if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
9950          counter_buffer_idx = -1;
9951
9952       bool append =
9953          counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx];
9954       uint64_t va = 0;
9955
9956       if (append) {
9957          RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
9958          uint64_t counter_buffer_offset = 0;
9959
9960          if (pCounterBufferOffsets)
9961             counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx];
9962
9963          va += radv_buffer_get_va(buffer->bo);
9964          va += buffer->offset + counter_buffer_offset;
9965
9966          radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
9967       }
9968
9969       radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
9970       radeon_emit(cs, S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) |
9971                          S_411_DST_SEL(V_411_GDS) | S_411_CP_SYNC(i == last_target));
9972       radeon_emit(cs, va);
9973       radeon_emit(cs, va >> 32);
9974       radeon_emit(cs, 4 * i); /* destination in GDS */
9975       radeon_emit(cs, 0);
9976       radeon_emit(cs, S_415_BYTE_COUNT_GFX9(4) | S_415_DISABLE_WR_CONFIRM_GFX9(i != last_target));
9977    }
9978
9979    radv_set_streamout_enable(cmd_buffer, true);
9980 }
9981
9982 VKAPI_ATTR void VKAPI_CALL
9983 radv_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer,
9984                                   uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
9985                                   const VkDeviceSize *pCounterBufferOffsets)
9986 {
9987    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9988
9989    if (cmd_buffer->device->physical_device->use_ngg_streamout) {
9990       gfx10_emit_streamout_begin(cmd_buffer, firstCounterBuffer, counterBufferCount,
9991                                  pCounterBuffers, pCounterBufferOffsets);
9992    } else {
9993       radv_emit_streamout_begin(cmd_buffer, firstCounterBuffer, counterBufferCount, pCounterBuffers,
9994                                 pCounterBufferOffsets);
9995    }
9996 }
9997
9998 static void
9999 radv_emit_streamout_end(struct radv_cmd_buffer *cmd_buffer, uint32_t firstCounterBuffer,
10000                         uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
10001                         const VkDeviceSize *pCounterBufferOffsets)
10002 {
10003    struct radv_streamout_state *so = &cmd_buffer->state.streamout;
10004    struct radeon_cmdbuf *cs = cmd_buffer->cs;
10005
10006    radv_flush_vgt_streamout(cmd_buffer);
10007
10008    assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
10009    u_foreach_bit(i, so->enabled_mask)
10010    {
10011       int32_t counter_buffer_idx = i - firstCounterBuffer;
10012       if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
10013          counter_buffer_idx = -1;
10014
10015       if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) {
10016          /* The array of counters buffer is optional. */
10017          RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
10018          uint64_t va = radv_buffer_get_va(buffer->bo);
10019          uint64_t counter_buffer_offset = 0;
10020
10021          if (pCounterBufferOffsets)
10022             counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx];
10023
10024          va += buffer->offset + counter_buffer_offset;
10025
10026          radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
10027          radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) | /* offset in bytes */
10028                             STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
10029                             STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */
10030          radeon_emit(cs, va);                                  /* dst address lo */
10031          radeon_emit(cs, va >> 32);                            /* dst address hi */
10032          radeon_emit(cs, 0);                                   /* unused */
10033          radeon_emit(cs, 0);                                   /* unused */
10034
10035          radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
10036       }
10037
10038       /* Deactivate transform feedback by zeroing the buffer size.
10039        * The counters (primitives generated, primitives emitted) may
10040        * be enabled even if there is not buffer bound. This ensures
10041        * that the primitives-emitted query won't increment.
10042        */
10043       radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 0);
10044
10045       cmd_buffer->state.context_roll_without_scissor_emitted = true;
10046    }
10047
10048    radv_set_streamout_enable(cmd_buffer, false);
10049 }
10050
10051 static void
10052 gfx10_emit_streamout_end(struct radv_cmd_buffer *cmd_buffer, uint32_t firstCounterBuffer,
10053                          uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
10054                          const VkDeviceSize *pCounterBufferOffsets)
10055 {
10056    struct radv_streamout_state *so = &cmd_buffer->state.streamout;
10057    struct radeon_cmdbuf *cs = cmd_buffer->cs;
10058
10059    assert(cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10);
10060    assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
10061
10062    u_foreach_bit(i, so->enabled_mask)
10063    {
10064       int32_t counter_buffer_idx = i - firstCounterBuffer;
10065       if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
10066          counter_buffer_idx = -1;
10067
10068       if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) {
10069          /* The array of counters buffer is optional. */
10070          RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
10071          uint64_t va = radv_buffer_get_va(buffer->bo);
10072          uint64_t counter_buffer_offset = 0;
10073
10074          if (pCounterBufferOffsets)
10075             counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx];
10076
10077          va += buffer->offset + counter_buffer_offset;
10078
10079          si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level,
10080                                     radv_cmd_buffer_uses_mec(cmd_buffer), V_028A90_PS_DONE, 0,
10081                                     EOP_DST_SEL_TC_L2, EOP_DATA_SEL_GDS, va, EOP_DATA_GDS(i, 1), 0);
10082
10083          radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
10084       }
10085    }
10086
10087    radv_set_streamout_enable(cmd_buffer, false);
10088 }
10089
10090 VKAPI_ATTR void VKAPI_CALL
10091 radv_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer,
10092                                 uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
10093                                 const VkDeviceSize *pCounterBufferOffsets)
10094 {
10095    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
10096
10097    if (cmd_buffer->device->physical_device->use_ngg_streamout) {
10098       gfx10_emit_streamout_end(cmd_buffer, firstCounterBuffer, counterBufferCount, pCounterBuffers,
10099                                pCounterBufferOffsets);
10100    } else {
10101       radv_emit_streamout_end(cmd_buffer, firstCounterBuffer, counterBufferCount, pCounterBuffers,
10102                               pCounterBufferOffsets);
10103    }
10104 }
10105
10106 VKAPI_ATTR void VKAPI_CALL
10107 radv_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer, uint32_t instanceCount,
10108                                  uint32_t firstInstance, VkBuffer _counterBuffer,
10109                                  VkDeviceSize counterBufferOffset, uint32_t counterOffset,
10110                                  uint32_t vertexStride)
10111 {
10112    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
10113    RADV_FROM_HANDLE(radv_buffer, counterBuffer, _counterBuffer);
10114    struct radv_draw_info info;
10115
10116    info.count = 0;
10117    info.instance_count = instanceCount;
10118    info.first_instance = firstInstance;
10119    info.strmout_buffer = counterBuffer;
10120    info.strmout_buffer_offset = counterBufferOffset;
10121    info.stride = vertexStride;
10122    info.indexed = false;
10123    info.indirect = NULL;
10124
10125    if (!radv_before_draw(cmd_buffer, &info, 1))
10126       return;
10127    struct VkMultiDrawInfoEXT minfo = { 0, 0 };
10128    radv_emit_direct_draw_packets(cmd_buffer, &info, 1, &minfo, S_0287F0_USE_OPAQUE(1), 0);
10129    radv_after_draw(cmd_buffer);
10130 }
10131
10132 /* VK_AMD_buffer_marker */
10133 VKAPI_ATTR void VKAPI_CALL
10134 radv_CmdWriteBufferMarker2AMD(VkCommandBuffer commandBuffer, VkPipelineStageFlags2 stage,
10135                               VkBuffer dstBuffer, VkDeviceSize dstOffset, uint32_t marker)
10136 {
10137    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
10138    RADV_FROM_HANDLE(radv_buffer, buffer, dstBuffer);
10139    struct radeon_cmdbuf *cs = cmd_buffer->cs;
10140    uint64_t va = radv_buffer_get_va(buffer->bo) + buffer->offset + dstOffset;
10141
10142    si_emit_cache_flush(cmd_buffer);
10143
10144    ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 12);
10145
10146    if (!(stage & ~VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT)) {
10147       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
10148       radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
10149                          COPY_DATA_WR_CONFIRM);
10150       radeon_emit(cs, marker);
10151       radeon_emit(cs, 0);
10152       radeon_emit(cs, va);
10153       radeon_emit(cs, va >> 32);
10154    } else {
10155       si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level,
10156                                  radv_cmd_buffer_uses_mec(cmd_buffer), V_028A90_BOTTOM_OF_PIPE_TS,
10157                                  0, EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, va, marker,
10158                                  cmd_buffer->gfx9_eop_bug_va);
10159    }
10160
10161    assert(cmd_buffer->cs->cdw <= cdw_max);
10162 }
10163
10164 void
10165 radv_CmdBindPipelineShaderGroupNV(VkCommandBuffer commandBuffer,
10166                                   VkPipelineBindPoint pipelineBindPoint, VkPipeline pipeline,
10167                                   uint32_t groupIndex)
10168 {
10169    fprintf(stderr, "radv: unimplemented vkCmdBindPipelineShaderGroupNV\n");
10170    abort();
10171 }