From 5a1cbafd9d39f46da69dc091f966f5a205679531 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Timur=20Krist=C3=B3f?= Date: Thu, 12 May 2022 01:27:03 +0200 Subject: [PATCH] radv: Submit internal compute cmdbuf. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Use scheduled dependencies to create two submissions: first we submit ACE then GFX. Signed-off-by: Timur Kristóf Part-of: --- src/amd/vulkan/radv_device.c | 153 ++++++++++++++++++++++++++++++++++++++++++ src/amd/vulkan/radv_private.h | 1 + 2 files changed, 154 insertions(+) diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index d5627be..e3f8cf3 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -2874,10 +2874,33 @@ radv_queue_state_finish(struct radv_queue_state *queue, struct radeon_winsys *ws static void radv_queue_finish(struct radv_queue *queue) { + if (queue->ace_internal_state) { + /* Prevent double free */ + queue->ace_internal_state->task_rings_bo = NULL; + + /* Clean up the internal ACE queue state. */ + radv_queue_state_finish(queue->ace_internal_state, queue->device->ws); + free(queue->ace_internal_state); + } + radv_queue_state_finish(&queue->state, queue->device->ws); vk_queue_finish(&queue->vk); } +static bool +radv_queue_init_ace_internal_state(struct radv_queue *queue) +{ + if (queue->ace_internal_state) + return true; + + queue->ace_internal_state = calloc(1, sizeof(struct radv_queue_state)); + if (!queue->ace_internal_state) + return false; + + queue->ace_internal_state->qf = RADV_QUEUE_COMPUTE; + return true; +} + static VkResult radv_device_init_border_color(struct radv_device *device) { @@ -4893,6 +4916,37 @@ radv_update_preambles(struct radv_queue_state *queue, struct radv_device *device return radv_update_preamble_cs(queue, device, &needs); } +static VkResult +radv_update_ace_preambles(struct radv_queue *queue) +{ + if (!radv_queue_init_ace_internal_state(queue)) + return VK_ERROR_OUT_OF_HOST_MEMORY; + + /* Copy task rings state. + * Task shaders that are submitted on the ACE queue need to share + * their ring buffers with the mesh shaders on the GFX queue. + */ + queue->ace_internal_state->ring_info.task_rings = queue->state.ring_info.task_rings; + queue->ace_internal_state->task_rings_bo = queue->state.task_rings_bo; + + /* Copy some needed states from the parent queue state. + * These can only increase so it's okay to copy them as-is without checking. + * Note, task shaders use the scratch size from their graphics pipeline. + */ + struct radv_queue_ring_info needs = queue->ace_internal_state->ring_info; + needs.compute_scratch_size_per_wave = queue->state.ring_info.scratch_size_per_wave; + needs.compute_scratch_waves = queue->state.ring_info.scratch_waves; + needs.task_rings = queue->state.ring_info.task_rings; + + return radv_update_preamble_cs(queue->ace_internal_state, queue->device, &needs); +} + +static bool +radv_cmd_buffer_needs_ace(const struct radv_cmd_buffer *cmd_buffer) +{ + return cmd_buffer->ace_internal.cs && cmd_buffer->task_rings_needed; +} + struct radv_deferred_queue_submission { struct radv_queue *queue; VkCommandBuffer *cmd_buffers; @@ -4965,11 +5019,103 @@ radv_queue_submit_empty(struct radv_queue *queue, struct vk_queue_submit *submis } static VkResult +radv_queue_submit_with_ace(struct radv_queue *queue, struct vk_queue_submit *submission, + struct radeon_cmdbuf **cs_array, unsigned cs_count, unsigned cs_offset, + bool can_patch) +{ + /* Submits command buffers that may have an internal ACE cmdbuf + * using scheduled dependencies. This guarantees that the GFX cmdbuf + * is only scheduled after ACE. + * + * TODO: Unfortunately this is prone to a deadlock, so is considered a + * temporary solution until gang submit is merged in the upstream kernel. + */ + struct radeon_winsys_ctx *ctx = queue->hw_ctx; + const uint32_t max_cs_submission = queue->device->trace_bo ? 1 : RADV_MAX_IBS_PER_SUBMIT; + const bool need_wait = submission->wait_count > 0; + VkResult result = VK_SUCCESS; + + struct radeon_cmdbuf **ace_cs_array = calloc(max_cs_submission, sizeof(struct radeon_cmdbuf *)); + if (!ace_cs_array) { + result = VK_ERROR_OUT_OF_HOST_MEMORY; + goto finish; + } + + result = radv_update_ace_preambles(queue); + if (result != VK_SUCCESS) + goto finish; + + struct radv_winsys_submit_info submit[2] = { + { + .ip_type = AMD_IP_COMPUTE, + .cs_array = ace_cs_array, + .cs_count = 0, + .initial_preamble_cs = need_wait + ? queue->ace_internal_state->initial_full_flush_preamble_cs + : queue->ace_internal_state->initial_preamble_cs, + }, + { + .ip_type = radv_queue_ring(queue), + .queue_index = queue->vk.index_in_family, + .cs_array = cs_array, + .cs_count = 0, + .initial_preamble_cs = need_wait ? queue->state.initial_full_flush_preamble_cs + : queue->state.initial_preamble_cs, + }}; + + for (uint32_t advance, j = 0; j < cs_count; j += advance) { + advance = MIN2(max_cs_submission, cs_count - j); + bool last_submit = j + advance == cs_count; + + if (queue->device->trace_bo) + *queue->device->trace_id_ptr = 0; + + for (unsigned c = 0; c < advance; ++c) { + const struct radv_cmd_buffer *cmd_buffer = + (struct radv_cmd_buffer *)submission->command_buffers[j + c + cs_offset]; + if (!radv_cmd_buffer_needs_ace(cmd_buffer)) + continue; + + submit[0].cs_array[submit[0].cs_count++] = cmd_buffer->ace_internal.cs; + } + + const uint32_t submit_count = 1 + !!submit[0].cs_count; + const struct radv_winsys_submit_info *submit_ptr = submit + !submit[0].cs_count; + submit[1].cs_count = advance; + + result = queue->device->ws->cs_submit( + ctx, submit_count, submit_ptr, j == 0 ? submission->wait_count : 0, submission->waits, + last_submit ? submission->signal_count : 0, submission->signals, can_patch); + + if (result != VK_SUCCESS) + goto finish; + + if (queue->device->trace_bo) { + radv_check_gpu_hangs(queue, cs_array[j]); + } + + if (queue->device->tma_bo) { + radv_check_trap_handler(queue); + } + + submit[1].cs_array += submit[1].cs_count; + submit[1].initial_preamble_cs = queue->state.initial_preamble_cs; + submit[0].cs_count = 0; + submit[0].initial_preamble_cs = queue->ace_internal_state->initial_preamble_cs; + } + +finish: + free(ace_cs_array); + return result; +} + +static VkResult radv_queue_submit_normal(struct radv_queue *queue, struct vk_queue_submit *submission) { struct radeon_winsys_ctx *ctx = queue->hw_ctx; uint32_t max_cs_submission = queue->device->trace_bo ? 1 : RADV_MAX_IBS_PER_SUBMIT; bool can_patch = true; + bool use_ace = false; uint32_t advance; VkResult result; bool uses_perf_counters = false; @@ -4999,6 +5145,7 @@ radv_queue_submit_normal(struct radv_queue *queue, struct vk_queue_submit *submi can_patch = false; cmd_buffer->status = RADV_CMD_BUFFER_STATUS_PENDING; + use_ace |= radv_cmd_buffer_needs_ace(cmd_buffer); } if (uses_perf_counters) { @@ -5013,6 +5160,12 @@ radv_queue_submit_normal(struct radv_queue *queue, struct vk_queue_submit *submi } } + if (use_ace) { + result = radv_queue_submit_with_ace(queue, submission, cs_array, cmd_buffer_count, cs_offset, + can_patch); + goto fail; + } + /* For fences on the same queue/vm amdgpu doesn't wait till all processing is finished * before starting the next cmdbuffer, so we need to do it here. */ bool need_wait = submission->wait_count > 0; diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index 1f01a0e..c36c867 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -765,6 +765,7 @@ struct radv_queue { struct radeon_winsys_ctx *hw_ctx; enum radeon_ctx_priority priority; struct radv_queue_state state; + struct radv_queue_state *ace_internal_state; }; #define RADV_BORDER_COLOR_COUNT 4096 -- 2.7.4