From 1a72ca5667f82b8144c0003bddd76a774221ac09 Mon Sep 17 00:00:00 2001 From: Bas Nieuwenhuizen Date: Sat, 2 Sep 2017 12:59:55 +0200 Subject: [PATCH] radv: Put semaphore waits in preamble cs. The separate flush cs gets in the way of batchchain. Reviewed-by: Dave Airlie --- src/amd/vulkan/radv_device.c | 94 +++++++++++++++++-------------------------- src/amd/vulkan/radv_private.h | 3 +- 2 files changed, 37 insertions(+), 60 deletions(-) diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index a589869..d4dd2db 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -1000,6 +1000,8 @@ radv_queue_finish(struct radv_queue *queue) if (queue->hw_ctx) queue->device->ws->ctx_destroy(queue->hw_ctx); + if (queue->initial_full_flush_preamble_cs) + queue->device->ws->cs_destroy(queue->initial_full_flush_preamble_cs); if (queue->initial_preamble_cs) queue->device->ws->cs_destroy(queue->initial_preamble_cs); if (queue->continue_preamble_cs) @@ -1178,41 +1180,6 @@ VkResult radv_CreateDevice( break; } device->ws->cs_finalize(device->empty_cs[family]); - - device->flush_cs[family] = device->ws->cs_create(device->ws, family); - switch (family) { - case RADV_QUEUE_GENERAL: - case RADV_QUEUE_COMPUTE: - si_cs_emit_cache_flush(device->flush_cs[family], - false, - device->physical_device->rad_info.chip_class, - NULL, 0, - family == RADV_QUEUE_COMPUTE && device->physical_device->rad_info.chip_class >= CIK, - RADV_CMD_FLAG_INV_ICACHE | - RADV_CMD_FLAG_INV_SMEM_L1 | - RADV_CMD_FLAG_INV_VMEM_L1 | - RADV_CMD_FLAG_INV_GLOBAL_L2); - break; - } - device->ws->cs_finalize(device->flush_cs[family]); - - device->flush_shader_cs[family] = device->ws->cs_create(device->ws, family); - switch (family) { - case RADV_QUEUE_GENERAL: - case RADV_QUEUE_COMPUTE: - si_cs_emit_cache_flush(device->flush_shader_cs[family], - false, - device->physical_device->rad_info.chip_class, - NULL, 0, - family == RADV_QUEUE_COMPUTE && device->physical_device->rad_info.chip_class >= CIK, - family == RADV_QUEUE_COMPUTE ? RADV_CMD_FLAG_CS_PARTIAL_FLUSH : (RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH) | - RADV_CMD_FLAG_INV_ICACHE | - RADV_CMD_FLAG_INV_SMEM_L1 | - RADV_CMD_FLAG_INV_VMEM_L1 | - RADV_CMD_FLAG_INV_GLOBAL_L2); - break; - } - device->ws->cs_finalize(device->flush_shader_cs[family]); } if (getenv("RADV_TRACE_FILE")) { @@ -1280,10 +1247,6 @@ void radv_DestroyDevice( vk_free(&device->alloc, device->queues[i]); if (device->empty_cs[i]) device->ws->cs_destroy(device->empty_cs[i]); - if (device->flush_cs[i]) - device->ws->cs_destroy(device->flush_cs[i]); - if (device->flush_shader_cs[i]) - device->ws->cs_destroy(device->flush_shader_cs[i]); } radv_device_finish_meta(device); @@ -1576,6 +1539,7 @@ radv_get_preamble_cs(struct radv_queue *queue, uint32_t gsvs_ring_size, bool needs_tess_rings, bool needs_sample_positions, + struct radeon_winsys_cs **initial_full_flush_preamble_cs, struct radeon_winsys_cs **initial_preamble_cs, struct radeon_winsys_cs **continue_preamble_cs) { @@ -1586,7 +1550,7 @@ radv_get_preamble_cs(struct radv_queue *queue, struct radeon_winsys_bo *gsvs_ring_bo = NULL; struct radeon_winsys_bo *tess_factor_ring_bo = NULL; struct radeon_winsys_bo *tess_offchip_ring_bo = NULL; - struct radeon_winsys_cs *dest_cs[2] = {0}; + struct radeon_winsys_cs *dest_cs[3] = {0}; bool add_tess_rings = false, add_sample_positions = false; unsigned tess_factor_ring_size = 0, tess_offchip_ring_size = 0; unsigned max_offchip_buffers; @@ -1611,6 +1575,7 @@ radv_get_preamble_cs(struct radv_queue *queue, gsvs_ring_size <= queue->gsvs_ring_size && !add_tess_rings && !add_sample_positions && queue->initial_preamble_cs) { + *initial_full_flush_preamble_cs = queue->initial_full_flush_preamble_cs; *initial_preamble_cs = queue->initial_preamble_cs; *continue_preamble_cs = queue->continue_preamble_cs; if (!scratch_size && !compute_scratch_size && !esgs_ring_size && !gsvs_ring_size) @@ -1712,7 +1677,7 @@ radv_get_preamble_cs(struct radv_queue *queue, } else descriptor_bo = queue->descriptor_bo; - for(int i = 0; i < 2; ++i) { + for(int i = 0; i < 3; ++i) { struct radeon_winsys_cs *cs = NULL; cs = queue->device->ws->cs_create(queue->device->ws, queue->queue_family_index ? RING_COMPUTE : RING_GFX); @@ -1831,7 +1796,19 @@ radv_get_preamble_cs(struct radv_queue *queue, radeon_emit(cs, rsrc1); } - if (!i) { + if (i == 0) { + si_cs_emit_cache_flush(cs, + false, + queue->device->physical_device->rad_info.chip_class, + NULL, 0, + queue->queue_family_index == RING_COMPUTE && + queue->device->physical_device->rad_info.chip_class >= CIK, + (queue->queue_family_index == RADV_QUEUE_COMPUTE ? RADV_CMD_FLAG_CS_PARTIAL_FLUSH : (RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH)) | + RADV_CMD_FLAG_INV_ICACHE | + RADV_CMD_FLAG_INV_SMEM_L1 | + RADV_CMD_FLAG_INV_VMEM_L1 | + RADV_CMD_FLAG_INV_GLOBAL_L2); + } else if (i == 1) { si_cs_emit_cache_flush(cs, false, queue->device->physical_device->rad_info.chip_class, @@ -1848,14 +1825,18 @@ radv_get_preamble_cs(struct radv_queue *queue, goto fail; } + if (queue->initial_full_flush_preamble_cs) + queue->device->ws->cs_destroy(queue->initial_full_flush_preamble_cs); + if (queue->initial_preamble_cs) queue->device->ws->cs_destroy(queue->initial_preamble_cs); if (queue->continue_preamble_cs) queue->device->ws->cs_destroy(queue->continue_preamble_cs); - queue->initial_preamble_cs = dest_cs[0]; - queue->continue_preamble_cs = dest_cs[1]; + queue->initial_full_flush_preamble_cs = dest_cs[0]; + queue->initial_preamble_cs = dest_cs[1]; + queue->continue_preamble_cs = dest_cs[2]; if (scratch_bo != queue->scratch_bo) { if (queue->scratch_bo) @@ -1904,6 +1885,7 @@ radv_get_preamble_cs(struct radv_queue *queue, if (add_sample_positions) queue->has_sample_positions = true; + *initial_full_flush_preamble_cs = queue->initial_full_flush_preamble_cs; *initial_preamble_cs = queue->initial_preamble_cs; *continue_preamble_cs = queue->continue_preamble_cs; if (!scratch_size && !compute_scratch_size && !esgs_ring_size && !gsvs_ring_size) @@ -2028,7 +2010,7 @@ VkResult radv_QueueSubmit( uint32_t scratch_size = 0; uint32_t compute_scratch_size = 0; uint32_t esgs_ring_size = 0, gsvs_ring_size = 0; - struct radeon_winsys_cs *initial_preamble_cs = NULL, *continue_preamble_cs = NULL; + struct radeon_winsys_cs *initial_preamble_cs = NULL, *initial_flush_preamble_cs = NULL, *continue_preamble_cs = NULL; VkResult result; bool fence_emitted = false; bool tess_rings_needed = false; @@ -2053,7 +2035,7 @@ VkResult radv_QueueSubmit( result = radv_get_preamble_cs(queue, scratch_size, compute_scratch_size, esgs_ring_size, gsvs_ring_size, tess_rings_needed, - sample_positions_needed, + sample_positions_needed, &initial_flush_preamble_cs, &initial_preamble_cs, &continue_preamble_cs); if (result != VK_SUCCESS) return result; @@ -2061,7 +2043,7 @@ VkResult radv_QueueSubmit( for (uint32_t i = 0; i < submitCount; i++) { struct radeon_winsys_cs **cs_array; bool do_flush = !i || pSubmits[i].pWaitDstStageMask; - bool can_patch = !do_flush; + bool can_patch = true; uint32_t advance; struct radv_winsys_sem_info sem_info; @@ -2091,35 +2073,31 @@ VkResult radv_QueueSubmit( } cs_array = malloc(sizeof(struct radeon_winsys_cs *) * - (pSubmits[i].commandBufferCount + do_flush)); - - if(do_flush) - cs_array[0] = pSubmits[i].waitSemaphoreCount ? - queue->device->flush_shader_cs[queue->queue_family_index] : - queue->device->flush_cs[queue->queue_family_index]; + (pSubmits[i].commandBufferCount)); for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, pSubmits[i].pCommandBuffers[j]); assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY); - cs_array[j + do_flush] = cmd_buffer->cs; + cs_array[j] = cmd_buffer->cs; if ((cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)) can_patch = false; } - for (uint32_t j = 0; j < pSubmits[i].commandBufferCount + do_flush; j += advance) { + for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j += advance) { + struct radeon_winsys_cs *initial_preamble = (do_flush && !j) ? initial_flush_preamble_cs : initial_preamble_cs; advance = MIN2(max_cs_submission, - pSubmits[i].commandBufferCount + do_flush - j); + pSubmits[i].commandBufferCount - j); if (queue->device->trace_bo) *queue->device->trace_id_ptr = 0; sem_info.cs_emit_wait = j == 0; - sem_info.cs_emit_signal = j + advance == pSubmits[i].commandBufferCount + do_flush; + sem_info.cs_emit_signal = j + advance == pSubmits[i].commandBufferCount; ret = queue->device->ws->cs_submit(ctx, queue->queue_idx, cs_array + j, - advance, initial_preamble_cs, continue_preamble_cs, + advance, initial_preamble, continue_preamble_cs, &sem_info, can_patch, base_fence); diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index 28e70e6..07e3a49 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -499,6 +499,7 @@ struct radv_queue { struct radeon_winsys_bo *tess_factor_ring_bo; struct radeon_winsys_bo *tess_offchip_ring_bo; struct radeon_winsys_cs *initial_preamble_cs; + struct radeon_winsys_cs *initial_full_flush_preamble_cs; struct radeon_winsys_cs *continue_preamble_cs; }; @@ -515,8 +516,6 @@ struct radv_device { struct radv_queue *queues[RADV_MAX_QUEUE_FAMILIES]; int queue_count[RADV_MAX_QUEUE_FAMILIES]; struct radeon_winsys_cs *empty_cs[RADV_MAX_QUEUE_FAMILIES]; - struct radeon_winsys_cs *flush_cs[RADV_MAX_QUEUE_FAMILIES]; - struct radeon_winsys_cs *flush_shader_cs[RADV_MAX_QUEUE_FAMILIES]; uint64_t debug_flags; bool llvm_supports_spill; -- 2.7.4