From 83fee30e85965c9c1caffd0ad829bacaef12ae7c Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Wed, 9 Dec 2020 13:22:45 +0200 Subject: [PATCH] anv: allow multiple command buffers in anv_queue_submit v2: Fixup crash spotted by Mark about missing alloc vfuncs v3: Fixup double iteration over device->memory_objects (that ought to be expensive...) (Ken) v4: Add more asserts for non-softpin cases (Ken) Signed-off-by: Lionel Landwerlin Reviewed-by: Kenneth Graunke Part-of: --- src/intel/vulkan/anv_batch_chain.c | 355 ++++++++++++++++++++++++------------- src/intel/vulkan/anv_private.h | 5 +- src/intel/vulkan/anv_queue.c | 28 ++- 3 files changed, 265 insertions(+), 123 deletions(-) diff --git a/src/intel/vulkan/anv_batch_chain.c b/src/intel/vulkan/anv_batch_chain.c index 0116f9a..26144f5 100644 --- a/src/intel/vulkan/anv_batch_chain.c +++ b/src/intel/vulkan/anv_batch_chain.c @@ -615,6 +615,37 @@ cmd_buffer_chain_to_batch_bo(struct anv_cmd_buffer *cmd_buffer, } static void +anv_cmd_buffer_record_chain_submit(struct anv_cmd_buffer *cmd_buffer_from, + struct anv_cmd_buffer *cmd_buffer_to) +{ + assert(cmd_buffer_from->device->physical->use_softpin); + + uint32_t *bb_start = cmd_buffer_from->batch_end; + + struct anv_batch_bo *last_bbo = + list_last_entry(&cmd_buffer_from->batch_bos, struct anv_batch_bo, link); + struct anv_batch_bo *first_bbo = + list_first_entry(&cmd_buffer_to->batch_bos, struct anv_batch_bo, link); + + struct GEN8_MI_BATCH_BUFFER_START gen_bb_start = { + __anv_cmd_header(GEN8_MI_BATCH_BUFFER_START), + .SecondLevelBatchBuffer = Firstlevelbatch, + .AddressSpaceIndicator = ASI_PPGTT, + .BatchBufferStartAddress = (struct anv_address) { first_bbo->bo, 0 }, + }; + struct anv_batch local_batch = { + .start = last_bbo->bo->map, + .end = last_bbo->bo->map + last_bbo->bo->size, + .relocs = &last_bbo->relocs, + .alloc = &cmd_buffer_from->pool->alloc, + }; + + __anv_cmd_pack(GEN8_MI_BATCH_BUFFER_START)(&local_batch, bb_start, &gen_bb_start); + + last_bbo->chained = true; +} + +static void anv_cmd_buffer_record_end_submit(struct anv_cmd_buffer *cmd_buffer) { assert(cmd_buffer->device->physical->use_softpin); @@ -1135,6 +1166,11 @@ struct anv_execbuf { /* Allocated length of the 'objects' and 'bos' arrays */ uint32_t array_length; + /* List of relocations for surface states, only used with platforms not + * using softpin. + */ + void * surface_states_relocs; + /* Indicates whether any of the command buffers have relocations. This * doesn't not necessarily mean we'll need the kernel to process them. It * might be that a previous execbuf has already placed things in the VMA @@ -1157,6 +1193,7 @@ anv_execbuf_init(struct anv_execbuf *exec) static void anv_execbuf_finish(struct anv_execbuf *exec) { + vk_free(exec->alloc, exec->surface_states_relocs); vk_free(exec->alloc, exec->objects); vk_free(exec->alloc, exec->bos); } @@ -1434,8 +1471,7 @@ anv_reloc_list_apply(struct anv_device *device, * have to make a full copy of all the relocations lists. */ static bool -relocate_cmd_buffer(struct anv_cmd_buffer *cmd_buffer, - struct anv_execbuf *exec) +execbuf_can_skip_relocations(struct anv_execbuf *exec) { if (!exec->has_relocs) return true; @@ -1459,6 +1495,13 @@ relocate_cmd_buffer(struct anv_cmd_buffer *cmd_buffer, return false; } + return true; +} + +static void +relocate_cmd_buffer(struct anv_cmd_buffer *cmd_buffer, + struct anv_execbuf *exec) +{ /* Since surface states are shared between command buffers and we don't * know what order they will be submitted to the kernel, we don't know * what address is actually written in the surface state object at any @@ -1482,16 +1525,27 @@ relocate_cmd_buffer(struct anv_cmd_buffer *cmd_buffer, for (uint32_t i = 0; i < exec->bo_count; i++) exec->objects[i].offset = exec->bos[i]->offset; +} - return true; +static void +reset_cmd_buffer_surface_offsets(struct anv_cmd_buffer *cmd_buffer) +{ + /* In the case where we fall back to doing kernel relocations, we need to + * ensure that the relocation list is valid. All relocations on the batch + * buffers are already valid and kept up-to-date. Since surface states are + * shared between command buffers and we don't know what order they will be + * submitted to the kernel, we don't know what address is actually written + * in the surface state object at any given time. The only option is to set + * a bogus presumed offset and let the kernel relocate them. + */ + for (size_t i = 0; i < cmd_buffer->surface_relocs.num_relocs; i++) + cmd_buffer->surface_relocs.relocs[i].presumed_offset = -1; } static VkResult setup_execbuf_for_cmd_buffer(struct anv_execbuf *execbuf, - struct anv_queue *queue, struct anv_cmd_buffer *cmd_buffer) { - struct anv_batch *batch = &cmd_buffer->batch; struct anv_state_pool *ss_pool = &cmd_buffer->device->surface_state_pool; @@ -1499,58 +1553,10 @@ setup_execbuf_for_cmd_buffer(struct anv_execbuf *execbuf, cmd_buffer->last_ss_pool_center); VkResult result; if (cmd_buffer->device->physical->use_softpin) { - anv_block_pool_foreach_bo(bo, &ss_pool->block_pool) { - result = anv_execbuf_add_bo(cmd_buffer->device, execbuf, - bo, NULL, 0); - if (result != VK_SUCCESS) - return result; - } /* Add surface dependencies (BOs) to the execbuf */ anv_execbuf_add_bo_bitset(cmd_buffer->device, execbuf, cmd_buffer->surface_relocs.dep_words, cmd_buffer->surface_relocs.deps, 0); - - /* Add the BOs for all memory objects */ - list_for_each_entry(struct anv_device_memory, mem, - &cmd_buffer->device->memory_objects, link) { - result = anv_execbuf_add_bo(cmd_buffer->device, execbuf, - mem->bo, NULL, 0); - if (result != VK_SUCCESS) - return result; - } - - struct anv_block_pool *pool; - pool = &cmd_buffer->device->general_state_pool.block_pool; - anv_block_pool_foreach_bo(bo, pool) { - result = anv_execbuf_add_bo(cmd_buffer->device, execbuf, - bo, NULL, 0); - if (result != VK_SUCCESS) - return result; - } - - pool = &cmd_buffer->device->dynamic_state_pool.block_pool; - anv_block_pool_foreach_bo(bo, pool) { - result = anv_execbuf_add_bo(cmd_buffer->device, execbuf, - bo, NULL, 0); - if (result != VK_SUCCESS) - return result; - } - - pool = &cmd_buffer->device->instruction_state_pool.block_pool; - anv_block_pool_foreach_bo(bo, pool) { - result = anv_execbuf_add_bo(cmd_buffer->device, execbuf, - bo, NULL, 0); - if (result != VK_SUCCESS) - return result; - } - - pool = &cmd_buffer->device->binding_table_pool.block_pool; - anv_block_pool_foreach_bo(bo, pool) { - result = anv_execbuf_add_bo(cmd_buffer->device, execbuf, - bo, NULL, 0); - if (result != VK_SUCCESS) - return result; - } } else { /* Since we aren't in the softpin case, all of our STATE_BASE_ADDRESS BOs * will get added automatically by processing relocations on the batch @@ -1584,8 +1590,146 @@ setup_execbuf_for_cmd_buffer(struct anv_execbuf *execbuf, */ cmd_buffer->last_ss_pool_center = ss_pool->block_pool.center_bo_offset; + return VK_SUCCESS; +} + +static void +chain_command_buffers(struct anv_cmd_buffer **cmd_buffers, + uint32_t num_cmd_buffers) +{ + if (!anv_cmd_buffer_is_chainable(cmd_buffers[0])) { + assert(num_cmd_buffers == 1); + return; + } + + /* Chain the N-1 first batch buffers */ + for (uint32_t i = 0; i < (num_cmd_buffers - 1); i++) + anv_cmd_buffer_record_chain_submit(cmd_buffers[i], cmd_buffers[i + 1]); + + /* Put an end to the last one */ + anv_cmd_buffer_record_end_submit(cmd_buffers[num_cmd_buffers - 1]); +} + +static VkResult +setup_execbuf_for_cmd_buffers(struct anv_execbuf *execbuf, + struct anv_queue *queue, + struct anv_cmd_buffer **cmd_buffers, + uint32_t num_cmd_buffers) +{ + struct anv_device *device = queue->device; + struct anv_state_pool *ss_pool = &device->surface_state_pool; + VkResult result; + + /* Edit the tail of the command buffers to chain them all together if they + * can be. + */ + chain_command_buffers(cmd_buffers, num_cmd_buffers); + + for (uint32_t i = 0; i < num_cmd_buffers; i++) { + result = setup_execbuf_for_cmd_buffer(execbuf, cmd_buffers[i]); + if (result != VK_SUCCESS) + return result; + } + + /* Add all the global BOs to the object list for softpin case. */ + if (device->physical->use_softpin) { + anv_block_pool_foreach_bo(bo, &ss_pool->block_pool) { + result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0); + if (result != VK_SUCCESS) + return result; + } + + struct anv_block_pool *pool; + pool = &device->dynamic_state_pool.block_pool; + anv_block_pool_foreach_bo(bo, pool) { + result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0); + if (result != VK_SUCCESS) + return result; + } + + pool = &device->instruction_state_pool.block_pool; + anv_block_pool_foreach_bo(bo, pool) { + result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0); + if (result != VK_SUCCESS) + return result; + } + + pool = &device->binding_table_pool.block_pool; + anv_block_pool_foreach_bo(bo, pool) { + result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0); + if (result != VK_SUCCESS) + return result; + } + + /* Add the BOs for all user allocated memory objects because we can't + * track after binding updates of VK_EXT_descriptor_indexing. + */ + list_for_each_entry(struct anv_device_memory, mem, + &device->memory_objects, link) { + result = anv_execbuf_add_bo(device, execbuf, mem->bo, NULL, 0); + if (result != VK_SUCCESS) + return result; + } + } else { + /* We do not support chaining primary command buffers without + * softpin. + */ + assert(num_cmd_buffers == 1); + } + + bool no_reloc = true; + if (execbuf->has_relocs) { + no_reloc = execbuf_can_skip_relocations(execbuf); + if (no_reloc) { + /* If we were able to successfully relocate everything, tell the + * kernel that it can skip doing relocations. The requirement for + * using NO_RELOC is: + * + * 1) The addresses written in the objects must match the + * corresponding reloc.presumed_offset which in turn must match + * the corresponding execobject.offset. + * + * 2) To avoid stalling, execobject.offset should match the current + * address of that object within the active context. + * + * In order to satisfy all of the invariants that make userspace + * relocations to be safe (see relocate_cmd_buffer()), we need to + * further ensure that the addresses we use match those used by the + * kernel for the most recent execbuf2. + * + * The kernel may still choose to do relocations anyway if something + * has moved in the GTT. In this case, the relocation list still + * needs to be valid. All relocations on the batch buffers are + * already valid and kept up-to-date. For surface state relocations, + * by applying the relocations in relocate_cmd_buffer, we ensured + * that the address in the RENDER_SURFACE_STATE matches + * presumed_offset, so it should be safe for the kernel to relocate + * them as needed. + */ + for (uint32_t i = 0; i < num_cmd_buffers; i++) { + relocate_cmd_buffer(cmd_buffers[i], execbuf); + + anv_reloc_list_apply(device, &cmd_buffers[i]->surface_relocs, + device->surface_state_pool.block_pool.bo, + true /* always relocate surface states */); + } + } else { + /* In the case where we fall back to doing kernel relocations, we + * need to ensure that the relocation list is valid. All relocations + * on the batch buffers are already valid and kept up-to-date. Since + * surface states are shared between command buffers and we don't + * know what order they will be submitted to the kernel, we don't + * know what address is actually written in the surface state object + * at any given time. The only option is to set a bogus presumed + * offset and let the kernel relocate them. + */ + for (uint32_t i = 0; i < num_cmd_buffers; i++) + reset_cmd_buffer_surface_offsets(cmd_buffers[i]); + } + } + struct anv_batch_bo *first_batch_bo = - list_first_entry(&cmd_buffer->batch_bos, struct anv_batch_bo, link); + list_first_entry(&cmd_buffers[0]->batch_bos, struct anv_batch_bo, link); /* The kernel requires that the last entry in the validation list be the * batch buffer to execute. We can simply swap the element @@ -1609,28 +1753,34 @@ setup_execbuf_for_cmd_buffer(struct anv_execbuf *execbuf, } /* If we are pinning our BOs, we shouldn't have to relocate anything */ - if (cmd_buffer->device->physical->use_softpin) + if (device->physical->use_softpin) assert(!execbuf->has_relocs); - /* Now we go through and fixup all of the relocation lists to point to - * the correct indices in the object array. We have to do this after we - * reorder the list above as some of the indices may have changed. + /* Now we go through and fixup all of the relocation lists to point to the + * correct indices in the object array (I915_EXEC_HANDLE_LUT). We have to + * do this after we reorder the list above as some of the indices may have + * changed. */ + struct anv_batch_bo **bbo; if (execbuf->has_relocs) { - u_vector_foreach(bbo, &cmd_buffer->seen_bbos) - anv_cmd_buffer_process_relocs(cmd_buffer, &(*bbo)->relocs); + assert(num_cmd_buffers == 1); + u_vector_foreach(bbo, &cmd_buffers[0]->seen_bbos) + anv_cmd_buffer_process_relocs(cmd_buffers[0], &(*bbo)->relocs); - anv_cmd_buffer_process_relocs(cmd_buffer, &cmd_buffer->surface_relocs); + anv_cmd_buffer_process_relocs(cmd_buffers[0], &cmd_buffers[0]->surface_relocs); } - if (!cmd_buffer->device->info.has_llc) { + if (!device->info.has_llc) { __builtin_ia32_mfence(); - u_vector_foreach(bbo, &cmd_buffer->seen_bbos) { - for (uint32_t i = 0; i < (*bbo)->length; i += CACHELINE_SIZE) - __builtin_ia32_clflush((*bbo)->bo->map + i); + for (uint32_t i = 0; i < num_cmd_buffers; i++) { + u_vector_foreach(bbo, &cmd_buffers[i]->seen_bbos) { + for (uint32_t i = 0; i < (*bbo)->length; i += CACHELINE_SIZE) + __builtin_ia32_clflush((*bbo)->bo->map + i); + } } } + struct anv_batch *batch = &cmd_buffers[0]->batch; execbuf->execbuf = (struct drm_i915_gem_execbuffer2) { .buffers_ptr = (uintptr_t) execbuf->objects, .buffer_count = execbuf->bo_count, @@ -1640,51 +1790,11 @@ setup_execbuf_for_cmd_buffer(struct anv_execbuf *execbuf, .num_cliprects = 0, .DR1 = 0, .DR4 = 0, - .flags = I915_EXEC_HANDLE_LUT | queue->exec_flags, - .rsvd1 = cmd_buffer->device->context_id, + .flags = I915_EXEC_HANDLE_LUT | queue->exec_flags | (no_reloc ? I915_EXEC_NO_RELOC : 0), + .rsvd1 = device->context_id, .rsvd2 = 0, }; - if (relocate_cmd_buffer(cmd_buffer, execbuf)) { - /* If we were able to successfully relocate everything, tell the kernel - * that it can skip doing relocations. The requirement for using - * NO_RELOC is: - * - * 1) The addresses written in the objects must match the corresponding - * reloc.presumed_offset which in turn must match the corresponding - * execobject.offset. - * - * 2) To avoid stalling, execobject.offset should match the current - * address of that object within the active context. - * - * In order to satisfy all of the invariants that make userspace - * relocations to be safe (see relocate_cmd_buffer()), we need to - * further ensure that the addresses we use match those used by the - * kernel for the most recent execbuf2. - * - * The kernel may still choose to do relocations anyway if something has - * moved in the GTT. In this case, the relocation list still needs to be - * valid. All relocations on the batch buffers are already valid and - * kept up-to-date. For surface state relocations, by applying the - * relocations in relocate_cmd_buffer, we ensured that the address in - * the RENDER_SURFACE_STATE matches presumed_offset, so it should be - * safe for the kernel to relocate them as needed. - */ - execbuf->execbuf.flags |= I915_EXEC_NO_RELOC; - } else { - /* In the case where we fall back to doing kernel relocations, we need - * to ensure that the relocation list is valid. All relocations on the - * batch buffers are already valid and kept up-to-date. Since surface - * states are shared between command buffers and we don't know what - * order they will be submitted to the kernel, we don't know what - * address is actually written in the surface state object at any given - * time. The only option is to set a bogus presumed offset and let the - * kernel relocate them. - */ - for (size_t i = 0; i < cmd_buffer->surface_relocs.num_relocs; i++) - cmd_buffer->surface_relocs.relocs[i].presumed_offset = -1; - } - return VK_SUCCESS; } @@ -1764,10 +1874,10 @@ anv_queue_execbuf_locked(struct anv_queue *queue, goto error; } - if (submit->cmd_buffer) { - if (!anv_cmd_buffer_is_chainable(submit->cmd_buffer)) - anv_cmd_buffer_record_end_submit(submit->cmd_buffer); - result = setup_execbuf_for_cmd_buffer(&execbuf, queue, submit->cmd_buffer); + if (submit->cmd_buffer_count) { + result = setup_execbuf_for_cmd_buffers(&execbuf, queue, + submit->cmd_buffers, + submit->cmd_buffer_count); } else if (submit->simple_bo) { result = anv_execbuf_add_bo(device, &execbuf, submit->simple_bo, NULL, 0); if (result != VK_SUCCESS) @@ -1791,14 +1901,14 @@ anv_queue_execbuf_locked(struct anv_queue *queue, const bool has_perf_query = submit->perf_query_pass >= 0 && - submit->cmd_buffer && - submit->cmd_buffer->perf_query_pool; + submit->cmd_buffer_count && + submit->perf_query_pool; if (INTEL_DEBUG & DEBUG_BATCH) { fprintf(stderr, "Batch on queue %d\n", (int)(queue - device->queues)); - if (submit->cmd_buffer) { + if (submit->cmd_buffer_count) { if (has_perf_query) { - struct anv_query_pool *query_pool = submit->cmd_buffer->perf_query_pool; + struct anv_query_pool *query_pool = submit->perf_query_pool; struct anv_bo *pass_batch_bo = query_pool->bo; uint64_t pass_batch_offset = khr_perf_query_preamble_offset(query_pool, @@ -1809,11 +1919,14 @@ anv_queue_execbuf_locked(struct anv_queue *queue, pass_batch_bo->offset + pass_batch_offset, false); } - struct anv_batch_bo **bo = u_vector_tail(&submit->cmd_buffer->seen_bbos); - device->cmd_buffer_being_decoded = submit->cmd_buffer; - gen_print_batch(&device->decoder_ctx, (*bo)->bo->map, - (*bo)->bo->size, (*bo)->bo->offset, false); - device->cmd_buffer_being_decoded = NULL; + for (uint32_t i = 0; i < submit->cmd_buffer_count; i++) { + struct anv_batch_bo **bo = + u_vector_tail(&submit->cmd_buffers[i]->seen_bbos); + device->cmd_buffer_being_decoded = submit->cmd_buffers[i]; + gen_print_batch(&device->decoder_ctx, (*bo)->bo->map, + (*bo)->bo->size, (*bo)->bo->offset, false); + device->cmd_buffer_being_decoded = NULL; + } } else if (submit->simple_bo) { gen_print_batch(&device->decoder_ctx, submit->simple_bo->map, submit->simple_bo->size, submit->simple_bo->offset, false); @@ -1853,7 +1966,7 @@ anv_queue_execbuf_locked(struct anv_queue *queue, } if (has_perf_query) { - struct anv_query_pool *query_pool = submit->cmd_buffer->perf_query_pool; + struct anv_query_pool *query_pool = submit->perf_query_pool; assert(submit->perf_query_pass < query_pool->n_passes); struct gen_perf_query_info *query_info = query_pool->pass_query[submit->perf_query_pass]; diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index fc62304..85aa8b1 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -1090,7 +1090,9 @@ VkResult anv_init_wsi(struct anv_physical_device *physical_device); void anv_finish_wsi(struct anv_physical_device *physical_device); struct anv_queue_submit { - struct anv_cmd_buffer * cmd_buffer; + struct anv_cmd_buffer ** cmd_buffers; + uint32_t cmd_buffer_count; + uint32_t cmd_buffer_array_length; uint32_t fence_count; uint32_t fence_array_length; @@ -1132,6 +1134,7 @@ struct anv_queue_submit { uintptr_t * fence_bos; int perf_query_pass; + struct anv_query_pool * perf_query_pool; const VkAllocationCallbacks * alloc; VkSystemAllocationScope alloc_scope; diff --git a/src/intel/vulkan/anv_queue.c b/src/intel/vulkan/anv_queue.c index 33c680f..f532e03 100644 --- a/src/intel/vulkan/anv_queue.c +++ b/src/intel/vulkan/anv_queue.c @@ -112,6 +112,7 @@ anv_queue_submit_free(struct anv_device *device, vk_free(alloc, submit->signal_timelines); vk_free(alloc, submit->signal_timeline_values); vk_free(alloc, submit->fence_bos); + vk_free(alloc, submit->cmd_buffers); vk_free(alloc, submit); } @@ -1208,6 +1209,29 @@ anv_post_queue_fence_update(struct anv_device *device, VkFence _fence) } static VkResult +anv_queue_submit_add_cmd_buffer(struct anv_queue_submit *submit, + struct anv_cmd_buffer *cmd_buffer) +{ + if (submit->cmd_buffer_count >= submit->cmd_buffer_array_length) { + uint32_t new_len = MAX2(submit->cmd_buffer_array_length * 2, 4); + struct anv_cmd_buffer **new_cmd_buffers = + vk_realloc(submit->alloc, + submit->cmd_buffers, new_len * sizeof(*submit->cmd_buffers), + 8, submit->alloc_scope); + if (new_cmd_buffers == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + submit->cmd_buffers = new_cmd_buffers; + submit->cmd_buffer_array_length = new_len; + } + + submit->cmd_buffers[submit->cmd_buffer_count++] = cmd_buffer; + submit->perf_query_pool = cmd_buffer->perf_query_pool; + + return VK_SUCCESS; +} + +static VkResult anv_queue_submit_empty(struct anv_queue *queue, const VkSemaphore *in_semaphores, const uint64_t *in_values, @@ -1362,7 +1386,9 @@ VkResult anv_QueueSubmit( goto out; } - submit->cmd_buffer = cmd_buffer; + result = anv_queue_submit_add_cmd_buffer(submit, cmd_buffer); + if (result != VK_SUCCESS) + goto out; if (j == 0) { /* Only the first batch gets the in semaphores */ -- 2.7.4