From a5f9e59ce357c2974a97004d943aae92ad6f5004 Mon Sep 17 00:00:00 2001 From: Chad Versace Date: Thu, 8 Dec 2022 11:28:22 -0800 Subject: [PATCH] anv: Use vma_heap for descriptor pool host allocation Pre-patch, anv_descriptor_pool used a free list for host allocations that never merged adjacent free blocks. If the pool only allocated fixed-sized blocks, then this would not be a problem. But the pool allocations are variable-sized, and this caused over half of the pool's memory to be consumed by unusable free blocks in some workloads, causing unnecessary memory footprint. Replacing the free list with util_vma_heap, which does merge adjacent free blocks, fixes the memory explosion in the target workload. Disdavantges of util_vma_heap compared to the free list: - The heap calls malloc() when a new hole is created. - The heap calls free() when a hole disappears or is merged with an adjacent hole. - The Vulkan spec expects descriptor set creation/destruction to be thread-local lockless in the common case. For workloads that create/destroy with high frequency, malloc/free may cause overhead. Profiling is needed. Tested with a ChromeOS internal TensorFlow benchmark, provided by package 'tensorflow', running with its OpenCL backend on clvk. cmdline: benchmark_model --graph=mn2.tflite --use_gpu=true --min_secs=60 gpu: adl memory footprint from start of benchmark: before: init=132.691MB max=227.684MB after: init=134.988MB max=134.988MB Reported-by: Romaric Jodin Reviewed-by: Lionel Landwerlin Part-of: --- src/intel/vulkan/anv_descriptor_set.c | 69 +++++++++++++---------------------- src/intel/vulkan/anv_private.h | 17 ++++++--- 2 files changed, 37 insertions(+), 49 deletions(-) diff --git a/src/intel/vulkan/anv_descriptor_set.c b/src/intel/vulkan/anv_descriptor_set.c index b067232..206c3ae 100644 --- a/src/intel/vulkan/anv_descriptor_set.c +++ b/src/intel/vulkan/anv_descriptor_set.c @@ -806,13 +806,13 @@ void anv_DestroyPipelineLayout( /* * Descriptor pools. * - * These are implemented using a big pool of memory and a free-list for the + * These are implemented using a big pool of memory and a vma heap for the * host memory allocations and a state_stream and a free list for the buffer * view surface state. The spec allows us to fail to allocate due to * fragmentation in all cases but two: 1) after pool reset, allocating up * until the pool size with no freeing must succeed and 2) allocating and * freeing only descriptor sets with the same layout. Case 1) is easy enough, - * and the free lists lets us recycle blocks for case 2). + * and the vma heap ensures case 2). */ /* The vma heap reserves 0 to mean NULL; we have to offset by some amount to @@ -897,25 +897,26 @@ VkResult anv_CreateDescriptorPool( const bool host_only = pCreateInfo->flags & VK_DESCRIPTOR_POOL_CREATE_HOST_ONLY_BIT_EXT; + /* For host_only pools, allocate some memory to hold the written surface * states of the internal anv_buffer_view. With normal pools, the memory * holding surface state is allocated from the device surface_state_pool. */ - const size_t pool_size = + const size_t host_mem_size = pCreateInfo->maxSets * sizeof(struct anv_descriptor_set) + descriptor_count * sizeof(struct anv_descriptor) + buffer_view_count * sizeof(struct anv_buffer_view) + (host_only ? buffer_view_count * ANV_SURFACE_STATE_SIZE : 0); - const size_t total_size = sizeof(*pool) + pool_size; - pool = vk_object_alloc(&device->vk, pAllocator, total_size, + pool = vk_object_alloc(&device->vk, pAllocator, + sizeof(*pool) + host_mem_size, VK_OBJECT_TYPE_DESCRIPTOR_POOL); if (!pool) return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); - pool->size = pool_size; - pool->next = 0; - pool->free_list = EMPTY; + pool->host_mem_size = host_mem_size; + util_vma_heap_init(&pool->host_heap, POOL_HEAP_OFFSET, host_mem_size); + pool->host_only = host_only; if (descriptor_bo_size > 0) { @@ -990,8 +991,8 @@ VkResult anv_ResetDescriptorPool( } list_inithead(&pool->desc_sets); - pool->next = 0; - pool->free_list = EMPTY; + util_vma_heap_finish(&pool->host_heap); + util_vma_heap_init(&pool->host_heap, POOL_HEAP_OFFSET, pool->host_mem_size); if (pool->bo) { util_vma_heap_finish(&pool->bo_heap); @@ -1006,57 +1007,37 @@ VkResult anv_ResetDescriptorPool( return VK_SUCCESS; } -struct pool_free_list_entry { - uint32_t next; - uint32_t size; -}; - static VkResult anv_descriptor_pool_alloc_set(struct anv_descriptor_pool *pool, uint32_t size, struct anv_descriptor_set **set) { - if (size <= pool->size - pool->next) { - *set = (struct anv_descriptor_set *) (pool->data + pool->next); - (*set)->size = size; - pool->next += size; - return VK_SUCCESS; - } else { - struct pool_free_list_entry *entry; - uint32_t *link = &pool->free_list; - for (uint32_t f = pool->free_list; f != EMPTY; f = entry->next) { - entry = (struct pool_free_list_entry *) (pool->data + f); - if (size <= entry->size) { - *link = entry->next; - *set = (struct anv_descriptor_set *) entry; - (*set)->size = entry->size; - return VK_SUCCESS; - } - link = &entry->next; - } + uint64_t vma_offset = util_vma_heap_alloc(&pool->host_heap, size, 1); - if (pool->free_list != EMPTY) { + if (vma_offset == 0) { + if (size <= pool->host_heap.free_size) { return VK_ERROR_FRAGMENTED_POOL; } else { return VK_ERROR_OUT_OF_POOL_MEMORY; } } + + assert(vma_offset >= POOL_HEAP_OFFSET); + uint64_t host_mem_offset = vma_offset - POOL_HEAP_OFFSET; + + *set = (struct anv_descriptor_set *) (pool->host_mem + host_mem_offset); + (*set)->size = size; + + return VK_SUCCESS; } static void anv_descriptor_pool_free_set(struct anv_descriptor_pool *pool, struct anv_descriptor_set *set) { - /* Put the descriptor set allocation back on the free list. */ - const uint32_t index = (char *) set - pool->data; - if (index + set->size == pool->next) { - pool->next = index; - } else { - struct pool_free_list_entry *entry = (struct pool_free_list_entry *) set; - entry->next = pool->free_list; - entry->size = set->size; - pool->free_list = (char *) entry - pool->data; - } + util_vma_heap_free(&pool->host_heap, + ((char *) set - pool->host_mem) + POOL_HEAP_OFFSET, + set->size); } struct surface_state_free_list_entry { diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 294897c..90921c4 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -1891,21 +1891,28 @@ anv_descriptor_set_address(struct anv_descriptor_set *set) struct anv_descriptor_pool { struct vk_object_base base; - uint32_t size; - uint32_t next; - uint32_t free_list; - struct anv_bo *bo; struct util_vma_heap bo_heap; struct anv_state_stream surface_state_stream; void *surface_state_free_list; + /** List of anv_descriptor_set. */ struct list_head desc_sets; + /** Heap over host_mem */ + struct util_vma_heap host_heap; + + /** Allocated size of host_mem */ + uint32_t host_mem_size; + + /** + * VK_DESCRIPTOR_POOL_CREATE_HOST_ONLY_BIT_EXT. If set, then + * surface_state_stream is unused. + */ bool host_only; - char data[0]; + char host_mem[0]; }; size_t -- 2.7.4