radv: Keep a global BO list for VkMemory.
authorBas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Mon, 9 Apr 2018 10:46:49 +0000 (12:46 +0200)
committerBas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Wed, 18 Apr 2018 20:56:54 +0000 (22:56 +0200)
With update after bind we can't attach bo's to the command buffer
from the descriptor set anymore, so we have to have a global BO
list.

I am somewhat surprised this works really well even though we have
implicit synchronization in the WSI based on the bo list associations
and with the new behavior every command buffer is associated with
every swapchain image. But I could not find slowdowns in games because
of it.

Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
src/amd/vulkan/radv_device.c
src/amd/vulkan/radv_private.h
src/amd/vulkan/radv_radeon_winsys.h
src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c

index b8313b2..fda21fa 100644 (file)
@@ -1221,6 +1221,55 @@ radv_queue_finish(struct radv_queue *queue)
 }
 
 static void
+radv_bo_list_init(struct radv_bo_list *bo_list)
+{
+       pthread_mutex_init(&bo_list->mutex, NULL);
+       bo_list->list.count = bo_list->capacity = 0;
+       bo_list->list.bos = NULL;
+}
+
+static void
+radv_bo_list_finish(struct radv_bo_list *bo_list)
+{
+       free(bo_list->list.bos);
+       pthread_mutex_destroy(&bo_list->mutex);
+}
+
+static VkResult radv_bo_list_add(struct radv_bo_list *bo_list, struct radeon_winsys_bo *bo)
+{
+       pthread_mutex_lock(&bo_list->mutex);
+       if (bo_list->list.count == bo_list->capacity) {
+               unsigned capacity = MAX2(4, bo_list->capacity * 2);
+               void *data = realloc(bo_list->list.bos, capacity * sizeof(struct radeon_winsys_bo*));
+
+               if (!data) {
+                       pthread_mutex_unlock(&bo_list->mutex);
+                       return VK_ERROR_OUT_OF_HOST_MEMORY;
+               }
+
+               bo_list->list.bos = (struct radeon_winsys_bo**)data;
+               bo_list->capacity = capacity;
+       }
+
+       bo_list->list.bos[bo_list->list.count++] = bo;
+       pthread_mutex_unlock(&bo_list->mutex);
+       return VK_SUCCESS;
+}
+
+static void radv_bo_list_remove(struct radv_bo_list *bo_list, struct radeon_winsys_bo *bo)
+{
+       pthread_mutex_lock(&bo_list->mutex);
+       for(unsigned i = 0; i < bo_list->list.count; ++i) {
+               if (bo_list->list.bos[i] == bo) {
+                       bo_list->list.bos[i] = bo_list->list.bos[bo_list->list.count - 1];
+                       --bo_list->list.count;
+                       break;
+               }
+       }
+       pthread_mutex_unlock(&bo_list->mutex);
+}
+
+static void
 radv_device_init_gs_info(struct radv_device *device)
 {
        switch (device->physical_device->rad_info.family) {
@@ -1320,6 +1369,8 @@ VkResult radv_CreateDevice(
        mtx_init(&device->shader_slab_mutex, mtx_plain);
        list_inithead(&device->shader_slabs);
 
+       radv_bo_list_init(&device->bo_list);
+
        for (unsigned i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
                const VkDeviceQueueCreateInfo *queue_create = &pCreateInfo->pQueueCreateInfos[i];
                uint32_t qfi = queue_create->queueFamilyIndex;
@@ -1452,6 +1503,8 @@ VkResult radv_CreateDevice(
 fail_meta:
        radv_device_finish_meta(device);
 fail:
+       radv_bo_list_finish(&device->bo_list);
+
        if (device->trace_bo)
                device->ws->buffer_destroy(device->trace_bo);
 
@@ -1499,6 +1552,7 @@ void radv_DestroyDevice(
 
        radv_destroy_shader_slabs(device);
 
+       radv_bo_list_finish(&device->bo_list);
        vk_free(&device->alloc, device);
 }
 
@@ -2269,7 +2323,7 @@ static VkResult radv_signal_fence(struct radv_queue *queue,
 
        ret = queue->device->ws->cs_submit(queue->hw_ctx, queue->queue_idx,
                                           &queue->device->empty_cs[queue->queue_family_index],
-                                          1, NULL, NULL, &sem_info,
+                                          1, NULL, NULL, &sem_info, NULL,
                                           false, fence->fence);
        radv_free_sem_info(&sem_info);
 
@@ -2346,7 +2400,7 @@ VkResult radv_QueueSubmit(
                                ret = queue->device->ws->cs_submit(ctx, queue->queue_idx,
                                                                   &queue->device->empty_cs[queue->queue_family_index],
                                                                   1, NULL, NULL,
-                                                                  &sem_info,
+                                                                  &sem_info, NULL,
                                                                   false, base_fence);
                                if (ret) {
                                        radv_loge("failed to submit CS %d\n", i);
@@ -2384,11 +2438,15 @@ VkResult radv_QueueSubmit(
                        sem_info.cs_emit_wait = j == 0;
                        sem_info.cs_emit_signal = j + advance == pSubmits[i].commandBufferCount;
 
+                       pthread_mutex_lock(&queue->device->bo_list.mutex);
+
                        ret = queue->device->ws->cs_submit(ctx, queue->queue_idx, cs_array + j,
                                                        advance, initial_preamble, continue_preamble_cs,
-                                                          &sem_info,
+                                                       &sem_info, &queue->device->bo_list.list,
                                                        can_patch, base_fence);
 
+                       pthread_mutex_unlock(&queue->device->bo_list.mutex);
+
                        if (ret) {
                                radv_loge("failed to submit CS %d\n", i);
                                abort();
@@ -2594,11 +2652,8 @@ static VkResult radv_alloc_memory(struct radv_device *device,
                        goto fail;
                } else {
                        close(import_info->fd);
-                       goto out_success;
                }
-       }
-
-       if (host_ptr_info) {
+       } else if (host_ptr_info) {
                assert(host_ptr_info->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT);
                assert(mem_type_index == RADV_MEM_TYPE_GTT_CACHED);
                mem->bo = device->ws->buffer_from_ptr(device->ws, host_ptr_info->pHostPointer,
@@ -2608,41 +2663,46 @@ static VkResult radv_alloc_memory(struct radv_device *device,
                        goto fail;
                } else {
                        mem->user_ptr = host_ptr_info->pHostPointer;
-                       goto out_success;
                }
-       }
-
-       uint64_t alloc_size = align_u64(pAllocateInfo->allocationSize, 4096);
-       if (mem_type_index == RADV_MEM_TYPE_GTT_WRITE_COMBINE ||
-           mem_type_index == RADV_MEM_TYPE_GTT_CACHED)
-               domain = RADEON_DOMAIN_GTT;
-       else
-               domain = RADEON_DOMAIN_VRAM;
+       } else {
+               uint64_t alloc_size = align_u64(pAllocateInfo->allocationSize, 4096);
+               if (mem_type_index == RADV_MEM_TYPE_GTT_WRITE_COMBINE ||
+                   mem_type_index == RADV_MEM_TYPE_GTT_CACHED)
+                       domain = RADEON_DOMAIN_GTT;
+               else
+                       domain = RADEON_DOMAIN_VRAM;
 
-       if (mem_type_index == RADV_MEM_TYPE_VRAM)
-               flags |= RADEON_FLAG_NO_CPU_ACCESS;
-       else
-               flags |= RADEON_FLAG_CPU_ACCESS;
+               if (mem_type_index == RADV_MEM_TYPE_VRAM)
+                       flags |= RADEON_FLAG_NO_CPU_ACCESS;
+               else
+                       flags |= RADEON_FLAG_CPU_ACCESS;
 
-       if (mem_type_index == RADV_MEM_TYPE_GTT_WRITE_COMBINE)
-               flags |= RADEON_FLAG_GTT_WC;
+               if (mem_type_index == RADV_MEM_TYPE_GTT_WRITE_COMBINE)
+                       flags |= RADEON_FLAG_GTT_WC;
 
-       if (!dedicate_info && !import_info && (!export_info || !export_info->handleTypes))
-               flags |= RADEON_FLAG_NO_INTERPROCESS_SHARING;
+               if (!dedicate_info && !import_info && (!export_info || !export_info->handleTypes))
+                       flags |= RADEON_FLAG_NO_INTERPROCESS_SHARING;
 
-       mem->bo = device->ws->buffer_create(device->ws, alloc_size, device->physical_device->rad_info.max_alignment,
-                                              domain, flags);
+               mem->bo = device->ws->buffer_create(device->ws, alloc_size, device->physical_device->rad_info.max_alignment,
+                                                   domain, flags);
 
-       if (!mem->bo) {
-               result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
-               goto fail;
+               if (!mem->bo) {
+                       result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
+                       goto fail;
+               }
+               mem->type_index = mem_type_index;
        }
-       mem->type_index = mem_type_index;
-out_success:
+
+       result = radv_bo_list_add(&device->bo_list, mem->bo);
+       if (result != VK_SUCCESS)
+               goto fail_bo;
+
        *pMem = radv_device_memory_to_handle(mem);
 
        return VK_SUCCESS;
 
+fail_bo:
+       device->ws->buffer_destroy(mem->bo);
 fail:
        vk_free2(&device->alloc, pAllocator, mem);
 
@@ -2670,6 +2730,7 @@ void radv_FreeMemory(
        if (mem == NULL)
                return;
 
+       radv_bo_list_remove(&device->bo_list, mem->bo);
        device->ws->buffer_destroy(mem->bo);
        mem->bo = NULL;
 
@@ -2989,7 +3050,7 @@ radv_sparse_image_opaque_bind_memory(struct radv_device *device,
                        queue->device->ws->cs_submit(queue->hw_ctx, queue->queue_idx,
                                                     &queue->device->empty_cs[queue->queue_family_index],
                                                     1, NULL, NULL,
-                                                    &sem_info,
+                                                    &sem_info, NULL,
                                                     false, base_fence);
                        fence_emitted = true;
                        if (fence)
index 1869604..35452b6 100644 (file)
@@ -598,6 +598,12 @@ struct radv_queue {
        struct radeon_winsys_cs *continue_preamble_cs;
 };
 
+struct radv_bo_list {
+       struct radv_winsys_bo_list list;
+       unsigned capacity;
+       pthread_mutex_t mutex;
+};
+
 struct radv_device {
        VK_LOADER_DATA                              _loader_data;
 
@@ -660,6 +666,8 @@ struct radv_device {
        uint64_t dmesg_timestamp;
 
        struct radv_device_extension_table enabled_extensions;
+
+       struct radv_bo_list bo_list;
 };
 
 struct radv_device_memory {
index ba16bf3..7f19934 100644 (file)
@@ -178,6 +178,11 @@ struct radv_winsys_sem_info {
        struct radv_winsys_sem_counts signal;
 };
 
+struct radv_winsys_bo_list {
+       struct radeon_winsys_bo **bos;
+       unsigned count;
+};
+
 struct radeon_winsys {
        void (*destroy)(struct radeon_winsys *ws);
 
@@ -246,6 +251,7 @@ struct radeon_winsys {
                         struct radeon_winsys_cs *initial_preamble_cs,
                         struct radeon_winsys_cs *continue_preamble_cs,
                         struct radv_winsys_sem_info *sem_info,
+                        const struct radv_winsys_bo_list *bo_list, /* optional */
                         bool can_patch,
                         struct radeon_winsys_fence *fence);
 
index 9921d38..c4b2232 100644 (file)
@@ -552,6 +552,7 @@ static int radv_amdgpu_create_bo_list(struct radv_amdgpu_winsys *ws,
                                      unsigned count,
                                      struct radv_amdgpu_winsys_bo *extra_bo,
                                      struct radeon_winsys_cs *extra_cs,
+                                     const struct radv_winsys_bo_list *radv_bo_list,
                                      amdgpu_bo_list_handle *bo_list)
 {
        int r = 0;
@@ -579,7 +580,7 @@ static int radv_amdgpu_create_bo_list(struct radv_amdgpu_winsys *ws,
                                          bo_list);
                free(handles);
                pthread_mutex_unlock(&ws->global_bo_list_lock);
-       } else if (count == 1 && !extra_bo && !extra_cs &&
+       } else if (count == 1 && !extra_bo && !extra_cs && !radv_bo_list && 
                   !radv_amdgpu_cs(cs_array[0])->num_virtual_buffers) {
                struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs*)cs_array[0];
                if (cs->num_buffers == 0) {
@@ -601,6 +602,11 @@ static int radv_amdgpu_create_bo_list(struct radv_amdgpu_winsys *ws,
                if (extra_cs) {
                        total_buffer_count += ((struct radv_amdgpu_cs*)extra_cs)->num_buffers;
                }
+
+               if (radv_bo_list) {
+                       total_buffer_count += radv_bo_list->count;
+               }
+
                if (total_buffer_count == 0) {
                        *bo_list = 0;
                        return 0;
@@ -674,6 +680,27 @@ static int radv_amdgpu_create_bo_list(struct radv_amdgpu_winsys *ws,
                        }
                }
 
+               if (radv_bo_list) {
+                       unsigned unique_bo_so_far = unique_bo_count;
+                       const unsigned default_bo_priority = 7;
+                       for (unsigned i = 0; i < radv_bo_list->count; ++i) {
+                               struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(radv_bo_list->bos[i]);
+                               bool found = false;
+                               for (unsigned j = 0; j < unique_bo_so_far; ++j) {
+                                       if (bo->bo == handles[j]) {
+                                               found = true;
+                                               priorities[j] = MAX2(priorities[j], default_bo_priority);
+                                               break;
+                                       }
+                               }
+                               if (!found) {
+                                       handles[unique_bo_count] = bo->bo;
+                                       priorities[unique_bo_count] = default_bo_priority;
+                                       ++unique_bo_count;
+                               }
+                       }
+               }
+
                if (unique_bo_count > 0) {
                        r = amdgpu_bo_list_create(ws->dev, unique_bo_count, handles,
                                                  priorities, bo_list);
@@ -709,6 +736,7 @@ static void radv_assign_last_submit(struct radv_amdgpu_ctx *ctx,
 static int radv_amdgpu_winsys_cs_submit_chained(struct radeon_winsys_ctx *_ctx,
                                                int queue_idx,
                                                struct radv_winsys_sem_info *sem_info,
+                                               const struct radv_winsys_bo_list *radv_bo_list,
                                                struct radeon_winsys_cs **cs_array,
                                                unsigned cs_count,
                                                struct radeon_winsys_cs *initial_preamble_cs,
@@ -745,7 +773,8 @@ static int radv_amdgpu_winsys_cs_submit_chained(struct radeon_winsys_ctx *_ctx,
                }
        }
 
-       r = radv_amdgpu_create_bo_list(cs0->ws, cs_array, cs_count, NULL, initial_preamble_cs, &bo_list);
+       r = radv_amdgpu_create_bo_list(cs0->ws, cs_array, cs_count, NULL, initial_preamble_cs,
+                                      radv_bo_list, &bo_list);
        if (r) {
                fprintf(stderr, "amdgpu: buffer list creation failed for the "
                                "chained submission(%d)\n", r);
@@ -789,6 +818,7 @@ static int radv_amdgpu_winsys_cs_submit_chained(struct radeon_winsys_ctx *_ctx,
 static int radv_amdgpu_winsys_cs_submit_fallback(struct radeon_winsys_ctx *_ctx,
                                                 int queue_idx,
                                                 struct radv_winsys_sem_info *sem_info,
+                                                const struct radv_winsys_bo_list *radv_bo_list,
                                                 struct radeon_winsys_cs **cs_array,
                                                 unsigned cs_count,
                                                 struct radeon_winsys_cs *initial_preamble_cs,
@@ -813,7 +843,7 @@ static int radv_amdgpu_winsys_cs_submit_fallback(struct radeon_winsys_ctx *_ctx,
                memset(&request, 0, sizeof(request));
 
                r = radv_amdgpu_create_bo_list(cs0->ws, &cs_array[i], cnt, NULL,
-                                              preamble_cs, &bo_list);
+                                              preamble_cs, radv_bo_list, &bo_list);
                if (r) {
                        fprintf(stderr, "amdgpu: buffer list creation failed "
                                        "for the fallback submission (%d)\n", r);
@@ -870,6 +900,7 @@ static int radv_amdgpu_winsys_cs_submit_fallback(struct radeon_winsys_ctx *_ctx,
 static int radv_amdgpu_winsys_cs_submit_sysmem(struct radeon_winsys_ctx *_ctx,
                                               int queue_idx,
                                               struct radv_winsys_sem_info *sem_info,
+                                              const struct radv_winsys_bo_list *radv_bo_list,
                                               struct radeon_winsys_cs **cs_array,
                                               unsigned cs_count,
                                               struct radeon_winsys_cs *initial_preamble_cs,
@@ -939,7 +970,7 @@ static int radv_amdgpu_winsys_cs_submit_sysmem(struct radeon_winsys_ctx *_ctx,
 
                r = radv_amdgpu_create_bo_list(cs0->ws, &cs_array[i], cnt,
                                               (struct radv_amdgpu_winsys_bo*)bo,
-                                              preamble_cs, &bo_list);
+                                              preamble_cs, radv_bo_list, &bo_list);
                if (r) {
                        fprintf(stderr, "amdgpu: buffer list creation failed "
                                        "for the sysmem submission (%d)\n", r);
@@ -990,6 +1021,7 @@ static int radv_amdgpu_winsys_cs_submit(struct radeon_winsys_ctx *_ctx,
                                        struct radeon_winsys_cs *initial_preamble_cs,
                                        struct radeon_winsys_cs *continue_preamble_cs,
                                        struct radv_winsys_sem_info *sem_info,
+                                       const struct radv_winsys_bo_list *bo_list,
                                        bool can_patch,
                                        struct radeon_winsys_fence *_fence)
 {
@@ -999,13 +1031,13 @@ static int radv_amdgpu_winsys_cs_submit(struct radeon_winsys_ctx *_ctx,
 
        assert(sem_info);
        if (!cs->ws->use_ib_bos) {
-               ret = radv_amdgpu_winsys_cs_submit_sysmem(_ctx, queue_idx, sem_info, cs_array,
+               ret = radv_amdgpu_winsys_cs_submit_sysmem(_ctx, queue_idx, sem_info, bo_list, cs_array,
                                                           cs_count, initial_preamble_cs, continue_preamble_cs, _fence);
        } else if (can_patch && cs_count > AMDGPU_CS_MAX_IBS_PER_SUBMIT && cs->ws->batchchain) {
-               ret = radv_amdgpu_winsys_cs_submit_chained(_ctx, queue_idx, sem_info, cs_array,
+               ret = radv_amdgpu_winsys_cs_submit_chained(_ctx, queue_idx, sem_info, bo_list, cs_array,
                                                            cs_count, initial_preamble_cs, continue_preamble_cs, _fence);
        } else {
-               ret = radv_amdgpu_winsys_cs_submit_fallback(_ctx, queue_idx, sem_info, cs_array,
+               ret = radv_amdgpu_winsys_cs_submit_fallback(_ctx, queue_idx, sem_info, bo_list, cs_array,
                                                             cs_count, initial_preamble_cs, continue_preamble_cs, _fence);
        }