anv: add a third memory type for LLC configuration
authorBrian Paul <brianp@vmware.com>
Wed, 18 Jan 2023 16:01:15 +0000 (09:01 -0700)
committerMarge Bot <emma+marge@anholt.net>
Thu, 26 Jan 2023 22:45:49 +0000 (22:45 +0000)
Commit 582bf4d9 turned on write-combining for most (all?) memory
allocations.  This caused a fairly large performance drop in some of
our VMware tests (application traces, such as Windows Metro Paint).

This patch adds a third memory type configuration: DEVICE_LOCAL,
HOST_VISIBLE, HOST_COHERENT.  This is uncached.  Then, in
anv_AllocateMemory() we only use write-combining for this uncached
type.  This memory type is found in the Intel Windows Vulkan driver.
And according to
https://asawicki.info/news_1740_vulkan_memory_types_on_pc_and_how_to_use_them
uncached memory correlates to write-combined memory.

This fixes our performance regression (and actually produced the
fastest ever results for our test suite).

Signed-off-by: Brian Paul <brianp@vmware.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20770>

src/intel/vulkan/anv_device.c

index 837f683..fad34e0 100644 (file)
@@ -514,19 +514,28 @@ anv_physical_device_init_heaps(struct anv_physical_device *device, int fd)
        * But some game engines can't handle single type well
        * https://gitlab.freedesktop.org/mesa/mesa/-/issues/7360#note_1719438
        *
-       * And Intel on Windows uses 3 types so it's better to add extra one here
+       * The second memory type w/out HOST_CACHED_BIT will get write-combining.
+       * See anv_AllocateMemory()).
+       *
+       * The Intel Vulkan driver for Windows also advertises these memory types.
        */
-      device->memory.type_count = 2;
+      device->memory.type_count = 3;
       device->memory.types[0] = (struct anv_memory_type) {
-          .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
-          .heapIndex = 0,
+         .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+         .heapIndex = 0,
       };
       device->memory.types[1] = (struct anv_memory_type) {
-          .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
-                           VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
-                           VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
-                           VK_MEMORY_PROPERTY_HOST_CACHED_BIT,
-          .heapIndex = 0,
+         .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
+                          VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                          VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+         .heapIndex = 0,
+      };
+      device->memory.types[2] = (struct anv_memory_type) {
+         .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
+                          VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                          VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
+                          VK_MEMORY_PROPERTY_HOST_CACHED_BIT,
+         .heapIndex = 0,
       };
    } else {
       device->memory.heap_count = 1;
@@ -3819,7 +3828,7 @@ VkResult anv_AllocateMemory(
       return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
 
    assert(pAllocateInfo->memoryTypeIndex < pdevice->memory.type_count);
-   struct anv_memory_type *mem_type =
+   const struct anv_memory_type *mem_type =
       &pdevice->memory.types[pAllocateInfo->memoryTypeIndex];
    assert(mem_type->heapIndex < pdevice->memory.heap_count);
    struct anv_memory_heap *mem_heap =
@@ -3914,9 +3923,10 @@ VkResult anv_AllocateMemory(
       alloc_flags |= ANV_BO_ALLOC_NO_LOCAL_MEM;
 
    /* If the allocated buffer might end up in local memory and it's host
-    * visible, make CPU writes are combined, it should be faster.
+    * visible and uncached, enable CPU write-combining. It should be faster.
     */
    if (!(alloc_flags & ANV_BO_ALLOC_NO_LOCAL_MEM) &&
+       (mem_type->propertyFlags & VK_MEMORY_PROPERTY_HOST_CACHED_BIT) == 0 &&
        (mem_type->propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT))
       alloc_flags |= ANV_BO_ALLOC_WRITE_COMBINE;