From f6981ac595cc5e85880f190a0c898049a93ccd31 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Tue, 3 Mar 2020 11:25:10 -0500 Subject: [PATCH] [mlir][vulkan-runner] Add basic timing for compute pipeline This commit adds timestamp query commands in Vulkan runner's compute pipeline to gain insights into how long it takes to run the compute shader. This commit also adds timing from CPU side for VkQueueSubmit and vkQueueWaitIdle. Differential Revision: https://reviews.llvm.org/D75531 --- mlir/tools/mlir-vulkan-runner/VulkanRuntime.cpp | 118 ++++++++++++++++++------ mlir/tools/mlir-vulkan-runner/VulkanRuntime.h | 26 ++++-- 2 files changed, 108 insertions(+), 36 deletions(-) diff --git a/mlir/tools/mlir-vulkan-runner/VulkanRuntime.cpp b/mlir/tools/mlir-vulkan-runner/VulkanRuntime.cpp index 9112566..d85db94 100644 --- a/mlir/tools/mlir-vulkan-runner/VulkanRuntime.cpp +++ b/mlir/tools/mlir-vulkan-runner/VulkanRuntime.cpp @@ -13,6 +13,9 @@ #include "VulkanRuntime.h" +#include "llvm/Support/Format.h" +#include + using namespace mlir; void VulkanRuntime::setNumWorkGroups(const NumWorkGroups &numberWorkGroups) { @@ -120,6 +123,7 @@ LogicalResult VulkanRuntime::destroy() { // Free and destroy. vkFreeCommandBuffers(device, commandPool, commandBuffers.size(), commandBuffers.data()); + vkDestroyQueryPool(device, queryPool, nullptr); vkDestroyCommandPool(device, commandPool, nullptr); vkFreeDescriptorSets(device, descriptorPool, descriptorSets.size(), descriptorSets.data()); @@ -162,18 +166,46 @@ LogicalResult VulkanRuntime::run() { failed(createComputePipeline()) || failed(createDescriptorPool()) || failed(allocateDescriptorSets()) || failed(setWriteDescriptors()) || // Create command buffer. - failed(createCommandPool()) || failed(createComputeCommandBuffer())) { + failed(createCommandPool()) || failed(createQueryPool()) || + failed(createComputeCommandBuffer())) { return failure(); } // Get working queue. vkGetDeviceQueue(device, queueFamilyIndex, 0, &queue); + auto submitStart = std::chrono::high_resolution_clock::now(); // Submit command buffer into the queue. if (failed(submitCommandBuffersToQueue())) return failure(); + auto submitEnd = std::chrono::high_resolution_clock::now(); RETURN_ON_VULKAN_ERROR(vkQueueWaitIdle(queue), "vkQueueWaitIdle"); + auto execEnd = std::chrono::high_resolution_clock::now(); + + auto submitDuration = std::chrono::duration_cast( + submitEnd - submitStart); + auto execDuration = std::chrono::duration_cast( + execEnd - submitEnd); + + if (queryPool != VK_NULL_HANDLE) { + uint64_t timestamps[2]; + RETURN_ON_VULKAN_ERROR( + vkGetQueryPoolResults( + device, queryPool, /*firstQuery=*/0, /*queryCount=*/2, + /*dataSize=*/sizeof(timestamps), + /*pData=*/reinterpret_cast(timestamps), + /*stride=*/sizeof(uint64_t), + VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT), + "vkGetQueryPoolResults"); + float microsec = (timestamps[1] - timestamps[0]) * timestampPeriod / 1000; + llvm::outs() << "Compute shader execution time: " + << llvm::format("%0.3fus\n", microsec); + } + + llvm::outs() << "Command buffer submit time: " << submitDuration.count() + << "us\nWait idle time: " << execDuration.count() << "us\n"; + return success(); } @@ -218,8 +250,9 @@ LogicalResult VulkanRuntime::createDevice() { "physicalDeviceCount"); // TODO(denis0x0D): find the best device. - const auto &physicalDevice = physicalDevices.front(); - getBestComputeQueue(physicalDevice); + physicalDevice = physicalDevices.front(); + if (failed(getBestComputeQueue())) + return failure(); const float queuePrioritory = 1.0f; VkDeviceQueueCreateInfo deviceQueueCreateInfo = {}; @@ -275,39 +308,33 @@ LogicalResult VulkanRuntime::createDevice() { return success(); } -LogicalResult -VulkanRuntime::getBestComputeQueue(const VkPhysicalDevice &physicalDevice) { +LogicalResult VulkanRuntime::getBestComputeQueue() { uint32_t queueFamilyPropertiesCount = 0; vkGetPhysicalDeviceQueueFamilyProperties(physicalDevice, &queueFamilyPropertiesCount, 0); - SmallVector queueFamilyProperties( - queueFamilyPropertiesCount); - vkGetPhysicalDeviceQueueFamilyProperties(physicalDevice, - &queueFamilyPropertiesCount, - queueFamilyProperties.data()); + SmallVector familyProperties( + queueFamilyPropertiesCount); + vkGetPhysicalDeviceQueueFamilyProperties( + physicalDevice, &queueFamilyPropertiesCount, familyProperties.data()); // VK_QUEUE_COMPUTE_BIT specifies that queues in this queue family support - // compute operations. + // compute operations. Try to find a compute-only queue first if possible. for (uint32_t i = 0; i < queueFamilyPropertiesCount; ++i) { - const VkQueueFlags maskedFlags = - (~(VK_QUEUE_TRANSFER_BIT | VK_QUEUE_SPARSE_BINDING_BIT) & - queueFamilyProperties[i].queueFlags); - - if (!(VK_QUEUE_GRAPHICS_BIT & maskedFlags) && - (VK_QUEUE_COMPUTE_BIT & maskedFlags)) { + auto flags = familyProperties[i].queueFlags; + if ((flags & VK_QUEUE_COMPUTE_BIT) && !(flags & VK_QUEUE_GRAPHICS_BIT)) { queueFamilyIndex = i; + queueFamilyProperties = familyProperties[i]; return success(); } } + // Otherwise use a queue that can also support graphics. for (uint32_t i = 0; i < queueFamilyPropertiesCount; ++i) { - const VkQueueFlags maskedFlags = - (~(VK_QUEUE_TRANSFER_BIT | VK_QUEUE_SPARSE_BINDING_BIT) & - queueFamilyProperties[i].queueFlags); - - if (VK_QUEUE_COMPUTE_BIT & maskedFlags) { + auto flags = familyProperties[i].queueFlags; + if ((flags & VK_QUEUE_COMPUTE_BIT)) { queueFamilyIndex = i; + queueFamilyProperties = familyProperties[i]; return success(); } } @@ -627,21 +654,48 @@ LogicalResult VulkanRuntime::createCommandPool() { commandPoolCreateInfo.pNext = nullptr; commandPoolCreateInfo.flags = 0; commandPoolCreateInfo.queueFamilyIndex = queueFamilyIndex; - RETURN_ON_VULKAN_ERROR( - vkCreateCommandPool(device, &commandPoolCreateInfo, 0, &commandPool), - "vkCreateCommandPool"); + RETURN_ON_VULKAN_ERROR(vkCreateCommandPool(device, &commandPoolCreateInfo, + /*pAllocator=*/nullptr, + &commandPool), + "vkCreateCommandPool"); + return success(); +} + +LogicalResult VulkanRuntime::createQueryPool() { + // Return directly if timestamp query is not supported. + if (queueFamilyProperties.timestampValidBits == 0) + return success(); + + // Get timestamp period for this physical device. + VkPhysicalDeviceProperties deviceProperties = {}; + vkGetPhysicalDeviceProperties(physicalDevice, &deviceProperties); + timestampPeriod = deviceProperties.limits.timestampPeriod; + + // Create query pool. + VkQueryPoolCreateInfo queryPoolCreateInfo = {}; + queryPoolCreateInfo.sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO; + queryPoolCreateInfo.pNext = nullptr; + queryPoolCreateInfo.flags = 0; + queryPoolCreateInfo.queryType = VK_QUERY_TYPE_TIMESTAMP; + queryPoolCreateInfo.queryCount = 2; + queryPoolCreateInfo.pipelineStatistics = 0; + RETURN_ON_VULKAN_ERROR(vkCreateQueryPool(device, &queryPoolCreateInfo, + /*pAllocator=*/nullptr, &queryPool), + "vkCreateQueryPool"); + return success(); } LogicalResult VulkanRuntime::createComputeCommandBuffer() { VkCommandBufferAllocateInfo commandBufferAllocateInfo = {}; - VkCommandBuffer commandBuffer; commandBufferAllocateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; commandBufferAllocateInfo.pNext = nullptr; commandBufferAllocateInfo.commandPool = commandPool; commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; commandBufferAllocateInfo.commandBufferCount = 1; + + VkCommandBuffer commandBuffer; RETURN_ON_VULKAN_ERROR(vkAllocateCommandBuffers(device, &commandBufferAllocateInfo, &commandBuffer), @@ -658,13 +712,23 @@ LogicalResult VulkanRuntime::createComputeCommandBuffer() { vkBeginCommandBuffer(commandBuffer, &commandBufferBeginInfo), "vkBeginCommandBuffer"); - // Commands. + if (queryPool != VK_NULL_HANDLE) + vkCmdResetQueryPool(commandBuffer, queryPool, 0, 2); + vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipelineLayout, 0, descriptorSets.size(), descriptorSets.data(), 0, 0); + // Get a timestamp before invoking the compute shader. + if (queryPool != VK_NULL_HANDLE) + vkCmdWriteTimestamp(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + queryPool, 0); vkCmdDispatch(commandBuffer, numWorkGroups.x, numWorkGroups.y, numWorkGroups.z); + // Get another timestamp after invoking the compute shader. + if (queryPool != VK_NULL_HANDLE) + vkCmdWriteTimestamp(commandBuffer, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, + queryPool, 1); // Commands end. RETURN_ON_VULKAN_ERROR(vkEndCommandBuffer(commandBuffer), diff --git a/mlir/tools/mlir-vulkan-runner/VulkanRuntime.h b/mlir/tools/mlir-vulkan-runner/VulkanRuntime.h index c137b67..9c63714 100644 --- a/mlir/tools/mlir-vulkan-runner/VulkanRuntime.h +++ b/mlir/tools/mlir-vulkan-runner/VulkanRuntime.h @@ -130,7 +130,7 @@ private: LogicalResult createInstance(); LogicalResult createDevice(); - LogicalResult getBestComputeQueue(const VkPhysicalDevice &physicalDevice); + LogicalResult getBestComputeQueue(); LogicalResult createMemoryBuffers(); LogicalResult createShaderModule(); void initDescriptorSetLayoutBindingMap(); @@ -141,6 +141,7 @@ private: LogicalResult allocateDescriptorSets(); LogicalResult setWriteDescriptors(); LogicalResult createCommandPool(); + LogicalResult createQueryPool(); LogicalResult createComputeCommandBuffer(); LogicalResult submitCommandBuffersToQueue(); @@ -164,9 +165,10 @@ private: // Vulkan objects. //===--------------------------------------------------------------------===// - VkInstance instance; - VkDevice device; - VkQueue queue; + VkInstance instance{VK_NULL_HANDLE}; + VkPhysicalDevice physicalDevice{VK_NULL_HANDLE}; + VkDevice device{VK_NULL_HANDLE}; + VkQueue queue{VK_NULL_HANDLE}; /// Specifies VulkanDeviceMemoryBuffers divided into sets. llvm::DenseMap descriptorSetLayouts; - VkPipelineLayout pipelineLayout; + VkPipelineLayout pipelineLayout{VK_NULL_HANDLE}; /// Specifies descriptor sets. llvm::SmallVector descriptorSets; @@ -191,11 +193,16 @@ private: /// Specifies a pool of descriptor set info, each descriptor set must have /// information such as type, index and amount of bindings. llvm::SmallVector descriptorSetInfoPool; - VkDescriptorPool descriptorPool; + VkDescriptorPool descriptorPool{VK_NULL_HANDLE}; + + /// Timestamp query. + VkQueryPool queryPool{VK_NULL_HANDLE}; + // Number of nonoseconds for timestamp to increase 1 + float timestampPeriod{0.f}; /// Computation pipeline. - VkPipeline pipeline; - VkCommandPool commandPool; + VkPipeline pipeline{VK_NULL_HANDLE}; + VkCommandPool commandPool{VK_NULL_HANDLE}; llvm::SmallVector commandBuffers; //===--------------------------------------------------------------------===// @@ -203,6 +210,7 @@ private: //===--------------------------------------------------------------------===// uint32_t queueFamilyIndex{0}; + VkQueueFamilyProperties queueFamilyProperties{}; uint32_t memoryTypeIndex{VK_MAX_MEMORY_TYPES}; VkDeviceSize memorySize{0}; -- 2.7.4