#include "VulkanRuntime.h"
+#include "llvm/Support/Format.h"
+#include <chrono>
+
using namespace mlir;
void VulkanRuntime::setNumWorkGroups(const NumWorkGroups &numberWorkGroups) {
// Free and destroy.
vkFreeCommandBuffers(device, commandPool, commandBuffers.size(),
commandBuffers.data());
+ vkDestroyQueryPool(device, queryPool, nullptr);
vkDestroyCommandPool(device, commandPool, nullptr);
vkFreeDescriptorSets(device, descriptorPool, descriptorSets.size(),
descriptorSets.data());
failed(createComputePipeline()) || failed(createDescriptorPool()) ||
failed(allocateDescriptorSets()) || failed(setWriteDescriptors()) ||
// Create command buffer.
- failed(createCommandPool()) || failed(createComputeCommandBuffer())) {
+ failed(createCommandPool()) || failed(createQueryPool()) ||
+ failed(createComputeCommandBuffer())) {
return failure();
}
// Get working queue.
vkGetDeviceQueue(device, queueFamilyIndex, 0, &queue);
+ auto submitStart = std::chrono::high_resolution_clock::now();
// Submit command buffer into the queue.
if (failed(submitCommandBuffersToQueue()))
return failure();
+ auto submitEnd = std::chrono::high_resolution_clock::now();
RETURN_ON_VULKAN_ERROR(vkQueueWaitIdle(queue), "vkQueueWaitIdle");
+ auto execEnd = std::chrono::high_resolution_clock::now();
+
+ auto submitDuration = std::chrono::duration_cast<std::chrono::microseconds>(
+ submitEnd - submitStart);
+ auto execDuration = std::chrono::duration_cast<std::chrono::microseconds>(
+ execEnd - submitEnd);
+
+ if (queryPool != VK_NULL_HANDLE) {
+ uint64_t timestamps[2];
+ RETURN_ON_VULKAN_ERROR(
+ vkGetQueryPoolResults(
+ device, queryPool, /*firstQuery=*/0, /*queryCount=*/2,
+ /*dataSize=*/sizeof(timestamps),
+ /*pData=*/reinterpret_cast<void *>(timestamps),
+ /*stride=*/sizeof(uint64_t),
+ VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT),
+ "vkGetQueryPoolResults");
+ float microsec = (timestamps[1] - timestamps[0]) * timestampPeriod / 1000;
+ llvm::outs() << "Compute shader execution time: "
+ << llvm::format("%0.3fus\n", microsec);
+ }
+
+ llvm::outs() << "Command buffer submit time: " << submitDuration.count()
+ << "us\nWait idle time: " << execDuration.count() << "us\n";
+
return success();
}
"physicalDeviceCount");
// TODO(denis0x0D): find the best device.
- const auto &physicalDevice = physicalDevices.front();
- getBestComputeQueue(physicalDevice);
+ physicalDevice = physicalDevices.front();
+ if (failed(getBestComputeQueue()))
+ return failure();
const float queuePrioritory = 1.0f;
VkDeviceQueueCreateInfo deviceQueueCreateInfo = {};
return success();
}
-LogicalResult
-VulkanRuntime::getBestComputeQueue(const VkPhysicalDevice &physicalDevice) {
+LogicalResult VulkanRuntime::getBestComputeQueue() {
uint32_t queueFamilyPropertiesCount = 0;
vkGetPhysicalDeviceQueueFamilyProperties(physicalDevice,
&queueFamilyPropertiesCount, 0);
- SmallVector<VkQueueFamilyProperties, 1> queueFamilyProperties(
- queueFamilyPropertiesCount);
- vkGetPhysicalDeviceQueueFamilyProperties(physicalDevice,
- &queueFamilyPropertiesCount,
- queueFamilyProperties.data());
+ SmallVector<VkQueueFamilyProperties, 1> familyProperties(
+ queueFamilyPropertiesCount);
+ vkGetPhysicalDeviceQueueFamilyProperties(
+ physicalDevice, &queueFamilyPropertiesCount, familyProperties.data());
// VK_QUEUE_COMPUTE_BIT specifies that queues in this queue family support
- // compute operations.
+ // compute operations. Try to find a compute-only queue first if possible.
for (uint32_t i = 0; i < queueFamilyPropertiesCount; ++i) {
- const VkQueueFlags maskedFlags =
- (~(VK_QUEUE_TRANSFER_BIT | VK_QUEUE_SPARSE_BINDING_BIT) &
- queueFamilyProperties[i].queueFlags);
-
- if (!(VK_QUEUE_GRAPHICS_BIT & maskedFlags) &&
- (VK_QUEUE_COMPUTE_BIT & maskedFlags)) {
+ auto flags = familyProperties[i].queueFlags;
+ if ((flags & VK_QUEUE_COMPUTE_BIT) && !(flags & VK_QUEUE_GRAPHICS_BIT)) {
queueFamilyIndex = i;
+ queueFamilyProperties = familyProperties[i];
return success();
}
}
+ // Otherwise use a queue that can also support graphics.
for (uint32_t i = 0; i < queueFamilyPropertiesCount; ++i) {
- const VkQueueFlags maskedFlags =
- (~(VK_QUEUE_TRANSFER_BIT | VK_QUEUE_SPARSE_BINDING_BIT) &
- queueFamilyProperties[i].queueFlags);
-
- if (VK_QUEUE_COMPUTE_BIT & maskedFlags) {
+ auto flags = familyProperties[i].queueFlags;
+ if ((flags & VK_QUEUE_COMPUTE_BIT)) {
queueFamilyIndex = i;
+ queueFamilyProperties = familyProperties[i];
return success();
}
}
commandPoolCreateInfo.pNext = nullptr;
commandPoolCreateInfo.flags = 0;
commandPoolCreateInfo.queueFamilyIndex = queueFamilyIndex;
- RETURN_ON_VULKAN_ERROR(
- vkCreateCommandPool(device, &commandPoolCreateInfo, 0, &commandPool),
- "vkCreateCommandPool");
+ RETURN_ON_VULKAN_ERROR(vkCreateCommandPool(device, &commandPoolCreateInfo,
+ /*pAllocator=*/nullptr,
+ &commandPool),
+ "vkCreateCommandPool");
+ return success();
+}
+
+LogicalResult VulkanRuntime::createQueryPool() {
+ // Return directly if timestamp query is not supported.
+ if (queueFamilyProperties.timestampValidBits == 0)
+ return success();
+
+ // Get timestamp period for this physical device.
+ VkPhysicalDeviceProperties deviceProperties = {};
+ vkGetPhysicalDeviceProperties(physicalDevice, &deviceProperties);
+ timestampPeriod = deviceProperties.limits.timestampPeriod;
+
+ // Create query pool.
+ VkQueryPoolCreateInfo queryPoolCreateInfo = {};
+ queryPoolCreateInfo.sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO;
+ queryPoolCreateInfo.pNext = nullptr;
+ queryPoolCreateInfo.flags = 0;
+ queryPoolCreateInfo.queryType = VK_QUERY_TYPE_TIMESTAMP;
+ queryPoolCreateInfo.queryCount = 2;
+ queryPoolCreateInfo.pipelineStatistics = 0;
+ RETURN_ON_VULKAN_ERROR(vkCreateQueryPool(device, &queryPoolCreateInfo,
+ /*pAllocator=*/nullptr, &queryPool),
+ "vkCreateQueryPool");
+
return success();
}
LogicalResult VulkanRuntime::createComputeCommandBuffer() {
VkCommandBufferAllocateInfo commandBufferAllocateInfo = {};
- VkCommandBuffer commandBuffer;
commandBufferAllocateInfo.sType =
VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
commandBufferAllocateInfo.pNext = nullptr;
commandBufferAllocateInfo.commandPool = commandPool;
commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
commandBufferAllocateInfo.commandBufferCount = 1;
+
+ VkCommandBuffer commandBuffer;
RETURN_ON_VULKAN_ERROR(vkAllocateCommandBuffers(device,
&commandBufferAllocateInfo,
&commandBuffer),
vkBeginCommandBuffer(commandBuffer, &commandBufferBeginInfo),
"vkBeginCommandBuffer");
- // Commands.
+ if (queryPool != VK_NULL_HANDLE)
+ vkCmdResetQueryPool(commandBuffer, queryPool, 0, 2);
+
vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
pipelineLayout, 0, descriptorSets.size(),
descriptorSets.data(), 0, 0);
+ // Get a timestamp before invoking the compute shader.
+ if (queryPool != VK_NULL_HANDLE)
+ vkCmdWriteTimestamp(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+ queryPool, 0);
vkCmdDispatch(commandBuffer, numWorkGroups.x, numWorkGroups.y,
numWorkGroups.z);
+ // Get another timestamp after invoking the compute shader.
+ if (queryPool != VK_NULL_HANDLE)
+ vkCmdWriteTimestamp(commandBuffer, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT,
+ queryPool, 1);
// Commands end.
RETURN_ON_VULKAN_ERROR(vkEndCommandBuffer(commandBuffer),
LogicalResult createInstance();
LogicalResult createDevice();
- LogicalResult getBestComputeQueue(const VkPhysicalDevice &physicalDevice);
+ LogicalResult getBestComputeQueue();
LogicalResult createMemoryBuffers();
LogicalResult createShaderModule();
void initDescriptorSetLayoutBindingMap();
LogicalResult allocateDescriptorSets();
LogicalResult setWriteDescriptors();
LogicalResult createCommandPool();
+ LogicalResult createQueryPool();
LogicalResult createComputeCommandBuffer();
LogicalResult submitCommandBuffersToQueue();
// Vulkan objects.
//===--------------------------------------------------------------------===//
- VkInstance instance;
- VkDevice device;
- VkQueue queue;
+ VkInstance instance{VK_NULL_HANDLE};
+ VkPhysicalDevice physicalDevice{VK_NULL_HANDLE};
+ VkDevice device{VK_NULL_HANDLE};
+ VkQueue queue{VK_NULL_HANDLE};
/// Specifies VulkanDeviceMemoryBuffers divided into sets.
llvm::DenseMap<DescriptorSetIndex,
deviceMemoryBufferMap;
/// Specifies shader module.
- VkShaderModule shaderModule;
+ VkShaderModule shaderModule{VK_NULL_HANDLE};
/// Specifies layout bindings.
llvm::DenseMap<DescriptorSetIndex,
/// Specifies layouts of descriptor sets.
llvm::SmallVector<VkDescriptorSetLayout, 1> descriptorSetLayouts;
- VkPipelineLayout pipelineLayout;
+ VkPipelineLayout pipelineLayout{VK_NULL_HANDLE};
/// Specifies descriptor sets.
llvm::SmallVector<VkDescriptorSet, 1> descriptorSets;
/// Specifies a pool of descriptor set info, each descriptor set must have
/// information such as type, index and amount of bindings.
llvm::SmallVector<DescriptorSetInfo, 1> descriptorSetInfoPool;
- VkDescriptorPool descriptorPool;
+ VkDescriptorPool descriptorPool{VK_NULL_HANDLE};
+
+ /// Timestamp query.
+ VkQueryPool queryPool{VK_NULL_HANDLE};
+ // Number of nonoseconds for timestamp to increase 1
+ float timestampPeriod{0.f};
/// Computation pipeline.
- VkPipeline pipeline;
- VkCommandPool commandPool;
+ VkPipeline pipeline{VK_NULL_HANDLE};
+ VkCommandPool commandPool{VK_NULL_HANDLE};
llvm::SmallVector<VkCommandBuffer, 1> commandBuffers;
//===--------------------------------------------------------------------===//
//===--------------------------------------------------------------------===//
uint32_t queueFamilyIndex{0};
+ VkQueueFamilyProperties queueFamilyProperties{};
uint32_t memoryTypeIndex{VK_MAX_MEMORY_TYPES};
VkDeviceSize memorySize{0};