From 37a619f517a913f1a32297f5f3b08775347bddd0 Mon Sep 17 00:00:00 2001 From: Bas Nieuwenhuizen Date: Mon, 27 Jun 2022 23:21:08 +0200 Subject: [PATCH] radv: Implement DGC generated command layout structure. Reviewed-by: Samuel Pitoiset Part-of: --- src/amd/vulkan/radv_device_generated_commands.c | 196 ++++++++++++++++++++++++ src/amd/vulkan/radv_private.h | 30 ++++ 2 files changed, 226 insertions(+) diff --git a/src/amd/vulkan/radv_device_generated_commands.c b/src/amd/vulkan/radv_device_generated_commands.c index 8e41883..a6b2168 100644 --- a/src/amd/vulkan/radv_device_generated_commands.c +++ b/src/amd/vulkan/radv_device_generated_commands.c @@ -26,6 +26,88 @@ #include "nir_builder.h" +static void +radv_get_sequence_size(const struct radv_indirect_command_layout *layout, + const struct radv_graphics_pipeline *pipeline, uint32_t *cmd_size, + uint32_t *upload_size) +{ + *cmd_size = 0; + *upload_size = 0; + + if (layout->bind_vbo_mask) { + *upload_size += 16 * util_bitcount(pipeline->vb_desc_usage_mask); + + /* One PKT3_SET_SH_REG for emitting VBO pointer (32-bit) */ + *cmd_size += 3 * 4; + } + + if (layout->push_constant_mask) { + bool need_copy = false; + + for (unsigned i = 0; i < ARRAY_SIZE(pipeline->base.shaders); ++i) { + if (!pipeline->base.shaders[i]) + continue; + + struct radv_userdata_locations *locs = &pipeline->base.shaders[i]->info.user_sgprs_locs; + if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0) { + /* One PKT3_SET_SH_REG for emitting push constants pointer (32-bit) */ + *cmd_size += 3 * 4; + need_copy = true; + } + if (locs->shader_data[AC_UD_INLINE_PUSH_CONSTANTS].sgpr_idx >= 0) + /* One PKT3_SET_SH_REG writing all inline push constants. */ + *cmd_size += (2 + locs->shader_data[AC_UD_INLINE_PUSH_CONSTANTS].num_sgprs) * 4; + } + if (need_copy) + *upload_size += + align(pipeline->base.push_constant_size + 16 * pipeline->base.dynamic_offset_count, 16); + } + + if (layout->binds_index_buffer) { + /* Index type write (normal reg write) + index buffer base write (64-bits, but special packet + * so only 1 word overhead) + index buffer size (again, special packet so only 1 word + * overhead) + */ + *cmd_size += (3 + 3 + 2) * 4; + } + + if (layout->indexed) { + /* userdata writes + instance count + indexed draw */ + *cmd_size += (5 + 2 + 5) * 4; + } else { + /* userdata writes + instance count + non-indexed draw */ + *cmd_size += (5 + 2 + 3) * 4; + } + + if (layout->binds_state) { + /* One PKT3_SET_CONTEXT_REG (PA_SU_SC_MODE_CNTL) */ + *cmd_size += 3 * 4; + + if (pipeline->base.device->physical_device->rad_info.has_gfx9_scissor_bug) { + /* 1 reg write of 4 regs + 1 reg write of 2 regs per scissor */ + *cmd_size += (8 + 2 * MAX_SCISSORS) * 4; + } + } +} + +static uint32_t +radv_align_cmdbuf_size(uint32_t size) +{ + return align(MAX2(1, size), 256); +} + +uint32_t +radv_get_indirect_cmdbuf_size(const VkGeneratedCommandsInfoNV *cmd_info) +{ + VK_FROM_HANDLE(radv_indirect_command_layout, layout, cmd_info->indirectCommandsLayout); + VK_FROM_HANDLE(radv_pipeline, pipeline, cmd_info->pipeline); + struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline); + + uint32_t cmd_size, upload_size; + radv_get_sequence_size(layout, graphics_pipeline, &cmd_size, &upload_size); + return radv_align_cmdbuf_size(cmd_size * cmd_info->sequencesCount); +} + enum radv_dgc_token_type { RADV_DGC_INDEX_BUFFER, RADV_DGC_DRAW, @@ -920,3 +1002,117 @@ fail: ralloc_free(cs); return result; } + +VkResult +radv_CreateIndirectCommandsLayoutNV(VkDevice _device, + const VkIndirectCommandsLayoutCreateInfoNV *pCreateInfo, + const VkAllocationCallbacks *pAllocator, + VkIndirectCommandsLayoutNV *pIndirectCommandsLayout) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + struct radv_indirect_command_layout *layout; + + size_t size = + sizeof(*layout) + pCreateInfo->tokenCount * sizeof(VkIndirectCommandsLayoutTokenNV); + + layout = + vk_zalloc2(&device->vk.alloc, pAllocator, size, alignof(struct radv_indirect_command_layout), + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (!layout) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + vk_object_base_init(&device->vk, &layout->base, VK_OBJECT_TYPE_INDIRECT_COMMANDS_LAYOUT_NV); + + layout->input_stride = pCreateInfo->pStreamStrides[0]; + layout->token_count = pCreateInfo->tokenCount; + typed_memcpy(layout->tokens, pCreateInfo->pTokens, pCreateInfo->tokenCount); + + layout->ibo_type_32 = VK_INDEX_TYPE_UINT32; + layout->ibo_type_8 = VK_INDEX_TYPE_UINT8_EXT; + + for (unsigned i = 0; i < pCreateInfo->tokenCount; ++i) { + switch (pCreateInfo->pTokens[i].tokenType) { + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_NV: + layout->draw_params_offset = pCreateInfo->pTokens[i].offset; + break; + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_INDEXED_NV: + layout->indexed = true; + layout->draw_params_offset = pCreateInfo->pTokens[i].offset; + break; + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_INDEX_BUFFER_NV: + layout->binds_index_buffer = true; + layout->index_buffer_offset = pCreateInfo->pTokens[i].offset; + /* 16-bit is implied if we find no match. */ + for (unsigned j = 0; j < pCreateInfo->pTokens[i].indexTypeCount; j++) { + if (pCreateInfo->pTokens[i].pIndexTypes[j] == VK_INDEX_TYPE_UINT32) + layout->ibo_type_32 = pCreateInfo->pTokens[i].pIndexTypeValues[j]; + else if (pCreateInfo->pTokens[i].pIndexTypes[j] == VK_INDEX_TYPE_UINT8_EXT) + layout->ibo_type_8 = pCreateInfo->pTokens[i].pIndexTypeValues[j]; + } + break; + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_VERTEX_BUFFER_NV: + layout->bind_vbo_mask |= 1u << pCreateInfo->pTokens[i].vertexBindingUnit; + layout->vbo_offsets[pCreateInfo->pTokens[i].vertexBindingUnit] = + pCreateInfo->pTokens[i].offset; + if (pCreateInfo->pTokens[i].vertexDynamicStride) + layout->vbo_offsets[pCreateInfo->pTokens[i].vertexBindingUnit] |= 1u << 15; + break; + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_PUSH_CONSTANT_NV: + for (unsigned j = pCreateInfo->pTokens[i].pushconstantOffset / 4, k = 0; + k < pCreateInfo->pTokens[i].pushconstantSize / 4; ++j, ++k) { + layout->push_constant_mask |= 1ull << j; + layout->push_constant_offsets[j] = pCreateInfo->pTokens[i].offset + k * 4; + } + break; + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_STATE_FLAGS_NV: + layout->binds_state = true; + layout->state_offset = pCreateInfo->pTokens[i].offset; + break; + default: + unreachable("Unhandled token type"); + } + } + if (!layout->indexed) + layout->binds_index_buffer = false; + + *pIndirectCommandsLayout = radv_indirect_command_layout_to_handle(layout); + return VK_SUCCESS; +} + +void +radv_DestroyIndirectCommandsLayoutNV(VkDevice _device, + VkIndirectCommandsLayoutNV indirectCommandsLayout, + const VkAllocationCallbacks *pAllocator) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + VK_FROM_HANDLE(radv_indirect_command_layout, layout, indirectCommandsLayout); + + if (!layout) + return; + + vk_object_base_finish(&layout->base); + vk_free2(&device->vk.alloc, pAllocator, layout); +} + +void +radv_GetGeneratedCommandsMemoryRequirementsNV( + VkDevice _device, const VkGeneratedCommandsMemoryRequirementsInfoNV *pInfo, + VkMemoryRequirements2 *pMemoryRequirements) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + VK_FROM_HANDLE(radv_indirect_command_layout, layout, pInfo->indirectCommandsLayout); + VK_FROM_HANDLE(radv_pipeline, pipeline, pInfo->pipeline); + struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline); + + uint32_t cmd_stride, upload_stride; + radv_get_sequence_size(layout, graphics_pipeline, &cmd_stride, &upload_stride); + + VkDeviceSize cmd_buf_size = radv_align_cmdbuf_size(cmd_stride * pInfo->maxSequencesCount); + VkDeviceSize upload_buf_size = upload_stride * pInfo->maxSequencesCount; + + pMemoryRequirements->memoryRequirements.memoryTypeBits = + device->physical_device->memory_types_32bit; + pMemoryRequirements->memoryRequirements.alignment = 256; + pMemoryRequirements->memoryRequirements.size = + align(cmd_buf_size + upload_buf_size, pMemoryRequirements->memoryRequirements.alignment); +} diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index 900942b..f87724f 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -2991,6 +2991,34 @@ void radv_describe_barrier_end_delayed(struct radv_cmd_buffer *cmd_buffer); void radv_describe_layout_transition(struct radv_cmd_buffer *cmd_buffer, const struct radv_barrier_data *barrier); +struct radv_indirect_command_layout { + struct vk_object_base base; + + uint32_t input_stride; + uint32_t token_count; + + bool indexed; + bool binds_index_buffer; + bool binds_state; + uint16_t draw_params_offset; + uint16_t index_buffer_offset; + + uint16_t state_offset; + + uint32_t bind_vbo_mask; + uint32_t vbo_offsets[MAX_VBS]; + + uint64_t push_constant_mask; + uint32_t push_constant_offsets[MAX_PUSH_CONSTANTS_SIZE / 4]; + + uint32_t ibo_type_32; + uint32_t ibo_type_8; + + VkIndirectCommandsLayoutTokenNV tokens[0]; +}; + +uint32_t radv_get_indirect_cmdbuf_size(const VkGeneratedCommandsInfoNV *cmd_info); + uint64_t radv_get_current_time(void); static inline uint32_t @@ -3256,6 +3284,8 @@ VK_DEFINE_NONDISP_HANDLE_CASTS(radv_event, base, VkEvent, VK_OBJECT_TYPE_EVENT) VK_DEFINE_NONDISP_HANDLE_CASTS(radv_image, vk.base, VkImage, VK_OBJECT_TYPE_IMAGE) VK_DEFINE_NONDISP_HANDLE_CASTS(radv_image_view, vk.base, VkImageView, VK_OBJECT_TYPE_IMAGE_VIEW); +VK_DEFINE_NONDISP_HANDLE_CASTS(radv_indirect_command_layout, base, VkIndirectCommandsLayoutNV, + VK_OBJECT_TYPE_INDIRECT_COMMANDS_LAYOUT_NV) VK_DEFINE_NONDISP_HANDLE_CASTS(radv_pipeline_cache, base, VkPipelineCache, VK_OBJECT_TYPE_PIPELINE_CACHE) VK_DEFINE_NONDISP_HANDLE_CASTS(radv_pipeline, base, VkPipeline, -- 2.7.4