From 57c38a56696cd8a49786a884a9720e63c7ae0161 Mon Sep 17 00:00:00 2001 From: Faith Ekstrand Date: Wed, 2 Aug 2023 11:48:22 -0500 Subject: [PATCH] nvk: Implement CopyQueryPoolResults with a compute shader Part-of: --- src/nouveau/vulkan/nvk_query_pool.c | 325 +++++++++++++++++++++++++++++++++++- 1 file changed, 322 insertions(+), 3 deletions(-) diff --git a/src/nouveau/vulkan/nvk_query_pool.c b/src/nouveau/vulkan/nvk_query_pool.c index 397786e..cb2dc0a 100644 --- a/src/nouveau/vulkan/nvk_query_pool.c +++ b/src/nouveau/vulkan/nvk_query_pool.c @@ -6,10 +6,15 @@ #include "nvk_event.h" #include "nvk_mme.h" #include "nvk_physical_device.h" -#include "util/os_time.h" +#include "nvk_pipeline.h" +#include "compiler/nir/nir.h" +#include "compiler/nir/nir_builder.h" #include "nouveau_bo.h" #include "nouveau_context.h" +#include "util/os_time.h" +#include "vk_meta.h" +#include "vk_pipeline.h" #include "nvk_cl906f.h" #include "nvk_cl9097.h" @@ -105,6 +110,14 @@ nvk_query_available_addr(struct nvk_query_pool *pool, uint32_t query) return pool->bo->offset + query * sizeof(uint32_t); } +static nir_ssa_def * +nvk_nir_available_addr(nir_builder *b, nir_ssa_def *pool_addr, + nir_ssa_def *query) +{ + nir_ssa_def *offset = nir_imul_imm(b, query, sizeof(uint32_t)); + return nir_iadd(b, pool_addr, nir_u2u64(b, offset)); +} + static uint32_t * nvk_query_available_map(struct nvk_query_pool *pool, uint32_t query) { @@ -125,6 +138,16 @@ nvk_query_report_addr(struct nvk_query_pool *pool, uint32_t query) return pool->bo->offset + nvk_query_offset(pool, query); } +static nir_ssa_def * +nvk_nir_query_report_addr(nir_builder *b, nir_ssa_def *pool_addr, + nir_ssa_def *query_start, nir_ssa_def *query_stride, + nir_ssa_def *query) +{ + nir_ssa_def *offset = + nir_iadd(b, query_start, nir_umul_2x32_64(b, query, query_stride)); + return nir_iadd(b, pool_addr, offset); +} + static struct nvk_query_report * nvk_query_report_map(struct nvk_query_pool *pool, uint32_t query) { @@ -646,6 +669,302 @@ nvk_GetQueryPoolResults(VkDevice device, return status; } +struct nvk_copy_query_push { + uint64_t pool_addr; + uint32_t query_start; + uint32_t query_stride; + uint32_t first_query; + uint32_t query_count; + uint64_t dst_addr; + uint64_t dst_stride; + uint32_t flags; +}; + +static nir_ssa_def * +load_struct_var(nir_builder *b, nir_variable *var, uint32_t field) +{ + nir_deref_instr *deref = + nir_build_deref_struct(b, nir_build_deref_var(b, var), field); + return nir_load_deref(b, deref); +} + +static void +nir_write_query_result(nir_builder *b, nir_ssa_def *dst_addr, + nir_ssa_def *idx, nir_ssa_def *flags, + nir_ssa_def *result) +{ + assert(result->num_components == 1); + assert(result->bit_size == 64); + + nir_push_if(b, nir_test_mask(b, flags, VK_QUERY_RESULT_64_BIT)); + { + nir_ssa_def *offset = nir_i2i64(b, nir_imul_imm(b, idx, 8)); + nir_store_global(b, nir_iadd(b, dst_addr, offset), 8, result, 0x1); + } + nir_push_else(b, NULL); + { + nir_ssa_def *result32 = nir_u2u32(b, result); + nir_ssa_def *offset = nir_i2i64(b, nir_imul_imm(b, idx, 4)); + nir_store_global(b, nir_iadd(b, dst_addr, offset), 4, result32, 0x1); + } + nir_pop_if(b, NULL); +} + +static void +nir_get_query_delta(nir_builder *b, nir_ssa_def *dst_addr, + nir_ssa_def *report_addr, nir_ssa_def *idx, + nir_ssa_def *flags) +{ + nir_ssa_def *offset = + nir_imul_imm(b, idx, 2 * sizeof(struct nvk_query_report)); + nir_ssa_def *begin_addr = + nir_iadd(b, report_addr, nir_i2i64(b, offset)); + nir_ssa_def *end_addr = + nir_iadd_imm(b, begin_addr, sizeof(struct nvk_query_report)); + + /* nvk_query_report::timestamp is the first uint64_t */ + nir_ssa_def *begin = nir_load_global(b, begin_addr, 16, 1, 64); + nir_ssa_def *end = nir_load_global(b, end_addr, 16, 1, 64); + + nir_ssa_def *delta = nir_isub(b, end, begin); + + nir_write_query_result(b, dst_addr, idx, flags, delta); +} + +static void +nvk_nir_copy_query(nir_builder *b, nir_variable *push, nir_ssa_def *i) +{ + nir_ssa_def *pool_addr = load_struct_var(b, push, 0); + nir_ssa_def *query_start = nir_u2u64(b, load_struct_var(b, push, 1)); + nir_ssa_def *query_stride = load_struct_var(b, push, 2); + nir_ssa_def *first_query = load_struct_var(b, push, 3); + nir_ssa_def *dst_addr = load_struct_var(b, push, 5); + nir_ssa_def *dst_stride = load_struct_var(b, push, 6); + nir_ssa_def *flags = load_struct_var(b, push, 7); + + nir_ssa_def *query = nir_iadd(b, first_query, i); + + nir_ssa_def *avail_addr = nvk_nir_available_addr(b, pool_addr, query); + nir_ssa_def *available = + nir_i2b(b, nir_load_global(b, avail_addr, 4, 1, 32)); + + nir_ssa_def *partial = nir_test_mask(b, flags, VK_QUERY_RESULT_PARTIAL_BIT); + nir_ssa_def *write_results = nir_ior(b, available, partial); + + nir_ssa_def *report_addr = + nvk_nir_query_report_addr(b, pool_addr, query_start, query_stride, + query); + nir_ssa_def *dst_offset = nir_imul(b, nir_u2u64(b, i), dst_stride); + + /* Timestamp queries are the only ones use a single report */ + nir_ssa_def *is_timestamp = + nir_ieq_imm(b, query_stride, sizeof(struct nvk_query_report)); + + nir_ssa_def *one = nir_imm_int(b, 1); + nir_ssa_def *num_reports; + nir_push_if(b, is_timestamp); + { + nir_push_if(b, write_results); + { + /* This is the timestamp case. We add 8 because we're loading + * nvk_query_report::timestamp. + */ + nir_ssa_def *timestamp = + nir_load_global(b, nir_iadd_imm(b, report_addr, 8), 8, 1, 64); + + nir_write_query_result(b, nir_iadd(b, dst_addr, dst_offset), + nir_imm_int(b, 0), flags, timestamp); + } + nir_pop_if(b, NULL); + } + nir_push_else(b, NULL); + { + /* Everything that isn't a timestamp has the invariant that the + * number of destination entries is equal to the query stride divided + * by the size of two reports. + */ + num_reports = nir_udiv_imm(b, query_stride, + 2 * sizeof(struct nvk_query_report)); + + nir_push_if(b, write_results); + { + nir_variable *r = + nir_local_variable_create(b->impl, glsl_uint_type(), "r"); + nir_store_var(b, r, nir_imm_int(b, 0), 0x1); + + nir_push_loop(b); + { + nir_push_if(b, nir_ige(b, nir_load_var(b, r), num_reports)); + { + nir_jump(b, nir_jump_break); + } + nir_pop_if(b, NULL); + + nir_get_query_delta(b, nir_iadd(b, dst_addr, dst_offset), + report_addr, nir_load_var(b, r), flags); + + nir_store_var(b, r, nir_iadd_imm(b, nir_load_var(b, r), 1), 0x1); + } + nir_pop_loop(b, NULL); + } + nir_pop_if(b, NULL); + } + nir_pop_if(b, NULL); + + num_reports = nir_if_phi(b, one, num_reports); + + nir_push_if(b, nir_test_mask(b, flags, VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)); + { + nir_write_query_result(b, nir_iadd(b, dst_addr, dst_offset), + num_reports, flags, nir_b2i64(b, available)); + } + nir_pop_if(b, NULL); +} + +static nir_shader * +build_copy_queries_shader(void) +{ + nir_builder build = + nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL, + "nvk-meta-copy-queries"); + nir_builder *b = &build; + + struct glsl_struct_field push_fields[] = { + { .type = glsl_uint64_t_type(), .name = "pool_addr", .offset = 0 }, + { .type = glsl_uint_type(), .name = "query_start", .offset = 8 }, + { .type = glsl_uint_type(), .name = "query_stride", .offset = 12 }, + { .type = glsl_uint_type(), .name = "first_query", .offset = 16 }, + { .type = glsl_uint_type(), .name = "query_count", .offset = 20 }, + { .type = glsl_uint64_t_type(), .name = "dst_addr", .offset = 24 }, + { .type = glsl_uint64_t_type(), .name = "dst_stride", .offset = 32 }, + { .type = glsl_uint_type(), .name = "flags", .offset = 40 }, + }; + const struct glsl_type *push_iface_type = + glsl_interface_type(push_fields, ARRAY_SIZE(push_fields), + GLSL_INTERFACE_PACKING_STD140, + false /* row_major */, "push"); + nir_variable *push = nir_variable_create(b->shader, nir_var_mem_push_const, + push_iface_type, "push"); + + nir_ssa_def *query_count = load_struct_var(b, push, 4); + + nir_variable *i = nir_local_variable_create(b->impl, glsl_uint_type(), "i"); + nir_store_var(b, i, nir_imm_int(b, 0), 0x1); + + nir_push_loop(b); + { + nir_push_if(b, nir_ige(b, nir_load_var(b, i), query_count)); + { + nir_jump(b, nir_jump_break); + } + nir_pop_if(b, NULL); + + nvk_nir_copy_query(b, push, nir_load_var(b, i)); + + nir_store_var(b, i, nir_iadd_imm(b, nir_load_var(b, i), 1), 0x1); + } + nir_pop_loop(b, NULL); + + return build.shader; +} + +static VkResult +get_copy_queries_pipeline(struct nvk_device *dev, + VkPipelineLayout layout, + VkPipeline *pipeline_out) +{ + const char key[] = "nvk-meta-copy-query-pool-results"; + VkPipeline cached = vk_meta_lookup_pipeline(&dev->meta, key, sizeof(key)); + if (cached != VK_NULL_HANDLE) { + *pipeline_out = cached; + return VK_SUCCESS; + } + + const VkPipelineShaderStageNirCreateInfoMESA nir_info = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_NIR_CREATE_INFO_MESA, + .nir = build_copy_queries_shader(), + }; + const VkComputePipelineCreateInfo info = { + .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, + .stage = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .pNext = &nir_info, + .stage = VK_SHADER_STAGE_COMPUTE_BIT, + .pName = "main", + }, + .layout = layout, + }; + + return vk_meta_create_compute_pipeline(&dev->vk, &dev->meta, &info, + key, sizeof(key), pipeline_out); +} + +static void +nvk_meta_copy_query_pool_results(struct nvk_cmd_buffer *cmd, + struct nvk_query_pool *pool, + uint32_t first_query, + uint32_t query_count, + uint64_t dst_addr, + uint64_t dst_stride, + VkQueryResultFlags flags) +{ + struct nvk_device *dev = nvk_cmd_buffer_device(cmd); + struct nvk_descriptor_state *desc = &cmd->state.cs.descriptors; + VkResult result; + + const struct nvk_copy_query_push push = { + .pool_addr = pool->bo->offset, + .query_start = pool->query_start, + .query_stride = pool->query_stride, + .first_query = first_query, + .query_count = query_count, + .dst_addr = dst_addr, + .dst_stride = dst_stride, + .flags = flags, + }; + + const char key[] = "nvk-meta-copy-query-pool-results"; + const VkPushConstantRange push_range = { + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .size = sizeof(push), + }; + VkPipelineLayout layout; + result = vk_meta_get_pipeline_layout(&dev->vk, &dev->meta, NULL, &push_range, + key, sizeof(key), &layout); + if (result != VK_SUCCESS) { + vk_command_buffer_set_error(&cmd->vk, result); + return; + } + + VkPipeline pipeline; + result = get_copy_queries_pipeline(dev, layout, &pipeline); + if (result != VK_SUCCESS) { + vk_command_buffer_set_error(&cmd->vk, result); + return; + } + + /* Save pipeline and push constants */ + struct nvk_compute_pipeline *pipeline_save = cmd->state.cs.pipeline; + uint8_t push_save[NVK_MAX_PUSH_SIZE]; + memcpy(push_save, desc->root.push, NVK_MAX_PUSH_SIZE); + + nvk_CmdBindPipeline(nvk_cmd_buffer_to_handle(cmd), + VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); + + nvk_CmdPushConstants(nvk_cmd_buffer_to_handle(cmd), layout, + VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push), &push); + + nvk_CmdDispatchBase(nvk_cmd_buffer_to_handle(cmd), 0, 0, 0, 1, 1, 1); + + /* Restore pipeline and push constants */ + if (pipeline_save) { + nvk_CmdBindPipeline(nvk_cmd_buffer_to_handle(cmd), + VK_PIPELINE_BIND_POINT_COMPUTE, + nvk_pipeline_to_handle(&pipeline_save->base)); + } + memcpy(desc->root.push, push_save, NVK_MAX_PUSH_SIZE); +} + void nvk_mme_copy_queries(struct mme_builder *b) { @@ -812,6 +1131,6 @@ nvk_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, } uint64_t dst_addr = nvk_buffer_address(dst_buffer, dstOffset); - nvk_cmd_copy_query_pool_results_mme(cmd, pool, firstQuery, queryCount, - dst_addr, stride, flags); + nvk_meta_copy_query_pool_results(cmd, pool, firstQuery, queryCount, + dst_addr, stride, flags); } -- 2.7.4