From b1ddb3504027c23560b19bdb6291a817b45ab166 Mon Sep 17 00:00:00 2001 From: Bas Nieuwenhuizen Date: Mon, 19 Sep 2022 23:56:59 +0200 Subject: [PATCH] radv: Translate the BVH copy shader to glsl from nir_builder. Much easier to change. Reviewed-by: Konstantin Seurer Part-of: --- src/amd/vulkan/bvh/build_helpers.h | 2 + src/amd/vulkan/bvh/build_interface.h | 10 + src/amd/vulkan/bvh/copy.comp | 105 ++++++++++ src/amd/vulkan/bvh/meson.build | 3 +- src/amd/vulkan/radv_acceleration_structure.c | 291 ++------------------------- 5 files changed, 132 insertions(+), 279 deletions(-) create mode 100644 src/amd/vulkan/bvh/copy.comp diff --git a/src/amd/vulkan/bvh/build_helpers.h b/src/amd/vulkan/bvh/build_helpers.h index e78188a..08fc354 100644 --- a/src/amd/vulkan/bvh/build_helpers.h +++ b/src/amd/vulkan/bvh/build_helpers.h @@ -192,6 +192,8 @@ TYPE(vec2, 4); TYPE(vec3, 4); TYPE(vec4, 4); +TYPE(uvec4, 16); + TYPE(VOID_REF, 8); void diff --git a/src/amd/vulkan/bvh/build_interface.h b/src/amd/vulkan/bvh/build_interface.h index 50ca014..cc40430 100644 --- a/src/amd/vulkan/bvh/build_interface.h +++ b/src/amd/vulkan/bvh/build_interface.h @@ -65,4 +65,14 @@ struct internal_args { uint32_t fill_count; }; +#define RADV_COPY_MODE_COPY 0 +#define RADV_COPY_MODE_SERIALIZE 1 +#define RADV_COPY_MODE_DESERIALIZE 2 + +struct copy_args { + VOID_REF src_addr; + VOID_REF dst_addr; + uint32_t mode; +}; + #endif diff --git a/src/amd/vulkan/bvh/copy.comp b/src/amd/vulkan/bvh/copy.comp new file mode 100644 index 0000000..6bd5a73 --- /dev/null +++ b/src/amd/vulkan/bvh/copy.comp @@ -0,0 +1,105 @@ +/* + * Copyright © 2022 Bas Nieuwenhuizen + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#version 460 + +#extension GL_GOOGLE_include_directive : require + +#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int64 : require +#extension GL_EXT_scalar_block_layout : require +#extension GL_EXT_buffer_reference : require +#extension GL_EXT_buffer_reference2 : require + +layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; + +#include "build_interface.h" + +layout(push_constant) uniform CONSTS { + copy_args args; +}; + +void +main(void) +{ + uint32_t global_id = gl_GlobalInvocationID.x; + uint32_t lanes = gl_NumWorkGroups.x * 64; + uint32_t increment = lanes * 16; + + uint64_t copy_src_addr = args.src_addr; + uint64_t copy_dst_addr = args.dst_addr; + + if (args.mode == RADV_COPY_MODE_DESERIALIZE) { + copy_src_addr += SIZEOF(radv_accel_struct_serialization_header) + + DEREF(REF(radv_accel_struct_serialization_header)(args.src_addr)).instance_count * SIZEOF(uint64_t); + + } + + REF(radv_accel_struct_header) header = REF(radv_accel_struct_header)(copy_src_addr); + + uint64_t instance_base = args.src_addr + SIZEOF(radv_accel_struct_serialization_header); + uint64_t node_offset = DEREF(header).instance_offset; + uint64_t node_end = DEREF(header).instance_count * SIZEOF(radv_bvh_instance_node); + if (node_end > 0) + node_end += node_offset; + + if (args.mode == RADV_COPY_MODE_SERIALIZE) { + copy_dst_addr += SIZEOF(radv_accel_struct_serialization_header) + + DEREF(REF(radv_accel_struct_header)(args.src_addr)).instance_count * SIZEOF(uint64_t); + + if (global_id == 0) { + REF(radv_accel_struct_serialization_header) ser_header = + REF(radv_accel_struct_serialization_header)(args.dst_addr); + DEREF(ser_header).serialization_size = DEREF(header).serialization_size; + DEREF(ser_header).compacted_size = DEREF(header).compacted_size; + DEREF(ser_header).instance_count = DEREF(header).instance_count; + } + + instance_base = args.dst_addr + SIZEOF(radv_accel_struct_serialization_header); + } else if (args.mode == RADV_COPY_MODE_COPY) + node_end = 0; + + uint64_t size = DEREF(header).compacted_size; + for (uint64_t offset = global_id * 16; offset < size; offset += increment) { + DEREF(REF(uvec4)(copy_dst_addr + offset)) = + DEREF(REF(uvec4)(copy_src_addr + offset)); + + /* Do the adjustment inline in the same invocation that copies the data so that we don't have + * to synchronize. */ + if (offset < node_end && offset >= node_offset && + (offset - node_offset) % SIZEOF(radv_bvh_instance_node) == 0) { + uint64_t idx = (offset - node_offset) / SIZEOF(radv_bvh_instance_node); + + if (args.mode == RADV_COPY_MODE_SERIALIZE) { + DEREF(INDEX(uint64_t, instance_base, idx)) = + DEREF(REF(radv_bvh_instance_node)(copy_src_addr + offset)).base_ptr; + } else { /* RADV_COPY_MODE_DESERIALIZE */ + uint64_t blas_addr = DEREF(INDEX(uint64_t, instance_base, idx)); + + DEREF(REF(radv_bvh_instance_node)(copy_dst_addr + offset)).base_ptr = blas_addr; + } + } + } +} diff --git a/src/amd/vulkan/bvh/meson.build b/src/amd/vulkan/bvh/meson.build index 6b785bb..21a709d 100644 --- a/src/amd/vulkan/bvh/meson.build +++ b/src/amd/vulkan/bvh/meson.build @@ -19,9 +19,10 @@ # SOFTWARE. bvh_shaders = [ + 'copy.comp', 'internal.comp', 'leaf.comp', - 'morton.comp' + 'morton.comp', ] bvh_include_dir = meson.source_root() + '/src/amd/vulkan/bvh' diff --git a/src/amd/vulkan/radv_acceleration_structure.c b/src/amd/vulkan/radv_acceleration_structure.c index 71b0cbc..c6cee6d 100644 --- a/src/amd/vulkan/radv_acceleration_structure.c +++ b/src/amd/vulkan/radv_acceleration_structure.c @@ -43,6 +43,10 @@ static const uint32_t internal_spv[] = { #include "bvh/internal.comp.spv.h" }; +static const uint32_t copy_spv[] = { +#include "bvh/copy.comp.spv.h" +}; + /* Min and max bounds of the bvh used to compute morton codes */ #define SCRATCH_TOTAL_BOUNDS_SIZE (6 * sizeof(float)) @@ -209,228 +213,6 @@ create_accel_build_shader(struct radv_device *device, const char *name) return b; } -enum copy_mode { - COPY_MODE_COPY, - COPY_MODE_SERIALIZE, - COPY_MODE_DESERIALIZE, -}; - -struct copy_constants { - uint64_t src_addr; - uint64_t dst_addr; - uint32_t mode; -}; - -static nir_shader * -build_copy_shader(struct radv_device *dev) -{ - nir_builder b = create_accel_build_shader(dev, "accel_copy"); - - nir_ssa_def *invoc_id = nir_load_local_invocation_id(&b); - nir_ssa_def *wg_id = nir_load_workgroup_id(&b, 32); - nir_ssa_def *block_size = - nir_imm_ivec4(&b, b.shader->info.workgroup_size[0], b.shader->info.workgroup_size[1], - b.shader->info.workgroup_size[2], 0); - - nir_ssa_def *global_id = - nir_channel(&b, nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id), 0); - - nir_variable *offset_var = - nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "offset"); - nir_ssa_def *offset = nir_imul_imm(&b, global_id, 16); - nir_store_var(&b, offset_var, offset, 1); - - nir_ssa_def *increment = nir_imul_imm(&b, nir_channel(&b, nir_load_num_workgroups(&b, 32), 0), - b.shader->info.workgroup_size[0] * 16); - - nir_ssa_def *pconst0 = - nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .base = 0, .range = 16); - nir_ssa_def *pconst1 = - nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 16, .range = 4); - nir_ssa_def *src_base_addr = nir_pack_64_2x32(&b, nir_channels(&b, pconst0, 0b0011)); - nir_ssa_def *dst_base_addr = nir_pack_64_2x32(&b, nir_channels(&b, pconst0, 0b1100)); - nir_ssa_def *mode = nir_channel(&b, pconst1, 0); - - nir_variable *compacted_size_var = - nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint64_t_type(), "compacted_size"); - nir_variable *src_offset_var = - nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "src_offset"); - nir_variable *dst_offset_var = - nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "dst_offset"); - nir_variable *instance_offset_var = - nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "instance_offset"); - nir_variable *instance_count_var = - nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "instance_count"); - nir_variable *value_var = - nir_variable_create(b.shader, nir_var_shader_temp, glsl_vec4_type(), "value"); - - nir_push_if(&b, nir_ieq_imm(&b, mode, COPY_MODE_SERIALIZE)); - { - nir_ssa_def *instance_count = nir_build_load_global( - &b, 1, 32, - nir_iadd_imm(&b, src_base_addr, - offsetof(struct radv_accel_struct_header, instance_count))); - nir_ssa_def *compacted_size = nir_build_load_global( - &b, 1, 64, - nir_iadd_imm(&b, src_base_addr, - offsetof(struct radv_accel_struct_header, compacted_size))); - nir_ssa_def *serialization_size = nir_build_load_global( - &b, 1, 64, - nir_iadd_imm(&b, src_base_addr, - offsetof(struct radv_accel_struct_header, serialization_size))); - - nir_store_var(&b, compacted_size_var, compacted_size, 1); - nir_store_var(&b, instance_offset_var, - nir_build_load_global( - &b, 1, 32, - nir_iadd_imm(&b, src_base_addr, - offsetof(struct radv_accel_struct_header, instance_offset))), - 1); - nir_store_var(&b, instance_count_var, instance_count, 1); - - nir_ssa_def *dst_offset = nir_iadd_imm(&b, nir_imul_imm(&b, instance_count, sizeof(uint64_t)), - sizeof(struct radv_accel_struct_serialization_header)); - nir_store_var(&b, src_offset_var, nir_imm_int(&b, 0), 1); - nir_store_var(&b, dst_offset_var, dst_offset, 1); - - nir_push_if(&b, nir_ieq_imm(&b, global_id, 0)); - { - nir_build_store_global(&b, serialization_size, - nir_iadd_imm(&b, dst_base_addr, - offsetof(struct radv_accel_struct_serialization_header, - serialization_size))); - nir_build_store_global( - &b, compacted_size, - nir_iadd_imm(&b, dst_base_addr, - offsetof(struct radv_accel_struct_serialization_header, compacted_size))); - nir_build_store_global( - &b, nir_u2u64(&b, instance_count), - nir_iadd_imm(&b, dst_base_addr, - offsetof(struct radv_accel_struct_serialization_header, instance_count))); - } - nir_pop_if(&b, NULL); - } - nir_push_else(&b, NULL); - nir_push_if(&b, nir_ieq_imm(&b, mode, COPY_MODE_DESERIALIZE)); - { - nir_ssa_def *instance_count = nir_build_load_global( - &b, 1, 32, - nir_iadd_imm(&b, src_base_addr, - offsetof(struct radv_accel_struct_serialization_header, instance_count))); - nir_ssa_def *src_offset = nir_iadd_imm(&b, nir_imul_imm(&b, instance_count, sizeof(uint64_t)), - sizeof(struct radv_accel_struct_serialization_header)); - - nir_ssa_def *header_addr = nir_iadd(&b, src_base_addr, nir_u2u64(&b, src_offset)); - nir_store_var(&b, compacted_size_var, - nir_build_load_global( - &b, 1, 64, - nir_iadd_imm(&b, header_addr, - offsetof(struct radv_accel_struct_header, compacted_size))), - 1); - nir_store_var(&b, instance_offset_var, - nir_build_load_global( - &b, 1, 32, - nir_iadd_imm(&b, header_addr, - offsetof(struct radv_accel_struct_header, instance_offset))), - 1); - nir_store_var(&b, instance_count_var, instance_count, 1); - nir_store_var(&b, src_offset_var, src_offset, 1); - nir_store_var(&b, dst_offset_var, nir_imm_int(&b, 0), 1); - } - nir_push_else(&b, NULL); /* COPY_MODE_COPY */ - { - nir_store_var(&b, compacted_size_var, - nir_build_load_global( - &b, 1, 64, - nir_iadd_imm(&b, src_base_addr, - offsetof(struct radv_accel_struct_header, compacted_size))), - 1); - - nir_store_var(&b, src_offset_var, nir_imm_int(&b, 0), 1); - nir_store_var(&b, dst_offset_var, nir_imm_int(&b, 0), 1); - nir_store_var(&b, instance_offset_var, nir_imm_int(&b, 0), 1); - nir_store_var(&b, instance_count_var, nir_imm_int(&b, 0), 1); - } - nir_pop_if(&b, NULL); - nir_pop_if(&b, NULL); - - nir_ssa_def *instance_bound = - nir_imul_imm(&b, nir_load_var(&b, instance_count_var), sizeof(struct radv_bvh_instance_node)); - nir_ssa_def *compacted_size = nir_build_load_global( - &b, 1, 32, - nir_iadd_imm(&b, src_base_addr, offsetof(struct radv_accel_struct_header, compacted_size))); - - nir_push_loop(&b); - { - offset = nir_load_var(&b, offset_var); - nir_push_if(&b, nir_ilt(&b, offset, compacted_size)); - { - nir_ssa_def *src_offset = nir_iadd(&b, offset, nir_load_var(&b, src_offset_var)); - nir_ssa_def *dst_offset = nir_iadd(&b, offset, nir_load_var(&b, dst_offset_var)); - nir_ssa_def *src_addr = nir_iadd(&b, src_base_addr, nir_u2u64(&b, src_offset)); - nir_ssa_def *dst_addr = nir_iadd(&b, dst_base_addr, nir_u2u64(&b, dst_offset)); - - nir_ssa_def *value = nir_build_load_global(&b, 4, 32, src_addr, .align_mul = 16); - nir_store_var(&b, value_var, value, 0xf); - - nir_ssa_def *instance_offset = nir_isub(&b, offset, nir_load_var(&b, instance_offset_var)); - nir_ssa_def *in_instance_bound = - nir_iand(&b, nir_uge(&b, offset, nir_load_var(&b, instance_offset_var)), - nir_ult(&b, instance_offset, instance_bound)); - nir_ssa_def *instance_start = nir_ieq_imm( - &b, nir_iand_imm(&b, instance_offset, sizeof(struct radv_bvh_instance_node) - 1), 0); - - nir_push_if(&b, nir_iand(&b, in_instance_bound, instance_start)); - { - nir_ssa_def *instance_id = nir_ushr_imm(&b, instance_offset, 7); - - nir_push_if(&b, nir_ieq_imm(&b, mode, COPY_MODE_SERIALIZE)); - { - nir_ssa_def *instance_addr = nir_imul_imm(&b, instance_id, sizeof(uint64_t)); - instance_addr = nir_iadd_imm(&b, instance_addr, - sizeof(struct radv_accel_struct_serialization_header)); - instance_addr = nir_iadd(&b, dst_base_addr, nir_u2u64(&b, instance_addr)); - - nir_build_store_global(&b, nir_channels(&b, value, 3), instance_addr, - .align_mul = 8); - } - nir_push_else(&b, NULL); - { - nir_ssa_def *instance_addr = nir_imul_imm(&b, instance_id, sizeof(uint64_t)); - instance_addr = nir_iadd_imm(&b, instance_addr, - sizeof(struct radv_accel_struct_serialization_header)); - instance_addr = nir_iadd(&b, src_base_addr, nir_u2u64(&b, instance_addr)); - - nir_ssa_def *instance_value = - nir_build_load_global(&b, 2, 32, instance_addr, .align_mul = 8); - - nir_ssa_def *values[] = { - nir_channel(&b, instance_value, 0), - nir_channel(&b, instance_value, 1), - nir_channel(&b, value, 2), - nir_channel(&b, value, 3), - }; - - nir_store_var(&b, value_var, nir_vec(&b, values, 4), 0xf); - } - nir_pop_if(&b, NULL); - } - nir_pop_if(&b, NULL); - - nir_store_var(&b, offset_var, nir_iadd(&b, offset, increment), 1); - - nir_build_store_global(&b, nir_load_var(&b, value_var), dst_addr, .align_mul = 16); - } - nir_push_else(&b, NULL); - { - nir_jump(&b, nir_jump_break); - } - nir_pop_if(&b, NULL); - } - nir_pop_loop(&b, NULL); - return b.shader; -} - void radv_device_finish_accel_struct_build_state(struct radv_device *device) { @@ -458,52 +240,6 @@ radv_device_finish_accel_struct_build_state(struct radv_device *device) } static VkResult -create_build_pipeline(struct radv_device *device, nir_shader *shader, unsigned push_constant_size, - VkPipeline *pipeline, VkPipelineLayout *layout) -{ - const VkPipelineLayoutCreateInfo pl_create_info = { - .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, - .setLayoutCount = 0, - .pushConstantRangeCount = 1, - .pPushConstantRanges = - &(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0, push_constant_size}, - }; - - VkResult result = radv_CreatePipelineLayout(radv_device_to_handle(device), &pl_create_info, - &device->meta_state.alloc, layout); - if (result != VK_SUCCESS) { - ralloc_free(shader); - return result; - } - - VkPipelineShaderStageCreateInfo shader_stage = { - .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, - .stage = VK_SHADER_STAGE_COMPUTE_BIT, - .module = vk_shader_module_handle_from_nir(shader), - .pName = "main", - .pSpecializationInfo = NULL, - }; - - VkComputePipelineCreateInfo pipeline_info = { - .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, - .stage = shader_stage, - .flags = 0, - .layout = *layout, - }; - - result = radv_CreateComputePipelines(radv_device_to_handle(device), - radv_pipeline_cache_to_handle(&device->meta_state.cache), 1, - &pipeline_info, &device->meta_state.alloc, pipeline); - - if (result != VK_SUCCESS) { - ralloc_free(shader); - return result; - } - - return VK_SUCCESS; -} - -static VkResult create_build_pipeline_spv(struct radv_device *device, const uint32_t *spv, uint32_t spv_size, unsigned push_constant_size, VkPipeline *pipeline, VkPipelineLayout *layout) @@ -577,7 +313,6 @@ VkResult radv_device_init_accel_struct_build_state(struct radv_device *device) { VkResult result; - nir_shader *copy_cs = build_copy_shader(device); result = create_build_pipeline_spv(device, leaf_spv, sizeof(leaf_spv), sizeof(struct leaf_args), &device->meta_state.accel_struct_build.leaf_pipeline, @@ -592,9 +327,9 @@ radv_device_init_accel_struct_build_state(struct radv_device *device) if (result != VK_SUCCESS) return result; - result = create_build_pipeline(device, copy_cs, sizeof(struct copy_constants), - &device->meta_state.accel_struct_build.copy_pipeline, - &device->meta_state.accel_struct_build.copy_p_layout); + result = create_build_pipeline_spv(device, copy_spv, sizeof(copy_spv), sizeof(struct copy_args), + &device->meta_state.accel_struct_build.copy_pipeline, + &device->meta_state.accel_struct_build.copy_p_layout); if (result != VK_SUCCESS) return result; @@ -950,10 +685,10 @@ radv_CmdCopyAccelerationStructureKHR(VkCommandBuffer commandBuffer, radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE, cmd_buffer->device->meta_state.accel_struct_build.copy_pipeline); - const struct copy_constants consts = { + struct copy_args consts = { .src_addr = src->va, .dst_addr = dst->va, - .mode = COPY_MODE_COPY, + .mode = RADV_COPY_MODE_COPY, }; radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer), @@ -1017,10 +752,10 @@ radv_CmdCopyMemoryToAccelerationStructureKHR( radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE, cmd_buffer->device->meta_state.accel_struct_build.copy_pipeline); - const struct copy_constants consts = { + const struct copy_args consts = { .src_addr = pInfo->src.deviceAddress, .dst_addr = dst->va, - .mode = COPY_MODE_DESERIALIZE, + .mode = RADV_COPY_MODE_DESERIALIZE, }; radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer), @@ -1046,10 +781,10 @@ radv_CmdCopyAccelerationStructureToMemoryKHR( radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE, cmd_buffer->device->meta_state.accel_struct_build.copy_pipeline); - const struct copy_constants consts = { + const struct copy_args consts = { .src_addr = src->va, .dst_addr = pInfo->dst.deviceAddress, - .mode = COPY_MODE_SERIALIZE, + .mode = RADV_COPY_MODE_SERIALIZE, }; radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer), -- 2.7.4