radv: Add PLOC shader
authorBas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Thu, 29 Sep 2022 09:03:30 +0000 (11:03 +0200)
committerMarge Bot <emma+marge@anholt.net>
Sun, 30 Oct 2022 19:48:46 +0000 (19:48 +0000)
Reviewed-by: Konstantin Seurer <konstantin.seurer@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19292>

src/amd/vulkan/bvh/build_interface.h
src/amd/vulkan/bvh/meson.build
src/amd/vulkan/bvh/ploc_internal.comp [new file with mode: 0644]
src/amd/vulkan/radv_acceleration_structure.c
src/amd/vulkan/radv_private.h

index 7b8aee8..cc829ca 100644 (file)
@@ -90,4 +90,20 @@ struct convert_leaf_args {
    uint32_t geometry_type;
 };
 
+struct ploc_prefix_scan_partition {
+   uint32_t aggregate;
+   uint32_t inclusive_sum;
+};
+
+#define PLOC_WORKGROUP_SIZE 1024
+
+struct ploc_args {
+   VOID_REF bvh;
+   VOID_REF prefix_scan_partitions;
+   REF(radv_ir_header) header;
+   VOID_REF ids_0;
+   VOID_REF ids_1;
+   uint32_t internal_node_offset;
+};
+
 #endif
index 8f93f8a..e01926a 100644 (file)
@@ -23,6 +23,7 @@ bvh_shaders = [
   'lbvh_internal.comp',
   'leaf.comp',
   'morton.comp',
+  'ploc_internal.comp',
   'converter_internal.comp',
   'converter_leaf.comp',
 ]
@@ -42,7 +43,7 @@ foreach s : bvh_shaders
     input : s,
     output : s + '.spv.h',
     command : [
-      prog_glslang, '-V', '-I' + bvh_include_dir, '--target-env', 'spirv1.4', '-x', '-o', '@OUTPUT@', '@INPUT@'
+      prog_glslang, '-V', '-I' + bvh_include_dir, '--target-env', 'vulkan1.2', '-x', '-o', '@OUTPUT@', '@INPUT@'
     ] + glslang_quiet,
     depend_files: bvh_includes
   )
diff --git a/src/amd/vulkan/bvh/ploc_internal.comp b/src/amd/vulkan/bvh/ploc_internal.comp
new file mode 100644 (file)
index 0000000..e6189e7
--- /dev/null
@@ -0,0 +1,373 @@
+/*
+ * Copyright © 2022 Bas Nieuwenhuizen
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#version 460
+
+#extension GL_GOOGLE_include_directive : require
+
+#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int64 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
+#extension GL_EXT_scalar_block_layout : require
+#extension GL_EXT_buffer_reference : require
+#extension GL_EXT_buffer_reference2 : require
+#extension GL_KHR_memory_scope_semantics : require
+#extension GL_KHR_shader_subgroup_vote : require
+#extension GL_KHR_shader_subgroup_arithmetic : require
+#extension GL_KHR_shader_subgroup_ballot : require
+
+layout(local_size_x = 1024, local_size_y = 1, local_size_z = 1) in;
+
+#define USE_GLOBAL_SYNC
+#include "build_interface.h"
+
+TYPE(ploc_prefix_scan_partition, 4);
+
+layout(push_constant) uniform CONSTS
+{
+   ploc_args args;
+};
+
+shared uint32_t exclusive_prefix_sum;
+shared uint32_t aggregate_sums[PLOC_WORKGROUP_SIZE / 64];
+
+/*
+ * Global prefix scan over all workgroups to find out the index of the collapsed node to write.
+ * See https://research.nvidia.com/sites/default/files/publications/nvr-2016-002.pdf
+ * One partition = one workgroup in this case.
+ */
+uint32_t
+prefix_scan(uvec4 ballot, REF(ploc_prefix_scan_partition) partitions, uint32_t task_index)
+{
+   if (gl_LocalInvocationIndex == 0) {
+      exclusive_prefix_sum = 0;
+      if (task_index >= gl_WorkGroupSize.x) {
+         REF(ploc_prefix_scan_partition) current_partition =
+            REF(ploc_prefix_scan_partition)(INDEX(ploc_prefix_scan_partition, partitions, task_index / gl_WorkGroupSize.x));
+
+         REF(ploc_prefix_scan_partition) previous_partition = current_partition - 1;
+
+         while (true) {
+            /* See if this previous workgroup already set their inclusive sum */
+            if (atomicLoad(DEREF(previous_partition).inclusive_sum, gl_ScopeDevice,
+                           gl_StorageSemanticsBuffer,
+                           gl_SemanticsAcquire | gl_SemanticsMakeVisible) != 0xFFFFFFFF) {
+               atomicAdd(exclusive_prefix_sum, DEREF(previous_partition).inclusive_sum);
+               break;
+            } else {
+               atomicAdd(exclusive_prefix_sum, DEREF(previous_partition).aggregate);
+               previous_partition -= 1;
+            }
+         }
+         /* Set the inclusive sum for the next workgroups */
+         atomicStore(DEREF(current_partition).inclusive_sum,
+                     DEREF(current_partition).aggregate + exclusive_prefix_sum, gl_ScopeDevice,
+                     gl_StorageSemanticsBuffer, gl_SemanticsRelease | gl_SemanticsMakeAvailable);
+      }
+   }
+
+   if (subgroupElect())
+      aggregate_sums[gl_SubgroupID] = subgroupBallotBitCount(ballot);
+   barrier();
+
+   if (gl_LocalInvocationID.x < PLOC_WORKGROUP_SIZE / 64) {
+      aggregate_sums[gl_LocalInvocationID.x] =
+         exclusive_prefix_sum + subgroupExclusiveAdd(aggregate_sums[gl_LocalInvocationID.x]);
+   }
+   barrier();
+
+   return aggregate_sums[gl_SubgroupID] + subgroupBallotExclusiveBitCount(ballot);
+}
+
+uint32_t
+push_node(uint32_t children[2])
+{
+   uint32_t internal_node_index = atomicAdd(DEREF(args.header).ir_internal_node_count, 1);
+   uint32_t dst_offset = args.internal_node_offset + internal_node_index * SIZEOF(radv_ir_box_node);
+   uint32_t dst_id = pack_ir_node_id(dst_offset, radv_ir_node_internal);
+   REF(radv_ir_box_node) dst_node = REF(radv_ir_box_node)(OFFSET(args.bvh, dst_offset));
+
+   radv_aabb total_bounds;
+   total_bounds.min = vec3(INFINITY);
+   total_bounds.max = vec3(-INFINITY);
+
+   for (uint i = 0; i < 2; ++i) {
+      if (children[i] != RADV_BVH_INVALID_NODE) {
+         VOID_REF node = OFFSET(args.bvh, ir_id_to_offset(children[i]));
+         REF(radv_ir_node) child = REF(radv_ir_node)(node);
+         radv_aabb bounds = DEREF(child).aabb;
+
+         total_bounds.min = min(total_bounds.min, bounds.min);
+         total_bounds.max = max(total_bounds.max, bounds.max);
+      }
+
+      DEREF(dst_node).children[i] = children[i];
+   }
+
+   DEREF(dst_node).base.aabb = total_bounds;
+   DEREF(dst_node).in_final_tree = FINAL_TREE_UNKNOWN;
+   return dst_id;
+}
+
+#define PLOC_NEIGHBOURHOOD 16
+#define PLOC_OFFSET_MASK   ((1 << 5) - 1)
+
+uint32_t
+encode_neighbour_offset(float sah, uint32_t i, uint32_t j)
+{
+   int32_t offset = int32_t(j) - int32_t(i);
+   uint32_t encoded_offset = offset + PLOC_NEIGHBOURHOOD - (offset > 0 ? 1 : 0);
+   return (floatBitsToUint(sah) & (~PLOC_OFFSET_MASK)) | encoded_offset;
+}
+
+int32_t
+decode_neighbour_offset(uint32_t encoded_offset)
+{
+   int32_t offset = int32_t(encoded_offset & PLOC_OFFSET_MASK) - PLOC_NEIGHBOURHOOD;
+   return offset + (offset >= 0 ? 1 : 0);
+}
+
+#define NUM_PLOC_LDS_ITEMS PLOC_WORKGROUP_SIZE + 4 * PLOC_NEIGHBOURHOOD
+
+shared radv_aabb shared_bounds[NUM_PLOC_LDS_ITEMS];
+shared uint32_t nearest_neighbour_indices[NUM_PLOC_LDS_ITEMS];
+
+uint32_t
+load_id(VOID_REF ids, uint32_t iter, uint32_t index)
+{
+   if (iter == 0)
+      return DEREF(REF(key_id_pair)(INDEX(key_id_pair, ids, index))).id;
+   else
+      return DEREF(REF(uint32_t)(INDEX(uint32_t, ids, index)));
+}
+
+void
+load_bounds(VOID_REF ids, uint32_t iter, uint32_t task_index, uint32_t lds_base,
+            uint32_t neigbourhood_overlap, uint32_t search_bound)
+{
+   for (uint32_t i = task_index - 2 * neigbourhood_overlap; i < search_bound;
+        i += gl_WorkGroupSize.x) {
+      uint32_t id = load_id(ids, iter, i);
+      if (id == RADV_BVH_INVALID_NODE)
+         continue;
+
+      VOID_REF addr = OFFSET(args.bvh, ir_id_to_offset(id));
+      REF(radv_ir_node) node = REF(radv_ir_node)(addr);
+
+      shared_bounds[i - lds_base] = DEREF(node).aabb;
+   }
+}
+
+float
+combined_node_area(uint32_t lds_base, uint32_t i, uint32_t j)
+{
+   radv_aabb combined_bounds;
+   combined_bounds.min = min(shared_bounds[i - lds_base].min, shared_bounds[j - lds_base].min);
+   combined_bounds.max = max(shared_bounds[i - lds_base].max, shared_bounds[j - lds_base].max);
+   vec3 size = combined_bounds.max - combined_bounds.min;
+
+   return size.x * size.y + size.y * size.z + size.z * size.x;
+}
+
+shared uint32_t shared_aggregate_sum;
+
+void
+main(void)
+{
+   atomicCompSwap(DEREF(args.header).sync_data.task_counts[0], 0xFFFFFFFF,
+                  DEREF(args.header).active_leaf_count);
+   REF(ploc_prefix_scan_partition)
+   partitions = REF(ploc_prefix_scan_partition)(args.prefix_scan_partitions);
+
+   uint32_t task_index = fetch_task(args.header, false);
+
+   VOID_REF src_ids = args.ids_0;
+   VOID_REF dst_ids = args.ids_1;
+
+   for (uint iter = 0;; ++iter) {
+      uint32_t current_task_count = task_count(args.header);
+      if (current_task_count <= 1)
+         break;
+
+      /* Find preferred partners and merge them */
+      PHASE (args.header) {
+         uint32_t base_index = task_index - gl_LocalInvocationID.x;
+         uint32_t neigbourhood_overlap = min(PLOC_NEIGHBOURHOOD, base_index);
+         uint32_t double_neigbourhood_overlap = min(2 * PLOC_NEIGHBOURHOOD, base_index);
+         /* Upper bound to where valid nearest node indices are written. */
+         uint32_t write_bound =
+            min(current_task_count, base_index + gl_WorkGroupSize.x + PLOC_NEIGHBOURHOOD);
+         /* Upper bound to where valid nearest node indices are searched. */
+         uint32_t search_bound =
+            min(current_task_count, base_index + gl_WorkGroupSize.x + 2 * PLOC_NEIGHBOURHOOD);
+         uint32_t lds_base = base_index - double_neigbourhood_overlap;
+
+         load_bounds(src_ids, iter, task_index, lds_base, neigbourhood_overlap, search_bound);
+
+         for (uint32_t i = gl_LocalInvocationID.x; i < NUM_PLOC_LDS_ITEMS; i += gl_WorkGroupSize.x)
+            nearest_neighbour_indices[i] = 0xFFFFFFFF;
+         barrier();
+
+         for (uint32_t i = task_index - double_neigbourhood_overlap; i < write_bound;
+              i += gl_WorkGroupSize.x) {
+            uint32_t right_bound = min(search_bound - 1 - i, PLOC_NEIGHBOURHOOD);
+
+            uint32_t fallback_pair = i == 0 ? (i + 1) : (i - 1);
+            uint32_t min_offset = encode_neighbour_offset(INFINITY, i, fallback_pair);
+
+            for (uint32_t j = max(i + 1, base_index - neigbourhood_overlap); j <= i + right_bound;
+                 ++j) {
+
+               float sah = combined_node_area(lds_base, i, j);
+               uint32_t i_encoded_offset = encode_neighbour_offset(sah, i, j);
+               uint32_t j_encoded_offset = encode_neighbour_offset(sah, j, i);
+               min_offset = min(min_offset, i_encoded_offset);
+               atomicMin(nearest_neighbour_indices[j - lds_base], j_encoded_offset);
+            }
+            if (i >= base_index - neigbourhood_overlap)
+               atomicMin(nearest_neighbour_indices[i - lds_base], min_offset);
+         }
+
+         if (gl_LocalInvocationID.x == 0)
+            shared_aggregate_sum = 0;
+         barrier();
+
+         for (uint32_t i = task_index - neigbourhood_overlap; i < write_bound;
+              i += gl_WorkGroupSize.x) {
+            uint32_t left_bound = min(i, PLOC_NEIGHBOURHOOD);
+            uint32_t right_bound = min(search_bound - 1 - i, PLOC_NEIGHBOURHOOD);
+            /*
+             * Workaround for a worst-case scenario in PLOC: If the combined area of
+             * all nodes (in the neighbourhood) is the same, then the chosen nearest
+             * neighbour is the first neighbour. However, this means that no nodes
+             * except the first two will find each other as nearest neighbour. Therefore,
+             * only one node is contained in each BVH level. By first testing if the immediate
+             * neighbour on one side is the nearest, all immediate neighbours will be merged
+             * on every step.
+             */
+            uint32_t preferred_pair;
+            if ((i & 1) != 0)
+               preferred_pair = i - min(left_bound, 1);
+            else
+               preferred_pair = i + min(right_bound, 1);
+
+            if (preferred_pair != i) {
+               uint32_t encoded_min_sah =
+                  nearest_neighbour_indices[i - lds_base] & (~PLOC_OFFSET_MASK);
+               float sah = combined_node_area(lds_base, i, preferred_pair);
+               uint32_t encoded_sah = floatBitsToUint(sah) & (~PLOC_OFFSET_MASK);
+               uint32_t encoded_offset = encode_neighbour_offset(sah, i, preferred_pair);
+               if (encoded_sah <= encoded_min_sah) {
+                  nearest_neighbour_indices[i - lds_base] = encoded_offset;
+               }
+            }
+         }
+         barrier();
+
+         bool has_valid_node = true;
+
+         if (task_index < current_task_count) {
+            uint32_t base_index = task_index - gl_LocalInvocationID.x;
+
+            uint32_t neighbour_index =
+               task_index +
+               decode_neighbour_offset(nearest_neighbour_indices[task_index - lds_base]);
+            uint32_t other_neighbour_index =
+               neighbour_index +
+               decode_neighbour_offset(nearest_neighbour_indices[neighbour_index - lds_base]);
+            uint32_t id = load_id(src_ids, iter, task_index);
+            if (other_neighbour_index == task_index) {
+               if (task_index < neighbour_index) {
+                  uint32_t neighbour_id = load_id(src_ids, iter, neighbour_index);
+                  uint32_t children[2] = {id, neighbour_id};
+
+                  DEREF(REF(uint32_t)(INDEX(uint32_t, dst_ids, task_index))) = push_node(children);
+                  DEREF(REF(uint32_t)(INDEX(uint32_t, dst_ids, neighbour_index))) =
+                     RADV_BVH_INVALID_NODE;
+               } else {
+                  /* We still store in the other case so we don't destroy the node id needed to
+                   * create the internal node */
+                  has_valid_node = false;
+               }
+            } else {
+               DEREF(REF(uint32_t)(INDEX(uint32_t, dst_ids, task_index))) = id;
+            }
+
+            /* Compact - prepare prefix scan */
+            uvec4 ballot = subgroupBallot(has_valid_node);
+
+            uint32_t aggregate_sum = subgroupBallotBitCount(ballot);
+            if (subgroupElect())
+               atomicAdd(shared_aggregate_sum, aggregate_sum);
+         }
+
+         barrier();
+         /*
+          * The paper proposes initializing all partitions to an invalid state
+          * and only computing aggregates afterwards. We skip that step and
+          * initialize the partitions to a valid state. This also simplifies
+          * the look-back, as there will never be any blocking due to invalid
+          * partitions.
+          */
+         if (gl_LocalInvocationIndex == 0) {
+            REF(ploc_prefix_scan_partition)
+            current_partition = REF(ploc_prefix_scan_partition)(
+               INDEX(ploc_prefix_scan_partition, partitions, task_index / gl_WorkGroupSize.x));
+            DEREF(current_partition).aggregate = shared_aggregate_sum;
+            if (task_index < gl_WorkGroupSize.x) {
+               DEREF(current_partition).inclusive_sum = shared_aggregate_sum;
+            } else {
+               DEREF(current_partition).inclusive_sum = 0xFFFFFFFF;
+            }
+         }
+
+         if (task_index == 0)
+            set_next_task_count(args.header, task_count(args.header));
+      }
+
+      /* Compact - prefix scan and update */
+      PHASE (args.header) {
+         uint32_t current_task_count = task_count(args.header);
+
+         uint32_t id = task_index < current_task_count
+                          ? DEREF(REF(uint32_t)(INDEX(uint32_t, dst_ids, task_index)))
+                          : RADV_BVH_INVALID_NODE;
+         uvec4 ballot = subgroupBallot(id != RADV_BVH_INVALID_NODE);
+
+         uint32_t new_offset = prefix_scan(ballot, partitions, task_index);
+         if (task_index >= current_task_count)
+            continue;
+
+         if (id != RADV_BVH_INVALID_NODE) {
+            DEREF(REF(uint32_t)(INDEX(uint32_t, src_ids, new_offset))) = id;
+            ++new_offset;
+         }
+
+         if (task_index == current_task_count - 1) {
+            set_next_task_count(args.header, new_offset);
+         }
+      }
+   }
+}
index 06a9722..73c3680 100644 (file)
@@ -43,6 +43,10 @@ static const uint32_t lbvh_internal_spv[] = {
 #include "bvh/lbvh_internal.comp.spv.h"
 };
 
+static const uint32_t ploc_spv[] = {
+#include "bvh/ploc_internal.comp.spv.h"
+};
+
 static const uint32_t copy_spv[] = {
 #include "bvh/copy.comp.spv.h"
 };
@@ -272,6 +276,8 @@ radv_device_finish_accel_struct_build_state(struct radv_device *device)
    struct radv_meta_state *state = &device->meta_state;
    radv_DestroyPipeline(radv_device_to_handle(device), state->accel_struct_build.copy_pipeline,
                         &state->alloc);
+   radv_DestroyPipeline(radv_device_to_handle(device), state->accel_struct_build.ploc_pipeline,
+                        &state->alloc);
    radv_DestroyPipeline(radv_device_to_handle(device),
                         state->accel_struct_build.lbvh_internal_pipeline, &state->alloc);
    radv_DestroyPipeline(radv_device_to_handle(device), state->accel_struct_build.leaf_pipeline,
@@ -285,6 +291,8 @@ radv_device_finish_accel_struct_build_state(struct radv_device *device)
    radv_DestroyPipelineLayout(radv_device_to_handle(device),
                               state->accel_struct_build.copy_p_layout, &state->alloc);
    radv_DestroyPipelineLayout(radv_device_to_handle(device),
+                              state->accel_struct_build.ploc_p_layout, &state->alloc);
+   radv_DestroyPipelineLayout(radv_device_to_handle(device),
                               state->accel_struct_build.lbvh_internal_p_layout, &state->alloc);
    radv_DestroyPipelineLayout(radv_device_to_handle(device),
                               state->accel_struct_build.leaf_p_layout, &state->alloc);
@@ -388,6 +396,12 @@ radv_device_init_accel_struct_build_state(struct radv_device *device)
    if (result != VK_SUCCESS)
       return result;
 
+   result = create_build_pipeline_spv(device, ploc_spv, sizeof(ploc_spv), sizeof(struct ploc_args),
+                                      &device->meta_state.accel_struct_build.ploc_pipeline,
+                                      &device->meta_state.accel_struct_build.ploc_p_layout);
+   if (result != VK_SUCCESS)
+      return result;
+
    result = create_build_pipeline_spv(device, convert_leaf_spv, sizeof(convert_leaf_spv),
                                       sizeof(struct convert_leaf_args),
                                       &device->meta_state.accel_struct_build.convert_leaf_pipeline,
index c708b11..20145d8 100644 (file)
@@ -668,6 +668,8 @@ struct radv_meta_state {
       VkPipeline morton_pipeline;
       VkPipelineLayout lbvh_internal_p_layout;
       VkPipeline lbvh_internal_pipeline;
+      VkPipelineLayout ploc_p_layout;
+      VkPipeline ploc_pipeline;
       VkPipelineLayout convert_leaf_p_layout;
       VkPipeline convert_leaf_pipeline;
       VkPipelineLayout convert_internal_p_layout;