From 0c0f179037603b44a3ce092cc4c20d8bdb84220d Mon Sep 17 00:00:00 2001
From: Friedrich Vock <friedrich.vock@gmx.de>
Date: Tue, 25 Oct 2022 17:33:58 +0200
Subject: [PATCH] radv: Add global sync utilities

Reviewed-by: Konstantin Seurer <konstantin.seurer@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19292>
---
 src/amd/.clang-format              |   1 +
 src/amd/vulkan/bvh/build_helpers.h | 125 +++++++++++++++++++++++++++++++++++++
 src/amd/vulkan/bvh/bvh.h           |  15 +++++
 3 files changed, 141 insertions(+)

diff --git a/src/amd/.clang-format b/src/amd/.clang-format
index db9ecef..38f047e 100644
--- a/src/amd/.clang-format
+++ b/src/amd/.clang-format
@@ -81,6 +81,7 @@ ForEachMacros:
   - set_foreach
   - u_foreach_bit
   - u_foreach_bit64
+  - PHASE
 
 IncludeBlocks: Preserve
 IncludeCategories:
diff --git a/src/amd/vulkan/bvh/build_helpers.h b/src/amd/vulkan/bvh/build_helpers.h
index 765154b..a8e17b5 100644
--- a/src/amd/vulkan/bvh/build_helpers.h
+++ b/src/amd/vulkan/bvh/build_helpers.h
@@ -239,6 +239,8 @@ TYPE(radv_ir_triangle_node, 4);
 TYPE(radv_ir_aabb_node, 4);
 TYPE(radv_ir_instance_node, 8);
 
+TYPE(radv_global_sync_data, 4);
+
 uint32_t
 id_to_offset(uint32_t id)
 {
@@ -361,4 +363,127 @@ aabb_surface_area(radv_aabb aabb)
    return 2 * diagonal.x * diagonal.y + 2 * diagonal.y * diagonal.z + 2 * diagonal.x * diagonal.z;
 }
 
+/** Compute ceiling of integer quotient of A divided by B.
+    From macros.h */
+#define DIV_ROUND_UP(A, B) (((A) + (B)-1) / (B))
+
+#ifdef USE_GLOBAL_SYNC
+
+/* There might be more invocations available than tasks to do.
+ * In that case, the fetched task index is greater than the
+ * counter offset for the next phase. To avoid out-of-bounds
+ * accessing, phases will be skipped until the task index is
+ * is in-bounds again. */
+uint32_t num_tasks_to_skip = 0;
+uint32_t phase_index = 0;
+bool should_skip = false;
+shared uint32_t global_task_index;
+
+shared uint32_t shared_phase_index;
+
+uint32_t
+task_count(REF(radv_ir_header) header)
+{
+   uint32_t phase_index = DEREF(header).sync_data.phase_index;
+   return DEREF(header).sync_data.task_counts[phase_index & 1];
+}
+
+/* Sets the task count for the next phase. */
+void
+set_next_task_count(REF(radv_ir_header) header, uint32_t new_count)
+{
+   uint32_t phase_index = DEREF(header).sync_data.phase_index;
+   DEREF(header).sync_data.task_counts[(phase_index + 1) & 1] = new_count;
+}
+
+/*
+ * This function has two main objectives:
+ * Firstly, it partitions pending work among free invocations.
+ * Secondly, it guarantees global synchronization between different phases.
+ *
+ * After every call to fetch_task, a new task index is returned.
+ * fetch_task will also set num_tasks_to_skip. Use should_execute_phase
+ * to determine if the current phase should be executed or skipped.
+ *
+ * Since tasks are assigned per-workgroup, there is a possibility of the task index being
+ * greater than the total task count.
+ */
+uint32_t
+fetch_task(REF(radv_ir_header) header, bool did_work)
+{
+   /* Perform a memory + control barrier for all buffer writes for the entire workgroup.
+    * This guarantees that once the workgroup leaves the PHASE loop, all invocations have finished
+    * and their results are written to memory. */
+   controlBarrier(gl_ScopeWorkgroup, gl_ScopeDevice, gl_StorageSemanticsBuffer,
+                  gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
+   if (gl_LocalInvocationIndex == 0) {
+      if (did_work)
+         atomicAdd(DEREF(header).sync_data.task_done_counter, 1);
+      global_task_index = atomicAdd(DEREF(header).sync_data.task_started_counter, 1);
+
+      do {
+         /* Perform a memory barrier to refresh the current phase's end counter, in case
+          * another workgroup changed it. */
+         memoryBarrier(
+            gl_ScopeDevice, gl_StorageSemanticsBuffer,
+            gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
+
+         /* The first invocation of the first workgroup in a new phase is responsible to initiate the
+          * switch to a new phase. It is only possible to switch to a new phase if all tasks of the
+          * previous phase have been completed. Switching to a new phase and incrementing the phase
+          * end counter in turn notifies all invocations for that phase that it is safe to execute.
+          */
+         if (global_task_index == DEREF(header).sync_data.current_phase_end_counter &&
+             DEREF(header).sync_data.task_done_counter ==
+                DEREF(header).sync_data.current_phase_end_counter) {
+            atomicAdd(DEREF(header).sync_data.phase_index, 1);
+            DEREF(header).sync_data.current_phase_start_counter =
+               DEREF(header).sync_data.current_phase_end_counter;
+            atomicAdd(DEREF(header).sync_data.current_phase_end_counter,
+                      DIV_ROUND_UP(task_count(header), gl_WorkGroupSize.x));
+            /* Ensure the changes to the phase index and start/end counter are visible for other
+             * workgroup waiting in the loop. */
+            memoryBarrier(
+               gl_ScopeDevice, gl_StorageSemanticsBuffer,
+               gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
+            break;
+         }
+
+         /* If other invocations have finished all nodes, break out; there is no work to do */
+         if (task_count(header) == 1) {
+            break;
+         }
+      } while (global_task_index >= DEREF(header).sync_data.current_phase_end_counter);
+
+      shared_phase_index = DEREF(header).sync_data.phase_index;
+   }
+
+   barrier();
+   if (task_count(header) == 1)
+      return TASK_INDEX_INVALID;
+
+   num_tasks_to_skip = shared_phase_index - phase_index;
+
+   uint32_t local_task_index =
+      global_task_index - DEREF(header).sync_data.current_phase_start_counter;
+   return local_task_index * gl_WorkGroupSize.x + gl_LocalInvocationID.x;
+}
+
+bool
+should_execute_phase()
+{
+   if (num_tasks_to_skip > 0) {
+      /* Skip to next phase. */
+      ++phase_index;
+      --num_tasks_to_skip;
+      return false;
+   }
+   return true;
+}
+
+#define PHASE(header)                                                                              \
+   for (; task_index != TASK_INDEX_INVALID && should_execute_phase();                              \
+        task_index = fetch_task(header, true))
+#endif
+
 #endif
diff --git a/src/amd/vulkan/bvh/bvh.h b/src/amd/vulkan/bvh/bvh.h
index 901a4b3..8cd90ea 100644
--- a/src/amd/vulkan/bvh/bvh.h
+++ b/src/amd/vulkan/bvh/bvh.h
@@ -40,6 +40,7 @@
 #else
 #include <vulkan/vulkan.h>
 typedef struct radv_ir_node radv_ir_node;
+typedef struct radv_global_sync_data radv_global_sync_data;
 
 typedef uint16_t float16_t;
 
@@ -132,6 +133,15 @@ struct radv_ir_instance_node {
    uint32_t instance_id;
 };
 
+struct radv_global_sync_data {
+   uint32_t task_counts[2];
+   uint32_t task_started_counter;
+   uint32_t task_done_counter;
+   uint32_t current_phase_start_counter;
+   uint32_t current_phase_end_counter;
+   uint32_t phase_index;
+};
+
 struct radv_ir_header {
    int32_t min_bounds[3];
    int32_t max_bounds[3];
@@ -142,6 +152,7 @@ struct radv_ir_header {
    uint32_t ir_internal_node_count;
    uint32_t dispatch_size_y;
    uint32_t dispatch_size_z;
+   radv_global_sync_data sync_data;
 };
 
 struct radv_bvh_triangle_node {
@@ -193,4 +204,8 @@ struct radv_bvh_box32_node {
 #define RADV_BVH_ROOT_NODE radv_bvh_node_box32
 #define RADV_BVH_INVALID_NODE 0xffffffffu
 
+/* If the task index is set to this value, there is no
+ * more work to do. */
+#define TASK_INDEX_INVALID 0xFFFFFFFF
+
 #endif
-- 
2.7.4