From 0c0f179037603b44a3ce092cc4c20d8bdb84220d Mon Sep 17 00:00:00 2001 From: Friedrich Vock Date: Tue, 25 Oct 2022 17:33:58 +0200 Subject: [PATCH] radv: Add global sync utilities Reviewed-by: Konstantin Seurer Part-of: --- src/amd/.clang-format | 1 + src/amd/vulkan/bvh/build_helpers.h | 125 +++++++++++++++++++++++++++++++++++++ src/amd/vulkan/bvh/bvh.h | 15 +++++ 3 files changed, 141 insertions(+) diff --git a/src/amd/.clang-format b/src/amd/.clang-format index db9ecef..38f047e 100644 --- a/src/amd/.clang-format +++ b/src/amd/.clang-format @@ -81,6 +81,7 @@ ForEachMacros: - set_foreach - u_foreach_bit - u_foreach_bit64 + - PHASE IncludeBlocks: Preserve IncludeCategories: diff --git a/src/amd/vulkan/bvh/build_helpers.h b/src/amd/vulkan/bvh/build_helpers.h index 765154b..a8e17b5 100644 --- a/src/amd/vulkan/bvh/build_helpers.h +++ b/src/amd/vulkan/bvh/build_helpers.h @@ -239,6 +239,8 @@ TYPE(radv_ir_triangle_node, 4); TYPE(radv_ir_aabb_node, 4); TYPE(radv_ir_instance_node, 8); +TYPE(radv_global_sync_data, 4); + uint32_t id_to_offset(uint32_t id) { @@ -361,4 +363,127 @@ aabb_surface_area(radv_aabb aabb) return 2 * diagonal.x * diagonal.y + 2 * diagonal.y * diagonal.z + 2 * diagonal.x * diagonal.z; } +/** Compute ceiling of integer quotient of A divided by B. + From macros.h */ +#define DIV_ROUND_UP(A, B) (((A) + (B)-1) / (B)) + +#ifdef USE_GLOBAL_SYNC + +/* There might be more invocations available than tasks to do. + * In that case, the fetched task index is greater than the + * counter offset for the next phase. To avoid out-of-bounds + * accessing, phases will be skipped until the task index is + * is in-bounds again. */ +uint32_t num_tasks_to_skip = 0; +uint32_t phase_index = 0; +bool should_skip = false; +shared uint32_t global_task_index; + +shared uint32_t shared_phase_index; + +uint32_t +task_count(REF(radv_ir_header) header) +{ + uint32_t phase_index = DEREF(header).sync_data.phase_index; + return DEREF(header).sync_data.task_counts[phase_index & 1]; +} + +/* Sets the task count for the next phase. */ +void +set_next_task_count(REF(radv_ir_header) header, uint32_t new_count) +{ + uint32_t phase_index = DEREF(header).sync_data.phase_index; + DEREF(header).sync_data.task_counts[(phase_index + 1) & 1] = new_count; +} + +/* + * This function has two main objectives: + * Firstly, it partitions pending work among free invocations. + * Secondly, it guarantees global synchronization between different phases. + * + * After every call to fetch_task, a new task index is returned. + * fetch_task will also set num_tasks_to_skip. Use should_execute_phase + * to determine if the current phase should be executed or skipped. + * + * Since tasks are assigned per-workgroup, there is a possibility of the task index being + * greater than the total task count. + */ +uint32_t +fetch_task(REF(radv_ir_header) header, bool did_work) +{ + /* Perform a memory + control barrier for all buffer writes for the entire workgroup. + * This guarantees that once the workgroup leaves the PHASE loop, all invocations have finished + * and their results are written to memory. */ + controlBarrier(gl_ScopeWorkgroup, gl_ScopeDevice, gl_StorageSemanticsBuffer, + gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible); + if (gl_LocalInvocationIndex == 0) { + if (did_work) + atomicAdd(DEREF(header).sync_data.task_done_counter, 1); + global_task_index = atomicAdd(DEREF(header).sync_data.task_started_counter, 1); + + do { + /* Perform a memory barrier to refresh the current phase's end counter, in case + * another workgroup changed it. */ + memoryBarrier( + gl_ScopeDevice, gl_StorageSemanticsBuffer, + gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible); + + /* The first invocation of the first workgroup in a new phase is responsible to initiate the + * switch to a new phase. It is only possible to switch to a new phase if all tasks of the + * previous phase have been completed. Switching to a new phase and incrementing the phase + * end counter in turn notifies all invocations for that phase that it is safe to execute. + */ + if (global_task_index == DEREF(header).sync_data.current_phase_end_counter && + DEREF(header).sync_data.task_done_counter == + DEREF(header).sync_data.current_phase_end_counter) { + atomicAdd(DEREF(header).sync_data.phase_index, 1); + DEREF(header).sync_data.current_phase_start_counter = + DEREF(header).sync_data.current_phase_end_counter; + atomicAdd(DEREF(header).sync_data.current_phase_end_counter, + DIV_ROUND_UP(task_count(header), gl_WorkGroupSize.x)); + /* Ensure the changes to the phase index and start/end counter are visible for other + * workgroup waiting in the loop. */ + memoryBarrier( + gl_ScopeDevice, gl_StorageSemanticsBuffer, + gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible); + break; + } + + /* If other invocations have finished all nodes, break out; there is no work to do */ + if (task_count(header) == 1) { + break; + } + } while (global_task_index >= DEREF(header).sync_data.current_phase_end_counter); + + shared_phase_index = DEREF(header).sync_data.phase_index; + } + + barrier(); + if (task_count(header) == 1) + return TASK_INDEX_INVALID; + + num_tasks_to_skip = shared_phase_index - phase_index; + + uint32_t local_task_index = + global_task_index - DEREF(header).sync_data.current_phase_start_counter; + return local_task_index * gl_WorkGroupSize.x + gl_LocalInvocationID.x; +} + +bool +should_execute_phase() +{ + if (num_tasks_to_skip > 0) { + /* Skip to next phase. */ + ++phase_index; + --num_tasks_to_skip; + return false; + } + return true; +} + +#define PHASE(header) \ + for (; task_index != TASK_INDEX_INVALID && should_execute_phase(); \ + task_index = fetch_task(header, true)) +#endif + #endif diff --git a/src/amd/vulkan/bvh/bvh.h b/src/amd/vulkan/bvh/bvh.h index 901a4b3..8cd90ea 100644 --- a/src/amd/vulkan/bvh/bvh.h +++ b/src/amd/vulkan/bvh/bvh.h @@ -40,6 +40,7 @@ #else #include typedef struct radv_ir_node radv_ir_node; +typedef struct radv_global_sync_data radv_global_sync_data; typedef uint16_t float16_t; @@ -132,6 +133,15 @@ struct radv_ir_instance_node { uint32_t instance_id; }; +struct radv_global_sync_data { + uint32_t task_counts[2]; + uint32_t task_started_counter; + uint32_t task_done_counter; + uint32_t current_phase_start_counter; + uint32_t current_phase_end_counter; + uint32_t phase_index; +}; + struct radv_ir_header { int32_t min_bounds[3]; int32_t max_bounds[3]; @@ -142,6 +152,7 @@ struct radv_ir_header { uint32_t ir_internal_node_count; uint32_t dispatch_size_y; uint32_t dispatch_size_z; + radv_global_sync_data sync_data; }; struct radv_bvh_triangle_node { @@ -193,4 +204,8 @@ struct radv_bvh_box32_node { #define RADV_BVH_ROOT_NODE radv_bvh_node_box32 #define RADV_BVH_INVALID_NODE 0xffffffffu +/* If the task index is set to this value, there is no + * more work to do. */ +#define TASK_INDEX_INVALID 0xFFFFFFFF + #endif -- 2.7.4