return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
}
+/// Returns the total number of workgruops in the grid.
+LIBC_INLINE uint64_t get_num_blocks() {
+ return get_num_blocks_x() * get_num_blocks_y() * get_num_blocks_z();
+}
+
/// Returns the 'x' dimension of the current AMD workgroup's id.
LIBC_INLINE uint32_t get_block_id_x() {
return __builtin_amdgcn_workgroup_id_x();
return __builtin_amdgcn_workgroup_size_z();
}
+/// Returns the total number of workitems in the workgroup.
+LIBC_INLINE uint64_t get_num_threads() {
+ return get_num_threads_x() * get_num_threads_y() * get_num_threads_z();
+}
+
/// Returns the 'x' dimension id of the workitem in the current AMD workgroup.
LIBC_INLINE uint32_t get_thread_id_x() {
return __builtin_amdgcn_workitem_id_x();
}
/// Wait for all threads in the wavefront to converge, this is a noop on AMDGPU.
-[[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t) {}
+[[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t) {
+ __builtin_amdgcn_wave_barrier();
+}
} // namespace gpu
} // namespace __llvm_libc
LIBC_INLINE uint32_t get_num_blocks_x() { return 1; }
-LIBC_INLINE uint32_t get_num_blocks_y() { return 0; }
+LIBC_INLINE uint32_t get_num_blocks_y() { return 1; }
-LIBC_INLINE uint32_t get_num_blocks_z() { return 0; }
+LIBC_INLINE uint32_t get_num_blocks_z() { return 1; }
+
+LIBC_INLINE uint64_t get_num_blocks() { return 1; }
LIBC_INLINE uint32_t get_block_id_x() { return 0; }
LIBC_INLINE uint32_t get_num_threads_x() { return 1; }
-LIBC_INLINE uint32_t get_num_threads_y() { return 0; }
+LIBC_INLINE uint32_t get_num_threads_y() { return 1; }
+
+LIBC_INLINE uint32_t get_num_threads_z() { return 1; }
-LIBC_INLINE uint32_t get_num_threads_z() { return 0; }
+LIBC_INLINE uint64_t get_num_threads() { return 1; }
LIBC_INLINE uint32_t get_thread_id_x() { return 0; }
return __nvvm_read_ptx_sreg_nctaid_z();
}
+/// Returns the total number of CUDA blocks.
+LIBC_INLINE uint64_t get_num_blocks() {
+ return get_num_blocks_x() * get_num_blocks_y() * get_num_blocks_z();
+}
+
/// Returns the 'x' dimension of the current CUDA block's id.
LIBC_INLINE uint32_t get_block_id_x() { return __nvvm_read_ptx_sreg_ctaid_x(); }
return __nvvm_read_ptx_sreg_ntid_z();
}
+/// Returns the total number of threads in the block.
+LIBC_INLINE uint64_t get_num_threads() {
+ return get_num_threads_x() * get_num_threads_y() * get_num_threads_z();
+}
+
/// Returns the 'x' dimension id of the thread in the current CUDA block.
LIBC_INLINE uint32_t get_thread_id_x() { return __nvvm_read_ptx_sreg_tid_x(); }