// Implementations initially derived from hcc
-static DEVICE uint32_t getLaneId(void) {
- return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
-}
-
// Initialized with a 64-bit mask with bits set in positions less than the
// thread's lane number in the warp
DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
- uint32_t lane = getLaneId();
+ uint32_t lane = GetLaneId();
int64_t ballot = __kmpc_impl_activemask();
uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
return mask & ballot;
// Initialized with a 64-bit mask with bits set in positions greater than the
// thread's lane number in the warp
DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
- uint32_t lane = getLaneId();
+ uint32_t lane = GetLaneId();
if (lane == (WARPSIZE - 1))
return 0;
uint64_t ballot = __kmpc_impl_activemask();
DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t var,
int32_t srcLane) {
int width = WARPSIZE;
- int self = getLaneId();
+ int self = GetLaneId();
int index = srcLane + (self & ~(width - 1));
return __builtin_amdgcn_ds_bpermute(index << 2, var);
}
DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t var,
uint32_t laneDelta, int32_t width) {
- int self = getLaneId();
+ int self = GetLaneId();
int index = self + laneDelta;
index = (int)(laneDelta + (self & (width - 1))) >= width ? self : index;
return __builtin_amdgcn_ds_bpermute(index << 2, var);
EXTERN uint64_t __ockl_get_num_groups(uint32_t);
DEVICE int GetNumberOfBlocksInKernel() { return __ockl_get_num_groups(0); }
DEVICE int GetNumberOfThreadsInBlock() { return __ockl_get_local_size(0); }
+DEVICE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
+DEVICE unsigned GetLaneId() {
+ return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
+}
////////////////////////////////////////////////////////////////////////////////
//
-// Calls to the NVPTX layer (assuming 1D layout)
-//
-////////////////////////////////////////////////////////////////////////////////
-
-DEVICE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
-
-DEVICE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }
-
-////////////////////////////////////////////////////////////////////////////////
-//
// Calls to the Generic Scheme Implementation Layer (assuming 1D layout)
//
////////////////////////////////////////////////////////////////////////////////