// Process schedule.
if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) {
- if (OrderedSchedule(schedule)) {
- if (isSPMDMode())
- __syncthreads();
- else
- __kmpc_barrier(loc, threadId);
- }
+ if (OrderedSchedule(schedule))
+ __kmpc_barrier(loc, threadId);
PRINT(LD_LOOP,
"go sequential as tnum=%ld, trip count %lld, ordered sched=%d\n",
(long)tnum, P64(tripCount), schedule);
omptarget_nvptx_threadPrivateContext->Stride(tid));
} else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
- if (isSPMDMode())
- __syncthreads();
- else
- __kmpc_barrier(loc, threadId);
+ __kmpc_barrier(loc, threadId);
// save sched state
int teamId = GetOmpTeamId();
omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId) = ub;
omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId) = lb;
}
- if (isSPMDMode())
- __syncthreads();
- else
- __kmpc_barrier(loc, threadId);
+ __kmpc_barrier(loc, threadId);
PRINT(LD_LOOP,
"dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64
", chunk %" PRIu64 "\n",
if (nt != blockDim.x)
return 0;
unsigned tnum = __ACTIVEMASK();
- if (tnum != (~0x0)) { // assume swapSize is 32
+ if (tnum != (~0x0)) // assume swapSize is 32
return 0;
- }
return 1;
}
if (numthread == 1)
return 1;
- else if (!__gpu_block_reduce())
+ if (!__gpu_block_reduce())
return 2;
- else {
- if (threadIdx.x == 0)
- return 1;
- else
- return 0;
- }
+ if (threadIdx.x == 0)
+ return 1;
+ return 0;
}
EXTERN
int32_t __kmpc_reduce_combined(kmp_Indent *loc) {
- if (threadIdx.x == 0) {
- return 2;
- } else {
- return 0;
- }
+ return threadIdx.x == 0 ? 2 : 0;
}
EXTERN
int32_t __kmpc_reduce_simd(kmp_Indent *loc) {
- if (threadIdx.x % 32 == 0) {
- return 1;
- } else {
- return 0;
- }
+ return (threadIdx.x % 32 == 0) ? 1 : 0;
}
EXTERN
}
EXTERN void __kmpc_barrier(kmp_Indent *loc_ref, int32_t tid) {
- if (isSPMDMode()) {
- __kmpc_barrier_simple_spmd(loc_ref, tid);
- } else if (isRuntimeUninitialized()) {
- __kmpc_barrier_simple_generic(loc_ref, tid);
+ if (isRuntimeUninitialized()) {
+ if (isSPMDMode())
+ __kmpc_barrier_simple_spmd(loc_ref, tid);
+ else
+ __kmpc_barrier_simple_generic(loc_ref, tid);
} else {
tid = GetLogicalThreadIdInBlock();
omptarget_nvptx_TaskDescr *currTaskDescr =
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid);
- if (!currTaskDescr->InL2OrHigherParallelRegion()) {
- int numberOfActiveOMPThreads =
- GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized());
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
- // On Volta and newer architectures we require that all lanes in
- // a warp (at least, all present for the kernel launch) participate in the
- // barrier. This is enforced when launching the parallel region. An
- // exception is when there are < WARPSIZE workers. In this case only 1
- // worker is started, so we don't need a barrier.
- if (numberOfActiveOMPThreads > 1) {
-#endif
+ int numberOfActiveOMPThreads = GetNumberOfOmpThreads(
+ tid, isSPMDMode(), /*isRuntimeUninitialized=*/false);
+ if (numberOfActiveOMPThreads > 1) {
+ if (isSPMDMode()) {
+ __kmpc_barrier_simple_spmd(loc_ref, tid);
+ } else {
// The #threads parameter must be rounded up to the WARPSIZE.
int threads =
WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE);
numberOfActiveOMPThreads, threads);
// Barrier #1 is for synchronization among active threads.
named_sync(L1_BARRIER, threads);
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
- } // numberOfActiveOMPThreads > 1
-#endif
- }
+ }
+ } // numberOfActiveOMPThreads > 1
PRINT0(LD_SYNC, "completed kmpc_barrier\n");
}
}