// is started, so we don't need a barrier.
if (NumThreads > 1) {
#endif
- named_sync(L1_BARRIER, WARPSIZE * NumWarps);
+ __kmpc_impl_named_sync(L1_BARRIER, WARPSIZE * NumWarps);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
}
#endif
// If we guard this barrier as follows it leads to deadlock, probably
// because of a compiler bug: if (!IsGenericMode()) __syncthreads();
uint16_t SyncWarps = (NumThreads + WARPSIZE - 1) / WARPSIZE;
- named_sync(L1_BARRIER, SyncWarps * WARPSIZE);
+ __kmpc_impl_named_sync(L1_BARRIER, SyncWarps * WARPSIZE);
// If this team is not the last, quit.
if (/* Volatile read by all threads */ !IsLastTeam)
((void *)((char *)((void *)(_addr)) - (_bytes)))
////////////////////////////////////////////////////////////////////////////////
-// Named Barrier Routines
-////////////////////////////////////////////////////////////////////////////////
-INLINE void named_sync(const int barrier, const int num_threads);
-
-////////////////////////////////////////////////////////////////////////////////
// Teams Reduction Scratchpad Helpers
////////////////////////////////////////////////////////////////////////////////
INLINE unsigned int *GetTeamsReductionTimestamp();
}
////////////////////////////////////////////////////////////////////////////////
-// Named Barrier Routines
-////////////////////////////////////////////////////////////////////////////////
-
-INLINE void named_sync(const int barrier, const int num_threads) {
- asm volatile("bar.sync %0, %1;"
- :
- : "r"(barrier), "r"(num_threads)
- : "memory");
-}
-
-////////////////////////////////////////////////////////////////////////////////
// Teams Reduction Scratchpad Helpers
////////////////////////////////////////////////////////////////////////////////
"call kmpc_barrier with %d omp threads, sync parameter %d\n",
(int)numberOfActiveOMPThreads, (int)threads);
// Barrier #1 is for synchronization among active threads.
- named_sync(L1_BARRIER, threads);
+ __kmpc_impl_named_sync(L1_BARRIER, threads);
}
} else {
// Still need to flush the memory per the standard.
"%d\n",
(int)numberOfActiveOMPThreads, (int)threads);
// Barrier #1 is for synchronization among active threads.
- named_sync(L1_BARRIER, threads);
+ __kmpc_impl_named_sync(L1_BARRIER, threads);
PRINT0(LD_SYNC, "completed kmpc_barrier_simple_generic\n");
}
#endif // CUDA_VERSION
}
+INLINE void __kmpc_impl_named_sync(int barrier, uint32_t num_threads) {
+ asm volatile("bar.sync %0, %1;"
+ :
+ : "r"(barrier), "r"(num_threads)
+ : "memory");
+}
+
#endif