#
# Regular entry points
- __kmp_wait_yield_4
+ __kmp_wait_4
__kmp_fork_call
__kmp_invoke_microtask
%ifdef KMP_USE_MONITOR
__kmp_reap_worker;
__kmp_release_64;
__kmp_wait_64;
- __kmp_wait_yield_4;
+ __kmp_wait_4;
# ittnotify symbols to be used by debugger
__kmp_itt_fini_ittlib;
(KMP_BLOCKTIME(team, tid) * KMP_USEC_PER_SEC)
#define KMP_BLOCKING(goal, count) ((count) % 1000 != 0 || (goal) > KMP_NOW())
#endif
-#define KMP_YIELD_NOW() \
- (KMP_NOW_MSEC() / KMP_MAX(__kmp_dflt_blocktime, 1) % \
- (__kmp_yield_on_count + __kmp_yield_off_count) < \
- (kmp_uint32)__kmp_yield_on_count)
#endif // KMP_USE_MONITOR
#define KMP_MIN_STATSCOLS 40
#define KMP_MAX_CHUNK (INT_MAX - 1)
#define KMP_DEFAULT_CHUNK 1
-#define KMP_MIN_INIT_WAIT 1
-#define KMP_MAX_INIT_WAIT (INT_MAX / 2)
-#define KMP_DEFAULT_INIT_WAIT 2048U
-
-#define KMP_MIN_NEXT_WAIT 1
-#define KMP_MAX_NEXT_WAIT (INT_MAX / 2)
-#define KMP_DEFAULT_NEXT_WAIT 1024U
-
#define KMP_DFLT_DISP_NUM_BUFF 7
#define KMP_MAX_ORDERED 8
extern void __kmp_x86_pause(void);
#elif KMP_MIC
// Performance testing on KNC (C0QS-7120 P/A/X/D, 61-core, 16 GB Memory) showed
-// regression after removal of extra PAUSE from KMP_YIELD_SPIN(). Changing
+// regression after removal of extra PAUSE from spin loops. Changing
// the delay from 100 to 300 showed even better performance than double PAUSE
// on Spec OMP2001 and LCPC tasking tests, no regressions on EPCC.
static inline void __kmp_x86_pause(void) { _mm_delay_32(300); }
#define KMP_INIT_YIELD(count) \
{ (count) = __kmp_yield_init; }
+#define KMP_OVERSUBSCRIBED \
+ (TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc))
+
+#define KMP_TRY_YIELD \
+ ((__kmp_use_yield == 1) || (__kmp_use_yield == 2 && (KMP_OVERSUBSCRIBED)))
+
+#define KMP_TRY_YIELD_OVERSUB \
+ ((__kmp_use_yield == 1 || __kmp_use_yield == 2) && (KMP_OVERSUBSCRIBED))
+
#define KMP_YIELD(cond) \
{ \
KMP_CPU_PAUSE(); \
- __kmp_yield((cond)); \
+ if ((cond) && (KMP_TRY_YIELD)) \
+ __kmp_yield(); \
+ }
+
+#define KMP_YIELD_OVERSUB() \
+ { \
+ KMP_CPU_PAUSE(); \
+ if ((KMP_TRY_YIELD_OVERSUB)) \
+ __kmp_yield(); \
}
// Note the decrement of 2 in the following Macros. With KMP_LIBRARY=turnaround,
// there should be no yielding since initial value from KMP_INIT_YIELD() is odd.
-
-#define KMP_YIELD_WHEN(cond, count) \
+#define KMP_YIELD_SPIN(count) \
{ \
KMP_CPU_PAUSE(); \
- (count) -= 2; \
- if (!(count)) { \
- __kmp_yield(cond); \
- (count) = __kmp_yield_next; \
+ if (KMP_TRY_YIELD) { \
+ (count) -= 2; \
+ if (!(count)) { \
+ __kmp_yield(); \
+ (count) = __kmp_yield_next; \
+ } \
} \
}
-#define KMP_YIELD_SPIN(count) \
+
+#define KMP_YIELD_OVERSUB_ELSE_SPIN(count) \
{ \
KMP_CPU_PAUSE(); \
- (count) -= 2; \
- if (!(count)) { \
- __kmp_yield(1); \
- (count) = __kmp_yield_next; \
+ if ((KMP_TRY_YIELD_OVERSUB)) \
+ __kmp_yield(); \
+ else if (__kmp_use_yield == 1) { \
+ (count) -= 2; \
+ if (!(count)) { \
+ __kmp_yield(); \
+ (count) = __kmp_yield_next; \
+ } \
} \
}
extern kmp_queuing_lock_t __kmp_dispatch_lock; /* control dispatch access */
extern kmp_lock_t __kmp_debug_lock; /* control I/O access for KMP_DEBUG */
-/* used for yielding spin-waits */
-extern unsigned int __kmp_init_wait; /* initial number of spin-tests */
-extern unsigned int __kmp_next_wait; /* susequent number of spin-tests */
-
extern enum library_type __kmp_library;
extern enum sched_type __kmp_sched; /* default runtime scheduling */
extern int __kmp_suspend_count; /* count inside __kmp_suspend_template() */
#endif
+extern kmp_int32 __kmp_use_yield;
+extern kmp_int32 __kmp_use_yield_exp_set;
extern kmp_uint32 __kmp_yield_init;
extern kmp_uint32 __kmp_yield_next;
-#if KMP_USE_MONITOR
-extern kmp_uint32 __kmp_yielding_on;
-#endif
-extern kmp_uint32 __kmp_yield_cycle;
-extern kmp_int32 __kmp_yield_on_count;
-extern kmp_int32 __kmp_yield_off_count;
-
/* ------------------------------------------------------------------------- */
extern int __kmp_allThreadsSpecified;
int num_threads);
#endif
-extern void __kmp_yield(int cond);
+extern void __kmp_yield();
extern void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
enum sched_type schedule, kmp_int32 lb,
extern kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker);
extern kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker);
extern kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker);
-extern kmp_uint32 __kmp_wait_yield_4(kmp_uint32 volatile *spinner,
- kmp_uint32 checker,
- kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
- void *obj);
-extern void __kmp_wait_yield_4_ptr(void *spinner, kmp_uint32 checker,
- kmp_uint32 (*pred)(void *, kmp_uint32),
- void *obj);
+extern kmp_uint32 __kmp_wait_4(kmp_uint32 volatile *spinner, kmp_uint32 checker,
+ kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
+ void *obj);
+extern void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
+ kmp_uint32 (*pred)(void *, kmp_uint32), void *obj);
class kmp_flag_32;
class kmp_flag_64;
// }
// and adding the yield here is good for at least a 10x speedup
// when running >2 threads per core (on the NAS LU benchmark).
- __kmp_yield(TRUE);
+ __kmp_yield();
#endif
#else
#error Unknown or unsupported architecture
kmp_uint32 spins; \
KMP_FSYNC_PREPARE(l); \
KMP_INIT_YIELD(spins); \
- if (TCR_4(__kmp_nth) > \
- (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \
- KMP_YIELD(TRUE); \
- } else { \
- KMP_YIELD_SPIN(spins); \
- } \
kmp_backoff_t backoff = __kmp_spin_backoff_params; \
- while ( \
- KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free || \
- !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) { \
- __kmp_spin_backoff(&backoff); \
+ do { \
if (TCR_4(__kmp_nth) > \
(__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \
KMP_YIELD(TRUE); \
} else { \
KMP_YIELD_SPIN(spins); \
} \
- } \
+ __kmp_spin_backoff(&backoff); \
+ } while ( \
+ KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free || \
+ !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)); \
} \
KMP_FSYNC_ACQUIRED(l); \
}
KMP_LOCK_BUSY(1, futex), NULL, NULL, 0); \
} \
KMP_MB(); \
- KMP_YIELD(TCR_4(__kmp_nth) > \
- (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)); \
+ KMP_YIELD_OVERSUB(); \
}
#endif // KMP_USE_FUTEX
// __kmp_dispatch_num_buffers)
if (idx != sh_buf->doacross_buf_idx) {
// Shared buffer is occupied, wait for it to be free
- __kmp_wait_yield_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
- __kmp_eq_4, NULL);
+ __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
+ __kmp_eq_4, NULL);
}
#if KMP_32_BIT_ARCH
// Check if we are the first thread. After the CAS the first thread gets 0,
KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
"sh->buffer_index:%d\n",
gtid, my_buffer_index, sh->buffer_index));
- __kmp_wait_yield<kmp_uint32>(&sh->buffer_index, my_buffer_index,
- __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
- // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and
+ __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
+ __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
+ // Note: KMP_WAIT() cannot be used there: buffer index and
// my_buffer_index are *always* 32-bit integers.
KMP_MB(); /* is this necessary? */
KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
}
#endif
- __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
- __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
+ __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
+ __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
KMP_MB(); /* is this necessary? */
#ifdef KMP_DEBUG
{
}
#endif
- __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
- __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
+ __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
+ __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
KMP_MB(); /* is this necessary? */
KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
}
kmp_uint32
-__kmp_wait_yield_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
- kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
- void *obj // Higher-level synchronization object, or NULL.
- ) {
+__kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
+ kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
+ void *obj // Higher-level synchronization object, or NULL.
+ ) {
// note: we may not belong to a team at this point
volatile kmp_uint32 *spin = spinner;
kmp_uint32 check = checker;
split. It causes problems with infinite recursion because of exit lock */
/* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
__kmp_abort_thread(); */
-
- /* if we have waited a bit, or are oversubscribed, yield */
- /* pause is in the following code */
- KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
- KMP_YIELD_SPIN(spins);
+ KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
}
KMP_FSYNC_SPIN_ACQUIRED(obj);
return r;
}
-void __kmp_wait_yield_4_ptr(
- void *spinner, kmp_uint32 checker, kmp_uint32 (*pred)(void *, kmp_uint32),
- void *obj // Higher-level synchronization object, or NULL.
- ) {
+void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
+ kmp_uint32 (*pred)(void *, kmp_uint32),
+ void *obj // Higher-level synchronization object, or NULL.
+ ) {
// note: we may not belong to a team at this point
void *spin = spinner;
kmp_uint32 check = checker;
// main wait spin loop
while (!f(spin, check)) {
KMP_FSYNC_SPIN_PREPARE(obj);
- /* if we have waited a bit, or are oversubscribed, yield */
+ /* if we have waited a bit, or are noversubscribed, yield */
/* pause is in the following code */
- KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
- KMP_YIELD_SPIN(spins);
+ KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
}
KMP_FSYNC_SPIN_ACQUIRED(obj);
}
}
/*
- Spin wait loop that first does pause, then yield.
+ Spin wait loop that pauses between checks.
Waits until function returns non-zero when called with *spinner and check.
Does NOT put threads to sleep.
Arguments:
is used to report locks consistently. For example, if lock is acquired
immediately, its address is reported to ittnotify via
KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately
- and lock routine calls to KMP_WAIT_YIELD(), the later should report the
+ and lock routine calls to KMP_WAIT(), the later should report the
same address, not an address of low-level spinner.
#endif // USE_ITT_BUILD
TODO: make inline function (move to header file for icl)
*/
template <typename UT>
-static UT __kmp_wait_yield(volatile UT *spinner, UT checker,
- kmp_uint32 (*pred)(UT, UT)
- USE_ITT_BUILD_ARG(void *obj)) {
+static UT __kmp_wait(volatile UT *spinner, UT checker,
+ kmp_uint32 (*pred)(UT, UT) USE_ITT_BUILD_ARG(void *obj)) {
// note: we may not belong to a team at this point
volatile UT *spin = spinner;
UT check = checker;
It causes problems with infinite recursion because of exit lock */
/* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
__kmp_abort_thread(); */
-
- // if we are oversubscribed,
- // or have waited a bit (and KMP_LIBRARY=throughput, then yield
- // pause is in the following code
- KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
- KMP_YIELD_SPIN(spins);
+ // If oversubscribed, or have waited a bit then yield.
+ KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
}
KMP_FSYNC_SPIN_ACQUIRED(obj);
return r;
__kmp_str_free(&buff);
}
#endif
- __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
- __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
+ __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
+ __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
KMP_MB(); /* is this necessary? */
#ifdef KMP_DEBUG
{
next_wait_value));
char v = (current_wait_value ? 0x1 : 0x0);
(RCAST(volatile char *, &(bdata->val[current_index])))[id] = v;
- __kmp_wait_yield<kmp_uint64>(&(bdata->val[current_index]), current_wait_value,
- __kmp_eq<kmp_uint64> USE_ITT_BUILD_ARG(NULL));
+ __kmp_wait<kmp_uint64>(&(bdata->val[current_index]), current_wait_value,
+ __kmp_eq<kmp_uint64> USE_ITT_BUILD_ARG(NULL));
tdata->wait_val[current_index] = next_wait_value;
tdata->index = next_index;
}
next_wait_value));
val = RCAST(volatile kmp_int64 *, &(bdata->val[current_index]));
KMP_TEST_THEN_INC64(val);
- __kmp_wait_yield<kmp_uint64>(&(bdata->val[current_index]), current_wait_value,
- __kmp_ge<kmp_uint64> USE_ITT_BUILD_ARG(NULL));
+ __kmp_wait<kmp_uint64>(&(bdata->val[current_index]), current_wait_value,
+ __kmp_ge<kmp_uint64> USE_ITT_BUILD_ARG(NULL));
tdata->wait_val[current_index] = next_wait_value;
tdata->index = next_index;
}
std::atomic<kmp_int32> __kmp_team_counter = ATOMIC_VAR_INIT(0);
std::atomic<kmp_int32> __kmp_task_counter = ATOMIC_VAR_INIT(0);
-unsigned int __kmp_init_wait =
- KMP_DEFAULT_INIT_WAIT; /* initial number of spin-tests */
-unsigned int __kmp_next_wait =
- KMP_DEFAULT_NEXT_WAIT; /* susequent number of spin-tests */
-
size_t __kmp_stksize = KMP_DEFAULT_STKSIZE;
#if KMP_USE_MONITOR
size_t __kmp_monitor_stksize = 0; // auto adjust
int __kmp_env_checks = FALSE; /* KMP_CHECKS specified? */
int __kmp_env_consistency_check = FALSE; /* KMP_CONSISTENCY_CHECK specified? */
+// From KMP_USE_YIELD:
+// 0 = never yield;
+// 1 = always yield (default);
+// 2 = yield only if oversubscribed
+kmp_int32 __kmp_use_yield = 1;
+// This will be 1 if KMP_USE_YIELD environment variable was set explicitly
+kmp_int32 __kmp_use_yield_exp_set = 0;
+
kmp_uint32 __kmp_yield_init = KMP_INIT_WAIT;
kmp_uint32 __kmp_yield_next = KMP_NEXT_WAIT;
-#if KMP_USE_MONITOR
-kmp_uint32 __kmp_yielding_on = 1;
-#endif
-#if KMP_OS_CNK
-kmp_uint32 __kmp_yield_cycle = 0;
-#else
-kmp_uint32 __kmp_yield_cycle = 1; /* Yield-cycle is on by default */
-#endif
-kmp_int32 __kmp_yield_on_count =
- 10; /* By default, yielding is on for 10 monitor periods. */
-kmp_int32 __kmp_yield_off_count =
- 1; /* By default, yielding is off for 1 monitor periods. */
-
/* ------------------------------------------------------ */
/* STATE mostly syncronized with global lock */
/* data written to rarely by masters, read often by workers */
with a delay (and not called at all if waiting time is small). So, in spin
loops, do not use KMP_FSYNC_PREPARE(), but use KMP_FSYNC_SPIN_INIT() (before
spin loop), KMP_FSYNC_SPIN_PREPARE() (whithin the spin loop), and
- KMP_FSYNC_SPIN_ACQUIRED(). See KMP_WAIT_YIELD() for example. */
+ KMP_FSYNC_SPIN_ACQUIRED(). See KMP_WAIT() for example. */
#undef KMP_FSYNC_SPIN_INIT
#define KMP_FSYNC_SPIN_INIT(obj, spin) \
kmp_uint32 spins;
KMP_FSYNC_PREPARE(lck);
KMP_INIT_YIELD(spins);
- if (TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {
- KMP_YIELD(TRUE);
- } else {
- KMP_YIELD_SPIN(spins);
- }
-
kmp_backoff_t backoff = __kmp_spin_backoff_params;
- while (KMP_ATOMIC_LD_RLX(&lck->lk.poll) != tas_free ||
- !__kmp_atomic_compare_store_acq(&lck->lk.poll, tas_free, tas_busy)) {
+ do {
__kmp_spin_backoff(&backoff);
- if (TCR_4(__kmp_nth) >
- (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {
- KMP_YIELD(TRUE);
- } else {
- KMP_YIELD_SPIN(spins);
- }
- }
+ KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
+ } while (KMP_ATOMIC_LD_RLX(&lck->lk.poll) != tas_free ||
+ !__kmp_atomic_compare_store_acq(&lck->lk.poll, tas_free, tas_busy));
KMP_FSYNC_ACQUIRED(lck);
return KMP_LOCK_ACQUIRED_FIRST;
}
KMP_ATOMIC_ST_REL(&lck->lk.poll, KMP_LOCK_FREE(tas));
KMP_MB(); /* Flush all pending memory write invalidates. */
- KMP_YIELD(TCR_4(__kmp_nth) >
- (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc));
+ KMP_YIELD_OVERSUB();
return KMP_LOCK_RELEASED;
}
KA_TRACE(1000, ("__kmp_release_futex_lock: lck:%p(0x%x), T#%d exiting\n", lck,
lck->lk.poll, gtid));
- KMP_YIELD(TCR_4(__kmp_nth) >
- (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc));
+ KMP_YIELD_OVERSUB();
return KMP_LOCK_RELEASED;
}
std::memory_order_acquire) == my_ticket) {
return KMP_LOCK_ACQUIRED_FIRST;
}
- KMP_WAIT_YIELD_PTR(&lck->lk.now_serving, my_ticket, __kmp_bakery_check, lck);
+ KMP_WAIT_PTR(&lck->lk.now_serving, my_ticket, __kmp_bakery_check, lck);
return KMP_LOCK_ACQUIRED_FIRST;
}
("__kmp_acquire_queuing_lock: lck:%p, T#%d waiting for lock\n",
lck, gtid));
- /* ToDo: May want to consider using __kmp_wait_sleep or something that
- sleeps for throughput only here. */
KMP_MB();
- KMP_WAIT_YIELD(spin_here_p, FALSE, KMP_EQ, lck);
+ // ToDo: Use __kmp_wait_sleep or similar when blocktime != inf
+ KMP_WAIT(spin_here_p, FALSE, KMP_EQ, lck);
#ifdef DEBUG_QUEUING_LOCKS
TRACE_LOCK(gtid + 1, "acq spin");
/* Yield if number of threads > number of logical processors */
/* ToDo: Not sure why this should only be in oversubscription case,
maybe should be traditional YIELD_INIT/YIELD_WHEN loop */
- KMP_YIELD(TCR_4(__kmp_nth) >
- (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc));
+ KMP_YIELD_OVERSUB();
+
#ifdef DEBUG_QUEUING_LOCKS
TRACE_LOCK(gtid + 1, "acq retry");
#endif
KMP_MB();
/* make sure enqueuing thread has time to update next waiting thread
* field */
- *head_id_p = KMP_WAIT_YIELD((volatile kmp_uint32 *)waiting_id_p, 0,
- KMP_NEQ, NULL);
+ *head_id_p =
+ KMP_WAIT((volatile kmp_uint32 *)waiting_id_p, 0, KMP_NEQ, NULL);
#ifdef DEBUG_QUEUING_LOCKS
TRACE_LOCK(gtid + 1, "rel deq: (h,t)->(h',t)");
#endif
// lock from now on.
while (!__kmp_is_unlocked_queuing_lock(GET_QLK_PTR(lck))) {
KMP_INC_STAT(lck, lemmingYields);
- __kmp_yield(TRUE);
+ KMP_YIELD(TRUE);
}
if (__kmp_test_adaptive_lock_only(lck, gtid))
// polling area has been reconfigured. Unless it is reconfigured, the
// reloads stay in L1 cache and are cheap.
//
- // Keep this code in sync with KMP_WAIT_YIELD, in kmp_dispatch.cpp !!!
- //
- // The current implementation of KMP_WAIT_YIELD doesn't allow for mask
+ // Keep this code in sync with KMP_WAIT, in kmp_dispatch.cpp !!!
+ // The current implementation of KMP_WAIT doesn't allow for mask
// and poll to be re-read every spin iteration.
kmp_uint32 spins;
-
KMP_FSYNC_PREPARE(lck);
KMP_INIT_YIELD(spins);
while (polls[ticket & mask] < ticket) { // atomic load
- // If we are oversubscribed,
- // or have waited a bit (and KMP_LIBRARY=turnaround), then yield.
- // CPU Pause is in the macros for yield.
- //
- KMP_YIELD(TCR_4(__kmp_nth) >
- (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc));
- KMP_YIELD_SPIN(spins);
-
+ KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
// Re-read the mask and the poll pointer from the lock structure.
//
// Make certain that "mask" is read before "polls" !!!
}
if ((status & _XABORT_EXPLICIT) && _XABORT_CODE(status) == 0xff) {
// Wait until lock becomes free
- while (!__kmp_is_unlocked_queuing_lock(lck))
- __kmp_yield(TRUE);
+ while (!__kmp_is_unlocked_queuing_lock(lck)) {
+ KMP_YIELD(TRUE);
+ }
} else if (!(status & _XABORT_RETRY))
break;
} while (retries--);
kmp_uint32 spins; \
KMP_FSYNC_PREPARE(lck); \
KMP_INIT_YIELD(spins); \
- if (TCR_4(__kmp_nth) > \
- (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \
- KMP_YIELD(TRUE); \
- } else { \
- KMP_YIELD_SPIN(spins); \
- } \
- while (lck->tas.lk.poll != 0 || !__kmp_atomic_compare_store_acq( \
- &lck->tas.lk.poll, 0, gtid + 1)) { \
- if (TCR_4(__kmp_nth) > \
- (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \
- KMP_YIELD(TRUE); \
- } else { \
- KMP_YIELD_SPIN(spins); \
- } \
- } \
+ do { \
+ KMP_YIELD_OVERSUB_ELSE_SPIN(spins); \
+ } while ( \
+ lck->tas.lk.poll != 0 || \
+ !__kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1)); \
} \
KMP_FSYNC_ACQUIRED(lck); \
} else { \
kmp_uint32 spins; \
KMP_FSYNC_PREPARE(lck); \
KMP_INIT_YIELD(spins); \
- if (TCR_4(__kmp_nth) > \
- (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \
- KMP_YIELD(TRUE); \
- } else { \
- KMP_YIELD_SPIN(spins); \
- } \
- while ( \
+ do { \
+ KMP_YIELD_OVERSUB_ELSE_SPIN(spins); \
+ } while ( \
(lck->tas.lk.poll != 0) || \
- !__kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1)) { \
- if (TCR_4(__kmp_nth) > \
- (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \
- KMP_YIELD(TRUE); \
- } else { \
- KMP_YIELD_SPIN(spins); \
- } \
- } \
+ !__kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1)); \
} \
lck->tas.lk.depth_locked = 1; \
*depth = KMP_LOCK_ACQUIRED_FIRST; \
#define KMP_CACHE_PREFETCH(ADDR) /* nothing */
-// Define attribute that indicates that the fall through from the previous
+// Define attribute that indicates that the fall through from the previous
// case label is intentional and should not be diagnosed by a compiler
// Code from libcxx/include/__config
// Use a function like macro to imply that it must be followed by a semicolon
#define VOLATILE_CAST(x) (x)
#endif
-#define KMP_WAIT_YIELD __kmp_wait_yield_4
-#define KMP_WAIT_YIELD_PTR __kmp_wait_yield_4_ptr
+#define KMP_WAIT __kmp_wait_4
+#define KMP_WAIT_PTR __kmp_wait_4_ptr
#define KMP_EQ __kmp_eq_4
#define KMP_NEQ __kmp_neq_4
#define KMP_LT __kmp_lt_4
static int done = FALSE;
while (!done) {
- KMP_YIELD(1);
+ KMP_YIELD(TRUE);
}
}
#endif /* KMP_OS_WINDOWS */
#endif /* KMP_DYNAMIC_LIB */
-/* Change the library type to "status" and return the old type */
-/* called from within initialization routines where __kmp_initz_lock is held */
-int __kmp_change_library(int status) {
- int old_status;
-
- old_status = __kmp_yield_init &
- 1; // check whether KMP_LIBRARY=throughput (even init count)
-
- if (status) {
- __kmp_yield_init |= 1; // throughput => turnaround (odd init count)
- } else {
- __kmp_yield_init &= ~1; // turnaround => throughput (even init count)
- }
-
- return old_status; // return previous setting of whether
- // KMP_LIBRARY=throughput
-}
-
/* __kmp_parallel_deo -- Wait until it's our turn. */
void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
int gtid = *gtid_ref;
#ifdef BUILD_PARALLEL_ORDERED
if (!team->t.t_serialized) {
KMP_MB();
- KMP_WAIT_YIELD(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid),
- KMP_EQ, NULL);
+ KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
+ NULL);
KMP_MB();
}
#endif /* BUILD_PARALLEL_ORDERED */
switch (__kmp_library) {
case library_serial: {
KMP_INFORM(LibraryIsSerial);
- (void)__kmp_change_library(TRUE);
} break;
case library_turnaround:
- (void)__kmp_change_library(TRUE);
+ if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
+ __kmp_use_yield = 2; // only yield when oversubscribed
break;
case library_throughput:
- (void)__kmp_change_library(FALSE);
+ if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
+ __kmp_dflt_blocktime = 200;
break;
default:
KMP_FATAL(UnknownLibraryType, arg);
} // __kmp_stg_print_teams_thread_limit
// -----------------------------------------------------------------------------
+// KMP_USE_YIELD
+static void __kmp_stg_parse_use_yield(char const *name, char const *value,
+ void *data) {
+ __kmp_stg_parse_int(name, value, 0, 2, &__kmp_use_yield);
+ __kmp_use_yield_exp_set = 1;
+} // __kmp_stg_parse_use_yield
+
+static void __kmp_stg_print_use_yield(kmp_str_buf_t *buffer, char const *name,
+ void *data) {
+ __kmp_stg_print_int(buffer, name, __kmp_use_yield);
+} // __kmp_stg_print_use_yield
+
+// -----------------------------------------------------------------------------
// KMP_BLOCKTIME
static void __kmp_stg_parse_blocktime(char const *name, char const *value,
__kmp_library = library_serial;
} else if (__kmp_str_match("throughput", 2, value)) { /* TH */
__kmp_library = library_throughput;
+ if (blocktime_str == NULL) {
+ // KMP_BLOCKTIME not specified, so set default to 0.
+ __kmp_dflt_blocktime = 0;
+ }
} else if (__kmp_str_match("turnaround", 2, value)) { /* TU */
__kmp_library = library_turnaround;
} else if (__kmp_str_match("dedicated", 1, value)) { /* D */
__kmp_library = library_turnaround;
} else if (__kmp_str_match("multiuser", 1, value)) { /* M */
__kmp_library = library_throughput;
+ if (blocktime_str == NULL) {
+ // KMP_BLOCKTIME not specified, so set default to 0.
+ __kmp_dflt_blocktime = 0;
+ }
} else {
KMP_WARNING(StgInvalidValue, name, value);
}
}
- __kmp_aux_set_library(__kmp_library);
-
} // __kmp_stg_parse_wait_policy
static void __kmp_stg_print_wait_policy(kmp_str_buf_t *buffer, char const *name,
}
} // __kmp_stg_print_par_range_env
-// -----------------------------------------------------------------------------
-// KMP_YIELD_CYCLE, KMP_YIELD_ON, KMP_YIELD_OFF
-
-static void __kmp_stg_parse_yield_cycle(char const *name, char const *value,
- void *data) {
- int flag = __kmp_yield_cycle;
- __kmp_stg_parse_bool(name, value, &flag);
- __kmp_yield_cycle = flag;
-} // __kmp_stg_parse_yield_cycle
-
-static void __kmp_stg_print_yield_cycle(kmp_str_buf_t *buffer, char const *name,
- void *data) {
- __kmp_stg_print_bool(buffer, name, __kmp_yield_cycle);
-} // __kmp_stg_print_yield_cycle
-
-static void __kmp_stg_parse_yield_on(char const *name, char const *value,
- void *data) {
- __kmp_stg_parse_int(name, value, 2, INT_MAX, &__kmp_yield_on_count);
-} // __kmp_stg_parse_yield_on
-
-static void __kmp_stg_print_yield_on(kmp_str_buf_t *buffer, char const *name,
- void *data) {
- __kmp_stg_print_int(buffer, name, __kmp_yield_on_count);
-} // __kmp_stg_print_yield_on
-
-static void __kmp_stg_parse_yield_off(char const *name, char const *value,
- void *data) {
- __kmp_stg_parse_int(name, value, 2, INT_MAX, &__kmp_yield_off_count);
-} // __kmp_stg_parse_yield_off
-
-static void __kmp_stg_print_yield_off(kmp_str_buf_t *buffer, char const *name,
- void *data) {
- __kmp_stg_print_int(buffer, name, __kmp_yield_off_count);
-} // __kmp_stg_print_yield_off
-
#endif
// -----------------------------------------------------------------------------
-// KMP_INIT_WAIT, KMP_NEXT_WAIT
-
-static void __kmp_stg_parse_init_wait(char const *name, char const *value,
- void *data) {
- int wait;
- KMP_ASSERT((__kmp_init_wait & 1) == 0);
- wait = __kmp_init_wait / 2;
- __kmp_stg_parse_int(name, value, KMP_MIN_INIT_WAIT, KMP_MAX_INIT_WAIT, &wait);
- __kmp_init_wait = wait * 2;
- KMP_ASSERT((__kmp_init_wait & 1) == 0);
- __kmp_yield_init = __kmp_init_wait;
-} // __kmp_stg_parse_init_wait
-
-static void __kmp_stg_print_init_wait(kmp_str_buf_t *buffer, char const *name,
- void *data) {
- __kmp_stg_print_int(buffer, name, __kmp_init_wait);
-} // __kmp_stg_print_init_wait
-
-static void __kmp_stg_parse_next_wait(char const *name, char const *value,
- void *data) {
- int wait;
- KMP_ASSERT((__kmp_next_wait & 1) == 0);
- wait = __kmp_next_wait / 2;
- __kmp_stg_parse_int(name, value, KMP_MIN_NEXT_WAIT, KMP_MAX_NEXT_WAIT, &wait);
- __kmp_next_wait = wait * 2;
- KMP_ASSERT((__kmp_next_wait & 1) == 0);
- __kmp_yield_next = __kmp_next_wait;
-} // __kmp_stg_parse_next_wait
-
-static void __kmp_stg_print_next_wait(kmp_str_buf_t *buffer, char const *name,
- void *data) {
- __kmp_stg_print_int(buffer, name, __kmp_next_wait);
-} //__kmp_stg_print_next_wait
-
-// -----------------------------------------------------------------------------
// KMP_GTID_MODE
static void __kmp_stg_parse_gtid_mode(char const *name, char const *value,
{"KMP_ALL_THREADS", __kmp_stg_parse_device_thread_limit, NULL, NULL, 0, 0},
{"KMP_BLOCKTIME", __kmp_stg_parse_blocktime, __kmp_stg_print_blocktime,
NULL, 0, 0},
+ {"KMP_USE_YIELD", __kmp_stg_parse_use_yield, __kmp_stg_print_use_yield,
+ NULL, 0, 0},
{"KMP_DUPLICATE_LIB_OK", __kmp_stg_parse_duplicate_lib_ok,
__kmp_stg_print_duplicate_lib_ok, NULL, 0, 0},
{"KMP_LIBRARY", __kmp_stg_parse_wait_policy, __kmp_stg_print_wait_policy,
{"KMP_PAR_RANGE", __kmp_stg_parse_par_range_env,
__kmp_stg_print_par_range_env, NULL, 0, 0},
- {"KMP_YIELD_CYCLE", __kmp_stg_parse_yield_cycle,
- __kmp_stg_print_yield_cycle, NULL, 0, 0},
- {"KMP_YIELD_ON", __kmp_stg_parse_yield_on, __kmp_stg_print_yield_on, NULL,
- 0, 0},
- {"KMP_YIELD_OFF", __kmp_stg_parse_yield_off, __kmp_stg_print_yield_off,
- NULL, 0, 0},
#endif // KMP_DEBUG
{"KMP_ALIGN_ALLOC", __kmp_stg_parse_align_alloc,
#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
{"KMP_MALLOC_POOL_INCR", __kmp_stg_parse_malloc_pool_incr,
__kmp_stg_print_malloc_pool_incr, NULL, 0, 0},
- {"KMP_INIT_WAIT", __kmp_stg_parse_init_wait, __kmp_stg_print_init_wait,
- NULL, 0, 0},
- {"KMP_NEXT_WAIT", __kmp_stg_parse_next_wait, __kmp_stg_print_next_wait,
- NULL, 0, 0},
{"KMP_GTID_MODE", __kmp_stg_parse_gtid_mode, __kmp_stg_print_gtid_mode,
NULL, 0, 0},
{"OMP_DYNAMIC", __kmp_stg_parse_omp_dynamic, __kmp_stg_print_omp_dynamic,
if (thread->th.th_task_team == NULL) {
break;
}
- // Yield before executing next task
- KMP_YIELD(__kmp_library == library_throughput);
+ KMP_YIELD(__kmp_library == library_throughput); // Yield before next task
// If execution of a stolen task results in more tasks being placed on our
// run queue, reset use_own_tasks
if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) {
break;
}
- // If we are oversubscribed, or have waited a bit (and library mode is
- // throughput), yield. Pause is in the following code.
- KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
- KMP_YIELD_SPIN(spins); // Yields only if KMP_LIBRARY=throughput
+ // If oversubscribed or have waited a bit, yield.
+ KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
}
}
__kmp_abort_thread();
break;
}
- KMP_YIELD(TRUE); // GH: We always yield here
+ KMP_YIELD(TRUE);
}
#if USE_ITT_BUILD
KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin));
taskq = tq->tq_curr_thunk[tid]->th.th_shareds->sv_queue;
- KMP_WAIT_YIELD(&taskq->tq_tasknum_serving, my_token, KMP_EQ, NULL);
+ KMP_WAIT(&taskq->tq_tasknum_serving, my_token, KMP_EQ, NULL);
KMP_MB();
}
}
taskq = thunk->th.th_shareds->sv_queue;
if (taskq->tq_tasknum_serving <= my_token) {
- KMP_WAIT_YIELD(&taskq->tq_tasknum_serving, my_token, KMP_GE, NULL);
+ KMP_WAIT(&taskq->tq_tasknum_serving, my_token, KMP_GE, NULL);
KMP_MB();
taskq->tq_tasknum_serving = my_token + 1;
KMP_MB();
while (queue->tq_ref_count > 1) {
__kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
- KMP_WAIT_YIELD((volatile kmp_uint32 *)&queue->tq_ref_count, 1, KMP_LE,
- NULL);
+ KMP_WAIT((volatile kmp_uint32 *)&queue->tq_ref_count, 1, KMP_LE, NULL);
__kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
// Make sure data structures are in consistent state before querying them
in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT);
if (in_parallel) {
- kmp_uint32 spins;
-
/* this is just a safeguard to release the waiting threads if */
/* the outermost taskq never queues a task */
do {
/* wait until something is available to dequeue */
- KMP_INIT_YIELD(spins);
-
while ((queue->tq_nfull == 0) && (queue->tq_taskq_slot == NULL) &&
(!__kmp_taskq_has_any_children(queue)) &&
(!(queue->tq_flags & TQF_ALL_TASKS_QUEUED))) {
- KMP_YIELD_WHEN(TRUE, spins);
+ KMP_CPU_PAUSE();
}
/* check to see if we can execute tasks in the queue */
/* WAIT until all tasks are finished and no child queues exist before
* proceeding */
- KMP_INIT_YIELD(spins);
while (!__kmp_taskq_tasks_finished(queue) ||
__kmp_taskq_has_any_children(queue)) {
in_parallel);
}
- KMP_YIELD_WHEN(thunk == NULL, spins);
+ if (thunk == NULL)
+ KMP_CPU_PAUSE();
__kmp_find_and_remove_finished_child_taskq(tq, global_tid, queue);
}
// Outermost Queue: steal work from descendants until all tasks are finished
- KMP_INIT_YIELD(spins);
-
while (!__kmp_taskq_tasks_finished(queue)) {
thunk = __kmp_find_task_in_descendant_queue(global_tid, queue);
__kmp_execute_task_from_queue(tq, loc, global_tid, thunk, in_parallel);
}
- KMP_YIELD_WHEN(thunk == NULL, spins);
+ if (thunk == NULL)
+ KMP_CPU_PAUSE();
}
/* Need this barrier to prevent destruction of queue before threads have all
}
#endif
-/* Spin wait loop that first does pause, then yield, then sleep. A thread that
- calls __kmp_wait_* must make certain that another thread calls __kmp_release
+/* Spin wait loop that first does pause/yield, then sleep. A thread that calls
+ __kmp_wait_* must make certain that another thread calls __kmp_release
to wake it back up to prevent deadlocks!
NOTE: We may not belong to a team at this point. */
}
#endif
- // Setup for waiting
- KMP_INIT_YIELD(spins);
+ KMP_INIT_YIELD(spins); // Setup for waiting
if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
#if OMP_50_ENABLED
// If we are oversubscribed, or have waited a bit (and
// KMP_LIBRARY=throughput), then yield
- // TODO: Should it be number of cores instead of thread contexts? Like:
- // KMP_YIELD(TCR_4(__kmp_nth) > __kmp_ncores);
- // Need performance improvement data to make the change...
- if (oversubscribed) {
- KMP_YIELD(1);
- } else {
- KMP_YIELD_SPIN(spins);
- }
+ KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
+
// Check if this thread was transferred from a team
// to the thread pool (or vice-versa) while spinning.
in_pool = !!TCR_4(this_thr->th.th_in_pool);
__kmp_msg_null);
}
#endif
- __kmp_yield(TRUE);
+ KMP_YIELD(TRUE);
} //
/* Set thread stack info according to values returned by pthread_getattr_np().
sigset_t new_set;
#endif /* KMP_BLOCK_SIGNALS */
struct timespec interval;
- int yield_count;
- int yield_cycles = 0;
KMP_MB(); /* Flush all pending memory write invalidates. */
KA_TRACE(10, ("__kmp_launch_monitor: #2 monitor\n"));
- if (__kmp_yield_cycle) {
- __kmp_yielding_on = 0; /* Start out with yielding shut off */
- yield_count = __kmp_yield_off_count;
- } else {
- __kmp_yielding_on = 1; /* Yielding is on permanently */
- }
-
while (!TCR_4(__kmp_global.g.g_done)) {
struct timespec now;
struct timeval tval;
status = pthread_mutex_unlock(&__kmp_wait_mx.m_mutex);
KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status);
- if (__kmp_yield_cycle) {
- yield_cycles++;
- if ((yield_cycles % yield_count) == 0) {
- if (__kmp_yielding_on) {
- __kmp_yielding_on = 0; /* Turn it off now */
- yield_count = __kmp_yield_off_count;
- } else {
- __kmp_yielding_on = 1; /* Turn it on now */
- yield_count = __kmp_yield_on_count;
- }
- yield_cycles = 0;
- }
- } else {
- __kmp_yielding_on = 1;
- }
-
TCW_4(__kmp_global.g.g_time.dt.t_value,
TCR_4(__kmp_global.g.g_time.dt.t_value) + 1);
// Wait for the monitor thread is really started and set its *priority*.
KMP_DEBUG_ASSERT(sizeof(kmp_uint32) ==
sizeof(__kmp_global.g.g_time.dt.t_value));
- __kmp_wait_yield_4((kmp_uint32 volatile *)&__kmp_global.g.g_time.dt.t_value,
- -1, &__kmp_neq_4, NULL);
+ __kmp_wait_4((kmp_uint32 volatile *)&__kmp_global.g.g_time.dt.t_value, -1,
+ &__kmp_neq_4, NULL);
#endif // KMP_REAL_TIME_FIX
#ifdef KMP_THREAD_ATTR
}
#endif // KMP_USE_MONITOR
-void __kmp_yield(int cond) {
- if (!cond)
- return;
-#if KMP_USE_MONITOR
- if (!__kmp_yielding_on)
- return;
-#else
- if (__kmp_yield_cycle && !KMP_YIELD_NOW())
- return;
-#endif
- sched_yield();
-}
+void __kmp_yield() { sched_yield(); }
void __kmp_gtid_set_specific(int gtid) {
if (__kmp_init_gtid) {
__kmp_resume_template(target_gtid, flag);
}
-void __kmp_yield(int cond) {
- if (cond)
- Sleep(0);
-}
+void __kmp_yield() { Sleep(0); }
void __kmp_gtid_set_specific(int gtid) {
if (__kmp_init_gtid) {
Right solution seems to be waiting for *either* thread termination *or*
ds_alive resetting. */
{
- // TODO: This code is very similar to KMP_WAIT_YIELD. Need to generalize
- // KMP_WAIT_YIELD to cover this usage also.
+ // TODO: This code is very similar to KMP_WAIT. Need to generalize
+ // KMP_WAIT to cover this usage also.
void *obj = NULL;
kmp_uint32 spins;
#if USE_ITT_BUILD
KMP_FSYNC_SPIN_PREPARE(obj);
#endif /* USE_ITT_BUILD */
__kmp_is_thread_alive(th, &exit_val);
- KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
- KMP_YIELD_SPIN(spins);
+ KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
} while (exit_val == STILL_ACTIVE && TCR_4(th->th.th_info.ds.ds_alive));
#if USE_ITT_BUILD
if (exit_val == STILL_ACTIVE) {