[OpenMP] Make use of sched_yield optional in runtime
authorJonathan Peyton <jonathan.l.peyton@intel.com>
Thu, 28 Feb 2019 19:11:29 +0000 (19:11 +0000)
committerJonathan Peyton <jonathan.l.peyton@intel.com>
Thu, 28 Feb 2019 19:11:29 +0000 (19:11 +0000)
This patch cleans up the yielding code and makes it optional. An
environment variable, KMP_USE_YIELD, was added. Yielding is still
on by default (KMP_USE_YIELD=1), but can be turned off completely
(KMP_USE_YIELD=0), or turned on only when oversubscription is detected
(KMP_USE_YIELD=2). Note that oversubscription cannot always be detected
by the runtime (for example, when the runtime is initialized and the
process forks, oversubscription cannot be detected currently over
multiple instances of the runtime).

Because yielding can be controlled by user now, the library mode
settings (from KMP_LIBRARY) for throughput and turnaround have been
adjusted by altering blocktime, unless that was also explicitly set.

In the original code, there were a number of places where a double yield
might have been done under oversubscription. This version checks
oversubscription and if that's not going to yield, then it does
the spin check.

Patch by Terry Wilmarth

Differential Revision: https://reviews.llvm.org/D58148

llvm-svn: 355120

19 files changed:
openmp/runtime/src/dllexports
openmp/runtime/src/exports_so.txt
openmp/runtime/src/kmp.h
openmp/runtime/src/kmp_csupport.cpp
openmp/runtime/src/kmp_dispatch.cpp
openmp/runtime/src/kmp_dispatch.h
openmp/runtime/src/kmp_dispatch_hier.h
openmp/runtime/src/kmp_global.cpp
openmp/runtime/src/kmp_itt.h
openmp/runtime/src/kmp_lock.cpp
openmp/runtime/src/kmp_lock.h
openmp/runtime/src/kmp_os.h
openmp/runtime/src/kmp_runtime.cpp
openmp/runtime/src/kmp_settings.cpp
openmp/runtime/src/kmp_tasking.cpp
openmp/runtime/src/kmp_taskq.cpp
openmp/runtime/src/kmp_wait_release.h
openmp/runtime/src/z_Linux_util.cpp
openmp/runtime/src/z_Windows_NT_util.cpp

index b03cbae..c0b0b60 100644 (file)
     #
 
     # Regular entry points
-        __kmp_wait_yield_4
+        __kmp_wait_4
         __kmp_fork_call
         __kmp_invoke_microtask
     %ifdef KMP_USE_MONITOR
index e23fa0c..4926697 100644 (file)
@@ -83,7 +83,7 @@ VERSION {
         __kmp_reap_worker;
         __kmp_release_64;
         __kmp_wait_64;
-        __kmp_wait_yield_4;
+        __kmp_wait_4;
 
         # ittnotify symbols to be used by debugger
         __kmp_itt_fini_ittlib;
index 4dd6cf0..5125eca 100644 (file)
@@ -981,10 +981,6 @@ extern kmp_uint64 __kmp_now_nsec();
   (KMP_BLOCKTIME(team, tid) * KMP_USEC_PER_SEC)
 #define KMP_BLOCKING(goal, count) ((count) % 1000 != 0 || (goal) > KMP_NOW())
 #endif
-#define KMP_YIELD_NOW()                                                        \
-  (KMP_NOW_MSEC() / KMP_MAX(__kmp_dflt_blocktime, 1) %                         \
-       (__kmp_yield_on_count + __kmp_yield_off_count) <                        \
-   (kmp_uint32)__kmp_yield_on_count)
 #endif // KMP_USE_MONITOR
 
 #define KMP_MIN_STATSCOLS 40
@@ -999,14 +995,6 @@ extern kmp_uint64 __kmp_now_nsec();
 #define KMP_MAX_CHUNK (INT_MAX - 1)
 #define KMP_DEFAULT_CHUNK 1
 
-#define KMP_MIN_INIT_WAIT 1
-#define KMP_MAX_INIT_WAIT (INT_MAX / 2)
-#define KMP_DEFAULT_INIT_WAIT 2048U
-
-#define KMP_MIN_NEXT_WAIT 1
-#define KMP_MAX_NEXT_WAIT (INT_MAX / 2)
-#define KMP_DEFAULT_NEXT_WAIT 1024U
-
 #define KMP_DFLT_DISP_NUM_BUFF 7
 #define KMP_MAX_ORDERED 8
 
@@ -1090,7 +1078,7 @@ extern void __kmp_x86_cpuid(int mode, int mode2, struct kmp_cpuid *p);
 extern void __kmp_x86_pause(void);
 #elif KMP_MIC
 // Performance testing on KNC (C0QS-7120 P/A/X/D, 61-core, 16 GB Memory) showed
-// regression after removal of extra PAUSE from KMP_YIELD_SPIN(). Changing
+// regression after removal of extra PAUSE from spin loops. Changing
 // the delay from 100 to 300 showed even better performance than double PAUSE
 // on Spec OMP2001 and LCPC tasking tests, no regressions on EPCC.
 static inline void __kmp_x86_pause(void) { _mm_delay_32(300); }
@@ -1115,31 +1103,54 @@ static inline void __kmp_x86_pause(void) { _mm_pause(); }
 #define KMP_INIT_YIELD(count)                                                  \
   { (count) = __kmp_yield_init; }
 
+#define KMP_OVERSUBSCRIBED                                                     \
+  (TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc))
+
+#define KMP_TRY_YIELD                                                          \
+  ((__kmp_use_yield == 1) || (__kmp_use_yield == 2 && (KMP_OVERSUBSCRIBED)))
+
+#define KMP_TRY_YIELD_OVERSUB                                                  \
+  ((__kmp_use_yield == 1 || __kmp_use_yield == 2) && (KMP_OVERSUBSCRIBED))
+
 #define KMP_YIELD(cond)                                                        \
   {                                                                            \
     KMP_CPU_PAUSE();                                                           \
-    __kmp_yield((cond));                                                       \
+    if ((cond) && (KMP_TRY_YIELD))                                             \
+      __kmp_yield();                                                           \
+  }
+
+#define KMP_YIELD_OVERSUB()                                                    \
+  {                                                                            \
+    KMP_CPU_PAUSE();                                                           \
+    if ((KMP_TRY_YIELD_OVERSUB))                                               \
+      __kmp_yield();                                                           \
   }
 
 // Note the decrement of 2 in the following Macros. With KMP_LIBRARY=turnaround,
 // there should be no yielding since initial value from KMP_INIT_YIELD() is odd.
-
-#define KMP_YIELD_WHEN(cond, count)                                            \
+#define KMP_YIELD_SPIN(count)                                                  \
   {                                                                            \
     KMP_CPU_PAUSE();                                                           \
-    (count) -= 2;                                                              \
-    if (!(count)) {                                                            \
-      __kmp_yield(cond);                                                       \
-      (count) = __kmp_yield_next;                                              \
+    if (KMP_TRY_YIELD) {                                                       \
+      (count) -= 2;                                                            \
+      if (!(count)) {                                                          \
+        __kmp_yield();                                                         \
+        (count) = __kmp_yield_next;                                            \
+      }                                                                        \
     }                                                                          \
   }
-#define KMP_YIELD_SPIN(count)                                                  \
+
+#define KMP_YIELD_OVERSUB_ELSE_SPIN(count)                                     \
   {                                                                            \
     KMP_CPU_PAUSE();                                                           \
-    (count) -= 2;                                                              \
-    if (!(count)) {                                                            \
-      __kmp_yield(1);                                                          \
-      (count) = __kmp_yield_next;                                              \
+    if ((KMP_TRY_YIELD_OVERSUB))                                               \
+      __kmp_yield();                                                           \
+    else if (__kmp_use_yield == 1) {                                           \
+      (count) -= 2;                                                            \
+      if (!(count)) {                                                          \
+        __kmp_yield();                                                         \
+        (count) = __kmp_yield_next;                                            \
+      }                                                                        \
     }                                                                          \
   }
 
@@ -2945,10 +2956,6 @@ extern kmp_lock_t __kmp_global_lock; /* control OS/global access  */
 extern kmp_queuing_lock_t __kmp_dispatch_lock; /* control dispatch access  */
 extern kmp_lock_t __kmp_debug_lock; /* control I/O access for KMP_DEBUG */
 
-/* used for yielding spin-waits */
-extern unsigned int __kmp_init_wait; /* initial number of spin-tests   */
-extern unsigned int __kmp_next_wait; /* susequent number of spin-tests */
-
 extern enum library_type __kmp_library;
 
 extern enum sched_type __kmp_sched; /* default runtime scheduling */
@@ -2977,16 +2984,11 @@ extern int __kmp_reserve_warn; /* have we issued reserve_threads warning? */
 extern int __kmp_suspend_count; /* count inside __kmp_suspend_template() */
 #endif
 
+extern kmp_int32 __kmp_use_yield;
+extern kmp_int32 __kmp_use_yield_exp_set;
 extern kmp_uint32 __kmp_yield_init;
 extern kmp_uint32 __kmp_yield_next;
 
-#if KMP_USE_MONITOR
-extern kmp_uint32 __kmp_yielding_on;
-#endif
-extern kmp_uint32 __kmp_yield_cycle;
-extern kmp_int32 __kmp_yield_on_count;
-extern kmp_int32 __kmp_yield_off_count;
-
 /* ------------------------------------------------------------------------- */
 extern int __kmp_allThreadsSpecified;
 
@@ -3309,7 +3311,7 @@ extern void __kmp_push_num_teams(ident_t *loc, int gtid, int num_teams,
                                  int num_threads);
 #endif
 
-extern void __kmp_yield(int cond);
+extern void __kmp_yield();
 
 extern void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
                                    enum sched_type schedule, kmp_int32 lb,
@@ -3374,13 +3376,11 @@ extern kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker);
 extern kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker);
 extern kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker);
 extern kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker);
-extern kmp_uint32 __kmp_wait_yield_4(kmp_uint32 volatile *spinner,
-                                     kmp_uint32 checker,
-                                     kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
-                                     void *obj);
-extern void __kmp_wait_yield_4_ptr(void *spinner, kmp_uint32 checker,
-                                   kmp_uint32 (*pred)(void *, kmp_uint32),
-                                   void *obj);
+extern kmp_uint32 __kmp_wait_4(kmp_uint32 volatile *spinner, kmp_uint32 checker,
+                               kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
+                               void *obj);
+extern void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
+                             kmp_uint32 (*pred)(void *, kmp_uint32), void *obj);
 
 class kmp_flag_32;
 class kmp_flag_64;
index 455cbe2..592a266 100644 (file)
@@ -683,7 +683,7 @@ void __kmpc_flush(ident_t *loc) {
   // }
   // and adding the yield here is good for at least a 10x speedup
   // when running >2 threads per core (on the NAS LU benchmark).
-  __kmp_yield(TRUE);
+  __kmp_yield();
 #endif
 #else
 #error Unknown or unsupported architecture
@@ -993,24 +993,18 @@ __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
       kmp_uint32 spins;                                                        \
       KMP_FSYNC_PREPARE(l);                                                    \
       KMP_INIT_YIELD(spins);                                                   \
-      if (TCR_4(__kmp_nth) >                                                   \
-          (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {               \
-        KMP_YIELD(TRUE);                                                       \
-      } else {                                                                 \
-        KMP_YIELD_SPIN(spins);                                                 \
-      }                                                                        \
       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
-      while (                                                                  \
-          KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
-          !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {  \
-        __kmp_spin_backoff(&backoff);                                          \
+      do {                                                                     \
         if (TCR_4(__kmp_nth) >                                                 \
             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
           KMP_YIELD(TRUE);                                                     \
         } else {                                                               \
           KMP_YIELD_SPIN(spins);                                               \
         }                                                                      \
-      }                                                                        \
+        __kmp_spin_backoff(&backoff);                                          \
+      } while (                                                                \
+          KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
+          !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy));   \
     }                                                                          \
     KMP_FSYNC_ACQUIRED(l);                                                     \
   }
@@ -1096,8 +1090,7 @@ __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
               KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
     }                                                                          \
     KMP_MB();                                                                  \
-    KMP_YIELD(TCR_4(__kmp_nth) >                                               \
-              (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc));            \
+    KMP_YIELD_OVERSUB();                                                       \
   }
 
 #endif // KMP_USE_FUTEX
@@ -3976,8 +3969,8 @@ void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
   // __kmp_dispatch_num_buffers)
   if (idx != sh_buf->doacross_buf_idx) {
     // Shared buffer is occupied, wait for it to be free
-    __kmp_wait_yield_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
-                       __kmp_eq_4, NULL);
+    __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
+                 __kmp_eq_4, NULL);
   }
 #if KMP_32_BIT_ARCH
   // Check if we are the first thread. After the CAS the first thread gets 0,
index 872b96d..564e73b 100644 (file)
@@ -858,9 +858,9 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
     KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
                    "sh->buffer_index:%d\n",
                    gtid, my_buffer_index, sh->buffer_index));
-    __kmp_wait_yield<kmp_uint32>(&sh->buffer_index, my_buffer_index,
-                                 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
-    // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and
+    __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
+                           __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
+    // Note: KMP_WAIT() cannot be used there: buffer index and
     // my_buffer_index are *always* 32-bit integers.
     KMP_MB(); /* is this necessary? */
     KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
@@ -1004,8 +1004,8 @@ static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
       }
 #endif
 
-      __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
-                           __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
+      __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
+                     __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
       KMP_MB(); /* is this necessary? */
 #ifdef KMP_DEBUG
       {
@@ -1073,8 +1073,8 @@ static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
       }
 #endif
 
-      __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
-                           __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
+      __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
+                     __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
 
       KMP_MB(); /* is this necessary? */
       KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
@@ -2489,10 +2489,10 @@ kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
 }
 
 kmp_uint32
-__kmp_wait_yield_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
-                   kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
-                   void *obj // Higher-level synchronization object, or NULL.
-                   ) {
+__kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
+             kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
+             void *obj // Higher-level synchronization object, or NULL.
+             ) {
   // note: we may not belong to a team at this point
   volatile kmp_uint32 *spin = spinner;
   kmp_uint32 check = checker;
@@ -2509,20 +2509,16 @@ __kmp_wait_yield_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
        split. It causes problems with infinite recursion because of exit lock */
     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
         __kmp_abort_thread(); */
-
-    /* if we have waited a bit, or are oversubscribed, yield */
-    /* pause is in the following code */
-    KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
-    KMP_YIELD_SPIN(spins);
+    KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
   }
   KMP_FSYNC_SPIN_ACQUIRED(obj);
   return r;
 }
 
-void __kmp_wait_yield_4_ptr(
-    void *spinner, kmp_uint32 checker, kmp_uint32 (*pred)(void *, kmp_uint32),
-    void *obj // Higher-level synchronization object, or NULL.
-    ) {
+void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
+                      kmp_uint32 (*pred)(void *, kmp_uint32),
+                      void *obj // Higher-level synchronization object, or NULL.
+                      ) {
   // note: we may not belong to a team at this point
   void *spin = spinner;
   kmp_uint32 check = checker;
@@ -2534,10 +2530,9 @@ void __kmp_wait_yield_4_ptr(
   // main wait spin loop
   while (!f(spin, check)) {
     KMP_FSYNC_SPIN_PREPARE(obj);
-    /* if we have waited a bit, or are oversubscribed, yield */
+    /* if we have waited a bit, or are noversubscribed, yield */
     /* pause is in the following code */
-    KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
-    KMP_YIELD_SPIN(spins);
+    KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
   }
   KMP_FSYNC_SPIN_ACQUIRED(obj);
 }
index 84603d1..98979fa 100644 (file)
@@ -269,7 +269,7 @@ template <typename T> kmp_uint32 __kmp_eq(T value, T checker) {
 }
 
 /*
-    Spin wait loop that first does pause, then yield.
+    Spin wait loop that pauses between checks.
     Waits until function returns non-zero when called with *spinner and check.
     Does NOT put threads to sleep.
     Arguments:
@@ -282,15 +282,14 @@ template <typename T> kmp_uint32 __kmp_eq(T value, T checker) {
         is used to report locks consistently. For example, if lock is acquired
         immediately, its address is reported to ittnotify via
         KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately
-        and lock routine calls to KMP_WAIT_YIELD(), the later should report the
+        and lock routine calls to KMP_WAIT(), the later should report the
         same address, not an address of low-level spinner.
 #endif // USE_ITT_BUILD
     TODO: make inline function (move to header file for icl)
 */
 template <typename UT>
-static UT __kmp_wait_yield(volatile UT *spinner, UT checker,
-                           kmp_uint32 (*pred)(UT, UT)
-                               USE_ITT_BUILD_ARG(void *obj)) {
+static UT __kmp_wait(volatile UT *spinner, UT checker,
+                     kmp_uint32 (*pred)(UT, UT) USE_ITT_BUILD_ARG(void *obj)) {
   // note: we may not belong to a team at this point
   volatile UT *spin = spinner;
   UT check = checker;
@@ -308,12 +307,8 @@ static UT __kmp_wait_yield(volatile UT *spinner, UT checker,
        It causes problems with infinite recursion because of exit lock */
     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
         __kmp_abort_thread(); */
-
-    // if we are oversubscribed,
-    // or have waited a bit (and KMP_LIBRARY=throughput, then yield
-    // pause is in the following code
-    KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
-    KMP_YIELD_SPIN(spins);
+    // If oversubscribed, or have waited a bit then yield.
+    KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
   }
   KMP_FSYNC_SPIN_ACQUIRED(obj);
   return r;
@@ -379,8 +374,8 @@ void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
       __kmp_str_free(&buff);
     }
 #endif
-    __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
-                         __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
+    __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
+                   __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
     KMP_MB(); /* is this necessary? */
 #ifdef KMP_DEBUG
     {
index 48b164e..3f1cc61 100644 (file)
@@ -263,8 +263,8 @@ void core_barrier_impl<T>::barrier(kmp_int32 id,
                 next_wait_value));
   char v = (current_wait_value ? 0x1 : 0x0);
   (RCAST(volatile char *, &(bdata->val[current_index])))[id] = v;
-  __kmp_wait_yield<kmp_uint64>(&(bdata->val[current_index]), current_wait_value,
-                               __kmp_eq<kmp_uint64> USE_ITT_BUILD_ARG(NULL));
+  __kmp_wait<kmp_uint64>(&(bdata->val[current_index]), current_wait_value,
+                         __kmp_eq<kmp_uint64> USE_ITT_BUILD_ARG(NULL));
   tdata->wait_val[current_index] = next_wait_value;
   tdata->index = next_index;
 }
@@ -310,8 +310,8 @@ void counter_barrier_impl<T>::barrier(kmp_int32 id,
                 next_wait_value));
   val = RCAST(volatile kmp_int64 *, &(bdata->val[current_index]));
   KMP_TEST_THEN_INC64(val);
-  __kmp_wait_yield<kmp_uint64>(&(bdata->val[current_index]), current_wait_value,
-                               __kmp_ge<kmp_uint64> USE_ITT_BUILD_ARG(NULL));
+  __kmp_wait<kmp_uint64>(&(bdata->val[current_index]), current_wait_value,
+                         __kmp_ge<kmp_uint64> USE_ITT_BUILD_ARG(NULL));
   tdata->wait_val[current_index] = next_wait_value;
   tdata->index = next_index;
 }
index ee321d7..627724c 100644 (file)
@@ -62,11 +62,6 @@ int __kmp_version = 0;
 std::atomic<kmp_int32> __kmp_team_counter = ATOMIC_VAR_INIT(0);
 std::atomic<kmp_int32> __kmp_task_counter = ATOMIC_VAR_INIT(0);
 
-unsigned int __kmp_init_wait =
-    KMP_DEFAULT_INIT_WAIT; /* initial number of spin-tests   */
-unsigned int __kmp_next_wait =
-    KMP_DEFAULT_NEXT_WAIT; /* susequent number of spin-tests */
-
 size_t __kmp_stksize = KMP_DEFAULT_STKSIZE;
 #if KMP_USE_MONITOR
 size_t __kmp_monitor_stksize = 0; // auto adjust
@@ -395,22 +390,17 @@ int __kmp_env_blocktime = FALSE; /* KMP_BLOCKTIME specified? */
 int __kmp_env_checks = FALSE; /* KMP_CHECKS specified?    */
 int __kmp_env_consistency_check = FALSE; /* KMP_CONSISTENCY_CHECK specified? */
 
+// From KMP_USE_YIELD:
+// 0 = never yield;
+// 1 = always yield (default);
+// 2 = yield only if oversubscribed
+kmp_int32 __kmp_use_yield = 1;
+// This will be 1 if KMP_USE_YIELD environment variable was set explicitly
+kmp_int32 __kmp_use_yield_exp_set = 0;
+
 kmp_uint32 __kmp_yield_init = KMP_INIT_WAIT;
 kmp_uint32 __kmp_yield_next = KMP_NEXT_WAIT;
 
-#if KMP_USE_MONITOR
-kmp_uint32 __kmp_yielding_on = 1;
-#endif
-#if KMP_OS_CNK
-kmp_uint32 __kmp_yield_cycle = 0;
-#else
-kmp_uint32 __kmp_yield_cycle = 1; /* Yield-cycle is on by default */
-#endif
-kmp_int32 __kmp_yield_on_count =
-    10; /* By default, yielding is on for 10 monitor periods. */
-kmp_int32 __kmp_yield_off_count =
-    1; /* By default, yielding is off for 1 monitor periods. */
-
 /* ------------------------------------------------------ */
 /* STATE mostly syncronized with global lock */
 /* data written to rarely by masters, read often by workers */
index 94719f8..b14a193 100644 (file)
@@ -219,7 +219,7 @@ __kmp_inline void __kmp_itt_stack_callee_leave(__itt_caller);
    with a delay (and not called at all if waiting time is small). So, in spin
    loops, do not use KMP_FSYNC_PREPARE(), but use KMP_FSYNC_SPIN_INIT() (before
    spin loop), KMP_FSYNC_SPIN_PREPARE() (whithin the spin loop), and
-   KMP_FSYNC_SPIN_ACQUIRED(). See KMP_WAIT_YIELD() for example. */
+   KMP_FSYNC_SPIN_ACQUIRED(). See KMP_WAIT() for example. */
 
 #undef KMP_FSYNC_SPIN_INIT
 #define KMP_FSYNC_SPIN_INIT(obj, spin)                                         \
index af91644..78d63c6 100644 (file)
@@ -100,23 +100,12 @@ __kmp_acquire_tas_lock_timed_template(kmp_tas_lock_t *lck, kmp_int32 gtid) {
   kmp_uint32 spins;
   KMP_FSYNC_PREPARE(lck);
   KMP_INIT_YIELD(spins);
-  if (TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {
-    KMP_YIELD(TRUE);
-  } else {
-    KMP_YIELD_SPIN(spins);
-  }
-
   kmp_backoff_t backoff = __kmp_spin_backoff_params;
-  while (KMP_ATOMIC_LD_RLX(&lck->lk.poll) != tas_free ||
-         !__kmp_atomic_compare_store_acq(&lck->lk.poll, tas_free, tas_busy)) {
+  do {
     __kmp_spin_backoff(&backoff);
-    if (TCR_4(__kmp_nth) >
-        (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {
-      KMP_YIELD(TRUE);
-    } else {
-      KMP_YIELD_SPIN(spins);
-    }
-  }
+    KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
+  } while (KMP_ATOMIC_LD_RLX(&lck->lk.poll) != tas_free ||
+           !__kmp_atomic_compare_store_acq(&lck->lk.poll, tas_free, tas_busy));
   KMP_FSYNC_ACQUIRED(lck);
   return KMP_LOCK_ACQUIRED_FIRST;
 }
@@ -169,8 +158,7 @@ int __kmp_release_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
   KMP_ATOMIC_ST_REL(&lck->lk.poll, KMP_LOCK_FREE(tas));
   KMP_MB(); /* Flush all pending memory write invalidates.  */
 
-  KMP_YIELD(TCR_4(__kmp_nth) >
-            (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc));
+  KMP_YIELD_OVERSUB();
   return KMP_LOCK_RELEASED;
 }
 
@@ -474,8 +462,7 @@ int __kmp_release_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
   KA_TRACE(1000, ("__kmp_release_futex_lock: lck:%p(0x%x), T#%d exiting\n", lck,
                   lck->lk.poll, gtid));
 
-  KMP_YIELD(TCR_4(__kmp_nth) >
-            (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc));
+  KMP_YIELD_OVERSUB();
   return KMP_LOCK_RELEASED;
 }
 
@@ -651,7 +638,7 @@ __kmp_acquire_ticket_lock_timed_template(kmp_ticket_lock_t *lck,
                                 std::memory_order_acquire) == my_ticket) {
     return KMP_LOCK_ACQUIRED_FIRST;
   }
-  KMP_WAIT_YIELD_PTR(&lck->lk.now_serving, my_ticket, __kmp_bakery_check, lck);
+  KMP_WAIT_PTR(&lck->lk.now_serving, my_ticket, __kmp_bakery_check, lck);
   return KMP_LOCK_ACQUIRED_FIRST;
 }
 
@@ -1249,10 +1236,9 @@ __kmp_acquire_queuing_lock_timed_template(kmp_queuing_lock_t *lck,
                ("__kmp_acquire_queuing_lock: lck:%p, T#%d waiting for lock\n",
                 lck, gtid));
 
-      /* ToDo: May want to consider using __kmp_wait_sleep  or something that
-         sleeps for throughput only here. */
       KMP_MB();
-      KMP_WAIT_YIELD(spin_here_p, FALSE, KMP_EQ, lck);
+      // ToDo: Use __kmp_wait_sleep or similar when blocktime != inf
+      KMP_WAIT(spin_here_p, FALSE, KMP_EQ, lck);
 
 #ifdef DEBUG_QUEUING_LOCKS
       TRACE_LOCK(gtid + 1, "acq spin");
@@ -1282,8 +1268,8 @@ __kmp_acquire_queuing_lock_timed_template(kmp_queuing_lock_t *lck,
     /* Yield if number of threads > number of logical processors */
     /* ToDo: Not sure why this should only be in oversubscription case,
        maybe should be traditional YIELD_INIT/YIELD_WHEN loop */
-    KMP_YIELD(TCR_4(__kmp_nth) >
-              (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc));
+    KMP_YIELD_OVERSUB();
+
 #ifdef DEBUG_QUEUING_LOCKS
     TRACE_LOCK(gtid + 1, "acq retry");
 #endif
@@ -1462,8 +1448,8 @@ int __kmp_release_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
         KMP_MB();
         /* make sure enqueuing thread has time to update next waiting thread
          * field */
-        *head_id_p = KMP_WAIT_YIELD((volatile kmp_uint32 *)waiting_id_p, 0,
-                                    KMP_NEQ, NULL);
+        *head_id_p =
+            KMP_WAIT((volatile kmp_uint32 *)waiting_id_p, 0, KMP_NEQ, NULL);
 #ifdef DEBUG_QUEUING_LOCKS
         TRACE_LOCK(gtid + 1, "rel deq: (h,t)->(h',t)");
 #endif
@@ -2131,7 +2117,7 @@ static void __kmp_acquire_adaptive_lock(kmp_adaptive_lock_t *lck,
       // lock from now on.
       while (!__kmp_is_unlocked_queuing_lock(GET_QLK_PTR(lck))) {
         KMP_INC_STAT(lck, lemmingYields);
-        __kmp_yield(TRUE);
+        KMP_YIELD(TRUE);
       }
 
       if (__kmp_test_adaptive_lock_only(lck, gtid))
@@ -2259,23 +2245,14 @@ __kmp_acquire_drdpa_lock_timed_template(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
   // polling area has been reconfigured.  Unless it is reconfigured, the
   // reloads stay in L1 cache and are cheap.
   //
-  // Keep this code in sync with KMP_WAIT_YIELD, in kmp_dispatch.cpp !!!
-  //
-  // The current implementation of KMP_WAIT_YIELD doesn't allow for mask
+  // Keep this code in sync with KMP_WAIT, in kmp_dispatch.cpp !!!
+  // The current implementation of KMP_WAIT doesn't allow for mask
   // and poll to be re-read every spin iteration.
   kmp_uint32 spins;
-
   KMP_FSYNC_PREPARE(lck);
   KMP_INIT_YIELD(spins);
   while (polls[ticket & mask] < ticket) { // atomic load
-    // If we are oversubscribed,
-    // or have waited a bit (and KMP_LIBRARY=turnaround), then yield.
-    // CPU Pause is in the macros for yield.
-    //
-    KMP_YIELD(TCR_4(__kmp_nth) >
-              (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc));
-    KMP_YIELD_SPIN(spins);
-
+    KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
     // Re-read the mask and the poll pointer from the lock structure.
     //
     // Make certain that "mask" is read before "polls" !!!
@@ -2807,8 +2784,9 @@ static void __kmp_acquire_rtm_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
     }
     if ((status & _XABORT_EXPLICIT) && _XABORT_CODE(status) == 0xff) {
       // Wait until lock becomes free
-      while (!__kmp_is_unlocked_queuing_lock(lck))
-        __kmp_yield(TRUE);
+      while (!__kmp_is_unlocked_queuing_lock(lck)) {
+        KMP_YIELD(TRUE);
+      }
     } else if (!(status & _XABORT_RETRY))
       break;
   } while (retries--);
index 8ce500c..ccd84eb 100644 (file)
@@ -652,21 +652,11 @@ extern int (*__kmp_acquire_user_lock_with_checks_)(kmp_user_lock_p lck,
       kmp_uint32 spins;                                                        \
       KMP_FSYNC_PREPARE(lck);                                                  \
       KMP_INIT_YIELD(spins);                                                   \
-      if (TCR_4(__kmp_nth) >                                                   \
-          (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {               \
-        KMP_YIELD(TRUE);                                                       \
-      } else {                                                                 \
-        KMP_YIELD_SPIN(spins);                                                 \
-      }                                                                        \
-      while (lck->tas.lk.poll != 0 || !__kmp_atomic_compare_store_acq(         \
-                                          &lck->tas.lk.poll, 0, gtid + 1)) {   \
-        if (TCR_4(__kmp_nth) >                                                 \
-            (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
-          KMP_YIELD(TRUE);                                                     \
-        } else {                                                               \
-          KMP_YIELD_SPIN(spins);                                               \
-        }                                                                      \
-      }                                                                        \
+      do {                                                                     \
+        KMP_YIELD_OVERSUB_ELSE_SPIN(spins);                                    \
+      } while (                                                                \
+          lck->tas.lk.poll != 0 ||                                             \
+          !__kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1));    \
     }                                                                          \
     KMP_FSYNC_ACQUIRED(lck);                                                   \
   } else {                                                                     \
@@ -770,22 +760,11 @@ extern int (*__kmp_acquire_nested_user_lock_with_checks_)(kmp_user_lock_p lck,
         kmp_uint32 spins;                                                      \
         KMP_FSYNC_PREPARE(lck);                                                \
         KMP_INIT_YIELD(spins);                                                 \
-        if (TCR_4(__kmp_nth) >                                                 \
-            (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
-          KMP_YIELD(TRUE);                                                     \
-        } else {                                                               \
-          KMP_YIELD_SPIN(spins);                                               \
-        }                                                                      \
-        while (                                                                \
+        do {                                                                   \
+          KMP_YIELD_OVERSUB_ELSE_SPIN(spins);                                  \
+        } while (                                                              \
             (lck->tas.lk.poll != 0) ||                                         \
-            !__kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1)) { \
-          if (TCR_4(__kmp_nth) >                                               \
-              (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {           \
-            KMP_YIELD(TRUE);                                                   \
-          } else {                                                             \
-            KMP_YIELD_SPIN(spins);                                             \
-          }                                                                    \
-        }                                                                      \
+            !__kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1));  \
       }                                                                        \
       lck->tas.lk.depth_locked = 1;                                            \
       *depth = KMP_LOCK_ACQUIRED_FIRST;                                        \
index a553463..529c218 100644 (file)
@@ -304,7 +304,7 @@ extern "C" {
 
 #define KMP_CACHE_PREFETCH(ADDR) /* nothing */
 
-// Define attribute that indicates that the fall through from the previous 
+// Define attribute that indicates that the fall through from the previous
 // case label is intentional and should not be diagnosed by a compiler
 //   Code from libcxx/include/__config
 // Use a function like macro to imply that it must be followed by a semicolon
@@ -882,8 +882,8 @@ typedef void (*microtask_t)(int *gtid, int *npr, ...);
 #define VOLATILE_CAST(x) (x)
 #endif
 
-#define KMP_WAIT_YIELD __kmp_wait_yield_4
-#define KMP_WAIT_YIELD_PTR __kmp_wait_yield_4_ptr
+#define KMP_WAIT __kmp_wait_4
+#define KMP_WAIT_PTR __kmp_wait_4_ptr
 #define KMP_EQ __kmp_eq_4
 #define KMP_NEQ __kmp_neq_4
 #define KMP_LT __kmp_lt_4
index ead122f..2a9e31d 100644 (file)
@@ -327,7 +327,7 @@ void __kmp_infinite_loop(void) {
   static int done = FALSE;
 
   while (!done) {
-    KMP_YIELD(1);
+    KMP_YIELD(TRUE);
   }
 }
 
@@ -672,24 +672,6 @@ BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
 #endif /* KMP_OS_WINDOWS */
 #endif /* KMP_DYNAMIC_LIB */
 
-/* Change the library type to "status" and return the old type */
-/* called from within initialization routines where __kmp_initz_lock is held */
-int __kmp_change_library(int status) {
-  int old_status;
-
-  old_status = __kmp_yield_init &
-               1; // check whether KMP_LIBRARY=throughput (even init count)
-
-  if (status) {
-    __kmp_yield_init |= 1; // throughput => turnaround (odd init count)
-  } else {
-    __kmp_yield_init &= ~1; // turnaround => throughput (even init count)
-  }
-
-  return old_status; // return previous setting of whether
-  // KMP_LIBRARY=throughput
-}
-
 /* __kmp_parallel_deo -- Wait until it's our turn. */
 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
   int gtid = *gtid_ref;
@@ -708,8 +690,8 @@ void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
 #ifdef BUILD_PARALLEL_ORDERED
   if (!team->t.t_serialized) {
     KMP_MB();
-    KMP_WAIT_YIELD(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid),
-                   KMP_EQ, NULL);
+    KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
+             NULL);
     KMP_MB();
   }
 #endif /* BUILD_PARALLEL_ORDERED */
@@ -7735,13 +7717,14 @@ void __kmp_aux_set_library(enum library_type arg) {
   switch (__kmp_library) {
   case library_serial: {
     KMP_INFORM(LibraryIsSerial);
-    (void)__kmp_change_library(TRUE);
   } break;
   case library_turnaround:
-    (void)__kmp_change_library(TRUE);
+    if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
+      __kmp_use_yield = 2; // only yield when oversubscribed
     break;
   case library_throughput:
-    (void)__kmp_change_library(FALSE);
+    if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
+      __kmp_dflt_blocktime = 200;
     break;
   default:
     KMP_FATAL(UnknownLibraryType, arg);
index b2e300f..68e36f5 100644 (file)
@@ -629,6 +629,19 @@ static void __kmp_stg_print_teams_thread_limit(kmp_str_buf_t *buffer,
 } // __kmp_stg_print_teams_thread_limit
 
 // -----------------------------------------------------------------------------
+// KMP_USE_YIELD
+static void __kmp_stg_parse_use_yield(char const *name, char const *value,
+                                      void *data) {
+  __kmp_stg_parse_int(name, value, 0, 2, &__kmp_use_yield);
+  __kmp_use_yield_exp_set = 1;
+} // __kmp_stg_parse_use_yield
+
+static void __kmp_stg_print_use_yield(kmp_str_buf_t *buffer, char const *name,
+                                      void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_use_yield);
+} // __kmp_stg_print_use_yield
+
+// -----------------------------------------------------------------------------
 // KMP_BLOCKTIME
 
 static void __kmp_stg_parse_blocktime(char const *name, char const *value,
@@ -745,18 +758,24 @@ static void __kmp_stg_parse_wait_policy(char const *name, char const *value,
       __kmp_library = library_serial;
     } else if (__kmp_str_match("throughput", 2, value)) { /* TH */
       __kmp_library = library_throughput;
+      if (blocktime_str == NULL) {
+        // KMP_BLOCKTIME not specified, so set default to 0.
+        __kmp_dflt_blocktime = 0;
+      }
     } else if (__kmp_str_match("turnaround", 2, value)) { /* TU */
       __kmp_library = library_turnaround;
     } else if (__kmp_str_match("dedicated", 1, value)) { /* D */
       __kmp_library = library_turnaround;
     } else if (__kmp_str_match("multiuser", 1, value)) { /* M */
       __kmp_library = library_throughput;
+      if (blocktime_str == NULL) {
+        // KMP_BLOCKTIME not specified, so set default to 0.
+        __kmp_dflt_blocktime = 0;
+      }
     } else {
       KMP_WARNING(StgInvalidValue, name, value);
     }
   }
-  __kmp_aux_set_library(__kmp_library);
-
 } // __kmp_stg_parse_wait_policy
 
 static void __kmp_stg_print_wait_policy(kmp_str_buf_t *buffer, char const *name,
@@ -3944,79 +3963,9 @@ static void __kmp_stg_print_par_range_env(kmp_str_buf_t *buffer,
   }
 } // __kmp_stg_print_par_range_env
 
-// -----------------------------------------------------------------------------
-// KMP_YIELD_CYCLE, KMP_YIELD_ON, KMP_YIELD_OFF
-
-static void __kmp_stg_parse_yield_cycle(char const *name, char const *value,
-                                        void *data) {
-  int flag = __kmp_yield_cycle;
-  __kmp_stg_parse_bool(name, value, &flag);
-  __kmp_yield_cycle = flag;
-} // __kmp_stg_parse_yield_cycle
-
-static void __kmp_stg_print_yield_cycle(kmp_str_buf_t *buffer, char const *name,
-                                        void *data) {
-  __kmp_stg_print_bool(buffer, name, __kmp_yield_cycle);
-} // __kmp_stg_print_yield_cycle
-
-static void __kmp_stg_parse_yield_on(char const *name, char const *value,
-                                     void *data) {
-  __kmp_stg_parse_int(name, value, 2, INT_MAX, &__kmp_yield_on_count);
-} // __kmp_stg_parse_yield_on
-
-static void __kmp_stg_print_yield_on(kmp_str_buf_t *buffer, char const *name,
-                                     void *data) {
-  __kmp_stg_print_int(buffer, name, __kmp_yield_on_count);
-} // __kmp_stg_print_yield_on
-
-static void __kmp_stg_parse_yield_off(char const *name, char const *value,
-                                      void *data) {
-  __kmp_stg_parse_int(name, value, 2, INT_MAX, &__kmp_yield_off_count);
-} // __kmp_stg_parse_yield_off
-
-static void __kmp_stg_print_yield_off(kmp_str_buf_t *buffer, char const *name,
-                                      void *data) {
-  __kmp_stg_print_int(buffer, name, __kmp_yield_off_count);
-} // __kmp_stg_print_yield_off
-
 #endif
 
 // -----------------------------------------------------------------------------
-// KMP_INIT_WAIT, KMP_NEXT_WAIT
-
-static void __kmp_stg_parse_init_wait(char const *name, char const *value,
-                                      void *data) {
-  int wait;
-  KMP_ASSERT((__kmp_init_wait & 1) == 0);
-  wait = __kmp_init_wait / 2;
-  __kmp_stg_parse_int(name, value, KMP_MIN_INIT_WAIT, KMP_MAX_INIT_WAIT, &wait);
-  __kmp_init_wait = wait * 2;
-  KMP_ASSERT((__kmp_init_wait & 1) == 0);
-  __kmp_yield_init = __kmp_init_wait;
-} // __kmp_stg_parse_init_wait
-
-static void __kmp_stg_print_init_wait(kmp_str_buf_t *buffer, char const *name,
-                                      void *data) {
-  __kmp_stg_print_int(buffer, name, __kmp_init_wait);
-} // __kmp_stg_print_init_wait
-
-static void __kmp_stg_parse_next_wait(char const *name, char const *value,
-                                      void *data) {
-  int wait;
-  KMP_ASSERT((__kmp_next_wait & 1) == 0);
-  wait = __kmp_next_wait / 2;
-  __kmp_stg_parse_int(name, value, KMP_MIN_NEXT_WAIT, KMP_MAX_NEXT_WAIT, &wait);
-  __kmp_next_wait = wait * 2;
-  KMP_ASSERT((__kmp_next_wait & 1) == 0);
-  __kmp_yield_next = __kmp_next_wait;
-} // __kmp_stg_parse_next_wait
-
-static void __kmp_stg_print_next_wait(kmp_str_buf_t *buffer, char const *name,
-                                      void *data) {
-  __kmp_stg_print_int(buffer, name, __kmp_next_wait);
-} //__kmp_stg_print_next_wait
-
-// -----------------------------------------------------------------------------
 // KMP_GTID_MODE
 
 static void __kmp_stg_parse_gtid_mode(char const *name, char const *value,
@@ -4726,6 +4675,8 @@ static kmp_setting_t __kmp_stg_table[] = {
     {"KMP_ALL_THREADS", __kmp_stg_parse_device_thread_limit, NULL, NULL, 0, 0},
     {"KMP_BLOCKTIME", __kmp_stg_parse_blocktime, __kmp_stg_print_blocktime,
      NULL, 0, 0},
+    {"KMP_USE_YIELD", __kmp_stg_parse_use_yield, __kmp_stg_print_use_yield,
+     NULL, 0, 0},
     {"KMP_DUPLICATE_LIB_OK", __kmp_stg_parse_duplicate_lib_ok,
      __kmp_stg_print_duplicate_lib_ok, NULL, 0, 0},
     {"KMP_LIBRARY", __kmp_stg_parse_wait_policy, __kmp_stg_print_wait_policy,
@@ -4830,12 +4781,6 @@ static kmp_setting_t __kmp_stg_table[] = {
 
     {"KMP_PAR_RANGE", __kmp_stg_parse_par_range_env,
      __kmp_stg_print_par_range_env, NULL, 0, 0},
-    {"KMP_YIELD_CYCLE", __kmp_stg_parse_yield_cycle,
-     __kmp_stg_print_yield_cycle, NULL, 0, 0},
-    {"KMP_YIELD_ON", __kmp_stg_parse_yield_on, __kmp_stg_print_yield_on, NULL,
-     0, 0},
-    {"KMP_YIELD_OFF", __kmp_stg_parse_yield_off, __kmp_stg_print_yield_off,
-     NULL, 0, 0},
 #endif // KMP_DEBUG
 
     {"KMP_ALIGN_ALLOC", __kmp_stg_parse_align_alloc,
@@ -4927,10 +4872,6 @@ static kmp_setting_t __kmp_stg_table[] = {
 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
     {"KMP_MALLOC_POOL_INCR", __kmp_stg_parse_malloc_pool_incr,
      __kmp_stg_print_malloc_pool_incr, NULL, 0, 0},
-    {"KMP_INIT_WAIT", __kmp_stg_parse_init_wait, __kmp_stg_print_init_wait,
-     NULL, 0, 0},
-    {"KMP_NEXT_WAIT", __kmp_stg_parse_next_wait, __kmp_stg_print_next_wait,
-     NULL, 0, 0},
     {"KMP_GTID_MODE", __kmp_stg_parse_gtid_mode, __kmp_stg_print_gtid_mode,
      NULL, 0, 0},
     {"OMP_DYNAMIC", __kmp_stg_parse_omp_dynamic, __kmp_stg_print_omp_dynamic,
index 9e8b22a..f6d6ae1 100644 (file)
@@ -2705,8 +2705,7 @@ static inline int __kmp_execute_tasks_template(
       if (thread->th.th_task_team == NULL) {
         break;
       }
-      // Yield before executing next task
-      KMP_YIELD(__kmp_library == library_throughput);
+      KMP_YIELD(__kmp_library == library_throughput); // Yield before next task
       // If execution of a stolen task results in more tasks being placed on our
       // run queue, reset use_own_tasks
       if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) {
@@ -3242,10 +3241,8 @@ void __kmp_wait_to_unref_task_teams(void) {
       break;
     }
 
-    // If we are oversubscribed, or have waited a bit (and library mode is
-    // throughput), yield. Pause is in the following code.
-    KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
-    KMP_YIELD_SPIN(spins); // Yields only if KMP_LIBRARY=throughput
+    // If oversubscribed or have waited a bit, yield.
+    KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
   }
 }
 
@@ -3410,7 +3407,7 @@ void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) {
         __kmp_abort_thread();
       break;
     }
-    KMP_YIELD(TRUE); // GH: We always yield here
+    KMP_YIELD(TRUE);
   }
 #if USE_ITT_BUILD
   KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin));
index 2b01174..442ad4e 100644 (file)
@@ -51,7 +51,7 @@ static void __kmp_taskq_eo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
 
     taskq = tq->tq_curr_thunk[tid]->th.th_shareds->sv_queue;
 
-    KMP_WAIT_YIELD(&taskq->tq_tasknum_serving, my_token, KMP_EQ, NULL);
+    KMP_WAIT(&taskq->tq_tasknum_serving, my_token, KMP_EQ, NULL);
     KMP_MB();
   }
 }
@@ -95,7 +95,7 @@ static void __kmp_taskq_check_ordered(kmp_int32 gtid, kmpc_thunk_t *thunk) {
   taskq = thunk->th.th_shareds->sv_queue;
 
   if (taskq->tq_tasknum_serving <= my_token) {
-    KMP_WAIT_YIELD(&taskq->tq_tasknum_serving, my_token, KMP_GE, NULL);
+    KMP_WAIT(&taskq->tq_tasknum_serving, my_token, KMP_GE, NULL);
     KMP_MB();
     taskq->tq_tasknum_serving = my_token + 1;
     KMP_MB();
@@ -1056,8 +1056,7 @@ static void __kmp_remove_queue_from_tree(kmp_taskq_t *tq, kmp_int32 global_tid,
     while (queue->tq_ref_count > 1) {
       __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
 
-      KMP_WAIT_YIELD((volatile kmp_uint32 *)&queue->tq_ref_count, 1, KMP_LE,
-                     NULL);
+      KMP_WAIT((volatile kmp_uint32 *)&queue->tq_ref_count, 1, KMP_LE, NULL);
 
       __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
       // Make sure data structures are in consistent state before querying them
@@ -1538,8 +1537,6 @@ void __kmpc_end_taskq(ident_t *loc, kmp_int32 global_tid,
   in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT);
 
   if (in_parallel) {
-    kmp_uint32 spins;
-
     /* this is just a safeguard to release the waiting threads if */
     /* the outermost taskq never queues a task                    */
 
@@ -1556,12 +1553,10 @@ void __kmpc_end_taskq(ident_t *loc, kmp_int32 global_tid,
 
     do {
       /* wait until something is available to dequeue */
-      KMP_INIT_YIELD(spins);
-
       while ((queue->tq_nfull == 0) && (queue->tq_taskq_slot == NULL) &&
              (!__kmp_taskq_has_any_children(queue)) &&
              (!(queue->tq_flags & TQF_ALL_TASKS_QUEUED))) {
-        KMP_YIELD_WHEN(TRUE, spins);
+        KMP_CPU_PAUSE();
       }
 
       /* check to see if we can execute tasks in the queue */
@@ -1628,7 +1623,6 @@ void __kmpc_end_taskq(ident_t *loc, kmp_int32 global_tid,
 
       /* WAIT until all tasks are finished and no child queues exist before
        * proceeding */
-      KMP_INIT_YIELD(spins);
 
       while (!__kmp_taskq_tasks_finished(queue) ||
              __kmp_taskq_has_any_children(queue)) {
@@ -1643,7 +1637,8 @@ void __kmpc_end_taskq(ident_t *loc, kmp_int32 global_tid,
                                         in_parallel);
         }
 
-        KMP_YIELD_WHEN(thunk == NULL, spins);
+        if (thunk == NULL)
+          KMP_CPU_PAUSE();
 
         __kmp_find_and_remove_finished_child_taskq(tq, global_tid, queue);
       }
@@ -1669,8 +1664,6 @@ void __kmpc_end_taskq(ident_t *loc, kmp_int32 global_tid,
 
     // Outermost Queue: steal work from descendants until all tasks are finished
 
-    KMP_INIT_YIELD(spins);
-
     while (!__kmp_taskq_tasks_finished(queue)) {
       thunk = __kmp_find_task_in_descendant_queue(global_tid, queue);
 
@@ -1683,7 +1676,8 @@ void __kmpc_end_taskq(ident_t *loc, kmp_int32 global_tid,
         __kmp_execute_task_from_queue(tq, loc, global_tid, thunk, in_parallel);
       }
 
-      KMP_YIELD_WHEN(thunk == NULL, spins);
+      if (thunk == NULL)
+        KMP_CPU_PAUSE();
     }
 
     /* Need this barrier to prevent destruction of queue before threads have all
index 21f3610..d1120d4 100644 (file)
@@ -150,8 +150,8 @@ static void __ompt_implicit_task_end(kmp_info_t *this_thr,
 }
 #endif
 
-/* Spin wait loop that first does pause, then yield, then sleep. A thread that
-   calls __kmp_wait_*  must make certain that another thread calls __kmp_release
+/* Spin wait loop that first does pause/yield, then sleep. A thread that calls
+   __kmp_wait_*  must make certain that another thread calls __kmp_release
    to wake it back up to prevent deadlocks!
 
    NOTE: We may not belong to a team at this point.  */
@@ -270,8 +270,7 @@ final_spin=FALSE)
   }
 #endif
 
-  // Setup for waiting
-  KMP_INIT_YIELD(spins);
+  KMP_INIT_YIELD(spins); // Setup for waiting
 
   if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
 #if OMP_50_ENABLED
@@ -368,14 +367,8 @@ final_spin=FALSE)
 
     // If we are oversubscribed, or have waited a bit (and
     // KMP_LIBRARY=throughput), then yield
-    // TODO: Should it be number of cores instead of thread contexts? Like:
-    // KMP_YIELD(TCR_4(__kmp_nth) > __kmp_ncores);
-    // Need performance improvement data to make the change...
-    if (oversubscribed) {
-      KMP_YIELD(1);
-    } else {
-      KMP_YIELD_SPIN(spins);
-    }
+    KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
+
     // Check if this thread was transferred from a team
     // to the thread pool (or vice-versa) while spinning.
     in_pool = !!TCR_4(this_thr->th.th_in_pool);
index 08b9742..df1c47b 100644 (file)
@@ -437,7 +437,7 @@ void __kmp_terminate_thread(int gtid) {
                 __kmp_msg_null);
   }
 #endif
-  __kmp_yield(TRUE);
+  KMP_YIELD(TRUE);
 } //
 
 /* Set thread stack info according to values returned by pthread_getattr_np().
@@ -580,8 +580,6 @@ static void *__kmp_launch_monitor(void *thr) {
   sigset_t new_set;
 #endif /* KMP_BLOCK_SIGNALS */
   struct timespec interval;
-  int yield_count;
-  int yield_cycles = 0;
 
   KMP_MB(); /* Flush all pending memory write invalidates.  */
 
@@ -665,13 +663,6 @@ static void *__kmp_launch_monitor(void *thr) {
 
   KA_TRACE(10, ("__kmp_launch_monitor: #2 monitor\n"));
 
-  if (__kmp_yield_cycle) {
-    __kmp_yielding_on = 0; /* Start out with yielding shut off */
-    yield_count = __kmp_yield_off_count;
-  } else {
-    __kmp_yielding_on = 1; /* Yielding is on permanently */
-  }
-
   while (!TCR_4(__kmp_global.g.g_done)) {
     struct timespec now;
     struct timeval tval;
@@ -707,22 +698,6 @@ static void *__kmp_launch_monitor(void *thr) {
     status = pthread_mutex_unlock(&__kmp_wait_mx.m_mutex);
     KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status);
 
-    if (__kmp_yield_cycle) {
-      yield_cycles++;
-      if ((yield_cycles % yield_count) == 0) {
-        if (__kmp_yielding_on) {
-          __kmp_yielding_on = 0; /* Turn it off now */
-          yield_count = __kmp_yield_off_count;
-        } else {
-          __kmp_yielding_on = 1; /* Turn it on now */
-          yield_count = __kmp_yield_on_count;
-        }
-        yield_cycles = 0;
-      }
-    } else {
-      __kmp_yielding_on = 1;
-    }
-
     TCW_4(__kmp_global.g.g_time.dt.t_value,
           TCR_4(__kmp_global.g.g_time.dt.t_value) + 1);
 
@@ -1011,8 +986,8 @@ retry:
   // Wait for the monitor thread is really started and set its *priority*.
   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) ==
                    sizeof(__kmp_global.g.g_time.dt.t_value));
-  __kmp_wait_yield_4((kmp_uint32 volatile *)&__kmp_global.g.g_time.dt.t_value,
-                     -1, &__kmp_neq_4, NULL);
+  __kmp_wait_4((kmp_uint32 volatile *)&__kmp_global.g.g_time.dt.t_value, -1,
+               &__kmp_neq_4, NULL);
 #endif // KMP_REAL_TIME_FIX
 
 #ifdef KMP_THREAD_ATTR
@@ -1688,18 +1663,7 @@ void __kmp_resume_monitor() {
 }
 #endif // KMP_USE_MONITOR
 
-void __kmp_yield(int cond) {
-  if (!cond)
-    return;
-#if KMP_USE_MONITOR
-  if (!__kmp_yielding_on)
-    return;
-#else
-  if (__kmp_yield_cycle && !KMP_YIELD_NOW())
-    return;
-#endif
-  sched_yield();
-}
+void __kmp_yield() { sched_yield(); }
 
 void __kmp_gtid_set_specific(int gtid) {
   if (__kmp_init_gtid) {
index 038ac86..0049ca8 100644 (file)
@@ -483,10 +483,7 @@ void __kmp_resume_oncore(int target_gtid, kmp_flag_oncore *flag) {
   __kmp_resume_template(target_gtid, flag);
 }
 
-void __kmp_yield(int cond) {
-  if (cond)
-    Sleep(0);
-}
+void __kmp_yield() { Sleep(0); }
 
 void __kmp_gtid_set_specific(int gtid) {
   if (__kmp_init_gtid) {
@@ -1245,8 +1242,8 @@ static void __kmp_reap_common(kmp_info_t *th) {
      Right solution seems to be waiting for *either* thread termination *or*
      ds_alive resetting. */
   {
-    // TODO: This code is very similar to KMP_WAIT_YIELD. Need to generalize
-    // KMP_WAIT_YIELD to cover this usage also.
+    // TODO: This code is very similar to KMP_WAIT. Need to generalize
+    // KMP_WAIT to cover this usage also.
     void *obj = NULL;
     kmp_uint32 spins;
 #if USE_ITT_BUILD
@@ -1258,8 +1255,7 @@ static void __kmp_reap_common(kmp_info_t *th) {
       KMP_FSYNC_SPIN_PREPARE(obj);
 #endif /* USE_ITT_BUILD */
       __kmp_is_thread_alive(th, &exit_val);
-      KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
-      KMP_YIELD_SPIN(spins);
+      KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
     } while (exit_val == STILL_ACTIVE && TCR_4(th->th.th_info.ds.ds_alive));
 #if USE_ITT_BUILD
     if (exit_val == STILL_ACTIVE) {