nptl/pthread_cond_wait.c

   1 /* Copyright (C) 2003-2019 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Contributed by Martin Schwidefsky <schwidefsky@de.ibm.com>, 2003.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #include <endian.h>
  20 #include <errno.h>
  21 #include <sysdep.h>
  22 #include <futex-internal.h>
  23 #include <pthread.h>
  24 #include <pthreadP.h>
  25 #include <sys/time.h>
  26 #include <atomic.h>
  27 #include <stdint.h>
  28 #include <stdbool.h>
  29
  30 #include <shlib-compat.h>
  31 #include <stap-probe.h>
  32 #include <time.h>
  33
  34 #include "pthread_cond_common.c"
  35
  36
  37 struct _condvar_cleanup_buffer
  38 {
  39   uint64_t wseq;
  40   pthread_cond_t *cond;
  41   pthread_mutex_t *mutex;
  42   int private;
  43 };
  44
  45
  46 /* Decrease the waiter reference count.  */
  47 static void
  48 __condvar_confirm_wakeup (pthread_cond_t *cond, int private)
  49 {
  50   /* If destruction is pending (i.e., the wake-request flag is nonzero) and we
  51      are the last waiter (prior value of __wrefs was 1 << 3), then wake any
  52      threads waiting in pthread_cond_destroy.  Release MO to synchronize with
  53      these threads.  Don't bother clearing the wake-up request flag.  */
  54   if ((atomic_fetch_add_release (&cond->__data.__wrefs, -8) >> 2) == 3)
  55     futex_wake (&cond->__data.__wrefs, INT_MAX, private);
  56 }
  57
  58
  59 /* Cancel waiting after having registered as a waiter previously.  SEQ is our
  60    position and G is our group index.
  61    The goal of cancellation is to make our group smaller if that is still
  62    possible.  If we are in a closed group, this is not possible anymore; in
  63    this case, we need to send a replacement signal for the one we effectively
  64    consumed because the signal should have gotten consumed by another waiter
  65    instead; we must not both cancel waiting and consume a signal.
  66
  67    Must not be called while still holding a reference on the group.
  68
  69    Returns true iff we consumed a signal.
  70
  71    On some kind of timeouts, we may be able to pretend that a signal we
  72    effectively consumed happened before the timeout (i.e., similarly to first
  73    spinning on signals before actually checking whether the timeout has
  74    passed already).  Doing this would allow us to skip sending a replacement
  75    signal, but this case might happen rarely because the end of the timeout
  76    must race with someone else sending a signal.  Therefore, we don't bother
  77    trying to optimize this.  */
  78 static void
  79 __condvar_cancel_waiting (pthread_cond_t *cond, uint64_t seq, unsigned int g,
  80                           int private)
  81 {
  82   bool consumed_signal = false;
  83
  84   /* No deadlock with group switching is possible here because we have do
  85      not hold a reference on the group.  */
  86   __condvar_acquire_lock (cond, private);
  87
  88   uint64_t g1_start = __condvar_load_g1_start_relaxed (cond) >> 1;
  89   if (g1_start > seq)
  90     {
  91       /* Our group is closed, so someone provided enough signals for it.
  92          Thus, we effectively consumed a signal.  */
  93       consumed_signal = true;
  94     }
  95   else
  96     {
  97       if (g1_start + __condvar_get_orig_size (cond) <= seq)
  98         {
  99           /* We are in the current G2 and thus cannot have consumed a signal.
 100              Reduce its effective size or handle overflow.  Remember that in
 101              G2, unsigned int size is zero or a negative value.  */
 102           if (cond->__data.__g_size[g] + __PTHREAD_COND_MAX_GROUP_SIZE > 0)
 103             {
 104               cond->__data.__g_size[g]--;
 105             }
 106           else
 107             {
 108               /* Cancellations would overflow the maximum group size.  Just
 109                  wake up everyone spuriously to create a clean state.  This
 110                  also means we do not consume a signal someone else sent.  */
 111               __condvar_release_lock (cond, private);
 112               __pthread_cond_broadcast (cond);
 113               return;
 114             }
 115         }
 116       else
 117         {
 118           /* We are in current G1.  If the group's size is zero, someone put
 119              a signal in the group that nobody else but us can consume.  */
 120           if (cond->__data.__g_size[g] == 0)
 121             consumed_signal = true;
 122           else
 123             {
 124               /* Otherwise, we decrease the size of the group.  This is
 125                  equivalent to atomically putting in a signal just for us and
 126                  consuming it right away.  We do not consume a signal sent
 127                  by someone else.  We also cannot have consumed a futex
 128                  wake-up because if we were cancelled or timed out in a futex
 129                  call, the futex will wake another waiter.  */
 130               cond->__data.__g_size[g]--;
 131             }
 132         }
 133     }
 134
 135   __condvar_release_lock (cond, private);
 136
 137   if (consumed_signal)
 138     {
 139       /* We effectively consumed a signal even though we didn't want to.
 140          Therefore, we need to send a replacement signal.
 141          If we would want to optimize this, we could do what
 142          pthread_cond_signal does right in the critical section above.  */
 143       __pthread_cond_signal (cond);
 144     }
 145 }
 146
 147 /* Wake up any signalers that might be waiting.  */
 148 static void
 149 __condvar_dec_grefs (pthread_cond_t *cond, unsigned int g, int private)
 150 {
 151   /* Release MO to synchronize-with the acquire load in
 152      __condvar_quiesce_and_switch_g1.  */
 153   if (atomic_fetch_add_release (cond->__data.__g_refs + g, -2) == 3)
 154     {
 155       /* Clear the wake-up request flag before waking up.  We do not need more
 156          than relaxed MO and it doesn't matter if we apply this for an aliased
 157          group because we wake all futex waiters right after clearing the
 158          flag.  */
 159       atomic_fetch_and_relaxed (cond->__data.__g_refs + g, ~(unsigned int) 1);
 160       futex_wake (cond->__data.__g_refs + g, INT_MAX, private);
 161     }
 162 }
 163
 164 /* Clean-up for cancellation of waiters waiting for normal signals.  We cancel
 165    our registration as a waiter, confirm we have woken up, and re-acquire the
 166    mutex.  */
 167 static void
 168 __condvar_cleanup_waiting (void *arg)
 169 {
 170   struct _condvar_cleanup_buffer *cbuffer =
 171     (struct _condvar_cleanup_buffer *) arg;
 172   pthread_cond_t *cond = cbuffer->cond;
 173   unsigned g = cbuffer->wseq & 1;
 174
 175   __condvar_dec_grefs (cond, g, cbuffer->private);
 176
 177   __condvar_cancel_waiting (cond, cbuffer->wseq >> 1, g, cbuffer->private);
 178   /* FIXME With the current cancellation implementation, it is possible that
 179      a thread is cancelled after it has returned from a syscall.  This could
 180      result in a cancelled waiter consuming a futex wake-up that is then
 181      causing another waiter in the same group to not wake up.  To work around
 182      this issue until we have fixed cancellation, just add a futex wake-up
 183      conservatively.  */
 184   futex_wake (cond->__data.__g_signals + g, 1, cbuffer->private);
 185
 186   __condvar_confirm_wakeup (cond, cbuffer->private);
 187
 188   /* XXX If locking the mutex fails, should we just stop execution?  This
 189      might be better than silently ignoring the error.  */
 190   __pthread_mutex_cond_lock (cbuffer->mutex);
 191 }
 192
 193 /* This condvar implementation guarantees that all calls to signal and
 194    broadcast and all of the three virtually atomic parts of each call to wait
 195    (i.e., (1) releasing the mutex and blocking, (2) unblocking, and (3) re-
 196    acquiring the mutex) happen in some total order that is consistent with the
 197    happens-before relations in the calling program.  However, this order does
 198    not necessarily result in additional happens-before relations being
 199    established (which aligns well with spurious wake-ups being allowed).
 200
 201    All waiters acquire a certain position in a 64b waiter sequence (__wseq).
 202    This sequence determines which waiters are allowed to consume signals.
 203    A broadcast is equal to sending as many signals as are unblocked waiters.
 204    When a signal arrives, it samples the current value of __wseq with a
 205    relaxed-MO load (i.e., the position the next waiter would get).  (This is
 206    sufficient because it is consistent with happens-before; the caller can
 207    enforce stronger ordering constraints by calling signal while holding the
 208    mutex.)  Only waiters with a position less than the __wseq value observed
 209    by the signal are eligible to consume this signal.
 210
 211    This would be straight-forward to implement if waiters would just spin but
 212    we need to let them block using futexes.  Futexes give no guarantee of
 213    waking in FIFO order, so we cannot reliably wake eligible waiters if we
 214    just use a single futex.  Also, futex words are 32b in size, but we need
 215    to distinguish more than 1<<32 states because we need to represent the
 216    order of wake-up (and thus which waiters are eligible to consume signals);
 217    blocking in a futex is not atomic with a waiter determining its position in
 218    the waiter sequence, so we need the futex word to reliably notify waiters
 219    that they should not attempt to block anymore because they have been
 220    already signaled in the meantime.  While an ABA issue on a 32b value will
 221    be rare, ignoring it when we are aware of it is not the right thing to do
 222    either.
 223
 224    Therefore, we use a 64b counter to represent the waiter sequence (on
 225    architectures which only support 32b atomics, we use a few bits less).
 226    To deal with the blocking using futexes, we maintain two groups of waiters:
 227    * Group G1 consists of waiters that are all eligible to consume signals;
 228      incoming signals will always signal waiters in this group until all
 229      waiters in G1 have been signaled.
 230    * Group G2 consists of waiters that arrive when a G1 is present and still
 231      contains waiters that have not been signaled.  When all waiters in G1
 232      are signaled and a new signal arrives, the new signal will convert G2
 233      into the new G1 and create a new G2 for future waiters.
 234
 235    We cannot allocate new memory because of process-shared condvars, so we
 236    have just two slots of groups that change their role between G1 and G2.
 237    Each has a separate futex word, a number of signals available for
 238    consumption, a size (number of waiters in the group that have not been
 239    signaled), and a reference count.
 240
 241    The group reference count is used to maintain the number of waiters that
 242    are using the group's futex.  Before a group can change its role, the
 243    reference count must show that no waiters are using the futex anymore; this
 244    prevents ABA issues on the futex word.
 245
 246    To represent which intervals in the waiter sequence the groups cover (and
 247    thus also which group slot contains G1 or G2), we use a 64b counter to
 248    designate the start position of G1 (inclusive), and a single bit in the
 249    waiter sequence counter to represent which group slot currently contains
 250    G2.  This allows us to switch group roles atomically wrt. waiters obtaining
 251    a position in the waiter sequence.  The G1 start position allows waiters to
 252    figure out whether they are in a group that has already been completely
 253    signaled (i.e., if the current G1 starts at a later position that the
 254    waiter's position).  Waiters cannot determine whether they are currently
 255    in G2 or G1 -- but they do not have too because all they are interested in
 256    is whether there are available signals, and they always start in G2 (whose
 257    group slot they know because of the bit in the waiter sequence.  Signalers
 258    will simply fill the right group until it is completely signaled and can
 259    be closed (they do not switch group roles until they really have to to
 260    decrease the likelihood of having to wait for waiters still holding a
 261    reference on the now-closed G1).
 262
 263    Signalers maintain the initial size of G1 to be able to determine where
 264    G2 starts (G2 is always open-ended until it becomes G1).  They track the
 265    remaining size of a group; when waiters cancel waiting (due to PThreads
 266    cancellation or timeouts), they will decrease this remaining size as well.
 267
 268    To implement condvar destruction requirements (i.e., that
 269    pthread_cond_destroy can be called as soon as all waiters have been
 270    signaled), waiters increment a reference count before starting to wait and
 271    decrement it after they stopped waiting but right before they acquire the
 272    mutex associated with the condvar.
 273
 274    pthread_cond_t thus consists of the following (bits that are used for
 275    flags and are not part of the primary value of each field but necessary
 276    to make some things atomic or because there was no space for them
 277    elsewhere in the data structure):
 278
 279    __wseq: Waiter sequence counter
 280      * LSB is index of current G2.
 281      * Waiters fetch-add while having acquire the mutex associated with the
 282        condvar.  Signalers load it and fetch-xor it concurrently.
 283    __g1_start: Starting position of G1 (inclusive)
 284      * LSB is index of current G2.
 285      * Modified by signalers while having acquired the condvar-internal lock
 286        and observed concurrently by waiters.
 287    __g1_orig_size: Initial size of G1
 288      * The two least-significant bits represent the condvar-internal lock.
 289      * Only accessed while having acquired the condvar-internal lock.
 290    __wrefs: Waiter reference counter.
 291      * Bit 2 is true if waiters should run futex_wake when they remove the
 292        last reference.  pthread_cond_destroy uses this as futex word.
 293      * Bit 1 is the clock ID (0 == CLOCK_REALTIME, 1 == CLOCK_MONOTONIC).
 294      * Bit 0 is true iff this is a process-shared condvar.
 295      * Simple reference count used by both waiters and pthread_cond_destroy.
 296      (If the format of __wrefs is changed, update nptl_lock_constants.pysym
 297       and the pretty printers.)
 298    For each of the two groups, we have:
 299    __g_refs: Futex waiter reference count.
 300      * LSB is true if waiters should run futex_wake when they remove the
 301        last reference.
 302      * Reference count used by waiters concurrently with signalers that have
 303        acquired the condvar-internal lock.
 304    __g_signals: The number of signals that can still be consumed.
 305      * Used as a futex word by waiters.  Used concurrently by waiters and
 306        signalers.
 307      * LSB is true iff this group has been completely signaled (i.e., it is
 308        closed).
 309    __g_size: Waiters remaining in this group (i.e., which have not been
 310      signaled yet.
 311      * Accessed by signalers and waiters that cancel waiting (both do so only
 312        when having acquired the condvar-internal lock.
 313      * The size of G2 is always zero because it cannot be determined until
 314        the group becomes G1.
 315      * Although this is of unsigned type, we rely on using unsigned overflow
 316        rules to make this hold effectively negative values too (in
 317        particular, when waiters in G2 cancel waiting).
 318
 319    A PTHREAD_COND_INITIALIZER condvar has all fields set to zero, which yields
 320    a condvar that has G2 starting at position 0 and a G1 that is closed.
 321
 322    Because waiters do not claim ownership of a group right when obtaining a
 323    position in __wseq but only reference count the group when using futexes
 324    to block, it can happen that a group gets closed before a waiter can
 325    increment the reference count.  Therefore, waiters have to check whether
 326    their group is already closed using __g1_start.  They also have to perform
 327    this check when spinning when trying to grab a signal from __g_signals.
 328    Note that for these checks, using relaxed MO to load __g1_start is
 329    sufficient because if a waiter can see a sufficiently large value, it could
 330    have also consume a signal in the waiters group.
 331
 332    Waiters try to grab a signal from __g_signals without holding a reference
 333    count, which can lead to stealing a signal from a more recent group after
 334    their own group was already closed.  They cannot always detect whether they
 335    in fact did because they do not know when they stole, but they can
 336    conservatively add a signal back to the group they stole from; if they
 337    did so unnecessarily, all that happens is a spurious wake-up.  To make this
 338    even less likely, __g1_start contains the index of the current g2 too,
 339    which allows waiters to check if there aliasing on the group slots; if
 340    there wasn't, they didn't steal from the current G1, which means that the
 341    G1 they stole from must have been already closed and they do not need to
 342    fix anything.
 343
 344    It is essential that the last field in pthread_cond_t is __g_signals[1]:
 345    The previous condvar used a pointer-sized field in pthread_cond_t, so a
 346    PTHREAD_COND_INITIALIZER from that condvar implementation might only
 347    initialize 4 bytes to zero instead of the 8 bytes we need (i.e., 44 bytes
 348    in total instead of the 48 we need).  __g_signals[1] is not accessed before
 349    the first group switch (G2 starts at index 0), which will set its value to
 350    zero after a harmless fetch-or whose return value is ignored.  This
 351    effectively completes initialization.
 352
 353
 354    Limitations:
 355    * This condvar isn't designed to allow for more than
 356      __PTHREAD_COND_MAX_GROUP_SIZE * (1 << 31) calls to __pthread_cond_wait.
 357    * More than __PTHREAD_COND_MAX_GROUP_SIZE concurrent waiters are not
 358      supported.
 359    * Beyond what is allowed as errors by POSIX or documented, we can also
 360      return the following errors:
 361      * EPERM if MUTEX is a recursive mutex and the caller doesn't own it.
 362      * EOWNERDEAD or ENOTRECOVERABLE when using robust mutexes.  Unlike
 363        for other errors, this can happen when we re-acquire the mutex; this
 364        isn't allowed by POSIX (which requires all errors to virtually happen
 365        before we release the mutex or change the condvar state), but there's
 366        nothing we can do really.
 367      * When using PTHREAD_MUTEX_PP_* mutexes, we can also return all errors
 368        returned by __pthread_tpp_change_priority.  We will already have
 369        released the mutex in such cases, so the caller cannot expect to own
 370        MUTEX.
 371
 372    Other notes:
 373    * Instead of the normal mutex unlock / lock functions, we use
 374      __pthread_mutex_unlock_usercnt(m, 0) / __pthread_mutex_cond_lock(m)
 375      because those will not change the mutex-internal users count, so that it
 376      can be detected when a condvar is still associated with a particular
 377      mutex because there is a waiter blocked on this condvar using this mutex.
 378 */
 379 static __always_inline int
 380 __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
 381     clockid_t clockid,
 382     const struct timespec *abstime)
 383 {
 384   const int maxspin = 0;
 385   int err;
 386   int result = 0;
 387
 388   LIBC_PROBE (cond_wait, 2, cond, mutex);
 389
 390   /* clockid will already have been checked by
 391      __pthread_cond_clockwait or pthread_condattr_setclock, or we
 392      don't use it if abstime is NULL, so we don't need to check it
 393      here. */
 394
 395   /* Acquire a position (SEQ) in the waiter sequence (WSEQ).  We use an
 396      atomic operation because signals and broadcasts may update the group
 397      switch without acquiring the mutex.  We do not need release MO here
 398      because we do not need to establish any happens-before relation with
 399      signalers (see __pthread_cond_signal); modification order alone
 400      establishes a total order of waiters/signals.  We do need acquire MO
 401      to synchronize with group reinitialization in
 402      __condvar_quiesce_and_switch_g1.  */
 403   uint64_t wseq = __condvar_fetch_add_wseq_acquire (cond, 2);
 404   /* Find our group's index.  We always go into what was G2 when we acquired
 405      our position.  */
 406   unsigned int g = wseq & 1;
 407   uint64_t seq = wseq >> 1;
 408
 409   /* Increase the waiter reference count.  Relaxed MO is sufficient because
 410      we only need to synchronize when decrementing the reference count.  */
 411   unsigned int flags = atomic_fetch_add_relaxed (&cond->__data.__wrefs, 8);
 412   int private = __condvar_get_private (flags);
 413
 414   /* Now that we are registered as a waiter, we can release the mutex.
 415      Waiting on the condvar must be atomic with releasing the mutex, so if
 416      the mutex is used to establish a happens-before relation with any
 417      signaler, the waiter must be visible to the latter; thus, we release the
 418      mutex after registering as waiter.
 419      If releasing the mutex fails, we just cancel our registration as a
 420      waiter and confirm that we have woken up.  */
 421   err = __pthread_mutex_unlock_usercnt (mutex, 0);
 422   if (__glibc_unlikely (err != 0))
 423     {
 424       __condvar_cancel_waiting (cond, seq, g, private);
 425       __condvar_confirm_wakeup (cond, private);
 426       return err;
 427     }
 428
 429   /* Now wait until a signal is available in our group or it is closed.
 430      Acquire MO so that if we observe a value of zero written after group
 431      switching in __condvar_quiesce_and_switch_g1, we synchronize with that
 432      store and will see the prior update of __g1_start done while switching
 433      groups too.  */
 434   unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g);
 435
 436   do
 437     {
 438       while (1)
 439         {
 440           /* Spin-wait first.
 441              Note that spinning first without checking whether a timeout
 442              passed might lead to what looks like a spurious wake-up even
 443              though we should return ETIMEDOUT (e.g., if the caller provides
 444              an absolute timeout that is clearly in the past).  However,
 445              (1) spurious wake-ups are allowed, (2) it seems unlikely that a
 446              user will (ab)use pthread_cond_wait as a check for whether a
 447              point in time is in the past, and (3) spinning first without
 448              having to compare against the current time seems to be the right
 449              choice from a performance perspective for most use cases.  */
 450           unsigned int spin = maxspin;
 451           while (signals == 0 && spin > 0)
 452             {
 453               /* Check that we are not spinning on a group that's already
 454                  closed.  */
 455               if (seq < (__condvar_load_g1_start_relaxed (cond) >> 1))
 456                 goto done;
 457
 458               /* TODO Back off.  */
 459
 460               /* Reload signals.  See above for MO.  */
 461               signals = atomic_load_acquire (cond->__data.__g_signals + g);
 462               spin--;
 463             }
 464
 465           /* If our group will be closed as indicated by the flag on signals,
 466              don't bother grabbing a signal.  */
 467           if (signals & 1)
 468             goto done;
 469
 470           /* If there is an available signal, don't block.  */
 471           if (signals != 0)
 472             break;
 473
 474           /* No signals available after spinning, so prepare to block.
 475              We first acquire a group reference and use acquire MO for that so
 476              that we synchronize with the dummy read-modify-write in
 477              __condvar_quiesce_and_switch_g1 if we read from that.  In turn,
 478              in this case this will make us see the closed flag on __g_signals
 479              that designates a concurrent attempt to reuse the group's slot.
 480              We use acquire MO for the __g_signals check to make the
 481              __g1_start check work (see spinning above).
 482              Note that the group reference acquisition will not mask the
 483              release MO when decrementing the reference count because we use
 484              an atomic read-modify-write operation and thus extend the release
 485              sequence.  */
 486           atomic_fetch_add_acquire (cond->__data.__g_refs + g, 2);
 487           if (((atomic_load_acquire (cond->__data.__g_signals + g) & 1) != 0)
 488               || (seq < (__condvar_load_g1_start_relaxed (cond) >> 1)))
 489             {
 490               /* Our group is closed.  Wake up any signalers that might be
 491                  waiting.  */
 492               __condvar_dec_grefs (cond, g, private);
 493               goto done;
 494             }
 495
 496           // Now block.
 497           struct _pthread_cleanup_buffer buffer;
 498           struct _condvar_cleanup_buffer cbuffer;
 499           cbuffer.wseq = wseq;
 500           cbuffer.cond = cond;
 501           cbuffer.mutex = mutex;
 502           cbuffer.private = private;
 503           __pthread_cleanup_push (&buffer, __condvar_cleanup_waiting, &cbuffer);
 504
 505           if (abstime == NULL)
 506             {
 507               /* Block without a timeout.  */
 508               err = futex_wait_cancelable (
 509                   cond->__data.__g_signals + g, 0, private);
 510             }
 511           else
 512             {
 513               /* Block, but with a timeout.
 514                  Work around the fact that the kernel rejects negative timeout
 515                  values despite them being valid.  */
 516               if (__glibc_unlikely (abstime->tv_sec < 0))
 517                 err = ETIMEDOUT;
 518               else
 519                 {
 520                   err = futex_abstimed_wait_cancelable
 521                     (cond->__data.__g_signals + g, 0, clockid, abstime,
 522                      private);
 523                 }
 524             }
 525
 526           __pthread_cleanup_pop (&buffer, 0);
 527
 528           if (__glibc_unlikely (err == ETIMEDOUT))
 529             {
 530               __condvar_dec_grefs (cond, g, private);
 531               /* If we timed out, we effectively cancel waiting.  Note that
 532                  we have decremented __g_refs before cancellation, so that a
 533                  deadlock between waiting for quiescence of our group in
 534                  __condvar_quiesce_and_switch_g1 and us trying to acquire
 535                  the lock during cancellation is not possible.  */
 536               __condvar_cancel_waiting (cond, seq, g, private);
 537               result = ETIMEDOUT;
 538               goto done;
 539             }
 540           else
 541             __condvar_dec_grefs (cond, g, private);
 542
 543           /* Reload signals.  See above for MO.  */
 544           signals = atomic_load_acquire (cond->__data.__g_signals + g);
 545         }
 546
 547     }
 548   /* Try to grab a signal.  Use acquire MO so that we see an up-to-date value
 549      of __g1_start below (see spinning above for a similar case).  In
 550      particular, if we steal from a more recent group, we will also see a
 551      more recent __g1_start below.  */
 552   while (!atomic_compare_exchange_weak_acquire (cond->__data.__g_signals + g,
 553                                                 &signals, signals - 2));
 554
 555   /* We consumed a signal but we could have consumed from a more recent group
 556      that aliased with ours due to being in the same group slot.  If this
 557      might be the case our group must be closed as visible through
 558      __g1_start.  */
 559   uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
 560   if (seq < (g1_start >> 1))
 561     {
 562       /* We potentially stole a signal from a more recent group but we do not
 563          know which group we really consumed from.
 564          We do not care about groups older than current G1 because they are
 565          closed; we could have stolen from these, but then we just add a
 566          spurious wake-up for the current groups.
 567          We will never steal a signal from current G2 that was really intended
 568          for G2 because G2 never receives signals (until it becomes G1).  We
 569          could have stolen a signal from G2 that was conservatively added by a
 570          previous waiter that also thought it stole a signal -- but given that
 571          that signal was added unnecessarily, it's not a problem if we steal
 572          it.
 573          Thus, the remaining case is that we could have stolen from the current
 574          G1, where "current" means the __g1_start value we observed.  However,
 575          if the current G1 does not have the same slot index as we do, we did
 576          not steal from it and do not need to undo that.  This is the reason
 577          for putting a bit with G2's index into__g1_start as well.  */
 578       if (((g1_start & 1) ^ 1) == g)
 579         {
 580           /* We have to conservatively undo our potential mistake of stealing
 581              a signal.  We can stop trying to do that when the current G1
 582              changes because other spinning waiters will notice this too and
 583              __condvar_quiesce_and_switch_g1 has checked that there are no
 584              futex waiters anymore before switching G1.
 585              Relaxed MO is fine for the __g1_start load because we need to
 586              merely be able to observe this fact and not have to observe
 587              something else as well.
 588              ??? Would it help to spin for a little while to see whether the
 589              current G1 gets closed?  This might be worthwhile if the group is
 590              small or close to being closed.  */
 591           unsigned int s = atomic_load_relaxed (cond->__data.__g_signals + g);
 592           while (__condvar_load_g1_start_relaxed (cond) == g1_start)
 593             {
 594               /* Try to add a signal.  We don't need to acquire the lock
 595                  because at worst we can cause a spurious wake-up.  If the
 596                  group is in the process of being closed (LSB is true), this
 597                  has an effect similar to us adding a signal.  */
 598               if (((s & 1) != 0)
 599                   || atomic_compare_exchange_weak_relaxed
 600                        (cond->__data.__g_signals + g, &s, s + 2))
 601                 {
 602                   /* If we added a signal, we also need to add a wake-up on
 603                      the futex.  We also need to do that if we skipped adding
 604                      a signal because the group is being closed because
 605                      while __condvar_quiesce_and_switch_g1 could have closed
 606                      the group, it might stil be waiting for futex waiters to
 607                      leave (and one of those waiters might be the one we stole
 608                      the signal from, which cause it to block using the
 609                      futex).  */
 610                   futex_wake (cond->__data.__g_signals + g, 1, private);
 611                   break;
 612                 }
 613               /* TODO Back off.  */
 614             }
 615         }
 616     }
 617
 618  done:
 619
 620   /* Confirm that we have been woken.  We do that before acquiring the mutex
 621      to allow for execution of pthread_cond_destroy while having acquired the
 622      mutex.  */
 623   __condvar_confirm_wakeup (cond, private);
 624
 625   /* Woken up; now re-acquire the mutex.  If this doesn't fail, return RESULT,
 626      which is set to ETIMEDOUT if a timeout occured, or zero otherwise.  */
 627   err = __pthread_mutex_cond_lock (mutex);
 628   /* XXX Abort on errors that are disallowed by POSIX?  */
 629   return (err != 0) ? err : result;
 630 }
 631
 632
 633 /* See __pthread_cond_wait_common.  */
 634 int
 635 __pthread_cond_wait (pthread_cond_t *cond, pthread_mutex_t *mutex)
 636 {
 637   /* clockid is unused when abstime is NULL. */
 638   return __pthread_cond_wait_common (cond, mutex, 0, NULL);
 639 }
 640
 641 /* See __pthread_cond_wait_common.  */
 642 int
 643 __pthread_cond_timedwait (pthread_cond_t *cond, pthread_mutex_t *mutex,
 644     const struct timespec *abstime)
 645 {
 646   /* Check parameter validity.  This should also tell the compiler that
 647      it can assume that abstime is not NULL.  */
 648   if (abstime->tv_nsec < 0 || abstime->tv_nsec >= 1000000000)
 649     return EINVAL;
 650
 651   /* Relaxed MO is suffice because clock ID bit is only modified
 652      in condition creation.  */
 653   unsigned int flags = atomic_load_relaxed (&cond->__data.__wrefs);
 654   clockid_t clockid = (flags & __PTHREAD_COND_CLOCK_MONOTONIC_MASK)
 655                     ? CLOCK_MONOTONIC : CLOCK_REALTIME;
 656   return __pthread_cond_wait_common (cond, mutex, clockid, abstime);
 657 }
 658
 659 /* See __pthread_cond_wait_common.  */
 660 int
 661 __pthread_cond_clockwait (pthread_cond_t *cond, pthread_mutex_t *mutex,
 662                           clockid_t clockid,
 663                           const struct timespec *abstime)
 664 {
 665   /* Check parameter validity.  This should also tell the compiler that
 666      it can assume that abstime is not NULL.  */
 667   if (abstime->tv_nsec < 0 || abstime->tv_nsec >= 1000000000)
 668     return EINVAL;
 669
 670   if (!futex_abstimed_supported_clockid (clockid))
 671     return EINVAL;
 672
 673   /* If we do not support waiting using CLOCK_MONOTONIC, return an error.  */
 674   if (clockid == CLOCK_MONOTONIC
 675       && !futex_supports_exact_relative_timeouts ())
 676     return EINVAL;
 677
 678   return __pthread_cond_wait_common (cond, mutex, clockid, abstime);
 679 }
 680
 681 versioned_symbol (libpthread, __pthread_cond_wait, pthread_cond_wait,
 682                   GLIBC_2_3_2);
 683 versioned_symbol (libpthread, __pthread_cond_timedwait, pthread_cond_timedwait,
 684                   GLIBC_2_3_2);
 685 weak_alias (__pthread_cond_clockwait, pthread_cond_clockwait);