1  /* Copyright (C) 2003-2023 Free Software Foundation, Inc.
       2     This file is part of the GNU C Library.
       3  
       4     The GNU C Library is free software; you can redistribute it and/or
       5     modify it under the terms of the GNU Lesser General Public
       6     License as published by the Free Software Foundation; either
       7     version 2.1 of the License, or (at your option) any later version.
       8  
       9     The GNU C Library is distributed in the hope that it will be useful,
      10     but WITHOUT ANY WARRANTY; without even the implied warranty of
      11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
      12     Lesser General Public License for more details.
      13  
      14     You should have received a copy of the GNU Lesser General Public
      15     License along with the GNU C Library; if not, see
      16     <https://www.gnu.org/licenses/>.  */
      17  
      18  #include <endian.h>
      19  #include <errno.h>
      20  #include <sysdep.h>
      21  #include <futex-internal.h>
      22  #include <pthread.h>
      23  #include <pthreadP.h>
      24  #include <sys/time.h>
      25  #include <atomic.h>
      26  #include <stdint.h>
      27  #include <stdbool.h>
      28  
      29  #include <shlib-compat.h>
      30  #include <stap-probe.h>
      31  #include <time.h>
      32  
      33  #include "pthread_cond_common.c"
      34  
      35  
      36  struct _condvar_cleanup_buffer
      37  {
      38    uint64_t wseq;
      39    pthread_cond_t *cond;
      40    pthread_mutex_t *mutex;
      41    int private;
      42  };
      43  
      44  
      45  /* Decrease the waiter reference count.  */
      46  static void
      47  __condvar_confirm_wakeup (pthread_cond_t *cond, int private)
      48  {
      49    /* If destruction is pending (i.e., the wake-request flag is nonzero) and we
      50       are the last waiter (prior value of __wrefs was 1 << 3), then wake any
      51       threads waiting in pthread_cond_destroy.  Release MO to synchronize with
      52       these threads.  Don't bother clearing the wake-up request flag.  */
      53    if ((atomic_fetch_add_release (&cond->__data.__wrefs, -8) >> 2) == 3)
      54      futex_wake (&cond->__data.__wrefs, INT_MAX, private);
      55  }
      56  
      57  
      58  /* Cancel waiting after having registered as a waiter previously.  SEQ is our
      59     position and G is our group index.
      60     The goal of cancellation is to make our group smaller if that is still
      61     possible.  If we are in a closed group, this is not possible anymore; in
      62     this case, we need to send a replacement signal for the one we effectively
      63     consumed because the signal should have gotten consumed by another waiter
      64     instead; we must not both cancel waiting and consume a signal.
      65  
      66     Must not be called while still holding a reference on the group.
      67  
      68     Returns true iff we consumed a signal.
      69  
      70     On some kind of timeouts, we may be able to pretend that a signal we
      71     effectively consumed happened before the timeout (i.e., similarly to first
      72     spinning on signals before actually checking whether the timeout has
      73     passed already).  Doing this would allow us to skip sending a replacement
      74     signal, but this case might happen rarely because the end of the timeout
      75     must race with someone else sending a signal.  Therefore, we don't bother
      76     trying to optimize this.  */
      77  static void
      78  __condvar_cancel_waiting (pthread_cond_t *cond, uint64_t seq, unsigned int g,
      79  			  int private)
      80  {
      81    bool consumed_signal = false;
      82  
      83    /* No deadlock with group switching is possible here because we do
      84       not hold a reference on the group.  */
      85    __condvar_acquire_lock (cond, private);
      86  
      87    uint64_t g1_start = __condvar_load_g1_start_relaxed (cond) >> 1;
      88    if (g1_start > seq)
      89      {
      90        /* Our group is closed, so someone provided enough signals for it.
      91  	 Thus, we effectively consumed a signal.  */
      92        consumed_signal = true;
      93      }
      94    else
      95      {
      96        if (g1_start + __condvar_get_orig_size (cond) <= seq)
      97  	{
      98  	  /* We are in the current G2 and thus cannot have consumed a signal.
      99  	     Reduce its effective size or handle overflow.  Remember that in
     100  	     G2, unsigned int size is zero or a negative value.  */
     101  	  if (cond->__data.__g_size[g] + __PTHREAD_COND_MAX_GROUP_SIZE > 0)
     102  	    {
     103  	      cond->__data.__g_size[g]--;
     104  	    }
     105  	  else
     106  	    {
     107  	      /* Cancellations would overflow the maximum group size.  Just
     108  		 wake up everyone spuriously to create a clean state.  This
     109  		 also means we do not consume a signal someone else sent.  */
     110  	      __condvar_release_lock (cond, private);
     111  	      __pthread_cond_broadcast (cond);
     112  	      return;
     113  	    }
     114  	}
     115        else
     116  	{
     117  	  /* We are in current G1.  If the group's size is zero, someone put
     118  	     a signal in the group that nobody else but us can consume.  */
     119  	  if (cond->__data.__g_size[g] == 0)
     120  	    consumed_signal = true;
     121  	  else
     122  	    {
     123  	      /* Otherwise, we decrease the size of the group.  This is
     124  		 equivalent to atomically putting in a signal just for us and
     125  		 consuming it right away.  We do not consume a signal sent
     126  		 by someone else.  We also cannot have consumed a futex
     127  		 wake-up because if we were cancelled or timed out in a futex
     128  		 call, the futex will wake another waiter.  */
     129  	      cond->__data.__g_size[g]--;
     130  	    }
     131  	}
     132      }
     133  
     134    __condvar_release_lock (cond, private);
     135  
     136    if (consumed_signal)
     137      {
     138        /* We effectively consumed a signal even though we didn't want to.
     139  	 Therefore, we need to send a replacement signal.
     140  	 If we would want to optimize this, we could do what
     141  	 pthread_cond_signal does right in the critical section above.  */
     142        __pthread_cond_signal (cond);
     143      }
     144  }
     145  
     146  /* Wake up any signalers that might be waiting.  */
     147  static void
     148  __condvar_dec_grefs (pthread_cond_t *cond, unsigned int g, int private)
     149  {
     150    /* Release MO to synchronize-with the acquire load in
     151       __condvar_quiesce_and_switch_g1.  */
     152    if (atomic_fetch_add_release (cond->__data.__g_refs + g, -2) == 3)
     153      {
     154        /* Clear the wake-up request flag before waking up.  We do not need more
     155  	 than relaxed MO and it doesn't matter if we apply this for an aliased
     156  	 group because we wake all futex waiters right after clearing the
     157  	 flag.  */
     158        atomic_fetch_and_relaxed (cond->__data.__g_refs + g, ~(unsigned int) 1);
     159        futex_wake (cond->__data.__g_refs + g, INT_MAX, private);
     160      }
     161  }
     162  
     163  /* Clean-up for cancellation of waiters waiting for normal signals.  We cancel
     164     our registration as a waiter, confirm we have woken up, and re-acquire the
     165     mutex.  */
     166  static void
     167  __condvar_cleanup_waiting (void *arg)
     168  {
     169    struct _condvar_cleanup_buffer *cbuffer =
     170      (struct _condvar_cleanup_buffer *) arg;
     171    pthread_cond_t *cond = cbuffer->cond;
     172    unsigned g = cbuffer->wseq & 1;
     173  
     174    __condvar_dec_grefs (cond, g, cbuffer->private);
     175  
     176    __condvar_cancel_waiting (cond, cbuffer->wseq >> 1, g, cbuffer->private);
     177    /* FIXME With the current cancellation implementation, it is possible that
     178       a thread is cancelled after it has returned from a syscall.  This could
     179       result in a cancelled waiter consuming a futex wake-up that is then
     180       causing another waiter in the same group to not wake up.  To work around
     181       this issue until we have fixed cancellation, just add a futex wake-up
     182       conservatively.  */
     183    futex_wake (cond->__data.__g_signals + g, 1, cbuffer->private);
     184  
     185    __condvar_confirm_wakeup (cond, cbuffer->private);
     186  
     187    /* XXX If locking the mutex fails, should we just stop execution?  This
     188       might be better than silently ignoring the error.  */
     189    __pthread_mutex_cond_lock (cbuffer->mutex);
     190  }
     191  
     192  /* This condvar implementation guarantees that all calls to signal and
     193     broadcast and all of the three virtually atomic parts of each call to wait
     194     (i.e., (1) releasing the mutex and blocking, (2) unblocking, and (3) re-
     195     acquiring the mutex) happen in some total order that is consistent with the
     196     happens-before relations in the calling program.  However, this order does
     197     not necessarily result in additional happens-before relations being
     198     established (which aligns well with spurious wake-ups being allowed).
     199  
     200     All waiters acquire a certain position in a 64b waiter sequence (__wseq).
     201     This sequence determines which waiters are allowed to consume signals.
     202     A broadcast is equal to sending as many signals as are unblocked waiters.
     203     When a signal arrives, it samples the current value of __wseq with a
     204     relaxed-MO load (i.e., the position the next waiter would get).  (This is
     205     sufficient because it is consistent with happens-before; the caller can
     206     enforce stronger ordering constraints by calling signal while holding the
     207     mutex.)  Only waiters with a position less than the __wseq value observed
     208     by the signal are eligible to consume this signal.
     209  
     210     This would be straight-forward to implement if waiters would just spin but
     211     we need to let them block using futexes.  Futexes give no guarantee of
     212     waking in FIFO order, so we cannot reliably wake eligible waiters if we
     213     just use a single futex.  Also, futex words are 32b in size, but we need
     214     to distinguish more than 1<<32 states because we need to represent the
     215     order of wake-up (and thus which waiters are eligible to consume signals);
     216     blocking in a futex is not atomic with a waiter determining its position in
     217     the waiter sequence, so we need the futex word to reliably notify waiters
     218     that they should not attempt to block anymore because they have been
     219     already signaled in the meantime.  While an ABA issue on a 32b value will
     220     be rare, ignoring it when we are aware of it is not the right thing to do
     221     either.
     222  
     223     Therefore, we use a 64b counter to represent the waiter sequence (on
     224     architectures which only support 32b atomics, we use a few bits less).
     225     To deal with the blocking using futexes, we maintain two groups of waiters:
     226     * Group G1 consists of waiters that are all eligible to consume signals;
     227       incoming signals will always signal waiters in this group until all
     228       waiters in G1 have been signaled.
     229     * Group G2 consists of waiters that arrive when a G1 is present and still
     230       contains waiters that have not been signaled.  When all waiters in G1
     231       are signaled and a new signal arrives, the new signal will convert G2
     232       into the new G1 and create a new G2 for future waiters.
     233  
     234     We cannot allocate new memory because of process-shared condvars, so we
     235     have just two slots of groups that change their role between G1 and G2.
     236     Each has a separate futex word, a number of signals available for
     237     consumption, a size (number of waiters in the group that have not been
     238     signaled), and a reference count.
     239  
     240     The group reference count is used to maintain the number of waiters that
     241     are using the group's futex.  Before a group can change its role, the
     242     reference count must show that no waiters are using the futex anymore; this
     243     prevents ABA issues on the futex word.
     244  
     245     To represent which intervals in the waiter sequence the groups cover (and
     246     thus also which group slot contains G1 or G2), we use a 64b counter to
     247     designate the start position of G1 (inclusive), and a single bit in the
     248     waiter sequence counter to represent which group slot currently contains
     249     G2.  This allows us to switch group roles atomically wrt. waiters obtaining
     250     a position in the waiter sequence.  The G1 start position allows waiters to
     251     figure out whether they are in a group that has already been completely
     252     signaled (i.e., if the current G1 starts at a later position that the
     253     waiter's position).  Waiters cannot determine whether they are currently
     254     in G2 or G1 -- but they do not have too because all they are interested in
     255     is whether there are available signals, and they always start in G2 (whose
     256     group slot they know because of the bit in the waiter sequence.  Signalers
     257     will simply fill the right group until it is completely signaled and can
     258     be closed (they do not switch group roles until they really have to to
     259     decrease the likelihood of having to wait for waiters still holding a
     260     reference on the now-closed G1).
     261  
     262     Signalers maintain the initial size of G1 to be able to determine where
     263     G2 starts (G2 is always open-ended until it becomes G1).  They track the
     264     remaining size of a group; when waiters cancel waiting (due to PThreads
     265     cancellation or timeouts), they will decrease this remaining size as well.
     266  
     267     To implement condvar destruction requirements (i.e., that
     268     pthread_cond_destroy can be called as soon as all waiters have been
     269     signaled), waiters increment a reference count before starting to wait and
     270     decrement it after they stopped waiting but right before they acquire the
     271     mutex associated with the condvar.
     272  
     273     pthread_cond_t thus consists of the following (bits that are used for
     274     flags and are not part of the primary value of each field but necessary
     275     to make some things atomic or because there was no space for them
     276     elsewhere in the data structure):
     277  
     278     __wseq: Waiter sequence counter
     279       * LSB is index of current G2.
     280       * Waiters fetch-add while having acquire the mutex associated with the
     281         condvar.  Signalers load it and fetch-xor it concurrently.
     282     __g1_start: Starting position of G1 (inclusive)
     283       * LSB is index of current G2.
     284       * Modified by signalers while having acquired the condvar-internal lock
     285         and observed concurrently by waiters.
     286     __g1_orig_size: Initial size of G1
     287       * The two least-significant bits represent the condvar-internal lock.
     288       * Only accessed while having acquired the condvar-internal lock.
     289     __wrefs: Waiter reference counter.
     290       * Bit 2 is true if waiters should run futex_wake when they remove the
     291         last reference.  pthread_cond_destroy uses this as futex word.
     292       * Bit 1 is the clock ID (0 == CLOCK_REALTIME, 1 == CLOCK_MONOTONIC).
     293       * Bit 0 is true iff this is a process-shared condvar.
     294       * Simple reference count used by both waiters and pthread_cond_destroy.
     295       (If the format of __wrefs is changed, update nptl_lock_constants.pysym
     296        and the pretty printers.)
     297     For each of the two groups, we have:
     298     __g_refs: Futex waiter reference count.
     299       * LSB is true if waiters should run futex_wake when they remove the
     300         last reference.
     301       * Reference count used by waiters concurrently with signalers that have
     302         acquired the condvar-internal lock.
     303     __g_signals: The number of signals that can still be consumed.
     304       * Used as a futex word by waiters.  Used concurrently by waiters and
     305         signalers.
     306       * LSB is true iff this group has been completely signaled (i.e., it is
     307         closed).
     308     __g_size: Waiters remaining in this group (i.e., which have not been
     309       signaled yet.
     310       * Accessed by signalers and waiters that cancel waiting (both do so only
     311         when having acquired the condvar-internal lock.
     312       * The size of G2 is always zero because it cannot be determined until
     313         the group becomes G1.
     314       * Although this is of unsigned type, we rely on using unsigned overflow
     315         rules to make this hold effectively negative values too (in
     316         particular, when waiters in G2 cancel waiting).
     317  
     318     A PTHREAD_COND_INITIALIZER condvar has all fields set to zero, which yields
     319     a condvar that has G2 starting at position 0 and a G1 that is closed.
     320  
     321     Because waiters do not claim ownership of a group right when obtaining a
     322     position in __wseq but only reference count the group when using futexes
     323     to block, it can happen that a group gets closed before a waiter can
     324     increment the reference count.  Therefore, waiters have to check whether
     325     their group is already closed using __g1_start.  They also have to perform
     326     this check when spinning when trying to grab a signal from __g_signals.
     327     Note that for these checks, using relaxed MO to load __g1_start is
     328     sufficient because if a waiter can see a sufficiently large value, it could
     329     have also consume a signal in the waiters group.
     330  
     331     Waiters try to grab a signal from __g_signals without holding a reference
     332     count, which can lead to stealing a signal from a more recent group after
     333     their own group was already closed.  They cannot always detect whether they
     334     in fact did because they do not know when they stole, but they can
     335     conservatively add a signal back to the group they stole from; if they
     336     did so unnecessarily, all that happens is a spurious wake-up.  To make this
     337     even less likely, __g1_start contains the index of the current g2 too,
     338     which allows waiters to check if there aliasing on the group slots; if
     339     there wasn't, they didn't steal from the current G1, which means that the
     340     G1 they stole from must have been already closed and they do not need to
     341     fix anything.
     342  
     343     It is essential that the last field in pthread_cond_t is __g_signals[1]:
     344     The previous condvar used a pointer-sized field in pthread_cond_t, so a
     345     PTHREAD_COND_INITIALIZER from that condvar implementation might only
     346     initialize 4 bytes to zero instead of the 8 bytes we need (i.e., 44 bytes
     347     in total instead of the 48 we need).  __g_signals[1] is not accessed before
     348     the first group switch (G2 starts at index 0), which will set its value to
     349     zero after a harmless fetch-or whose return value is ignored.  This
     350     effectively completes initialization.
     351  
     352  
     353     Limitations:
     354     * This condvar isn't designed to allow for more than
     355       __PTHREAD_COND_MAX_GROUP_SIZE * (1 << 31) calls to __pthread_cond_wait.
     356     * More than __PTHREAD_COND_MAX_GROUP_SIZE concurrent waiters are not
     357       supported.
     358     * Beyond what is allowed as errors by POSIX or documented, we can also
     359       return the following errors:
     360       * EPERM if MUTEX is a recursive mutex and the caller doesn't own it.
     361       * EOWNERDEAD or ENOTRECOVERABLE when using robust mutexes.  Unlike
     362         for other errors, this can happen when we re-acquire the mutex; this
     363         isn't allowed by POSIX (which requires all errors to virtually happen
     364         before we release the mutex or change the condvar state), but there's
     365         nothing we can do really.
     366       * When using PTHREAD_MUTEX_PP_* mutexes, we can also return all errors
     367         returned by __pthread_tpp_change_priority.  We will already have
     368         released the mutex in such cases, so the caller cannot expect to own
     369         MUTEX.
     370  
     371     Other notes:
     372     * Instead of the normal mutex unlock / lock functions, we use
     373       __pthread_mutex_unlock_usercnt(m, 0) / __pthread_mutex_cond_lock(m)
     374       because those will not change the mutex-internal users count, so that it
     375       can be detected when a condvar is still associated with a particular
     376       mutex because there is a waiter blocked on this condvar using this mutex.
     377  */
     378  static __always_inline int
     379  __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
     380      clockid_t clockid, const struct __timespec64 *abstime)
     381  {
     382    const int maxspin = 0;
     383    int err;
     384    int result = 0;
     385  
     386    LIBC_PROBE (cond_wait, 2, cond, mutex);
     387  
     388    /* clockid will already have been checked by
     389       __pthread_cond_clockwait or pthread_condattr_setclock, or we
     390       don't use it if abstime is NULL, so we don't need to check it
     391       here. */
     392  
     393    /* Acquire a position (SEQ) in the waiter sequence (WSEQ).  We use an
     394       atomic operation because signals and broadcasts may update the group
     395       switch without acquiring the mutex.  We do not need release MO here
     396       because we do not need to establish any happens-before relation with
     397       signalers (see __pthread_cond_signal); modification order alone
     398       establishes a total order of waiters/signals.  We do need acquire MO
     399       to synchronize with group reinitialization in
     400       __condvar_quiesce_and_switch_g1.  */
     401    uint64_t wseq = __condvar_fetch_add_wseq_acquire (cond, 2);
     402    /* Find our group's index.  We always go into what was G2 when we acquired
     403       our position.  */
     404    unsigned int g = wseq & 1;
     405    uint64_t seq = wseq >> 1;
     406  
     407    /* Increase the waiter reference count.  Relaxed MO is sufficient because
     408       we only need to synchronize when decrementing the reference count.  */
     409    unsigned int flags = atomic_fetch_add_relaxed (&cond->__data.__wrefs, 8);
     410    int private = __condvar_get_private (flags);
     411  
     412    /* Now that we are registered as a waiter, we can release the mutex.
     413       Waiting on the condvar must be atomic with releasing the mutex, so if
     414       the mutex is used to establish a happens-before relation with any
     415       signaler, the waiter must be visible to the latter; thus, we release the
     416       mutex after registering as waiter.
     417       If releasing the mutex fails, we just cancel our registration as a
     418       waiter and confirm that we have woken up.  */
     419    err = __pthread_mutex_unlock_usercnt (mutex, 0);
     420    if (__glibc_unlikely (err != 0))
     421      {
     422        __condvar_cancel_waiting (cond, seq, g, private);
     423        __condvar_confirm_wakeup (cond, private);
     424        return err;
     425      }
     426  
     427    /* Now wait until a signal is available in our group or it is closed.
     428       Acquire MO so that if we observe a value of zero written after group
     429       switching in __condvar_quiesce_and_switch_g1, we synchronize with that
     430       store and will see the prior update of __g1_start done while switching
     431       groups too.  */
     432    unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g);
     433  
     434    do
     435      {
     436        while (1)
     437  	{
     438  	  /* Spin-wait first.
     439  	     Note that spinning first without checking whether a timeout
     440  	     passed might lead to what looks like a spurious wake-up even
     441  	     though we should return ETIMEDOUT (e.g., if the caller provides
     442  	     an absolute timeout that is clearly in the past).  However,
     443  	     (1) spurious wake-ups are allowed, (2) it seems unlikely that a
     444  	     user will (ab)use pthread_cond_wait as a check for whether a
     445  	     point in time is in the past, and (3) spinning first without
     446  	     having to compare against the current time seems to be the right
     447  	     choice from a performance perspective for most use cases.  */
     448  	  unsigned int spin = maxspin;
     449  	  while (signals == 0 && spin > 0)
     450  	    {
     451  	      /* Check that we are not spinning on a group that's already
     452  		 closed.  */
     453  	      if (seq < (__condvar_load_g1_start_relaxed (cond) >> 1))
     454  		goto done;
     455  
     456  	      /* TODO Back off.  */
     457  
     458  	      /* Reload signals.  See above for MO.  */
     459  	      signals = atomic_load_acquire (cond->__data.__g_signals + g);
     460  	      spin--;
     461  	    }
     462  
     463  	  /* If our group will be closed as indicated by the flag on signals,
     464  	     don't bother grabbing a signal.  */
     465  	  if (signals & 1)
     466  	    goto done;
     467  
     468  	  /* If there is an available signal, don't block.  */
     469  	  if (signals != 0)
     470  	    break;
     471  
     472  	  /* No signals available after spinning, so prepare to block.
     473  	     We first acquire a group reference and use acquire MO for that so
     474  	     that we synchronize with the dummy read-modify-write in
     475  	     __condvar_quiesce_and_switch_g1 if we read from that.  In turn,
     476  	     in this case this will make us see the closed flag on __g_signals
     477  	     that designates a concurrent attempt to reuse the group's slot.
     478  	     We use acquire MO for the __g_signals check to make the
     479  	     __g1_start check work (see spinning above).
     480  	     Note that the group reference acquisition will not mask the
     481  	     release MO when decrementing the reference count because we use
     482  	     an atomic read-modify-write operation and thus extend the release
     483  	     sequence.  */
     484  	  atomic_fetch_add_acquire (cond->__data.__g_refs + g, 2);
     485  	  if (((atomic_load_acquire (cond->__data.__g_signals + g) & 1) != 0)
     486  	      || (seq < (__condvar_load_g1_start_relaxed (cond) >> 1)))
     487  	    {
     488  	      /* Our group is closed.  Wake up any signalers that might be
     489  		 waiting.  */
     490  	      __condvar_dec_grefs (cond, g, private);
     491  	      goto done;
     492  	    }
     493  
     494  	  // Now block.
     495  	  struct _pthread_cleanup_buffer buffer;
     496  	  struct _condvar_cleanup_buffer cbuffer;
     497  	  cbuffer.wseq = wseq;
     498  	  cbuffer.cond = cond;
     499  	  cbuffer.mutex = mutex;
     500  	  cbuffer.private = private;
     501  	  __pthread_cleanup_push (&buffer, __condvar_cleanup_waiting, &cbuffer);
     502  
     503  	  err = __futex_abstimed_wait_cancelable64 (
     504  	    cond->__data.__g_signals + g, 0, clockid, abstime, private);
     505  
     506  	  __pthread_cleanup_pop (&buffer, 0);
     507  
     508  	  if (__glibc_unlikely (err == ETIMEDOUT || err == EOVERFLOW))
     509  	    {
     510  	      __condvar_dec_grefs (cond, g, private);
     511  	      /* If we timed out, we effectively cancel waiting.  Note that
     512  		 we have decremented __g_refs before cancellation, so that a
     513  		 deadlock between waiting for quiescence of our group in
     514  		 __condvar_quiesce_and_switch_g1 and us trying to acquire
     515  		 the lock during cancellation is not possible.  */
     516  	      __condvar_cancel_waiting (cond, seq, g, private);
     517  	      result = err;
     518  	      goto done;
     519  	    }
     520  	  else
     521  	    __condvar_dec_grefs (cond, g, private);
     522  
     523  	  /* Reload signals.  See above for MO.  */
     524  	  signals = atomic_load_acquire (cond->__data.__g_signals + g);
     525  	}
     526  
     527      }
     528    /* Try to grab a signal.  Use acquire MO so that we see an up-to-date value
     529       of __g1_start below (see spinning above for a similar case).  In
     530       particular, if we steal from a more recent group, we will also see a
     531       more recent __g1_start below.  */
     532    while (!atomic_compare_exchange_weak_acquire (cond->__data.__g_signals + g,
     533  						&signals, signals - 2));
     534  
     535    /* We consumed a signal but we could have consumed from a more recent group
     536       that aliased with ours due to being in the same group slot.  If this
     537       might be the case our group must be closed as visible through
     538       __g1_start.  */
     539    uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
     540    if (seq < (g1_start >> 1))
     541      {
     542        /* We potentially stole a signal from a more recent group but we do not
     543  	 know which group we really consumed from.
     544  	 We do not care about groups older than current G1 because they are
     545  	 closed; we could have stolen from these, but then we just add a
     546  	 spurious wake-up for the current groups.
     547  	 We will never steal a signal from current G2 that was really intended
     548  	 for G2 because G2 never receives signals (until it becomes G1).  We
     549  	 could have stolen a signal from G2 that was conservatively added by a
     550  	 previous waiter that also thought it stole a signal -- but given that
     551  	 that signal was added unnecessarily, it's not a problem if we steal
     552  	 it.
     553  	 Thus, the remaining case is that we could have stolen from the current
     554  	 G1, where "current" means the __g1_start value we observed.  However,
     555  	 if the current G1 does not have the same slot index as we do, we did
     556  	 not steal from it and do not need to undo that.  This is the reason
     557  	 for putting a bit with G2's index into__g1_start as well.  */
     558        if (((g1_start & 1) ^ 1) == g)
     559  	{
     560  	  /* We have to conservatively undo our potential mistake of stealing
     561  	     a signal.  We can stop trying to do that when the current G1
     562  	     changes because other spinning waiters will notice this too and
     563  	     __condvar_quiesce_and_switch_g1 has checked that there are no
     564  	     futex waiters anymore before switching G1.
     565  	     Relaxed MO is fine for the __g1_start load because we need to
     566  	     merely be able to observe this fact and not have to observe
     567  	     something else as well.
     568  	     ??? Would it help to spin for a little while to see whether the
     569  	     current G1 gets closed?  This might be worthwhile if the group is
     570  	     small or close to being closed.  */
     571  	  unsigned int s = atomic_load_relaxed (cond->__data.__g_signals + g);
     572  	  while (__condvar_load_g1_start_relaxed (cond) == g1_start)
     573  	    {
     574  	      /* Try to add a signal.  We don't need to acquire the lock
     575  		 because at worst we can cause a spurious wake-up.  If the
     576  		 group is in the process of being closed (LSB is true), this
     577  		 has an effect similar to us adding a signal.  */
     578  	      if (((s & 1) != 0)
     579  		  || atomic_compare_exchange_weak_relaxed
     580  		       (cond->__data.__g_signals + g, &s, s + 2))
     581  		{
     582  		  /* If we added a signal, we also need to add a wake-up on
     583  		     the futex.  We also need to do that if we skipped adding
     584  		     a signal because the group is being closed because
     585  		     while __condvar_quiesce_and_switch_g1 could have closed
     586  		     the group, it might still be waiting for futex waiters to
     587  		     leave (and one of those waiters might be the one we stole
     588  		     the signal from, which cause it to block using the
     589  		     futex).  */
     590  		  futex_wake (cond->__data.__g_signals + g, 1, private);
     591  		  break;
     592  		}
     593  	      /* TODO Back off.  */
     594  	    }
     595  	}
     596      }
     597  
     598   done:
     599  
     600    /* Confirm that we have been woken.  We do that before acquiring the mutex
     601       to allow for execution of pthread_cond_destroy while having acquired the
     602       mutex.  */
     603    __condvar_confirm_wakeup (cond, private);
     604  
     605    /* Woken up; now re-acquire the mutex.  If this doesn't fail, return RESULT,
     606       which is set to ETIMEDOUT if a timeout occurred, or zero otherwise.  */
     607    err = __pthread_mutex_cond_lock (mutex);
     608    /* XXX Abort on errors that are disallowed by POSIX?  */
     609    return (err != 0) ? err : result;
     610  }
     611  
     612  
     613  /* See __pthread_cond_wait_common.  */
     614  int
     615  ___pthread_cond_wait (pthread_cond_t *cond, pthread_mutex_t *mutex)
     616  {
     617    /* clockid is unused when abstime is NULL. */
     618    return __pthread_cond_wait_common (cond, mutex, 0, NULL);
     619  }
     620  
     621  versioned_symbol (libc, ___pthread_cond_wait, pthread_cond_wait,
     622  		  GLIBC_2_3_2);
     623  libc_hidden_ver (___pthread_cond_wait, __pthread_cond_wait)
     624  #ifndef SHARED
     625  strong_alias (___pthread_cond_wait, __pthread_cond_wait)
     626  #endif
     627  
     628  /* See __pthread_cond_wait_common.  */
     629  int
     630  ___pthread_cond_timedwait64 (pthread_cond_t *cond, pthread_mutex_t *mutex,
     631  			     const struct __timespec64 *abstime)
     632  {
     633    /* Check parameter validity.  This should also tell the compiler that
     634       it can assume that abstime is not NULL.  */
     635    if (! valid_nanoseconds (abstime->tv_nsec))
     636      return EINVAL;
     637  
     638    /* Relaxed MO is suffice because clock ID bit is only modified
     639       in condition creation.  */
     640    unsigned int flags = atomic_load_relaxed (&cond->__data.__wrefs);
     641    clockid_t clockid = (flags & __PTHREAD_COND_CLOCK_MONOTONIC_MASK)
     642                      ? CLOCK_MONOTONIC : CLOCK_REALTIME;
     643    return __pthread_cond_wait_common (cond, mutex, clockid, abstime);
     644  }
     645  
     646  #if __TIMESIZE == 64
     647  strong_alias (___pthread_cond_timedwait64, ___pthread_cond_timedwait)
     648  #else
     649  strong_alias (___pthread_cond_timedwait64, __pthread_cond_timedwait64)
     650  libc_hidden_def (__pthread_cond_timedwait64)
     651  
     652  int
     653  ___pthread_cond_timedwait (pthread_cond_t *cond, pthread_mutex_t *mutex,
     654  			    const struct timespec *abstime)
     655  {
     656    struct __timespec64 ts64 = valid_timespec_to_timespec64 (*abstime);
     657  
     658    return __pthread_cond_timedwait64 (cond, mutex, &ts64);
     659  }
     660  #endif /* __TIMESIZE == 64 */
     661  versioned_symbol (libc, ___pthread_cond_timedwait,
     662  		  pthread_cond_timedwait, GLIBC_2_3_2);
     663  libc_hidden_ver (___pthread_cond_timedwait, __pthread_cond_timedwait)
     664  #ifndef SHARED
     665  strong_alias (___pthread_cond_timedwait, __pthread_cond_timedwait)
     666  #endif
     667  
     668  /* See __pthread_cond_wait_common.  */
     669  int
     670  ___pthread_cond_clockwait64 (pthread_cond_t *cond, pthread_mutex_t *mutex,
     671  			      clockid_t clockid,
     672  			      const struct __timespec64 *abstime)
     673  {
     674    /* Check parameter validity.  This should also tell the compiler that
     675       it can assume that abstime is not NULL.  */
     676    if (! valid_nanoseconds (abstime->tv_nsec))
     677      return EINVAL;
     678  
     679    if (!futex_abstimed_supported_clockid (clockid))
     680      return EINVAL;
     681  
     682    return __pthread_cond_wait_common (cond, mutex, clockid, abstime);
     683  }
     684  
     685  #if __TIMESIZE == 64
     686  strong_alias (___pthread_cond_clockwait64, ___pthread_cond_clockwait)
     687  #else
     688  strong_alias (___pthread_cond_clockwait64, __pthread_cond_clockwait64);
     689  libc_hidden_def (__pthread_cond_clockwait64)
     690  
     691  int
     692  ___pthread_cond_clockwait (pthread_cond_t *cond, pthread_mutex_t *mutex,
     693                            clockid_t clockid,
     694                            const struct timespec *abstime)
     695  {
     696    struct __timespec64 ts64 = valid_timespec_to_timespec64 (*abstime);
     697  
     698    return __pthread_cond_clockwait64 (cond, mutex, clockid, &ts64);
     699  }
     700  #endif /* __TIMESIZE == 64 */
     701  libc_hidden_ver (___pthread_cond_clockwait, __pthread_cond_clockwait)
     702  #ifndef SHARED
     703  strong_alias (___pthread_cond_clockwait, __pthread_cond_clockwait)
     704  #endif
     705  versioned_symbol (libc, ___pthread_cond_clockwait,
     706  		  pthread_cond_clockwait, GLIBC_2_34);
     707  #if OTHER_SHLIB_COMPAT (libpthread, GLIBC_2_30, GLIBC_2_34)
     708  compat_symbol (libpthread, ___pthread_cond_clockwait,
     709  	       pthread_cond_clockwait, GLIBC_2_30);
     710  #endif