1  /* Copyright (C) 2003-2023 Free Software Foundation, Inc.
       2     This file is part of the GNU C Library.
       3  
       4     The GNU C Library is free software; you can redistribute it and/or
       5     modify it under the terms of the GNU Lesser General Public
       6     License as published by the Free Software Foundation; either
       7     version 2.1 of the License, or (at your option) any later version.
       8  
       9     The GNU C Library is distributed in the hope that it will be useful,
      10     but WITHOUT ANY WARRANTY; without even the implied warranty of
      11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
      12     Lesser General Public License for more details.
      13  
      14     You should have received a copy of the GNU Lesser General Public
      15     License along with the GNU C Library; if not, see
      16     <https://www.gnu.org/licenses/>.  */
      17  
      18  #include <errno.h>
      19  #include <sysdep.h>
      20  #include <futex-internal.h>
      21  #include <pthreadP.h>
      22  #include <shlib-compat.h>
      23  
      24  
      25  /* Wait on the barrier.
      26  
      27     In each round, we wait for a fixed number of threads to enter the barrier
      28     (COUNT).  Once that has happened, exactly these threads are allowed to
      29     leave the barrier.  Note that POSIX does not require that only COUNT
      30     threads can attempt to block using the barrier concurrently.
      31  
      32     We count the number of threads that have entered (IN).  Each thread
      33     increments IN when entering, thus getting a position in the sequence of
      34     threads that are or have been waiting (starting with 1, so the position
      35     is the number of threads that have entered so far including the current
      36     thread).
      37     CURRENT_ROUND designates the most recent thread whose round has been
      38     detected as complete.  When a thread detects that enough threads have
      39     entered to make a round complete, it finishes this round by effectively
      40     adding COUNT to CURRENT_ROUND atomically.  Threads that believe that their
      41     round is not complete yet wait until CURRENT_ROUND is not smaller than
      42     their position anymore.
      43  
      44     A barrier can be destroyed as soon as no threads are blocked on the
      45     barrier.  This is already the case if just one thread from the last round
      46     has stopped waiting and returned to the caller; the assumption is that
      47     all threads from the round are unblocked atomically, even though they may
      48     return at different times from the respective calls to
      49     pthread_barrier_wait).  Thus, a valid call to pthread_barrier_destroy can
      50     be concurrent with other threads still figuring out that their round has
      51     been completed.  Therefore, threads need to confirm that they have left
      52     the barrier by incrementing OUT, and pthread_barrier_destroy needs to wait
      53     until OUT equals IN.
      54  
      55     To avoid an ABA issue for futex_wait on CURRENT_ROUND and for archs with
      56     32b-only atomics, we additionally reset the barrier when IN reaches
      57     a threshold to avoid overflow.  We assume that the total number of threads
      58     is less than UINT_MAX/2, and set the threshold accordingly so that we can
      59     use a simple atomic_fetch_add on IN instead of a CAS when entering.  The
      60     threshold is always set to the end of a round, so all threads that have
      61     entered are either pre-reset threads or post-reset threads (i.e., have a
      62     position larger than the threshold).
      63     Pre-reset threads just run the algorithm explained above.  Post-reset
      64     threads wait until IN is reset to a pre-threshold value.
      65     When the last pre-reset thread leaves the barrier (i.e., OUT equals the
      66     threshold), it resets the barrier to its initial state.  Other (post-reset)
      67     threads wait for the reset to have finished by waiting until IN is less
      68     than the threshold and then restart by trying to enter the barrier again.
      69  
      70     We reuse the reset mechanism in pthread_barrier_destroy to get notified
      71     when all threads have left the barrier: We trigger an artificial reset and
      72     wait for the last pre-reset thread to finish reset, thus notifying the
      73     thread that is about to destroy the barrier.
      74  
      75     Blocking using futexes is straightforward: pre-reset threads wait for
      76     completion of their round using CURRENT_ROUND as futex word, and post-reset
      77     threads and pthread_barrier_destroy use IN as futex word.
      78  
      79     Further notes:
      80     * It is not simple to let some of the post-reset threads help with the
      81       reset because of the ABA issues that arise; therefore, we simply make
      82       the last thread to leave responsible for the reset.
      83     * POSIX leaves it unspecified whether a signal handler running in a thread
      84       that has been unblocked (because its round is complete) can stall all
      85       other threads and prevent them from returning from the barrier.  In this
      86       implementation, other threads will return.  However,
      87       pthread_barrier_destroy will of course wait for the signal handler thread
      88       to confirm that it left the barrier.
      89  
      90     TODO We should add spinning with back-off.  Once we do that, we could also
      91     try to avoid the futex_wake syscall when a round is detected as finished.
      92     If we do not spin, it is quite likely that at least some other threads will
      93     have called futex_wait already.  */
      94  int
      95  ___pthread_barrier_wait (pthread_barrier_t *barrier)
      96  {
      97    struct pthread_barrier *bar = (struct pthread_barrier *) barrier;
      98  
      99    /* How many threads entered so far, including ourself.  */
     100    unsigned int i;
     101  
     102   reset_restart:
     103    /* Try to enter the barrier.  We need acquire MO to (1) ensure that if we
     104       observe that our round can be completed (see below for our attempt to do
     105       so), all pre-barrier-entry effects of all threads in our round happen
     106       before us completing the round, and (2) to make our use of the barrier
     107       happen after a potential reset.  We need release MO to make sure that our
     108       pre-barrier-entry effects happen before threads in this round leaving the
     109       barrier.  */
     110    i = atomic_fetch_add_acq_rel (&bar->in, 1) + 1;
     111    /* These loads are after the fetch_add so that we're less likely to first
     112       pull in the cache line as shared.  */
     113    unsigned int count = bar->count;
     114    /* This is the number of threads that can enter before we need to reset.
     115       Always at the end of a round.  */
     116    unsigned int max_in_before_reset = BARRIER_IN_THRESHOLD
     117  				   - BARRIER_IN_THRESHOLD % count;
     118  
     119    if (i > max_in_before_reset)
     120      {
     121        /* We're in a reset round.  Just wait for a reset to finish; do not
     122  	 help finishing previous rounds because this could happen
     123  	 concurrently with a reset.  */
     124        while (i > max_in_before_reset)
     125  	{
     126  	  futex_wait_simple (&bar->in, i, bar->shared);
     127  	  /* Relaxed MO is fine here because we just need an indication for
     128  	     when we should retry to enter (which will use acquire MO, see
     129  	     above).  */
     130  	  i = atomic_load_relaxed (&bar->in);
     131  	}
     132        goto reset_restart;
     133      }
     134  
     135    /* Look at the current round.  At this point, we are just interested in
     136       whether we can complete rounds, based on the information we obtained
     137       through our acquire-MO load of IN.  Nonetheless, if we notice that
     138       our round has been completed using this load, we use the acquire-MO
     139       fence below to make sure that all pre-barrier-entry effects of all
     140       threads in our round happen before us leaving the barrier.  Therefore,
     141       relaxed MO is sufficient.  */
     142    unsigned cr = atomic_load_relaxed (&bar->current_round);
     143  
     144    /* Try to finish previous rounds and/or the current round.  We simply
     145       consider just our position here and do not try to do the work of threads
     146       that entered more recently.  */
     147    while (cr + count <= i)
     148      {
     149        /* Calculate the new current round based on how many threads entered.
     150  	 NEWCR must be larger than CR because CR+COUNT ends a round.  */
     151        unsigned int newcr = i - i % count;
     152        /* Try to complete previous and/or the current round.  We need release
     153  	 MO to propagate the happens-before that we observed through reading
     154  	 with acquire MO from IN to other threads.  If the CAS fails, it
     155  	 is like the relaxed-MO load of CURRENT_ROUND above.  */
     156        if (atomic_compare_exchange_weak_release (&bar->current_round, &cr,
     157  						newcr))
     158  	{
     159  	  /* Update CR with the modification we just did.  */
     160  	  cr = newcr;
     161  	  /* Wake threads belonging to the rounds we just finished.  We may
     162  	     wake more threads than necessary if more than COUNT threads try
     163  	     to block concurrently on the barrier, but this is not a typical
     164  	     use of barriers.
     165  	     Note that we can still access SHARED because we haven't yet
     166  	     confirmed to have left the barrier.  */
     167  	  futex_wake (&bar->current_round, INT_MAX, bar->shared);
     168  	  /* We did as much as we could based on our position.  If we advanced
     169  	     the current round to a round sufficient for us, do not wait for
     170  	     that to happen and skip the acquire fence (we already
     171  	     synchronize-with all other threads in our round through the
     172  	     initial acquire MO fetch_add of IN.  */
     173  	  if (i <= cr)
     174  	    goto ready_to_leave;
     175  	  else
     176  	    break;
     177  	}
     178      }
     179  
     180    /* Wait until the current round is more recent than the round we are in.  */
     181    while (i > cr)
     182      {
     183        /* Wait for the current round to finish.  */
     184        futex_wait_simple (&bar->current_round, cr, bar->shared);
     185        /* See the fence below.  */
     186        cr = atomic_load_relaxed (&bar->current_round);
     187      }
     188  
     189    /* Our round finished.  Use the acquire MO fence to synchronize-with the
     190       thread that finished the round, either through the initial load of
     191       CURRENT_ROUND above or a failed CAS in the loop above.  */
     192    atomic_thread_fence_acquire ();
     193  
     194    /* Now signal that we left.  */
     195    unsigned int o;
     196   ready_to_leave:
     197    /* We need release MO here so that our use of the barrier happens before
     198       reset or memory reuse after pthread_barrier_destroy.  */
     199    o = atomic_fetch_add_release (&bar->out, 1) + 1;
     200    if (o == max_in_before_reset)
     201      {
     202        /* Perform a reset if we are the last pre-reset thread leaving.   All
     203  	 other threads accessing the barrier are post-reset threads and are
     204  	 incrementing or spinning on IN.  Thus, resetting IN as the last step
     205  	 of reset ensures that the reset is not concurrent with actual use of
     206  	 the barrier.  We need the acquire MO fence so that the reset happens
     207  	 after use of the barrier by all earlier pre-reset threads.  */
     208        atomic_thread_fence_acquire ();
     209        atomic_store_relaxed (&bar->current_round, 0);
     210        atomic_store_relaxed (&bar->out, 0);
     211        /* When destroying the barrier, we wait for a reset to happen.  Thus,
     212  	 we must load SHARED now so that this happens before the barrier is
     213  	 destroyed.  */
     214        int shared = bar->shared;
     215        atomic_store_release (&bar->in, 0);
     216        futex_wake (&bar->in, INT_MAX, shared);
     217  
     218      }
     219  
     220    /* Return a special value for exactly one thread per round.  */
     221    return i % count == 0 ?  PTHREAD_BARRIER_SERIAL_THREAD : 0;
     222  }
     223  versioned_symbol (libc, ___pthread_barrier_wait, pthread_barrier_wait,
     224                    GLIBC_2_34);
     225  libc_hidden_ver (___pthread_barrier_wait, __pthread_barrier_wait)
     226  #ifndef SHARED
     227  strong_alias (___pthread_barrier_wait, __pthread_barrier_wait)
     228  #endif
     229  
     230  #if OTHER_SHLIB_COMPAT (libpthread, GLIBC_2_2, GLIBC_2_34)
     231  compat_symbol (libpthread, ___pthread_barrier_wait, pthread_barrier_wait,
     232                 GLIBC_2_2);
     233  #endif