1  /* Copyright (C) 2002-2023 Free Software Foundation, Inc.
       2     This file is part of the GNU C Library.
       3  
       4     The GNU C Library is free software; you can redistribute it and/or
       5     modify it under the terms of the GNU Lesser General Public
       6     License as published by the Free Software Foundation; either
       7     version 2.1 of the License, or (at your option) any later version.
       8  
       9     The GNU C Library is distributed in the hope that it will be useful,
      10     but WITHOUT ANY WARRANTY; without even the implied warranty of
      11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      12     Lesser General Public License for more details.
      13  
      14     You should have received a copy of the GNU Lesser General Public
      15     License along with the GNU C Library; if not, see
      16     <https://www.gnu.org/licenses/>.  */
      17  
      18  #include <ctype.h>
      19  #include <errno.h>
      20  #include <stdbool.h>
      21  #include <stdlib.h>
      22  #include <string.h>
      23  #include <stdint.h>
      24  #include "pthreadP.h"
      25  #include <hp-timing.h>
      26  #include <ldsodefs.h>
      27  #include <atomic.h>
      28  #include <libc-diag.h>
      29  #include <libc-internal.h>
      30  #include <resolv.h>
      31  #include <kernel-features.h>
      32  #include <default-sched.h>
      33  #include <futex-internal.h>
      34  #include <tls-setup.h>
      35  #include <rseq-internal.h>
      36  #include "libioP.h"
      37  #include <sys/single_threaded.h>
      38  #include <version.h>
      39  #include <clone_internal.h>
      40  #include <futex-internal.h>
      41  
      42  #include <shlib-compat.h>
      43  
      44  #include <stap-probe.h>
      45  
      46  
      47  /* Globally enabled events.  */
      48  extern td_thr_events_t __nptl_threads_events;
      49  libc_hidden_proto (__nptl_threads_events)
      50  td_thr_events_t __nptl_threads_events;
      51  libc_hidden_data_def (__nptl_threads_events)
      52  
      53  /* Pointer to descriptor with the last event.  */
      54  extern struct pthread *__nptl_last_event;
      55  libc_hidden_proto (__nptl_last_event)
      56  struct pthread *__nptl_last_event;
      57  libc_hidden_data_def (__nptl_last_event)
      58  
      59  #ifdef SHARED
      60  /* This variable is used to access _rtld_global from libthread_db.  If
      61     GDB loads libpthread before ld.so, it is not possible to resolve
      62     _rtld_global directly during libpthread initialization.  */
      63  struct rtld_global *__nptl_rtld_global = &_rtld_global;
      64  #endif
      65  
      66  /* Version of the library, used in libthread_db to detect mismatches.  */
      67  const char __nptl_version[] = VERSION;
      68  
      69  /* This performs the initialization necessary when going from
      70     single-threaded to multi-threaded mode for the first time.  */
      71  static void
      72  late_init (void)
      73  {
      74    struct sigaction sa;
      75    __sigemptyset (&sa.sa_mask);
      76  
      77    /* Install the handle to change the threads' uid/gid.  Use
      78       SA_ONSTACK because the signal may be sent to threads that are
      79       running with custom stacks.  (This is less likely for
      80       SIGCANCEL.)  */
      81    sa.sa_sigaction = __nptl_setxid_sighandler;
      82    sa.sa_flags = SA_ONSTACK | SA_SIGINFO | SA_RESTART;
      83    (void) __libc_sigaction (SIGSETXID, &sa, NULL);
      84  
      85    /* The parent process might have left the signals blocked.  Just in
      86       case, unblock it.  We reuse the signal mask in the sigaction
      87       structure.  It is already cleared.  */
      88    __sigaddset (&sa.sa_mask, SIGCANCEL);
      89    __sigaddset (&sa.sa_mask, SIGSETXID);
      90    INTERNAL_SYSCALL_CALL (rt_sigprocmask, SIG_UNBLOCK, &sa.sa_mask,
      91  			 NULL, __NSIG_BYTES);
      92  }
      93  
      94  /* Code to allocate and deallocate a stack.  */
      95  #include "allocatestack.c"
      96  
      97  /* CONCURRENCY NOTES:
      98  
      99     Understanding who is the owner of the 'struct pthread' or 'PD'
     100     (refers to the value of the 'struct pthread *pd' function argument)
     101     is critically important in determining exactly which operations are
     102     allowed and which are not and when, particularly when it comes to the
     103     implementation of pthread_create, pthread_join, pthread_detach, and
     104     other functions which all operate on PD.
     105  
     106     The owner of PD is responsible for freeing the final resources
     107     associated with PD, and may examine the memory underlying PD at any
     108     point in time until it frees it back to the OS or to reuse by the
     109     runtime.
     110  
     111     The thread which calls pthread_create is called the creating thread.
     112     The creating thread begins as the owner of PD.
     113  
     114     During startup the new thread may examine PD in coordination with the
     115     owner thread (which may be itself).
     116  
     117     The four cases of ownership transfer are:
     118  
     119     (1) Ownership of PD is released to the process (all threads may use it)
     120         after the new thread starts in a joinable state
     121         i.e. pthread_create returns a usable pthread_t.
     122  
     123     (2) Ownership of PD is released to the new thread starting in a detached
     124         state.
     125  
     126     (3) Ownership of PD is dynamically released to a running thread via
     127         pthread_detach.
     128  
     129     (4) Ownership of PD is acquired by the thread which calls pthread_join.
     130  
     131     Implementation notes:
     132  
     133     The PD->stopped_start and thread_ran variables are used to determine
     134     exactly which of the four ownership states we are in and therefore
     135     what actions can be taken.  For example after (2) we cannot read or
     136     write from PD anymore since the thread may no longer exist and the
     137     memory may be unmapped.
     138  
     139     It is important to point out that PD->lock is being used both
     140     similar to a one-shot semaphore and subsequently as a mutex.  The
     141     lock is taken in the parent to force the child to wait, and then the
     142     child releases the lock.  However, this semaphore-like effect is used
     143     only for synchronizing the parent and child.  After startup the lock
     144     is used like a mutex to create a critical section during which a
     145     single owner modifies the thread parameters.
     146  
     147     The most complicated cases happen during thread startup:
     148  
     149     (a) If the created thread is in a detached (PTHREAD_CREATE_DETACHED),
     150         or joinable (default PTHREAD_CREATE_JOINABLE) state and
     151         STOPPED_START is true, then the creating thread has ownership of
     152         PD until the PD->lock is released by pthread_create.  If any
     153         errors occur we are in states (c) or (d) below.
     154  
     155     (b) If the created thread is in a detached state
     156         (PTHREAD_CREATED_DETACHED), and STOPPED_START is false, then the
     157         creating thread has ownership of PD until it invokes the OS
     158         kernel's thread creation routine.  If this routine returns
     159         without error, then the created thread owns PD; otherwise, see
     160         (c) or (d) below.
     161  
     162     (c) If either a joinable or detached thread setup failed and THREAD_RAN
     163         is true, then the creating thread releases ownership to the new thread,
     164         the created thread sees the failed setup through PD->setup_failed
     165         member, releases the PD ownership, and exits.  The creating thread will
     166         be responsible for cleanup the allocated resources.  The THREAD_RAN is
     167         local to creating thread and indicate whether thread creation or setup
     168         has failed.
     169  
     170     (d) If the thread creation failed and THREAD_RAN is false (meaning
     171         ARCH_CLONE has failed), then the creating thread retains ownership
     172         of PD and must cleanup he allocated resource.  No waiting for the new
     173         thread is required because it never started.
     174  
     175     The nptl_db interface:
     176  
     177     The interface with nptl_db requires that we enqueue PD into a linked
     178     list and then call a function which the debugger will trap.  The PD
     179     will then be dequeued and control returned to the thread.  The caller
     180     at the time must have ownership of PD and such ownership remains
     181     after control returns to thread. The enqueued PD is removed from the
     182     linked list by the nptl_db callback td_thr_event_getmsg.  The debugger
     183     must ensure that the thread does not resume execution, otherwise
     184     ownership of PD may be lost and examining PD will not be possible.
     185  
     186     Note that the GNU Debugger as of (December 10th 2015) commit
     187     c2c2a31fdb228d41ce3db62b268efea04bd39c18 no longer uses
     188     td_thr_event_getmsg and several other related nptl_db interfaces. The
     189     principal reason for this is that nptl_db does not support non-stop
     190     mode where other threads can run concurrently and modify runtime
     191     structures currently in use by the debugger and the nptl_db
     192     interface.
     193  
     194     Axioms:
     195  
     196     * The create_thread function can never set stopped_start to false.
     197     * The created thread can read stopped_start but never write to it.
     198     * The variable thread_ran is set some time after the OS thread
     199       creation routine returns, how much time after the thread is created
     200       is unspecified, but it should be as quickly as possible.
     201  
     202  */
     203  
     204  /* CREATE THREAD NOTES:
     205  
     206     create_thread must initialize PD->stopped_start.  It should be true
     207     if the STOPPED_START parameter is true, or if create_thread needs the
     208     new thread to synchronize at startup for some other implementation
     209     reason.  If STOPPED_START will be true, then create_thread is obliged
     210     to lock PD->lock before starting the thread.  Then pthread_create
     211     unlocks PD->lock which synchronizes-with create_thread in the
     212     child thread which does an acquire/release of PD->lock as the last
     213     action before calling the user entry point.  The goal of all of this
     214     is to ensure that the required initial thread attributes are applied
     215     (by the creating thread) before the new thread runs user code.  Note
     216     that the the functions pthread_getschedparam, pthread_setschedparam,
     217     pthread_setschedprio, __pthread_tpp_change_priority, and
     218     __pthread_current_priority reuse the same lock, PD->lock, for a
     219     similar purpose e.g. synchronizing the setting of similar thread
     220     attributes.  These functions are never called before the thread is
     221     created, so don't participate in startup synchronization, but given
     222     that the lock is present already and in the unlocked state, reusing
     223     it saves space.
     224  
     225     The return value is zero for success or an errno code for failure.
     226     If the return value is ENOMEM, that will be translated to EAGAIN,
     227     so create_thread need not do that.  On failure, *THREAD_RAN should
     228     be set to true iff the thread actually started up but before calling
     229     the user code (*PD->start_routine).  */
     230  
     231  static int _Noreturn start_thread (void *arg);
     232  
     233  static int create_thread (struct pthread *pd, const struct pthread_attr *attr,
     234  			  bool *stopped_start, void *stackaddr,
     235  			  size_t stacksize, bool *thread_ran)
     236  {
     237    /* Determine whether the newly created threads has to be started
     238       stopped since we have to set the scheduling parameters or set the
     239       affinity.  */
     240    bool need_setaffinity = (attr != NULL && attr->extension != NULL
     241  			   && attr->extension->cpuset != 0);
     242    if (attr != NULL
     243        && (__glibc_unlikely (need_setaffinity)
     244  	  || __glibc_unlikely ((attr->flags & ATTR_FLAG_NOTINHERITSCHED) != 0)))
     245      *stopped_start = true;
     246  
     247    pd->stopped_start = *stopped_start;
     248    if (__glibc_unlikely (*stopped_start))
     249      lll_lock (pd->lock, LLL_PRIVATE);
     250  
     251    /* We rely heavily on various flags the CLONE function understands:
     252  
     253       CLONE_VM, CLONE_FS, CLONE_FILES
     254  	These flags select semantics with shared address space and
     255  	file descriptors according to what POSIX requires.
     256  
     257       CLONE_SIGHAND, CLONE_THREAD
     258  	This flag selects the POSIX signal semantics and various
     259  	other kinds of sharing (itimers, POSIX timers, etc.).
     260  
     261       CLONE_SETTLS
     262  	The sixth parameter to CLONE determines the TLS area for the
     263  	new thread.
     264  
     265       CLONE_PARENT_SETTID
     266  	The kernels writes the thread ID of the newly created thread
     267  	into the location pointed to by the fifth parameters to CLONE.
     268  
     269  	Note that it would be semantically equivalent to use
     270  	CLONE_CHILD_SETTID but it is be more expensive in the kernel.
     271  
     272       CLONE_CHILD_CLEARTID
     273  	The kernels clears the thread ID of a thread that has called
     274  	sys_exit() in the location pointed to by the seventh parameter
     275  	to CLONE.
     276  
     277       The termination signal is chosen to be zero which means no signal
     278       is sent.  */
     279    const int clone_flags = (CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SYSVSEM
     280  			   | CLONE_SIGHAND | CLONE_THREAD
     281  			   | CLONE_SETTLS | CLONE_PARENT_SETTID
     282  			   | CLONE_CHILD_CLEARTID
     283  			   | 0);
     284  
     285    TLS_DEFINE_INIT_TP (tp, pd);
     286  
     287    struct clone_args args =
     288      {
     289        .flags = clone_flags,
     290        .pidfd = (uintptr_t) &pd->tid,
     291        .parent_tid = (uintptr_t) &pd->tid,
     292        .child_tid = (uintptr_t) &pd->tid,
     293        .stack = (uintptr_t) stackaddr,
     294        .stack_size = stacksize,
     295        .tls = (uintptr_t) tp,
     296      };
     297    int ret = __clone_internal (&args, &start_thread, pd);
     298    if (__glibc_unlikely (ret == -1))
     299      return errno;
     300  
     301    /* It's started now, so if we fail below, we'll have to let it clean itself
     302       up.  */
     303    *thread_ran = true;
     304  
     305    /* Now we have the possibility to set scheduling parameters etc.  */
     306    if (attr != NULL)
     307      {
     308        /* Set the affinity mask if necessary.  */
     309        if (need_setaffinity)
     310  	{
     311  	  assert (*stopped_start);
     312  
     313  	  int res = INTERNAL_SYSCALL_CALL (sched_setaffinity, pd->tid,
     314  					   attr->extension->cpusetsize,
     315  					   attr->extension->cpuset);
     316  	  if (__glibc_unlikely (INTERNAL_SYSCALL_ERROR_P (res)))
     317  	    return INTERNAL_SYSCALL_ERRNO (res);
     318  	}
     319  
     320        /* Set the scheduling parameters.  */
     321        if ((attr->flags & ATTR_FLAG_NOTINHERITSCHED) != 0)
     322  	{
     323  	  assert (*stopped_start);
     324  
     325  	  int res = INTERNAL_SYSCALL_CALL (sched_setscheduler, pd->tid,
     326  					   pd->schedpolicy, &pd->schedparam);
     327  	  if (__glibc_unlikely (INTERNAL_SYSCALL_ERROR_P (res)))
     328  	    return INTERNAL_SYSCALL_ERRNO (res);
     329  	}
     330      }
     331  
     332    return 0;
     333  }
     334  
     335  /* Local function to start thread and handle cleanup.  */
     336  static int _Noreturn
     337  start_thread (void *arg)
     338  {
     339    struct pthread *pd = arg;
     340  
     341    /* We are either in (a) or (b), and in either case we either own PD already
     342       (2) or are about to own PD (1), and so our only restriction would be that
     343       we can't free PD until we know we have ownership (see CONCURRENCY NOTES
     344       above).  */
     345    if (pd->stopped_start)
     346      {
     347        bool setup_failed = false;
     348  
     349        /* Get the lock the parent locked to force synchronization.  */
     350        lll_lock (pd->lock, LLL_PRIVATE);
     351  
     352        /* We have ownership of PD now, for detached threads with setup failure
     353  	 we set it as joinable so the creating thread could synchronous join
     354           and free any resource prior return to the pthread_create caller.  */
     355        setup_failed = pd->setup_failed == 1;
     356        if (setup_failed)
     357  	pd->joinid = NULL;
     358  
     359        /* And give it up right away.  */
     360        lll_unlock (pd->lock, LLL_PRIVATE);
     361  
     362        if (setup_failed)
     363  	goto out;
     364      }
     365  
     366    /* Initialize resolver state pointer.  */
     367    __resp = &pd->res;
     368  
     369    /* Initialize pointers to locale data.  */
     370    __ctype_init ();
     371  
     372    /* Register rseq TLS to the kernel.  */
     373    {
     374      bool do_rseq = THREAD_GETMEM (pd, flags) & ATTR_FLAG_DO_RSEQ;
     375      if (!rseq_register_current_thread (pd, do_rseq) && do_rseq)
     376        __libc_fatal ("Fatal glibc error: rseq registration failed\n");
     377    }
     378  
     379  #ifndef __ASSUME_SET_ROBUST_LIST
     380    if (__nptl_set_robust_list_avail)
     381  #endif
     382      {
     383        /* This call should never fail because the initial call in init.c
     384  	 succeeded.  */
     385        INTERNAL_SYSCALL_CALL (set_robust_list, &pd->robust_head,
     386  			     sizeof (struct robust_list_head));
     387      }
     388  
     389    /* This is where the try/finally block should be created.  For
     390       compilers without that support we do use setjmp.  */
     391    struct pthread_unwind_buf unwind_buf;
     392  
     393    int not_first_call;
     394    DIAG_PUSH_NEEDS_COMMENT;
     395  #if __GNUC_PREREQ (7, 0)
     396    /* This call results in a -Wstringop-overflow warning because struct
     397       pthread_unwind_buf is smaller than jmp_buf.  setjmp and longjmp
     398       do not use anything beyond the common prefix (they never access
     399       the saved signal mask), so that is a false positive.  */
     400    DIAG_IGNORE_NEEDS_COMMENT (11, "-Wstringop-overflow=");
     401  #endif
     402    not_first_call = setjmp ((struct __jmp_buf_tag *) unwind_buf.cancel_jmp_buf);
     403    DIAG_POP_NEEDS_COMMENT;
     404  
     405    /* No previous handlers.  NB: This must be done after setjmp since the
     406       private space in the unwind jump buffer may overlap space used by
     407       setjmp to store extra architecture-specific information which is
     408       never used by the cancellation-specific __libc_unwind_longjmp.
     409  
     410       The private space is allowed to overlap because the unwinder never
     411       has to return through any of the jumped-to call frames, and thus
     412       only a minimum amount of saved data need be stored, and for example,
     413       need not include the process signal mask information. This is all
     414       an optimization to reduce stack usage when pushing cancellation
     415       handlers.  */
     416    unwind_buf.priv.data.prev = NULL;
     417    unwind_buf.priv.data.cleanup = NULL;
     418  
     419    /* Allow setxid from now onwards.  */
     420    if (__glibc_unlikely (atomic_exchange_acquire (&pd->setxid_futex, 0) == -2))
     421      futex_wake (&pd->setxid_futex, 1, FUTEX_PRIVATE);
     422  
     423    if (__glibc_likely (! not_first_call))
     424      {
     425        /* Store the new cleanup handler info.  */
     426        THREAD_SETMEM (pd, cleanup_jmp_buf, &unwind_buf);
     427  
     428        internal_signal_restore_set (&pd->sigmask);
     429  
     430        LIBC_PROBE (pthread_start, 3, (pthread_t) pd, pd->start_routine, pd->arg);
     431  
     432        /* Run the code the user provided.  */
     433        void *ret;
     434        if (pd->c11)
     435  	{
     436  	  /* The function pointer of the c11 thread start is cast to an incorrect
     437  	     type on __pthread_create_2_1 call, however it is casted back to correct
     438  	     one so the call behavior is well-defined (it is assumed that pointers
     439  	     to void are able to represent all values of int.  */
     440  	  int (*start)(void*) = (int (*) (void*)) pd->start_routine;
     441  	  ret = (void*) (uintptr_t) start (pd->arg);
     442  	}
     443        else
     444  	ret = pd->start_routine (pd->arg);
     445        THREAD_SETMEM (pd, result, ret);
     446      }
     447  
     448    /* Call destructors for the thread_local TLS variables.  */
     449  #ifndef SHARED
     450    if (&__call_tls_dtors != NULL)
     451  #endif
     452      __call_tls_dtors ();
     453  
     454    /* Run the destructor for the thread-local data.  */
     455    __nptl_deallocate_tsd ();
     456  
     457    /* Clean up any state libc stored in thread-local variables.  */
     458    __libc_thread_freeres ();
     459  
     460    /* Report the death of the thread if this is wanted.  */
     461    if (__glibc_unlikely (pd->report_events))
     462      {
     463        /* See whether TD_DEATH is in any of the mask.  */
     464        const int idx = __td_eventword (TD_DEATH);
     465        const uint32_t mask = __td_eventmask (TD_DEATH);
     466  
     467        if ((mask & (__nptl_threads_events.event_bits[idx]
     468  		   | pd->eventbuf.eventmask.event_bits[idx])) != 0)
     469  	{
     470  	  /* Yep, we have to signal the death.  Add the descriptor to
     471  	     the list but only if it is not already on it.  */
     472  	  if (pd->nextevent == NULL)
     473  	    {
     474  	      pd->eventbuf.eventnum = TD_DEATH;
     475  	      pd->eventbuf.eventdata = pd;
     476  
     477  	      do
     478  		pd->nextevent = __nptl_last_event;
     479  	      while (atomic_compare_and_exchange_bool_acq (&__nptl_last_event,
     480  							   pd, pd->nextevent));
     481  	    }
     482  
     483  	  /* Now call the function which signals the event.  See
     484  	     CONCURRENCY NOTES for the nptl_db interface comments.  */
     485  	  __nptl_death_event ();
     486  	}
     487      }
     488  
     489    /* The thread is exiting now.  Don't set this bit until after we've hit
     490       the event-reporting breakpoint, so that td_thr_get_info on us while at
     491       the breakpoint reports TD_THR_RUN state rather than TD_THR_ZOMBIE.  */
     492    atomic_fetch_or_relaxed (&pd->cancelhandling, EXITING_BITMASK);
     493  
     494    if (__glibc_unlikely (atomic_fetch_add_relaxed (&__nptl_nthreads, -1) == 1))
     495      /* This was the last thread.  */
     496      exit (0);
     497  
     498    /* This prevents sending a signal from this thread to itself during
     499       its final stages.  This must come after the exit call above
     500       because atexit handlers must not run with signals blocked.
     501  
     502       Do not block SIGSETXID.  The setxid handshake below expects the
     503       signal to be delivered.  (SIGSETXID cannot run application code,
     504       nor does it use pthread_kill.)  Reuse the pd->sigmask space for
     505       computing the signal mask, to save stack space.  */
     506    internal_sigfillset (&pd->sigmask);
     507    internal_sigdelset (&pd->sigmask, SIGSETXID);
     508    INTERNAL_SYSCALL_CALL (rt_sigprocmask, SIG_BLOCK, &pd->sigmask, NULL,
     509  			 __NSIG_BYTES);
     510  
     511    /* Tell __pthread_kill_internal that this thread is about to exit.
     512       If there is a __pthread_kill_internal in progress, this delays
     513       the thread exit until the signal has been queued by the kernel
     514       (so that the TID used to send it remains valid).  */
     515    __libc_lock_lock (pd->exit_lock);
     516    pd->exiting = true;
     517    __libc_lock_unlock (pd->exit_lock);
     518  
     519  #ifndef __ASSUME_SET_ROBUST_LIST
     520    /* If this thread has any robust mutexes locked, handle them now.  */
     521  # if __PTHREAD_MUTEX_HAVE_PREV
     522    void *robust = pd->robust_head.list;
     523  # else
     524    __pthread_slist_t *robust = pd->robust_list.__next;
     525  # endif
     526    /* We let the kernel do the notification if it is able to do so.
     527       If we have to do it here there for sure are no PI mutexes involved
     528       since the kernel support for them is even more recent.  */
     529    if (!__nptl_set_robust_list_avail
     530        && __builtin_expect (robust != (void *) &pd->robust_head, 0))
     531      {
     532        do
     533  	{
     534  	  struct __pthread_mutex_s *this = (struct __pthread_mutex_s *)
     535  	    ((char *) robust - offsetof (struct __pthread_mutex_s,
     536  					 __list.__next));
     537  	  robust = *((void **) robust);
     538  
     539  # if __PTHREAD_MUTEX_HAVE_PREV
     540  	  this->__list.__prev = NULL;
     541  # endif
     542  	  this->__list.__next = NULL;
     543  
     544  	  atomic_fetch_or_acquire (&this->__lock, FUTEX_OWNER_DIED);
     545  	  futex_wake ((unsigned int *) &this->__lock, 1,
     546  		      /* XYZ */ FUTEX_SHARED);
     547  	}
     548        while (robust != (void *) &pd->robust_head);
     549      }
     550  #endif
     551  
     552    if (!pd->user_stack)
     553      advise_stack_range (pd->stackblock, pd->stackblock_size, (uintptr_t) pd,
     554  			pd->guardsize);
     555  
     556    if (__glibc_unlikely (pd->cancelhandling & SETXID_BITMASK))
     557      {
     558        /* Some other thread might call any of the setXid functions and expect
     559  	 us to reply.  In this case wait until we did that.  */
     560        do
     561  	/* XXX This differs from the typical futex_wait_simple pattern in that
     562  	   the futex_wait condition (setxid_futex) is different from the
     563  	   condition used in the surrounding loop (cancelhandling).  We need
     564  	   to check and document why this is correct.  */
     565  	futex_wait_simple (&pd->setxid_futex, 0, FUTEX_PRIVATE);
     566        while (pd->cancelhandling & SETXID_BITMASK);
     567  
     568        /* Reset the value so that the stack can be reused.  */
     569        pd->setxid_futex = 0;
     570      }
     571  
     572    /* If the thread is detached free the TCB.  */
     573    if (IS_DETACHED (pd))
     574      /* Free the TCB.  */
     575      __nptl_free_tcb (pd);
     576  
     577  out:
     578    /* We cannot call '_exit' here.  '_exit' will terminate the process.
     579  
     580       The 'exit' implementation in the kernel will signal when the
     581       process is really dead since 'clone' got passed the CLONE_CHILD_CLEARTID
     582       flag.  The 'tid' field in the TCB will be set to zero.
     583  
     584       rseq TLS is still registered at this point.  Rely on implicit
     585       unregistration performed by the kernel on thread teardown.  This is not a
     586       problem because the rseq TLS lives on the stack, and the stack outlives
     587       the thread.  If TCB allocation is ever changed, additional steps may be
     588       required, such as performing explicit rseq unregistration before
     589       reclaiming the rseq TLS area memory.  It is NOT sufficient to block
     590       signals because the kernel may write to the rseq area even without
     591       signals.
     592  
     593       The exit code is zero since in case all threads exit by calling
     594       'pthread_exit' the exit status must be 0 (zero).  */
     595    while (1)
     596      INTERNAL_SYSCALL_CALL (exit, 0);
     597  
     598    /* NOTREACHED */
     599  }
     600  
     601  
     602  /* Return true iff obliged to report TD_CREATE events.  */
     603  static bool
     604  report_thread_creation (struct pthread *pd)
     605  {
     606    if (__glibc_unlikely (THREAD_GETMEM (THREAD_SELF, report_events)))
     607      {
     608        /* The parent thread is supposed to report events.
     609  	 Check whether the TD_CREATE event is needed, too.  */
     610        const size_t idx = __td_eventword (TD_CREATE);
     611        const uint32_t mask = __td_eventmask (TD_CREATE);
     612  
     613        return ((mask & (__nptl_threads_events.event_bits[idx]
     614  		       | pd->eventbuf.eventmask.event_bits[idx])) != 0);
     615      }
     616    return false;
     617  }
     618  
     619  
     620  int
     621  __pthread_create_2_1 (pthread_t *newthread, const pthread_attr_t *attr,
     622  		      void *(*start_routine) (void *), void *arg)
     623  {
     624    void *stackaddr = NULL;
     625    size_t stacksize = 0;
     626  
     627    /* Avoid a data race in the multi-threaded case, and call the
     628       deferred initialization only once.  */
     629    if (__libc_single_threaded_internal)
     630      {
     631        late_init ();
     632        __libc_single_threaded_internal = 0;
     633        /* __libc_single_threaded can be accessed through copy relocations, so
     634  	 it requires to update the external copy.  */
     635        __libc_single_threaded = 0;
     636      }
     637  
     638    const struct pthread_attr *iattr = (struct pthread_attr *) attr;
     639    union pthread_attr_transparent default_attr;
     640    bool destroy_default_attr = false;
     641    bool c11 = (attr == ATTR_C11_THREAD);
     642    if (iattr == NULL || c11)
     643      {
     644        int ret = __pthread_getattr_default_np (&default_attr.external);
     645        if (ret != 0)
     646  	return ret;
     647        destroy_default_attr = true;
     648        iattr = &default_attr.internal;
     649      }
     650  
     651    struct pthread *pd = NULL;
     652    int err = allocate_stack (iattr, &pd, &stackaddr, &stacksize);
     653    int retval = 0;
     654  
     655    if (__glibc_unlikely (err != 0))
     656      /* Something went wrong.  Maybe a parameter of the attributes is
     657         invalid or we could not allocate memory.  Note we have to
     658         translate error codes.  */
     659      {
     660        retval = err == ENOMEM ? EAGAIN : err;
     661        goto out;
     662      }
     663  
     664  
     665    /* Initialize the TCB.  All initializations with zero should be
     666       performed in 'get_cached_stack'.  This way we avoid doing this if
     667       the stack freshly allocated with 'mmap'.  */
     668  
     669  #if TLS_TCB_AT_TP
     670    /* Reference to the TCB itself.  */
     671    pd->header.self = pd;
     672  
     673    /* Self-reference for TLS.  */
     674    pd->header.tcb = pd;
     675  #endif
     676  
     677    /* Store the address of the start routine and the parameter.  Since
     678       we do not start the function directly the stillborn thread will
     679       get the information from its thread descriptor.  */
     680    pd->start_routine = start_routine;
     681    pd->arg = arg;
     682    pd->c11 = c11;
     683  
     684    /* Copy the thread attribute flags.  */
     685    struct pthread *self = THREAD_SELF;
     686    pd->flags = ((iattr->flags & ~(ATTR_FLAG_SCHED_SET | ATTR_FLAG_POLICY_SET))
     687  	       | (self->flags & (ATTR_FLAG_SCHED_SET | ATTR_FLAG_POLICY_SET)));
     688  
     689    /* Inherit rseq registration state.  Without seccomp filters, rseq
     690       registration will either always fail or always succeed.  */
     691    if ((int) THREAD_GETMEM_VOLATILE (self, rseq_area.cpu_id) >= 0)
     692      pd->flags |= ATTR_FLAG_DO_RSEQ;
     693  
     694    /* Initialize the field for the ID of the thread which is waiting
     695       for us.  This is a self-reference in case the thread is created
     696       detached.  */
     697    pd->joinid = iattr->flags & ATTR_FLAG_DETACHSTATE ? pd : NULL;
     698  
     699    /* The debug events are inherited from the parent.  */
     700    pd->eventbuf = self->eventbuf;
     701  
     702  
     703    /* Copy the parent's scheduling parameters.  The flags will say what
     704       is valid and what is not.  */
     705    pd->schedpolicy = self->schedpolicy;
     706    pd->schedparam = self->schedparam;
     707  
     708    /* Copy the stack guard canary.  */
     709  #ifdef THREAD_COPY_STACK_GUARD
     710    THREAD_COPY_STACK_GUARD (pd);
     711  #endif
     712  
     713    /* Copy the pointer guard value.  */
     714  #ifdef THREAD_COPY_POINTER_GUARD
     715    THREAD_COPY_POINTER_GUARD (pd);
     716  #endif
     717  
     718    /* Setup tcbhead.  */
     719    tls_setup_tcbhead (pd);
     720  
     721    /* Verify the sysinfo bits were copied in allocate_stack if needed.  */
     722  #ifdef NEED_DL_SYSINFO
     723    CHECK_THREAD_SYSINFO (pd);
     724  #endif
     725  
     726    /* Determine scheduling parameters for the thread.  */
     727    if (__builtin_expect ((iattr->flags & ATTR_FLAG_NOTINHERITSCHED) != 0, 0)
     728        && (iattr->flags & (ATTR_FLAG_SCHED_SET | ATTR_FLAG_POLICY_SET)) != 0)
     729      {
     730        /* Use the scheduling parameters the user provided.  */
     731        if (iattr->flags & ATTR_FLAG_POLICY_SET)
     732          {
     733            pd->schedpolicy = iattr->schedpolicy;
     734            pd->flags |= ATTR_FLAG_POLICY_SET;
     735          }
     736        if (iattr->flags & ATTR_FLAG_SCHED_SET)
     737          {
     738            /* The values were validated in pthread_attr_setschedparam.  */
     739            pd->schedparam = iattr->schedparam;
     740            pd->flags |= ATTR_FLAG_SCHED_SET;
     741          }
     742  
     743        if ((pd->flags & (ATTR_FLAG_SCHED_SET | ATTR_FLAG_POLICY_SET))
     744            != (ATTR_FLAG_SCHED_SET | ATTR_FLAG_POLICY_SET))
     745          collect_default_sched (pd);
     746      }
     747  
     748    if (__glibc_unlikely (__nptl_nthreads == 1))
     749      _IO_enable_locks ();
     750  
     751    /* Pass the descriptor to the caller.  */
     752    *newthread = (pthread_t) pd;
     753  
     754    LIBC_PROBE (pthread_create, 4, newthread, attr, start_routine, arg);
     755  
     756    /* One more thread.  We cannot have the thread do this itself, since it
     757       might exist but not have been scheduled yet by the time we've returned
     758       and need to check the value to behave correctly.  We must do it before
     759       creating the thread, in case it does get scheduled first and then
     760       might mistakenly think it was the only thread.  In the failure case,
     761       we momentarily store a false value; this doesn't matter because there
     762       is no kosher thing a signal handler interrupting us right here can do
     763       that cares whether the thread count is correct.  */
     764    atomic_fetch_add_relaxed (&__nptl_nthreads, 1);
     765  
     766    /* Our local value of stopped_start and thread_ran can be accessed at
     767       any time. The PD->stopped_start may only be accessed if we have
     768       ownership of PD (see CONCURRENCY NOTES above).  */
     769    bool stopped_start = false; bool thread_ran = false;
     770  
     771    /* Block all signals, so that the new thread starts out with
     772       signals disabled.  This avoids race conditions in the thread
     773       startup.  */
     774    internal_sigset_t original_sigmask;
     775    internal_signal_block_all (&original_sigmask);
     776  
     777    if (iattr->extension != NULL && iattr->extension->sigmask_set)
     778      /* Use the signal mask in the attribute.  The internal signals
     779         have already been filtered by the public
     780         pthread_attr_setsigmask_np interface.  */
     781      internal_sigset_from_sigset (&pd->sigmask, &iattr->extension->sigmask);
     782    else
     783      {
     784        /* Conceptually, the new thread needs to inherit the signal mask
     785  	 of this thread.  Therefore, it needs to restore the saved
     786  	 signal mask of this thread, so save it in the startup
     787  	 information.  */
     788        pd->sigmask = original_sigmask;
     789        /* Reset the cancellation signal mask in case this thread is
     790  	 running cancellation.  */
     791        internal_sigdelset (&pd->sigmask, SIGCANCEL);
     792      }
     793  
     794    /* Start the thread.  */
     795    if (__glibc_unlikely (report_thread_creation (pd)))
     796      {
     797        stopped_start = true;
     798  
     799        /* We always create the thread stopped at startup so we can
     800  	 notify the debugger.  */
     801        retval = create_thread (pd, iattr, &stopped_start, stackaddr,
     802  			      stacksize, &thread_ran);
     803        if (retval == 0)
     804  	{
     805  	  /* We retain ownership of PD until (a) (see CONCURRENCY NOTES
     806  	     above).  */
     807  
     808  	  /* Assert stopped_start is true in both our local copy and the
     809  	     PD copy.  */
     810  	  assert (stopped_start);
     811  	  assert (pd->stopped_start);
     812  
     813  	  /* Now fill in the information about the new thread in
     814  	     the newly created thread's data structure.  We cannot let
     815  	     the new thread do this since we don't know whether it was
     816  	     already scheduled when we send the event.  */
     817  	  pd->eventbuf.eventnum = TD_CREATE;
     818  	  pd->eventbuf.eventdata = pd;
     819  
     820  	  /* Enqueue the descriptor.  */
     821  	  do
     822  	    pd->nextevent = __nptl_last_event;
     823  	  while (atomic_compare_and_exchange_bool_acq (&__nptl_last_event,
     824  						       pd, pd->nextevent)
     825  		 != 0);
     826  
     827  	  /* Now call the function which signals the event.  See
     828  	     CONCURRENCY NOTES for the nptl_db interface comments.  */
     829  	  __nptl_create_event ();
     830  	}
     831      }
     832    else
     833      retval = create_thread (pd, iattr, &stopped_start, stackaddr,
     834  			    stacksize, &thread_ran);
     835  
     836    /* Return to the previous signal mask, after creating the new
     837       thread.  */
     838    internal_signal_restore_set (&original_sigmask);
     839  
     840    if (__glibc_unlikely (retval != 0))
     841      {
     842        if (thread_ran)
     843  	/* State (c) and we not have PD ownership (see CONCURRENCY NOTES
     844  	   above).  We can assert that STOPPED_START must have been true
     845  	   because thread creation didn't fail, but thread attribute setting
     846  	   did.  */
     847          {
     848  	  assert (stopped_start);
     849  	  /* Signal the created thread to release PD ownership and early
     850  	     exit so it could be joined.  */
     851  	  pd->setup_failed = 1;
     852  	  lll_unlock (pd->lock, LLL_PRIVATE);
     853  
     854  	  /* Similar to pthread_join, but since thread creation has failed at
     855  	     startup there is no need to handle all the steps.  */
     856  	  pid_t tid;
     857  	  while ((tid = atomic_load_acquire (&pd->tid)) != 0)
     858  	    __futex_abstimed_wait_cancelable64 ((unsigned int *) &pd->tid,
     859  						tid, 0, NULL, LLL_SHARED);
     860          }
     861  
     862        /* State (c) or (d) and we have ownership of PD (see CONCURRENCY
     863  	 NOTES above).  */
     864  
     865        /* Oops, we lied for a second.  */
     866        atomic_fetch_add_relaxed (&__nptl_nthreads, -1);
     867  
     868        /* Free the resources.  */
     869        __nptl_deallocate_stack (pd);
     870  
     871        /* We have to translate error codes.  */
     872        if (retval == ENOMEM)
     873  	retval = EAGAIN;
     874      }
     875    else
     876      {
     877        /* We don't know if we have PD ownership.  Once we check the local
     878           stopped_start we'll know if we're in state (a) or (b) (see
     879  	 CONCURRENCY NOTES above).  */
     880        if (stopped_start)
     881  	/* State (a), we own PD. The thread blocked on this lock either
     882  	   because we're doing TD_CREATE event reporting, or for some
     883  	   other reason that create_thread chose.  Now let it run
     884  	   free.  */
     885  	lll_unlock (pd->lock, LLL_PRIVATE);
     886  
     887        /* We now have for sure more than one thread.  The main thread might
     888  	 not yet have the flag set.  No need to set the global variable
     889  	 again if this is what we use.  */
     890        THREAD_SETMEM (THREAD_SELF, header.multiple_threads, 1);
     891      }
     892  
     893   out:
     894    if (destroy_default_attr)
     895      __pthread_attr_destroy (&default_attr.external);
     896  
     897    return retval;
     898  }
     899  versioned_symbol (libc, __pthread_create_2_1, pthread_create, GLIBC_2_34);
     900  libc_hidden_ver (__pthread_create_2_1, __pthread_create)
     901  #ifndef SHARED
     902  strong_alias (__pthread_create_2_1, __pthread_create)
     903  #endif
     904  
     905  #if OTHER_SHLIB_COMPAT (libpthread, GLIBC_2_1, GLIBC_2_34)
     906  compat_symbol (libpthread, __pthread_create_2_1, pthread_create, GLIBC_2_1);
     907  #endif
     908  
     909  #if OTHER_SHLIB_COMPAT (libpthread, GLIBC_2_0, GLIBC_2_1)
     910  int
     911  __pthread_create_2_0 (pthread_t *newthread, const pthread_attr_t *attr,
     912  		      void *(*start_routine) (void *), void *arg)
     913  {
     914    /* The ATTR attribute is not really of type `pthread_attr_t *'.  It has
     915       the old size and access to the new members might crash the program.
     916       We convert the struct now.  */
     917    struct pthread_attr new_attr;
     918  
     919    if (attr != NULL)
     920      {
     921        struct pthread_attr *iattr = (struct pthread_attr *) attr;
     922        size_t ps = __getpagesize ();
     923  
     924        /* Copy values from the user-provided attributes.  */
     925        new_attr.schedparam = iattr->schedparam;
     926        new_attr.schedpolicy = iattr->schedpolicy;
     927        new_attr.flags = iattr->flags;
     928  
     929        /* Fill in default values for the fields not present in the old
     930  	 implementation.  */
     931        new_attr.guardsize = ps;
     932        new_attr.stackaddr = NULL;
     933        new_attr.stacksize = 0;
     934        new_attr.extension = NULL;
     935  
     936        /* We will pass this value on to the real implementation.  */
     937        attr = (pthread_attr_t *) &new_attr;
     938      }
     939  
     940    return __pthread_create_2_1 (newthread, attr, start_routine, arg);
     941  }
     942  compat_symbol (libpthread, __pthread_create_2_0, pthread_create,
     943  	       GLIBC_2_0);
     944  #endif
     945  
     946  /* Information for libthread_db.  */
     947  
     948  #include "../nptl_db/db_info.c"
     949  
     950  /* If pthread_create is present, libgcc_eh.a and libsupc++.a expects some other POSIX thread
     951     functions to be present as well.  */
     952  PTHREAD_STATIC_FN_REQUIRE (__pthread_mutex_lock)
     953  PTHREAD_STATIC_FN_REQUIRE (__pthread_mutex_trylock)
     954  PTHREAD_STATIC_FN_REQUIRE (__pthread_mutex_unlock)
     955  
     956  PTHREAD_STATIC_FN_REQUIRE (__pthread_once)
     957  PTHREAD_STATIC_FN_REQUIRE (__pthread_cancel)
     958  
     959  PTHREAD_STATIC_FN_REQUIRE (__pthread_key_create)
     960  PTHREAD_STATIC_FN_REQUIRE (__pthread_key_delete)
     961  PTHREAD_STATIC_FN_REQUIRE (__pthread_setspecific)
     962  PTHREAD_STATIC_FN_REQUIRE (__pthread_getspecific)