1  /* Copyright (C) 2002-2023 Free Software Foundation, Inc.
       2     This file is part of the GNU C Library.
       3  
       4     The GNU C Library is free software; you can redistribute it and/or
       5     modify it under the terms of the GNU Lesser General Public
       6     License as published by the Free Software Foundation; either
       7     version 2.1 of the License, or (at your option) any later version.
       8  
       9     The GNU C Library is distributed in the hope that it will be useful,
      10     but WITHOUT ANY WARRANTY; without even the implied warranty of
      11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      12     Lesser General Public License for more details.
      13  
      14     You should have received a copy of the GNU Lesser General Public
      15     License along with the GNU C Library; if not, see
      16     <https://www.gnu.org/licenses/>.  */
      17  
      18  #include <assert.h>
      19  #include <errno.h>
      20  #include <signal.h>
      21  #include <stdint.h>
      22  #include <string.h>
      23  #include <unistd.h>
      24  #include <sys/mman.h>
      25  #include <sys/param.h>
      26  #include <dl-sysdep.h>
      27  #include <dl-tls.h>
      28  #include <tls.h>
      29  #include <list.h>
      30  #include <lowlevellock.h>
      31  #include <futex-internal.h>
      32  #include <kernel-features.h>
      33  #include <nptl-stack.h>
      34  #include <libc-lock.h>
      35  #include <tls-internal.h>
      36  
      37  /* Default alignment of stack.  */
      38  #ifndef STACK_ALIGN
      39  # define STACK_ALIGN __alignof__ (long double)
      40  #endif
      41  
      42  /* Default value for minimal stack size after allocating thread
      43     descriptor and guard.  */
      44  #ifndef MINIMAL_REST_STACK
      45  # define MINIMAL_REST_STACK	4096
      46  #endif
      47  
      48  
      49  /* Newer kernels have the MAP_STACK flag to indicate a mapping is used for
      50     a stack.  Use it when possible.  */
      51  #ifndef MAP_STACK
      52  # define MAP_STACK 0
      53  #endif
      54  
      55  /* Get a stack frame from the cache.  We have to match by size since
      56     some blocks might be too small or far too large.  */
      57  static struct pthread *
      58  get_cached_stack (size_t *sizep, void **memp)
      59  {
      60    size_t size = *sizep;
      61    struct pthread *result = NULL;
      62    list_t *entry;
      63  
      64    lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
      65  
      66    /* Search the cache for a matching entry.  We search for the
      67       smallest stack which has at least the required size.  Note that
      68       in normal situations the size of all allocated stacks is the
      69       same.  As the very least there are only a few different sizes.
      70       Therefore this loop will exit early most of the time with an
      71       exact match.  */
      72    list_for_each (entry, &GL (dl_stack_cache))
      73      {
      74        struct pthread *curr;
      75  
      76        curr = list_entry (entry, struct pthread, list);
      77        if (__nptl_stack_in_use (curr) && curr->stackblock_size >= size)
      78  	{
      79  	  if (curr->stackblock_size == size)
      80  	    {
      81  	      result = curr;
      82  	      break;
      83  	    }
      84  
      85  	  if (result == NULL
      86  	      || result->stackblock_size > curr->stackblock_size)
      87  	    result = curr;
      88  	}
      89      }
      90  
      91    if (__builtin_expect (result == NULL, 0)
      92        /* Make sure the size difference is not too excessive.  In that
      93  	 case we do not use the block.  */
      94        || __builtin_expect (result->stackblock_size > 4 * size, 0))
      95      {
      96        /* Release the lock.  */
      97        lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
      98  
      99        return NULL;
     100      }
     101  
     102    /* Don't allow setxid until cloned.  */
     103    result->setxid_futex = -1;
     104  
     105    /* Dequeue the entry.  */
     106    __nptl_stack_list_del (&result->list);
     107  
     108    /* And add to the list of stacks in use.  */
     109    __nptl_stack_list_add (&result->list, &GL (dl_stack_used));
     110  
     111    /* And decrease the cache size.  */
     112    GL (dl_stack_cache_actsize) -= result->stackblock_size;
     113  
     114    /* Release the lock early.  */
     115    lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
     116  
     117    /* Report size and location of the stack to the caller.  */
     118    *sizep = result->stackblock_size;
     119    *memp = result->stackblock;
     120  
     121    /* Cancellation handling is back to the default.  */
     122    result->cancelhandling = 0;
     123    result->cleanup = NULL;
     124    result->setup_failed = 0;
     125  
     126    /* No pending event.  */
     127    result->nextevent = NULL;
     128  
     129    result->exiting = false;
     130    __libc_lock_init (result->exit_lock);
     131    memset (&result->tls_state, 0, sizeof result->tls_state);
     132  
     133    /* Clear the DTV.  */
     134    dtv_t *dtv = GET_DTV (TLS_TPADJ (result));
     135    for (size_t cnt = 0; cnt < dtv[-1].counter; ++cnt)
     136      free (dtv[1 + cnt].pointer.to_free);
     137    memset (dtv, '\0', (dtv[-1].counter + 1) * sizeof (dtv_t));
     138  
     139    /* Re-initialize the TLS.  */
     140    _dl_allocate_tls_init (TLS_TPADJ (result), true);
     141  
     142    return result;
     143  }
     144  
     145  /* Return the guard page position on allocated stack.  */
     146  static inline char *
     147  __attribute ((always_inline))
     148  guard_position (void *mem, size_t size, size_t guardsize, struct pthread *pd,
     149  		size_t pagesize_m1)
     150  {
     151  #ifdef NEED_SEPARATE_REGISTER_STACK
     152    return mem + (((size - guardsize) / 2) & ~pagesize_m1);
     153  #elif _STACK_GROWS_DOWN
     154    return mem;
     155  #elif _STACK_GROWS_UP
     156    return (char *) (((uintptr_t) pd - guardsize) & ~pagesize_m1);
     157  #endif
     158  }
     159  
     160  /* Based on stack allocated with PROT_NONE, setup the required portions with
     161     'prot' flags based on the guard page position.  */
     162  static inline int
     163  setup_stack_prot (char *mem, size_t size, char *guard, size_t guardsize,
     164  		  const int prot)
     165  {
     166    char *guardend = guard + guardsize;
     167  #if _STACK_GROWS_DOWN && !defined(NEED_SEPARATE_REGISTER_STACK)
     168    /* As defined at guard_position, for architectures with downward stack
     169       the guard page is always at start of the allocated area.  */
     170    if (__mprotect (guardend, size - guardsize, prot) != 0)
     171      return errno;
     172  #else
     173    size_t mprots1 = (uintptr_t) guard - (uintptr_t) mem;
     174    if (__mprotect (mem, mprots1, prot) != 0)
     175      return errno;
     176    size_t mprots2 = ((uintptr_t) mem + size) - (uintptr_t) guardend;
     177    if (__mprotect (guardend, mprots2, prot) != 0)
     178      return errno;
     179  #endif
     180    return 0;
     181  }
     182  
     183  /* Mark the memory of the stack as usable to the kernel.  It frees everything
     184     except for the space used for the TCB itself.  */
     185  static __always_inline void
     186  advise_stack_range (void *mem, size_t size, uintptr_t pd, size_t guardsize)
     187  {
     188    uintptr_t sp = (uintptr_t) CURRENT_STACK_FRAME;
     189    size_t pagesize_m1 = __getpagesize () - 1;
     190  #if _STACK_GROWS_DOWN && !defined(NEED_SEPARATE_REGISTER_STACK)
     191    size_t freesize = (sp - (uintptr_t) mem) & ~pagesize_m1;
     192    assert (freesize < size);
     193    if (freesize > PTHREAD_STACK_MIN)
     194      __madvise (mem, freesize - PTHREAD_STACK_MIN, MADV_DONTNEED);
     195  #else
     196    /* Page aligned start of memory to free (higher than or equal
     197       to current sp plus the minimum stack size).  */
     198    uintptr_t freeblock = (sp + PTHREAD_STACK_MIN + pagesize_m1) & ~pagesize_m1;
     199    uintptr_t free_end = (pd - guardsize) & ~pagesize_m1;
     200    if (free_end > freeblock)
     201      {
     202        size_t freesize = free_end - freeblock;
     203        assert (freesize < size);
     204        __madvise ((void*) freeblock, freesize, MADV_DONTNEED);
     205      }
     206  #endif
     207  }
     208  
     209  /* Returns a usable stack for a new thread either by allocating a
     210     new stack or reusing a cached stack of sufficient size.
     211     ATTR must be non-NULL and point to a valid pthread_attr.
     212     PDP must be non-NULL.  */
     213  static int
     214  allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
     215  		void **stack, size_t *stacksize)
     216  {
     217    struct pthread *pd;
     218    size_t size;
     219    size_t pagesize_m1 = __getpagesize () - 1;
     220    size_t tls_static_size_for_stack = __nptl_tls_static_size_for_stack ();
     221    size_t tls_static_align_m1 = GLRO (dl_tls_static_align) - 1;
     222  
     223    assert (powerof2 (pagesize_m1 + 1));
     224    assert (TCB_ALIGNMENT >= STACK_ALIGN);
     225  
     226    /* Get the stack size from the attribute if it is set.  Otherwise we
     227       use the default we determined at start time.  */
     228    if (attr->stacksize != 0)
     229      size = attr->stacksize;
     230    else
     231      {
     232        lll_lock (__default_pthread_attr_lock, LLL_PRIVATE);
     233        size = __default_pthread_attr.internal.stacksize;
     234        lll_unlock (__default_pthread_attr_lock, LLL_PRIVATE);
     235      }
     236  
     237    /* Get memory for the stack.  */
     238    if (__glibc_unlikely (attr->flags & ATTR_FLAG_STACKADDR))
     239      {
     240        uintptr_t adj;
     241        char *stackaddr = (char *) attr->stackaddr;
     242  
     243        /* Assume the same layout as the _STACK_GROWS_DOWN case, with struct
     244  	 pthread at the top of the stack block.  Later we adjust the guard
     245  	 location and stack address to match the _STACK_GROWS_UP case.  */
     246        if (_STACK_GROWS_UP)
     247  	stackaddr += attr->stacksize;
     248  
     249        /* If the user also specified the size of the stack make sure it
     250  	 is large enough.  */
     251        if (attr->stacksize != 0
     252  	  && attr->stacksize < (tls_static_size_for_stack
     253  				+ MINIMAL_REST_STACK))
     254  	return EINVAL;
     255  
     256        /* Adjust stack size for alignment of the TLS block.  */
     257  #if TLS_TCB_AT_TP
     258        adj = ((uintptr_t) stackaddr - TLS_TCB_SIZE)
     259  	    & tls_static_align_m1;
     260        assert (size > adj + TLS_TCB_SIZE);
     261  #elif TLS_DTV_AT_TP
     262        adj = ((uintptr_t) stackaddr - tls_static_size_for_stack)
     263  	    & tls_static_align_m1;
     264        assert (size > adj);
     265  #endif
     266  
     267        /* The user provided some memory.  Let's hope it matches the
     268  	 size...  We do not allocate guard pages if the user provided
     269  	 the stack.  It is the user's responsibility to do this if it
     270  	 is wanted.  */
     271  #if TLS_TCB_AT_TP
     272        pd = (struct pthread *) ((uintptr_t) stackaddr
     273  			       - TLS_TCB_SIZE - adj);
     274  #elif TLS_DTV_AT_TP
     275        pd = (struct pthread *) (((uintptr_t) stackaddr
     276  				- tls_static_size_for_stack - adj)
     277  			       - TLS_PRE_TCB_SIZE);
     278  #endif
     279  
     280        /* The user provided stack memory needs to be cleared.  */
     281        memset (pd, '\0', sizeof (struct pthread));
     282  
     283        /* The first TSD block is included in the TCB.  */
     284        pd->specific[0] = pd->specific_1stblock;
     285  
     286        /* Remember the stack-related values.  */
     287        pd->stackblock = (char *) stackaddr - size;
     288        pd->stackblock_size = size;
     289  
     290        /* This is a user-provided stack.  It will not be queued in the
     291  	 stack cache nor will the memory (except the TLS memory) be freed.  */
     292        pd->user_stack = true;
     293  
     294        /* This is at least the second thread.  */
     295        pd->header.multiple_threads = 1;
     296  
     297  #ifdef NEED_DL_SYSINFO
     298        SETUP_THREAD_SYSINFO (pd);
     299  #endif
     300  
     301        /* Don't allow setxid until cloned.  */
     302        pd->setxid_futex = -1;
     303  
     304        /* Allocate the DTV for this thread.  */
     305        if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
     306  	{
     307  	  /* Something went wrong.  */
     308  	  assert (errno == ENOMEM);
     309  	  return errno;
     310  	}
     311  
     312  
     313        /* Prepare to modify global data.  */
     314        lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
     315  
     316        /* And add to the list of stacks in use.  */
     317        list_add (&pd->list, &GL (dl_stack_user));
     318  
     319        lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
     320      }
     321    else
     322      {
     323        /* Allocate some anonymous memory.  If possible use the cache.  */
     324        size_t guardsize;
     325        size_t reported_guardsize;
     326        size_t reqsize;
     327        void *mem;
     328        const int prot = (PROT_READ | PROT_WRITE
     329  			| ((GL(dl_stack_flags) & PF_X) ? PROT_EXEC : 0));
     330  
     331        /* Adjust the stack size for alignment.  */
     332        size &= ~tls_static_align_m1;
     333        assert (size != 0);
     334  
     335        /* Make sure the size of the stack is enough for the guard and
     336  	 eventually the thread descriptor.  On some targets there is
     337  	 a minimum guard size requirement, ARCH_MIN_GUARD_SIZE, so
     338  	 internally enforce it (unless the guard was disabled), but
     339  	 report the original guard size for backward compatibility:
     340  	 before POSIX 2008 the guardsize was specified to be one page
     341  	 by default which is observable via pthread_attr_getguardsize
     342  	 and pthread_getattr_np.  */
     343        guardsize = (attr->guardsize + pagesize_m1) & ~pagesize_m1;
     344        reported_guardsize = guardsize;
     345        if (guardsize > 0 && guardsize < ARCH_MIN_GUARD_SIZE)
     346  	guardsize = ARCH_MIN_GUARD_SIZE;
     347        if (guardsize < attr->guardsize || size + guardsize < guardsize)
     348  	/* Arithmetic overflow.  */
     349  	return EINVAL;
     350        size += guardsize;
     351        if (__builtin_expect (size < ((guardsize + tls_static_size_for_stack
     352  				     + MINIMAL_REST_STACK + pagesize_m1)
     353  				    & ~pagesize_m1),
     354  			    0))
     355  	/* The stack is too small (or the guard too large).  */
     356  	return EINVAL;
     357  
     358        /* Try to get a stack from the cache.  */
     359        reqsize = size;
     360        pd = get_cached_stack (&size, &mem);
     361        if (pd == NULL)
     362  	{
     363  	  /* If a guard page is required, avoid committing memory by first
     364  	     allocate with PROT_NONE and then reserve with required permission
     365  	     excluding the guard page.  */
     366  	  mem = __mmap (NULL, size, (guardsize == 0) ? prot : PROT_NONE,
     367  			MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
     368  
     369  	  if (__glibc_unlikely (mem == MAP_FAILED))
     370  	    return errno;
     371  
     372  	  /* Do madvise in case the tunable glibc.pthread.stack_hugetlb is
     373  	     set to 0, disabling hugetlb.  */
     374  	  if (__glibc_unlikely (__nptl_stack_hugetlb == 0)
     375  	      && __madvise (mem, size, MADV_NOHUGEPAGE) != 0)
     376  	    return errno;
     377  
     378  	  /* SIZE is guaranteed to be greater than zero.
     379  	     So we can never get a null pointer back from mmap.  */
     380  	  assert (mem != NULL);
     381  
     382  	  /* Place the thread descriptor at the end of the stack.  */
     383  #if TLS_TCB_AT_TP
     384  	  pd = (struct pthread *) ((((uintptr_t) mem + size)
     385  				    - TLS_TCB_SIZE)
     386  				   & ~tls_static_align_m1);
     387  #elif TLS_DTV_AT_TP
     388  	  pd = (struct pthread *) ((((uintptr_t) mem + size
     389  				    - tls_static_size_for_stack)
     390  				    & ~tls_static_align_m1)
     391  				   - TLS_PRE_TCB_SIZE);
     392  #endif
     393  
     394  	  /* Now mprotect the required region excluding the guard area.  */
     395  	  if (__glibc_likely (guardsize > 0))
     396  	    {
     397  	      char *guard = guard_position (mem, size, guardsize, pd,
     398  					    pagesize_m1);
     399  	      if (setup_stack_prot (mem, size, guard, guardsize, prot) != 0)
     400  		{
     401  		  __munmap (mem, size);
     402  		  return errno;
     403  		}
     404  	    }
     405  
     406  	  /* Remember the stack-related values.  */
     407  	  pd->stackblock = mem;
     408  	  pd->stackblock_size = size;
     409  	  /* Update guardsize for newly allocated guardsize to avoid
     410  	     an mprotect in guard resize below.  */
     411  	  pd->guardsize = guardsize;
     412  
     413  	  /* We allocated the first block thread-specific data array.
     414  	     This address will not change for the lifetime of this
     415  	     descriptor.  */
     416  	  pd->specific[0] = pd->specific_1stblock;
     417  
     418  	  /* This is at least the second thread.  */
     419  	  pd->header.multiple_threads = 1;
     420  
     421  #ifdef NEED_DL_SYSINFO
     422  	  SETUP_THREAD_SYSINFO (pd);
     423  #endif
     424  
     425  	  /* Don't allow setxid until cloned.  */
     426  	  pd->setxid_futex = -1;
     427  
     428  	  /* Allocate the DTV for this thread.  */
     429  	  if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
     430  	    {
     431  	      /* Something went wrong.  */
     432  	      assert (errno == ENOMEM);
     433  
     434  	      /* Free the stack memory we just allocated.  */
     435  	      (void) __munmap (mem, size);
     436  
     437  	      return errno;
     438  	    }
     439  
     440  
     441  	  /* Prepare to modify global data.  */
     442  	  lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
     443  
     444  	  /* And add to the list of stacks in use.  */
     445  	  __nptl_stack_list_add (&pd->list, &GL (dl_stack_used));
     446  
     447  	  lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
     448  
     449  
     450  	  /* There might have been a race.  Another thread might have
     451  	     caused the stacks to get exec permission while this new
     452  	     stack was prepared.  Detect if this was possible and
     453  	     change the permission if necessary.  */
     454  	  if (__builtin_expect ((GL(dl_stack_flags) & PF_X) != 0
     455  				&& (prot & PROT_EXEC) == 0, 0))
     456  	    {
     457  	      int err = __nptl_change_stack_perm (pd);
     458  	      if (err != 0)
     459  		{
     460  		  /* Free the stack memory we just allocated.  */
     461  		  (void) __munmap (mem, size);
     462  
     463  		  return err;
     464  		}
     465  	    }
     466  
     467  
     468  	  /* Note that all of the stack and the thread descriptor is
     469  	     zeroed.  This means we do not have to initialize fields
     470  	     with initial value zero.  This is specifically true for
     471  	     the 'tid' field which is always set back to zero once the
     472  	     stack is not used anymore and for the 'guardsize' field
     473  	     which will be read next.  */
     474  	}
     475  
     476        /* Create or resize the guard area if necessary.  */
     477        if (__glibc_unlikely (guardsize > pd->guardsize))
     478  	{
     479  	  char *guard = guard_position (mem, size, guardsize, pd,
     480  					pagesize_m1);
     481  	  if (__mprotect (guard, guardsize, PROT_NONE) != 0)
     482  	    {
     483  	    mprot_error:
     484  	      lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
     485  
     486  	      /* Remove the thread from the list.  */
     487  	      __nptl_stack_list_del (&pd->list);
     488  
     489  	      lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
     490  
     491  	      /* Get rid of the TLS block we allocated.  */
     492  	      _dl_deallocate_tls (TLS_TPADJ (pd), false);
     493  
     494  	      /* Free the stack memory regardless of whether the size
     495  		 of the cache is over the limit or not.  If this piece
     496  		 of memory caused problems we better do not use it
     497  		 anymore.  Uh, and we ignore possible errors.  There
     498  		 is nothing we could do.  */
     499  	      (void) __munmap (mem, size);
     500  
     501  	      return errno;
     502  	    }
     503  
     504  	  pd->guardsize = guardsize;
     505  	}
     506        else if (__builtin_expect (pd->guardsize - guardsize > size - reqsize,
     507  				 0))
     508  	{
     509  	  /* The old guard area is too large.  */
     510  
     511  #ifdef NEED_SEPARATE_REGISTER_STACK
     512  	  char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1);
     513  	  char *oldguard = mem + (((size - pd->guardsize) / 2) & ~pagesize_m1);
     514  
     515  	  if (oldguard < guard
     516  	      && __mprotect (oldguard, guard - oldguard, prot) != 0)
     517  	    goto mprot_error;
     518  
     519  	  if (__mprotect (guard + guardsize,
     520  			oldguard + pd->guardsize - guard - guardsize,
     521  			prot) != 0)
     522  	    goto mprot_error;
     523  #elif _STACK_GROWS_DOWN
     524  	  if (__mprotect ((char *) mem + guardsize, pd->guardsize - guardsize,
     525  			prot) != 0)
     526  	    goto mprot_error;
     527  #elif _STACK_GROWS_UP
     528           char *new_guard = (char *)(((uintptr_t) pd - guardsize)
     529                                      & ~pagesize_m1);
     530           char *old_guard = (char *)(((uintptr_t) pd - pd->guardsize)
     531                                      & ~pagesize_m1);
     532           /* The guard size difference might be > 0, but once rounded
     533              to the nearest page the size difference might be zero.  */
     534           if (new_guard > old_guard
     535               && __mprotect (old_guard, new_guard - old_guard, prot) != 0)
     536  	    goto mprot_error;
     537  #endif
     538  
     539  	  pd->guardsize = guardsize;
     540  	}
     541        /* The pthread_getattr_np() calls need to get passed the size
     542  	 requested in the attribute, regardless of how large the
     543  	 actually used guardsize is.  */
     544        pd->reported_guardsize = reported_guardsize;
     545      }
     546  
     547    /* Initialize the lock.  We have to do this unconditionally since the
     548       stillborn thread could be canceled while the lock is taken.  */
     549    pd->lock = LLL_LOCK_INITIALIZER;
     550  
     551    /* The robust mutex lists also need to be initialized
     552       unconditionally because the cleanup for the previous stack owner
     553       might have happened in the kernel.  */
     554    pd->robust_head.futex_offset = (offsetof (pthread_mutex_t, __data.__lock)
     555  				  - offsetof (pthread_mutex_t,
     556  					      __data.__list.__next));
     557    pd->robust_head.list_op_pending = NULL;
     558  #if __PTHREAD_MUTEX_HAVE_PREV
     559    pd->robust_prev = &pd->robust_head;
     560  #endif
     561    pd->robust_head.list = &pd->robust_head;
     562  
     563    /* We place the thread descriptor at the end of the stack.  */
     564    *pdp = pd;
     565  
     566    void *stacktop;
     567  
     568  #if TLS_TCB_AT_TP
     569    /* The stack begins before the TCB and the static TLS block.  */
     570    stacktop = ((char *) (pd + 1) - tls_static_size_for_stack);
     571  #elif TLS_DTV_AT_TP
     572    stacktop = (char *) (pd - 1);
     573  #endif
     574  
     575    *stacksize = stacktop - pd->stackblock;
     576    *stack = pd->stackblock;
     577  
     578    return 0;
     579  }