1  /* POSIX spawn interface.  Linux version.
       2     Copyright (C) 2016-2023 Free Software Foundation, Inc.
       3     This file is part of the GNU C Library.
       4  
       5     The GNU C Library is free software; you can redistribute it and/or
       6     modify it under the terms of the GNU Lesser General Public
       7     License as published by the Free Software Foundation; either
       8     version 2.1 of the License, or (at your option) any later version.
       9  
      10     The GNU C Library is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      13     Lesser General Public License for more details.
      14  
      15     You should have received a copy of the GNU Lesser General Public
      16     License along with the GNU C Library; if not, see
      17     <https://www.gnu.org/licenses/>.  */
      18  
      19  #include <internal-signals.h>
      20  #include <ldsodefs.h>
      21  #include <local-setxid.h>
      22  #include <not-cancel.h>
      23  #include <paths.h>
      24  #include <shlib-compat.h>
      25  #include <spawn.h>
      26  #include <spawn_int.h>
      27  #include <sysdep.h>
      28  #include <sys/resource.h>
      29  #include <clone_internal.h>
      30  
      31  /* The Linux implementation of posix_spawn{p} uses the clone syscall directly
      32     with CLONE_VM and CLONE_VFORK flags and an allocated stack.  The new stack
      33     and start function solves most the vfork limitation (possible parent
      34     clobber due stack spilling). The remaining issue are:
      35  
      36     1. That no signal handlers must run in child context, to avoid corrupting
      37        parent's state.
      38     2. The parent must ensure child's stack freeing.
      39     3. Child must synchronize with parent to enforce 2. and to possible
      40        return execv issues.
      41  
      42     The first issue is solved by blocking all signals in child, even
      43     the NPTL-internal ones (SIGCANCEL and SIGSETXID).  The second and
      44     third issue is done by a stack allocation in parent, and by using a
      45     field in struct spawn_args where the child can write an error
      46     code. CLONE_VFORK ensures that the parent does not run until the
      47     child has either exec'ed successfully or exited.  */
      48  
      49  
      50  /* The Unix standard contains a long explanation of the way to signal
      51     an error after the fork() was successful.  Since no new wait status
      52     was wanted there is no way to signal an error using one of the
      53     available methods.  The committee chose to signal an error by a
      54     normal program exit with the exit code 127.  */
      55  #define SPAWN_ERROR	127
      56  
      57  
      58  struct posix_spawn_args
      59  {
      60    internal_sigset_t oldmask;
      61    const char *file;
      62    int (*exec) (const char *, char *const *, char *const *);
      63    const posix_spawn_file_actions_t *fa;
      64    const posix_spawnattr_t *restrict attr;
      65    char *const *argv;
      66    ptrdiff_t argc;
      67    char *const *envp;
      68    int xflags;
      69    bool use_clone3;
      70    int err;
      71  };
      72  
      73  /* Older version requires that shell script without shebang definition
      74     to be called explicitly using /bin/sh (_PATH_BSHELL).  */
      75  static void
      76  maybe_script_execute (struct posix_spawn_args *args)
      77  {
      78    if (SHLIB_COMPAT (libc, GLIBC_2_2, GLIBC_2_15)
      79        && (args->xflags & SPAWN_XFLAGS_TRY_SHELL) && errno == ENOEXEC)
      80      {
      81        char *const *argv = args->argv;
      82        ptrdiff_t argc = args->argc;
      83  
      84        /* Construct an argument list for the shell.  */
      85        char *new_argv[argc + 2];
      86        new_argv[0] = (char *) _PATH_BSHELL;
      87        new_argv[1] = (char *) args->file;
      88        if (argc > 1)
      89  	memcpy (new_argv + 2, argv + 1, argc * sizeof (char *));
      90        else
      91  	new_argv[2] = NULL;
      92  
      93        /* Execute the shell.  */
      94        args->exec (new_argv[0], new_argv, args->envp);
      95      }
      96  }
      97  
      98  /* Function used in the clone call to setup the signals mask, posix_spawn
      99     attributes, and file actions.  It run on its own stack (provided by the
     100     posix_spawn call).  */
     101  static int
     102  __spawni_child (void *arguments)
     103  {
     104    struct posix_spawn_args *args = arguments;
     105    const posix_spawnattr_t *restrict attr = args->attr;
     106    const posix_spawn_file_actions_t *file_actions = args->fa;
     107  
     108    /* The child must ensure that no signal handler is enabled because it
     109       shares memory with parent, so all signal dispositions must be either
     110       SIG_DFL or SIG_IGN.  If clone3/CLONE_CLEAR_SIGHAND is used, there is
     111       only the need to set the defined signals POSIX_SPAWN_SETSIGDEF to
     112       SIG_DFL; otherwise, the code iterates over all signals.  */
     113    struct sigaction sa;
     114    memset (&sa, '\0', sizeof (sa));
     115  
     116    sigset_t hset;
     117    __sigprocmask (SIG_BLOCK, 0, &hset);
     118    for (int sig = 1; sig < _NSIG; ++sig)
     119      {
     120        if ((attr->__flags & POSIX_SPAWN_SETSIGDEF)
     121  	  && __sigismember (&attr->__sd, sig))
     122  	{
     123  	  sa.sa_handler = SIG_DFL;
     124  	}
     125        else if (!args->use_clone3 && __sigismember (&hset, sig))
     126  	{
     127  	  if (is_internal_signal (sig))
     128  	    sa.sa_handler = SIG_IGN;
     129  	  else
     130  	    {
     131  	      __libc_sigaction (sig, 0, &sa);
     132  	      if (sa.sa_handler == SIG_IGN || sa.sa_handler == SIG_DFL)
     133  		continue;
     134  	      sa.sa_handler = SIG_DFL;
     135  	    }
     136  	}
     137        else
     138  	continue;
     139  
     140        __libc_sigaction (sig, &sa, 0);
     141      }
     142  
     143  #ifdef _POSIX_PRIORITY_SCHEDULING
     144    /* Set the scheduling algorithm and parameters.  */
     145    if ((attr->__flags & (POSIX_SPAWN_SETSCHEDPARAM | POSIX_SPAWN_SETSCHEDULER))
     146        == POSIX_SPAWN_SETSCHEDPARAM)
     147      {
     148        if (__sched_setparam (0, &attr->__sp) == -1)
     149  	goto fail;
     150      }
     151    else if ((attr->__flags & POSIX_SPAWN_SETSCHEDULER) != 0)
     152      {
     153        if (__sched_setscheduler (0, attr->__policy, &attr->__sp) == -1)
     154  	goto fail;
     155      }
     156  #endif
     157  
     158    if ((attr->__flags & POSIX_SPAWN_SETSID) != 0
     159        && __setsid () < 0)
     160      goto fail;
     161  
     162    /* Set the process group ID.  */
     163    if ((attr->__flags & POSIX_SPAWN_SETPGROUP) != 0
     164        && __setpgid (0, attr->__pgrp) != 0)
     165      goto fail;
     166  
     167    /* Set the effective user and group IDs.  */
     168    if ((attr->__flags & POSIX_SPAWN_RESETIDS) != 0
     169        && (local_seteuid (__getuid ()) != 0
     170  	  || local_setegid (__getgid ()) != 0))
     171      goto fail;
     172  
     173    /* Execute the file actions.  */
     174    if (file_actions != 0)
     175      {
     176        int cnt;
     177        struct rlimit64 fdlimit;
     178        bool have_fdlimit = false;
     179  
     180        for (cnt = 0; cnt < file_actions->__used; ++cnt)
     181  	{
     182  	  struct __spawn_action *action = &file_actions->__actions[cnt];
     183  
     184  	  switch (action->tag)
     185  	    {
     186  	    case spawn_do_close:
     187  	      if (__close_nocancel (action->action.close_action.fd) != 0)
     188  		{
     189  		  if (!have_fdlimit)
     190  		    {
     191  		      __getrlimit64 (RLIMIT_NOFILE, &fdlimit);
     192  		      have_fdlimit = true;
     193  		    }
     194  
     195  		  /* Signal errors only for file descriptors out of range.  */
     196  		  if (action->action.close_action.fd < 0
     197  		      || action->action.close_action.fd >= fdlimit.rlim_cur)
     198  		    goto fail;
     199  		}
     200  	      break;
     201  
     202  	    case spawn_do_open:
     203  	      {
     204  		/* POSIX states that if fildes was already an open file descriptor,
     205  		   it shall be closed before the new file is opened.  This avoid
     206  		   potential issues when posix_spawn plus addopen action is called
     207  		   with the process already at maximum number of file descriptor
     208  		   opened and also for multiple actions on single-open special
     209  		   paths (like /dev/watchdog).  */
     210  		__close_nocancel (action->action.open_action.fd);
     211  
     212  		int ret = __open_nocancel (action->action.open_action.path,
     213  					   action->action.
     214  					   open_action.oflag | O_LARGEFILE,
     215  					   action->action.open_action.mode);
     216  
     217  		if (ret == -1)
     218  		  goto fail;
     219  
     220  		int new_fd = ret;
     221  
     222  		/* Make sure the desired file descriptor is used.  */
     223  		if (ret != action->action.open_action.fd)
     224  		  {
     225  		    if (__dup2 (new_fd, action->action.open_action.fd)
     226  			!= action->action.open_action.fd)
     227  		      goto fail;
     228  
     229  		    if (__close_nocancel (new_fd) != 0)
     230  		      goto fail;
     231  		  }
     232  	      }
     233  	      break;
     234  
     235  	    case spawn_do_dup2:
     236  	      /* Austin Group issue #411 requires adddup2 action with source
     237  		 and destination being equal to remove close-on-exec flag.  */
     238  	      if (action->action.dup2_action.fd
     239  		  == action->action.dup2_action.newfd)
     240  		{
     241  		  int fd = action->action.dup2_action.newfd;
     242  		  int flags = __fcntl (fd, F_GETFD, 0);
     243  		  if (flags == -1)
     244  		    goto fail;
     245  		  if (__fcntl (fd, F_SETFD, flags & ~FD_CLOEXEC) == -1)
     246  		    goto fail;
     247  		}
     248  	      else if (__dup2 (action->action.dup2_action.fd,
     249  			       action->action.dup2_action.newfd)
     250  		       != action->action.dup2_action.newfd)
     251  		goto fail;
     252  	      break;
     253  
     254  	    case spawn_do_chdir:
     255  	      if (__chdir (action->action.chdir_action.path) != 0)
     256  		goto fail;
     257  	      break;
     258  
     259  	    case spawn_do_fchdir:
     260  	      if (__fchdir (action->action.fchdir_action.fd) != 0)
     261  		goto fail;
     262  	      break;
     263  
     264  	    case spawn_do_closefrom:
     265  	      {
     266  		int lowfd = action->action.closefrom_action.from;
     267  	        int r = INLINE_SYSCALL_CALL (close_range, lowfd, ~0U, 0);
     268  		if (r != 0 && !__closefrom_fallback (lowfd, false))
     269  		  goto fail;
     270  	      } break;
     271  
     272  	    case spawn_do_tcsetpgrp:
     273  	      {
     274  		/* Check if it is possible to avoid an extra syscall.  */
     275  		pid_t pgrp = (attr->__flags & POSIX_SPAWN_SETPGROUP) != 0
     276  			       && attr->__pgrp != 0
     277  			     ? attr->__pgrp : __getpgid (0);
     278  		if (__tcsetpgrp (action->action.setpgrp_action.fd, pgrp) != 0)
     279  		  goto fail;
     280  	      }
     281  	    }
     282  	}
     283      }
     284  
     285    /* Set the initial signal mask of the child if POSIX_SPAWN_SETSIGMASK
     286       is set, otherwise restore the previous one.  */
     287    if (attr->__flags & POSIX_SPAWN_SETSIGMASK)
     288      __sigprocmask (SIG_SETMASK, &attr->__ss, NULL);
     289    else
     290      internal_sigprocmask (SIG_SETMASK, &args->oldmask, NULL);
     291  
     292    args->exec (args->file, args->argv, args->envp);
     293  
     294    /* This is compatibility function required to enable posix_spawn run
     295       script without shebang definition for older posix_spawn versions
     296       (2.15).  */
     297    maybe_script_execute (args);
     298  
     299  fail:
     300    /* errno should have an appropriate non-zero value; otherwise,
     301       there's a bug in glibc or the kernel.  For lack of an error code
     302       (EINTERNALBUG) describing that, use ECHILD.  Another option would
     303       be to set args->err to some negative sentinel and have the parent
     304       abort(), but that seems needlessly harsh.  */
     305    args->err = errno ? : ECHILD;
     306    _exit (SPAWN_ERROR);
     307  }
     308  
     309  /* Spawn a new process executing PATH with the attributes describes in *ATTRP.
     310     Before running the process perform the actions described in FILE-ACTIONS. */
     311  static int
     312  __spawnix (pid_t * pid, const char *file,
     313  	   const posix_spawn_file_actions_t * file_actions,
     314  	   const posix_spawnattr_t * attrp, char *const argv[],
     315  	   char *const envp[], int xflags,
     316  	   int (*exec) (const char *, char *const *, char *const *))
     317  {
     318    pid_t new_pid;
     319    struct posix_spawn_args args;
     320    int ec;
     321  
     322    /* To avoid imposing hard limits on posix_spawn{p} the total number of
     323       arguments is first calculated to allocate a mmap to hold all possible
     324       values.  */
     325    ptrdiff_t argc = 0;
     326    /* Linux allows at most max (0x7FFFFFFF, 1/4 stack size) arguments
     327       to be used in a execve call.  We limit to INT_MAX minus one due the
     328       compatibility code that may execute a shell script (maybe_script_execute)
     329       where it will construct another argument list with an additional
     330       argument.  */
     331    ptrdiff_t limit = INT_MAX - 1;
     332    while (argv[argc++] != NULL)
     333      if (argc == limit)
     334        {
     335  	errno = E2BIG;
     336  	return errno;
     337        }
     338  
     339    int prot = (PROT_READ | PROT_WRITE
     340  	     | ((GL (dl_stack_flags) & PF_X) ? PROT_EXEC : 0));
     341  
     342    /* Add a slack area for child's stack.  */
     343    size_t argv_size = (argc * sizeof (void *)) + 512;
     344    /* We need at least a few pages in case the compiler's stack checking is
     345       enabled.  In some configs, it is known to use at least 24KiB.  We use
     346       32KiB to be "safe" from anything the compiler might do.  Besides, the
     347       extra pages won't actually be allocated unless they get used.
     348       It also acts the slack for spawn_closefrom (including MIPS64 getdents64
     349       where it might use about 1k extra stack space).  */
     350    argv_size += (32 * 1024);
     351    size_t stack_size = ALIGN_UP (argv_size, GLRO(dl_pagesize));
     352    void *stack = __mmap (NULL, stack_size, prot,
     353  			MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
     354    if (__glibc_unlikely (stack == MAP_FAILED))
     355      return errno;
     356  
     357    /* Disable asynchronous cancellation.  */
     358    int state;
     359    __pthread_setcancelstate (PTHREAD_CANCEL_DISABLE, &state);
     360  
     361    /* Child must set args.err to something non-negative - we rely on
     362       the parent and child sharing VM.  */
     363    args.err = 0;
     364    args.file = file;
     365    args.exec = exec;
     366    args.fa = file_actions;
     367    args.attr = attrp ? attrp : &(const posix_spawnattr_t) { 0 };
     368    args.argv = argv;
     369    args.argc = argc;
     370    args.envp = envp;
     371    args.xflags = xflags;
     372  
     373    internal_signal_block_all (&args.oldmask);
     374  
     375    /* The clone flags used will create a new child that will run in the same
     376       memory space (CLONE_VM) and the execution of calling thread will be
     377       suspend until the child calls execve or _exit.
     378  
     379       Also since the calling thread execution will be suspend, there is not
     380       need for CLONE_SETTLS.  Although parent and child share the same TLS
     381       namespace, there will be no concurrent access for TLS variables (errno
     382       for instance).  */
     383    struct clone_args clone_args =
     384      {
     385        /* Unsupported flags like CLONE_CLEAR_SIGHAND will be cleared up by
     386  	 __clone_internal_fallback.  */
     387        .flags = CLONE_CLEAR_SIGHAND | CLONE_VM | CLONE_VFORK,
     388        .exit_signal = SIGCHLD,
     389        .stack = (uintptr_t) stack,
     390        .stack_size = stack_size,
     391      };
     392  #ifdef HAVE_CLONE3_WRAPPER
     393    args.use_clone3 = true;
     394    new_pid = __clone3 (&clone_args, sizeof (clone_args), __spawni_child,
     395  		      &args);
     396    /* clone3 was added in 5.3 and CLONE_CLEAR_SIGHAND in 5.5.  */
     397    if (new_pid == -1 && (errno == ENOSYS || errno == EINVAL))
     398  #endif
     399      {
     400        args.use_clone3 = false;
     401        new_pid = __clone_internal_fallback (&clone_args, __spawni_child,
     402  					   &args);
     403      }
     404  
     405    /* It needs to collect the case where the auxiliary process was created
     406       but failed to execute the file (due either any preparation step or
     407       for execve itself).  */
     408    if (new_pid > 0)
     409      {
     410        /* Also, it handles the unlikely case where the auxiliary process was
     411  	 terminated before calling execve as if it was successfully.  The
     412  	 args.err is set to 0 as default and changed to a positive value
     413  	 only in case of failure, so in case of premature termination
     414  	 due a signal args.err will remain zeroed and it will be up to
     415  	 caller to actually collect it.  */
     416        ec = args.err;
     417        if (ec > 0)
     418  	/* There still an unlikely case where the child is cancelled after
     419  	   setting args.err, due to a positive error value.  Also there is
     420  	   possible pid reuse race (where the kernel allocated the same pid
     421  	   to an unrelated process).  Unfortunately due synchronization
     422  	   issues where the kernel might not have the process collected
     423  	   the waitpid below can not use WNOHANG.  */
     424  	__waitpid (new_pid, NULL, 0);
     425      }
     426    else
     427      ec = errno;
     428  
     429    __munmap (stack, stack_size);
     430  
     431    if ((ec == 0) && (pid != NULL))
     432      *pid = new_pid;
     433  
     434    internal_signal_restore_set (&args.oldmask);
     435  
     436    __pthread_setcancelstate (state, NULL);
     437  
     438    return ec;
     439  }
     440  
     441  /* Spawn a new process executing PATH with the attributes describes in *ATTRP.
     442     Before running the process perform the actions described in FILE-ACTIONS. */
     443  int
     444  __spawni (pid_t * pid, const char *file,
     445  	  const posix_spawn_file_actions_t * acts,
     446  	  const posix_spawnattr_t * attrp, char *const argv[],
     447  	  char *const envp[], int xflags)
     448  {
     449    /* It uses __execvpex to avoid run ENOEXEC in non compatibility mode (it
     450       will be handled by maybe_script_execute).  */
     451    return __spawnix (pid, file, acts, attrp, argv, envp, xflags,
     452  		    xflags & SPAWN_XFLAGS_USE_PATH ? __execvpex :__execve);
     453  }