1  /*
       2   * Copyright (c) 2018 Chen Jingpiao <chenjingpiao@gmail.com>
       3   * Copyright (c) 2019 Paul Chaignon <paul.chaignon@gmail.com>
       4   * Copyright (c) 2018-2023 The strace developers.
       5   * All rights reserved.
       6   *
       7   * SPDX-License-Identifier: LGPL-2.1-or-later
       8   */
       9  
      10  #include "defs.h"
      11  
      12  #include "ptrace.h"
      13  #include <signal.h>
      14  #include <sys/prctl.h>
      15  #include <sys/wait.h>
      16  #include <linux/filter.h>
      17  
      18  #include "filter_seccomp.h"
      19  #include "number_set.h"
      20  #include "scno.h"
      21  
      22  bool seccomp_filtering;
      23  bool seccomp_before_sysentry;
      24  
      25  #include <linux/seccomp.h>
      26  
      27  #ifndef BPF_MAXINSNS
      28  # define BPF_MAXINSNS 4096
      29  #endif
      30  
      31  #define JMP_PLACEHOLDER_NEXT  ((unsigned char) -1)
      32  #define JMP_PLACEHOLDER_TRACE ((unsigned char) -2)
      33  #define JMP_PLACEHOLDER_ALLOW ((unsigned char) -3)
      34  
      35  #define SET_BPF(filter, code, jt, jf, k) \
      36  	(*(filter) = (struct sock_filter) { code, jt, jf, k })
      37  
      38  #define SET_BPF_STMT(filter, code, k) \
      39  	SET_BPF(filter, code, 0, 0, k)
      40  
      41  #define SET_BPF_JUMP(filter, code, k, jt, jf) \
      42  	SET_BPF(filter, BPF_JMP | code, jt, jf, k)
      43  
      44  typedef unsigned short (*filter_generator_t)(struct sock_filter *,
      45  					     bool *overflow);
      46  static unsigned short linear_filter_generator(struct sock_filter *,
      47  					      bool *overflow);
      48  static unsigned short binary_match_filter_generator(struct sock_filter *,
      49  						    bool *overflow);
      50  static filter_generator_t filter_generators[] = {
      51  	linear_filter_generator,
      52  	binary_match_filter_generator,
      53  };
      54  
      55  /*
      56   * Keep some margin in seccomp_filter as programs larger than allowed may
      57   * be constructed before we discard them.
      58   */
      59  static struct sock_filter
      60  filters[ARRAY_SIZE(filter_generators)][2 * BPF_MAXINSNS];
      61  static struct sock_fprog bpf_prog = {
      62  	.len = USHRT_MAX,
      63  	.filter = NULL,
      64  };
      65  
      66  #ifdef HAVE_FORK
      67  
      68  static void ATTRIBUTE_NORETURN
      69  check_seccomp_order_do_child(void)
      70  {
      71  	static const struct sock_filter filter[] = {
      72  		/* return (nr == __NR_gettid) ? RET_TRACE : RET_ALLOW; */
      73  		BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
      74  			 offsetof(struct seccomp_data, nr)),
      75  		BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_gettid, 0, 1),
      76  		BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRACE),
      77  		BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW)
      78  	};
      79  	static const struct sock_fprog prog = {
      80  		.len = ARRAY_SIZE(filter),
      81  		.filter = (struct sock_filter *) filter
      82  	};
      83  
      84  	/* Get everything ready before PTRACE_TRACEME.  */
      85  	if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
      86  		perror_func_msg_and_die("prctl(PR_SET_NO_NEW_PRIVS, 1");
      87  	if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) < 0)
      88  		perror_func_msg_and_die("prctl(PR_SET_SECCOMP)");
      89  	int pid = getpid();
      90  
      91  	if (ptrace(PTRACE_TRACEME, 0L, 0L, 0L) < 0) {
      92  		/* Exit with a nonzero exit status.  */
      93  		perror_func_msg_and_die("PTRACE_TRACEME");
      94  	}
      95  
      96  	GCOV_DUMP;
      97  
      98  	kill(pid, SIGSTOP);
      99  	syscall(__NR_gettid);
     100  	_exit(0);
     101  }
     102  
     103  static int
     104  check_seccomp_order_tracer(int pid)
     105  {
     106  	for (unsigned int step = 0; ; ++step) {
     107  		int status;
     108  
     109  		for (;;) {
     110  			long rc = waitpid(pid, &status, 0);
     111  			if (rc < 0 && errno == EINTR)
     112  				continue;
     113  			if (rc == pid)
     114  				break;
     115  			/* Cannot happen.  */
     116  			perror_func_msg("#%d: unexpected wait result %ld",
     117  					step, rc);
     118  			return pid;
     119  		}
     120  
     121  		if (WIFEXITED(status)) {
     122  			/* The tracee is no more.  */
     123  			pid = 0;
     124  
     125  			int exitstatus = WEXITSTATUS(status);
     126  			if (step == 5 && exitstatus == 0) {
     127  				seccomp_filtering = true;
     128  			} else {
     129  				error_func_msg("#%d: unexpected exit status %u",
     130  					       step, exitstatus);
     131  			}
     132  			break;
     133  		}
     134  
     135  		if (WIFSIGNALED(status)) {
     136  			/* The tracee is no more.  */
     137  			pid = 0;
     138  
     139  			error_func_msg("#%d: unexpected signal %u",
     140  				       step, WTERMSIG(status));
     141  			break;
     142  		}
     143  
     144  		if (!WIFSTOPPED(status)) {
     145  			/* Cannot happen.  */
     146  			error_func_msg("#%d: unexpected wait status %#x",
     147  				       step, status);
     148  			break;
     149  		}
     150  
     151  		unsigned int event = (unsigned int) status >> 16;
     152  
     153  		switch (WSTOPSIG(status)) {
     154  		case SIGSTOP:
     155  			if (step != 0) {
     156  				error_func_msg("#%d: unexpected signal stop",
     157  					       step);
     158  				return pid;
     159  			}
     160  			if (ptrace(PTRACE_SETOPTIONS, pid, 0L,
     161  				   PTRACE_O_TRACESYSGOOD|
     162  				   PTRACE_O_TRACESECCOMP) < 0) {
     163  				perror_func_msg("PTRACE_SETOPTIONS");
     164  				return pid;
     165  			}
     166  			break;
     167  
     168  		case SIGTRAP:
     169  			if (event != PTRACE_EVENT_SECCOMP) {
     170  				error_func_msg("#%d: unexpected trap %#x",
     171  					       step, event);
     172  				return pid;
     173  			}
     174  
     175  			switch (step) {
     176  			case 1: /* Seccomp stop before entering gettid.  */
     177  				seccomp_before_sysentry = true;
     178  				break;
     179  			case 2: /* Seccomp stop after entering gettid.  */
     180  				if (!seccomp_before_sysentry)
     181  					break;
     182  				ATTRIBUTE_FALLTHROUGH;
     183  			default:
     184  				error_func_msg("#%d: unexpected seccomp stop",
     185  					       step);
     186  				return pid;
     187  			}
     188  			break;
     189  
     190  		case SIGTRAP | 0x80:
     191  			switch (step) {
     192  			case 3: /* Exiting gettid.  */
     193  			case 4: /* Entering exit_group.  */
     194  				break;
     195  			case 1: /* Entering gettid before seccomp stop.  */
     196  				seccomp_before_sysentry = false;
     197  				break;
     198  			case 2: /* Entering gettid after seccomp stop.  */
     199  				if (seccomp_before_sysentry)
     200  					break;
     201  				ATTRIBUTE_FALLTHROUGH;
     202  			default:
     203  				error_func_msg("#%d: unexpected syscall stop",
     204  					       step);
     205  				return pid;
     206  			}
     207  			break;
     208  
     209  		default:
     210  			error_func_msg("#%d: unexpected stop signal %#x",
     211  				       step, WSTOPSIG(status));
     212  			return pid;
     213  		}
     214  
     215  		if (ptrace(PTRACE_SYSCALL, pid, 0L, 0L) < 0) {
     216  			/* Cannot happen.  */
     217  			perror_func_msg("#%d: PTRACE_SYSCALL", step);
     218  			break;
     219  		}
     220  	}
     221  
     222  	return pid;
     223  }
     224  #endif /* HAVE_FORK */
     225  
     226  static void
     227  check_seccomp_order(void)
     228  {
     229  	seccomp_filtering = false;
     230  
     231  	/* NOMMU provides no forks necessary for the test.  */
     232  #ifdef HAVE_FORK
     233  	int pid = fork();
     234  	if (pid < 0) {
     235  		perror_func_msg("fork");
     236  		return;
     237  	}
     238  
     239  	if (pid == 0)
     240  		check_seccomp_order_do_child();
     241  
     242  	pid = check_seccomp_order_tracer(pid);
     243  	if (pid) {
     244  		kill(pid, SIGKILL);
     245  		for (;;) {
     246  			long rc = waitpid(pid, NULL, 0);
     247  			if (rc < 0 && errno == EINTR)
     248  				continue;
     249  			break;
     250  		}
     251  	}
     252  #endif /* HAVE_FORK */
     253  }
     254  
     255  static bool
     256  traced_by_seccomp(unsigned int scno, unsigned int p)
     257  {
     258  	unsigned int always_trace_flags =
     259  		TRACE_INDIRECT_SUBCALL | TRACE_SECCOMP_DEFAULT |
     260  		(stack_trace_enabled ? MEMORY_MAPPING_CHANGE : 0) |
     261  		(is_number_in_set(DECODE_PID_COMM, decode_pid_set) ?
     262  		 COMM_CHANGE : 0);
     263  	return sysent_vec[p][scno].sys_flags & always_trace_flags ||
     264  		is_number_in_set_array(scno, trace_set, p);
     265  }
     266  
     267  static void
     268  replace_jmp_placeholders(unsigned char *jmp_offset, unsigned char jmp_next,
     269  			 unsigned char jmp_trace, unsigned char jmp_allow)
     270  {
     271  	switch (*jmp_offset) {
     272  	case JMP_PLACEHOLDER_NEXT:
     273  		*jmp_offset = jmp_next;
     274  		break;
     275  	case JMP_PLACEHOLDER_TRACE:
     276  		*jmp_offset = jmp_trace;
     277  		break;
     278  	case JMP_PLACEHOLDER_ALLOW:
     279  		*jmp_offset = jmp_allow;
     280  		break;
     281  	default:
     282  		break;
     283  	}
     284  }
     285  
     286  static unsigned short
     287  bpf_syscalls_cmp(struct sock_filter *filter,
     288  		 unsigned int lower, unsigned int upper)
     289  {
     290  	if (lower + 1 == upper) {
     291  		/* if (nr == lower) return RET_TRACE; */
     292  		SET_BPF_JUMP(filter, BPF_JEQ | BPF_K, lower,
     293  			     JMP_PLACEHOLDER_TRACE, 0);
     294  		return 1;
     295  	} else {
     296  		/* if (nr >= lower && nr < upper) return RET_TRACE; */
     297  		SET_BPF_JUMP(filter, BPF_JGE | BPF_K, lower, 0, 1);
     298  		SET_BPF_JUMP(filter + 1, BPF_JGE | BPF_K, upper, 0,
     299  			     JMP_PLACEHOLDER_TRACE);
     300  		return 2;
     301  	}
     302  }
     303  
     304  static unsigned short
     305  linear_filter_generator(struct sock_filter *filter, bool *overflow)
     306  {
     307  	/*
     308  	 * Generated program looks like:
     309  	 * if (arch == AUDIT_ARCH_A && nr >= flag) {
     310  	 *	if (nr == 59)
     311  	 *		return SECCOMP_RET_TRACE;
     312  	 *	if (nr >= 321 && nr <= 323)
     313  	 *		return SECCOMP_RET_TRACE;
     314  	 *	...
     315  	 *	return SECCOMP_RET_ALLOW;
     316  	 * }
     317  	 * if (arch == AUDIT_ARCH_A) {
     318  	 *	...
     319  	 * }
     320  	 * if (arch == AUDIT_ARCH_B) {
     321  	 *	...
     322  	 * }
     323  	 * return SECCOMP_RET_TRACE;
     324  	 */
     325  	unsigned short pos = 0;
     326  
     327  #if SUPPORTED_PERSONALITIES > 1
     328  	SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
     329  		     offsetof(struct seccomp_data, arch));
     330  #endif
     331  
     332  	/*
     333  	 * Personalities are iterated in reverse-order in the BPF program so
     334  	 * that the x86 case is naturally handled.  On x86, the first and third
     335  	 * personalities have the same arch identifier.  The third can be
     336  	 * distinguished based on its associated syscall flag, so we check it
     337  	 * first.  The only drawback here is that the first personality is more
     338  	 * common, which may make the BPF program slower to match syscalls on
     339  	 * average.
     340  	 */
     341  	for (int p = SUPPORTED_PERSONALITIES - 1; p >= 0; --p) {
     342  		unsigned int lower = UINT_MAX;
     343  		unsigned short start = pos, end;
     344  
     345  #if SUPPORTED_PERSONALITIES > 1
     346  		/* if (arch != audit_arch_vec[p].arch) goto next; */
     347  		SET_BPF_JUMP(&filter[pos++], BPF_JEQ | BPF_K,
     348  			     audit_arch_vec[p].arch, 0, JMP_PLACEHOLDER_NEXT);
     349  #endif
     350  		SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
     351  			     offsetof(struct seccomp_data, nr));
     352  
     353  #if SUPPORTED_PERSONALITIES > 1
     354  		if (audit_arch_vec[p].flag) {
     355  			/* if (nr < audit_arch_vec[p].flag) goto next; */
     356  			SET_BPF_JUMP(&filter[pos++], BPF_JGE | BPF_K,
     357  				     audit_arch_vec[p].flag, 2, 0);
     358  			SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
     359  				     offsetof(struct seccomp_data, arch));
     360  			SET_BPF_JUMP(&filter[pos++], BPF_JA,
     361  				     JMP_PLACEHOLDER_NEXT, 0, 0);
     362  		}
     363  #endif
     364  
     365  		for (unsigned int i = 0; i < nsyscall_vec[p]; ++i) {
     366  			if (traced_by_seccomp(i, p)) {
     367  				if (lower == UINT_MAX)
     368  					lower = i;
     369  				continue;
     370  			}
     371  			if (lower == UINT_MAX)
     372  				continue;
     373  			pos += bpf_syscalls_cmp(filter + pos,
     374  						lower | audit_arch_vec[p].flag,
     375  						i | audit_arch_vec[p].flag);
     376  			lower = UINT_MAX;
     377  		}
     378  		if (lower != UINT_MAX)
     379  			pos += bpf_syscalls_cmp(filter + pos,
     380  						lower | audit_arch_vec[p].flag,
     381  						nsyscall_vec[p]
     382  						| audit_arch_vec[p].flag);
     383  		end = pos;
     384  
     385  		/* if (nr >= max_nr) return RET_TRACE; */
     386  		SET_BPF_JUMP(&filter[pos++], BPF_JGE | BPF_K,
     387  			     nsyscall_vec[p] | audit_arch_vec[p].flag, 1, 0);
     388  
     389  		SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K,
     390  			     SECCOMP_RET_ALLOW);
     391  		SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K,
     392  			     SECCOMP_RET_TRACE);
     393  
     394  		/*
     395  		 * Within generated BPF programs, the origin and destination of
     396  		 * jumps are always in the same personality section.  The
     397  		 * largest jump is therefore the jump from the first
     398  		 * instruction of the section to the last, to skip the
     399  		 * personality and try to compare .arch to the next
     400  		 * personality.
     401  		 * If we have a personality section with more than 255
     402  		 * instructions, the jump offset will overflow.  Such program
     403  		 * is unlikely to happen, so we simply disable seccomp-filter
     404  		 * in such a case.
     405  		 */
     406  		if (pos - start > UCHAR_MAX) {
     407  			*overflow = true;
     408  			return pos;
     409  		}
     410  
     411  		for (unsigned int i = start; i < end; ++i) {
     412  			if (BPF_CLASS(filter[i].code) != BPF_JMP)
     413  				continue;
     414  			unsigned char jmp_next = pos - i - 1;
     415  			unsigned char jmp_trace = pos - i - 2;
     416  			unsigned char jmp_allow = pos - i - 3;
     417  			replace_jmp_placeholders(&filter[i].jt, jmp_next,
     418  						 jmp_trace, jmp_allow);
     419  			replace_jmp_placeholders(&filter[i].jf, jmp_next,
     420  						 jmp_trace, jmp_allow);
     421  			if (BPF_OP(filter[i].code) == BPF_JA)
     422  				filter[i].k = (unsigned int) jmp_next;
     423  		}
     424  	}
     425  
     426  #if SUPPORTED_PERSONALITIES > 1
     427  	/* Jumps conditioned on .arch default to this RET_TRACE. */
     428  	SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K, SECCOMP_RET_TRACE);
     429  #endif
     430  
     431  	return pos;
     432  }
     433  
     434  static unsigned short
     435  bpf_syscalls_match(struct sock_filter *filter, unsigned int bitarray,
     436  		   unsigned int bitarray_idx)
     437  {
     438  	if (!bitarray) {
     439  		/* return RET_ALLOW; */
     440  		SET_BPF_JUMP(filter, BPF_JMP | BPF_JEQ | BPF_K, bitarray_idx,
     441  			     JMP_PLACEHOLDER_ALLOW, 0);
     442  		return 1;
     443  	}
     444  	if (bitarray == UINT_MAX) {
     445  		/* return RET_TRACE; */
     446  		SET_BPF_JUMP(filter, BPF_JMP | BPF_JEQ | BPF_K, bitarray_idx,
     447  			     JMP_PLACEHOLDER_TRACE, 0);
     448  		return 1;
     449  	}
     450  	/*
     451  	 * if (A == nr / 32)
     452  	 *   return (X & bitarray) ? RET_TRACE : RET_ALLOW;
     453  	 */
     454  	SET_BPF_JUMP(filter, BPF_JMP | BPF_JEQ | BPF_K, bitarray_idx,
     455  		     0, 2);
     456  	SET_BPF_STMT(filter + 1, BPF_MISC | BPF_TXA, 0);
     457  	SET_BPF_JUMP(filter + 2, BPF_JMP | BPF_JSET | BPF_K, bitarray,
     458  		     JMP_PLACEHOLDER_TRACE, JMP_PLACEHOLDER_ALLOW);
     459  	return 3;
     460  }
     461  
     462  static unsigned short
     463  binary_match_filter_generator(struct sock_filter *filter, bool *overflow)
     464  {
     465  	unsigned short pos = 0;
     466  
     467  #if SUPPORTED_PERSONALITIES > 1
     468  	SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
     469  		     offsetof(struct seccomp_data, arch));
     470  #endif
     471  
     472  	/* Personalities are iterated in reverse-order in the BPF program so that
     473  	 * the x86 case is naturally handled.  In x86, the first and third
     474  	 * personalities have the same arch identifier.  The third can be
     475  	 * distinguished based on its associated bit mask, so we check it first.
     476  	 * The only drawback here is that the first personality is more common,
     477  	 * which may make the BPF program slower to match syscalls on average. */
     478  	for (int p = SUPPORTED_PERSONALITIES - 1;
     479  		 p >= 0 && pos <= BPF_MAXINSNS;
     480  		 --p) {
     481  		unsigned short start = pos, end;
     482  		unsigned int bitarray = 0;
     483  		unsigned int i;
     484  
     485  #if SUPPORTED_PERSONALITIES > 1
     486  		SET_BPF_JUMP(&filter[pos++], BPF_JMP | BPF_JEQ | BPF_K,
     487  			     audit_arch_vec[p].arch, 0, JMP_PLACEHOLDER_NEXT);
     488  #endif
     489  		SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
     490  			     offsetof(struct seccomp_data, nr));
     491  
     492  #if SUPPORTED_PERSONALITIES > 1
     493  		if (audit_arch_vec[p].flag) {
     494  			SET_BPF_JUMP(&filter[pos++], BPF_JMP | BPF_JGE | BPF_K,
     495  				     audit_arch_vec[p].flag, 2, 0);
     496  			SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
     497  				     offsetof(struct seccomp_data, arch));
     498  			SET_BPF_JUMP(&filter[pos++], BPF_JMP | BPF_JA,
     499  				     JMP_PLACEHOLDER_NEXT, 0, 0);
     500  
     501  			/* nr = nr & ~mask */
     502  			SET_BPF_STMT(&filter[pos++], BPF_ALU | BPF_AND | BPF_K,
     503  				     ~audit_arch_vec[p].flag);
     504  		}
     505  #endif
     506  
     507  		/* X = 1 << nr % 32 = 1 << nr & 0x1F; */
     508  		SET_BPF_STMT(&filter[pos++], BPF_ALU | BPF_AND | BPF_K, 0x1F);
     509  		SET_BPF_STMT(&filter[pos++], BPF_MISC | BPF_TAX, 0);
     510  		SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_IMM, 1);
     511  		SET_BPF_STMT(&filter[pos++], BPF_ALU | BPF_LSH | BPF_X, 0);
     512  		SET_BPF_STMT(&filter[pos++], BPF_MISC | BPF_TAX, 0);
     513  
     514  		/* A = nr / 32 = n >> 5; */
     515  		SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
     516  			     offsetof(struct seccomp_data, nr));
     517  		if (audit_arch_vec[p].flag) {
     518  			/* nr = nr & ~mask */
     519  			SET_BPF_STMT(&filter[pos++], BPF_ALU | BPF_AND | BPF_K,
     520  				     ~audit_arch_vec[p].flag);
     521  		}
     522  		SET_BPF_STMT(&filter[pos++], BPF_ALU | BPF_RSH | BPF_K, 5);
     523  
     524  		for (i = 0; i < nsyscall_vec[p] && pos <= BPF_MAXINSNS; ++i) {
     525  			if (traced_by_seccomp(i, p))
     526  				bitarray |= (1 << i % 32);
     527  			if (i % 32 == 31) {
     528  				pos += bpf_syscalls_match(filter + pos,
     529  							  bitarray, i / 32);
     530  				bitarray = 0;
     531  			}
     532  		}
     533  		if (i % 32 != 0)
     534  			pos += bpf_syscalls_match(filter + pos, bitarray,
     535  						  i / 32);
     536  
     537  		end = pos;
     538  
     539  		SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K,
     540  			     SECCOMP_RET_ALLOW);
     541  		SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K,
     542  			     SECCOMP_RET_TRACE);
     543  
     544  		if (pos - start > UCHAR_MAX) {
     545  			*overflow = true;
     546  			return pos;
     547  		}
     548  
     549  		for (unsigned int i = start; i < end; ++i) {
     550  			if (BPF_CLASS(filter[i].code) != BPF_JMP)
     551  				continue;
     552  			unsigned char jmp_next = pos - i - 1;
     553  			unsigned char jmp_trace = pos - i - 2;
     554  			unsigned char jmp_allow = pos - i - 3;
     555  			replace_jmp_placeholders(&filter[i].jt, jmp_next,
     556  						 jmp_trace, jmp_allow);
     557  			replace_jmp_placeholders(&filter[i].jf, jmp_next,
     558  						 jmp_trace, jmp_allow);
     559  			if (BPF_OP(filter[i].code) == BPF_JA)
     560  				filter[i].k = (unsigned int)jmp_next;
     561  		}
     562  	}
     563  
     564  #if SUPPORTED_PERSONALITIES > 1
     565  	SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K, SECCOMP_RET_TRACE);
     566  #endif
     567  
     568  	return pos;
     569  }
     570  
     571  static void
     572  check_seccomp_filter_properties(void)
     573  {
     574  	int rc = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0);
     575  	seccomp_filtering = rc < 0 && errno != EINVAL;
     576  	if (!seccomp_filtering) {
     577  		debug_func_perror_msg("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
     578  		return;
     579  	}
     580  
     581  	for (unsigned int i = 0; i < ARRAY_SIZE(filter_generators); ++i) {
     582  		bool overflow = false;
     583  		unsigned short len = filter_generators[i](filters[i],
     584  							  &overflow);
     585  		if (len < bpf_prog.len && !overflow) {
     586  			bpf_prog.len = len;
     587  			bpf_prog.filter = filters[i];
     588  		}
     589  	}
     590  	if (bpf_prog.len == USHRT_MAX) {
     591  		debug_msg("seccomp filter disabled due to jump offset "
     592  			  "overflow");
     593  		seccomp_filtering = false;
     594  	} else if (bpf_prog.len > BPF_MAXINSNS) {
     595  		debug_msg("seccomp filter disabled due to BPF program "
     596  			  "being oversized (%u > %d)", bpf_prog.len,
     597  			  BPF_MAXINSNS);
     598  		seccomp_filtering = false;
     599  	}
     600  
     601  	if (seccomp_filtering)
     602  		check_seccomp_order();
     603  }
     604  
     605  static void
     606  dump_seccomp_bpf(void)
     607  {
     608  	const struct sock_filter *filter = bpf_prog.filter;
     609  	for (unsigned int i = 0; i < bpf_prog.len; ++i) {
     610  		switch (filter[i].code) {
     611  		case BPF_LD | BPF_W | BPF_ABS:
     612  			switch (filter[i].k) {
     613  			case offsetof(struct seccomp_data, arch):
     614  				error_msg("STMT(BPF_LDWABS, data->arch)");
     615  				break;
     616  			case offsetof(struct seccomp_data, nr):
     617  				error_msg("STMT(BPF_LDWABS, data->nr)");
     618  				break;
     619  			default:
     620  				error_msg("STMT(BPF_LDWABS, 0x%x)",
     621  					  filter[i].k);
     622  			}
     623  			break;
     624  		case BPF_LD + BPF_W + BPF_IMM:
     625  			error_msg("STMT(BPF_LDWIMM, 0x%x)", filter[i].k);
     626  			break;
     627  		case BPF_RET | BPF_K:
     628  			switch (filter[i].k) {
     629  			case SECCOMP_RET_TRACE:
     630  				error_msg("STMT(BPF_RET, SECCOMP_RET_TRACE)");
     631  				break;
     632  			case SECCOMP_RET_ALLOW:
     633  				error_msg("STMT(BPF_RET, SECCOMP_RET_ALLOW)");
     634  				break;
     635  			default:
     636  				error_msg("STMT(BPF_RET, 0x%x)", filter[i].k);
     637  			}
     638  			break;
     639  		case BPF_JMP | BPF_JEQ | BPF_K:
     640  			error_msg("JUMP(BPF_JEQ, %u, %u, %u)",
     641  				  filter[i].jt, filter[i].jf,
     642  				  filter[i].k);
     643  			break;
     644  		case BPF_JMP | BPF_JGE | BPF_K:
     645  			error_msg("JUMP(BPF_JGE, %u, %u, %u)",
     646  				  filter[i].jt, filter[i].jf,
     647  				  filter[i].k);
     648  			break;
     649  		case BPF_JMP + BPF_JSET + BPF_K:
     650  			error_msg("JUMP(BPF_JSET, %u, %u, 0x%x)",
     651  				  filter[i].jt, filter[i].jf,
     652  				  filter[i].k);
     653  			break;
     654  		case BPF_JMP | BPF_JA:
     655  			error_msg("JUMP(BPF_JA, %u)", filter[i].k);
     656  			break;
     657  		case BPF_ALU + BPF_RSH + BPF_K:
     658  			error_msg("STMT(BPF_RSH, %u)", filter[i].k);
     659  			break;
     660  		case BPF_ALU + BPF_LSH + BPF_X:
     661  			error_msg("STMT(BPF_LSH, X)");
     662  			break;
     663  		case BPF_ALU + BPF_AND + BPF_K:
     664  			error_msg("STMT(BPF_AND, 0x%x)", filter[i].k);
     665  			break;
     666  		case BPF_MISC + BPF_TAX:
     667  			error_msg("STMT(BPF_TAX)");
     668  			break;
     669  		case BPF_MISC + BPF_TXA:
     670  			error_msg("STMT(BPF_TXA)");
     671  			break;
     672  		default:
     673  			error_msg("STMT(0x%x, %u, %u, 0x%x)", filter[i].code,
     674  				  filter[i].jt, filter[i].jf, filter[i].k);
     675  		}
     676  	}
     677  }
     678  
     679  void
     680  init_seccomp_filter(void)
     681  {
     682  	if (debug_flag)
     683  		dump_seccomp_bpf();
     684  
     685  	if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bpf_prog) == 0)
     686  		return;
     687  
     688  	if (errno == EACCES) {
     689  		if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
     690  			perror_func_msg_and_die("prctl(PR_SET_NO_NEW_PRIVS)");
     691  		if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bpf_prog) == 0)
     692  			return;
     693  	}
     694  
     695  	perror_func_msg_and_die("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
     696  }
     697  
     698  int
     699  seccomp_filter_restart_operator(const struct tcb *tcp)
     700  {
     701  	if (exiting(tcp) && tcp->scno < nsyscall_vec[current_personality]
     702  	    && traced_by_seccomp(tcp->scno, current_personality))
     703  		return PTRACE_SYSCALL;
     704  	return PTRACE_CONT;
     705  }
     706  
     707  void
     708  check_seccomp_filter(void)
     709  {
     710  	/* Let's avoid enabling seccomp if all syscalls are traced. */
     711  	seccomp_filtering = !is_complete_set_array(trace_set, nsyscall_vec,
     712  						   SUPPORTED_PERSONALITIES);
     713  	if (!seccomp_filtering) {
     714  		error_msg("Seccomp filter is requested "
     715  			  "but there are no syscalls to filter.  "
     716  			  "See -e trace to filter syscalls.");
     717  		return;
     718  	}
     719  
     720  	check_seccomp_filter_properties();
     721  
     722  	if (!seccomp_filtering)
     723  		error_msg("seccomp filter is requested but unavailable");
     724  }