/* vim: set expandtab ts=4 sw=4: */
/*
* You may redistribute this program and/or modify it under the terms of
* the GNU General Public License as published by the Free Software Foundation,
* either version 3 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
// sigaction() siginfo_t SIG_UNBLOCK
#define _POSIX_C_SOURCE 199309L
#include "util/Seccomp_impl.h"
#include "util/Bits.h"
#include "util/ArchInfo.h"
#include "util/Defined.h"
// getpriority()
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
/**
* A unique number which is returned as errno by getpriority(), a syscall we never use
* this will be used by Seccomp_isWorking() to detect that the filter has been properly installed.
*/
#define IS_WORKING_ERRNO 3333
/**
* Accessing the SIGSYS siginfo depends on the fields being defined by the libc.
* Older libc do not yet include the needed definitions and accessor macros.
* Work around that by falling back to si_value.sival_int which works on some
* but not all architectures.
*/
#if defined(si_syscall)
# define GET_SYSCALL_NUM(si) ((si)->si_syscall)
#else
#pragma message "your libc doesn't define SIGSYS signal info! \
info about syscall number in case of SECCOMP crash can be invalid"
# define GET_SYSCALL_NUM(si) ((si)->si_value.sival_int)
#endif
static void catchViolation(int sig, siginfo_t* si, void* threadContext)
{
printf("Attempted banned syscall number [%d] see doc/Seccomp.md for more information\n",
GET_SYSCALL_NUM(si));
if (Defined(si_syscall)) {
printf("Your libc doesn't define SIGSYS signal info. "
"Above information about syscall number can be invalid.\n");
}
Assert_failure("Disallowed Syscall");
}
struct Filter {
int label;
int jt;
int jf;
struct sock_filter sf;
};
static struct sock_fprog* compile(struct Filter* input, int inputLen, struct Allocator* alloc)
{
// compute gotos
int totalOut = 0;
for (int i = inputLen-1; i >= 0; i--) {
struct Filter* a = &input[i];
if (a->label == 0) {
// check for unresolved gotos...
Assert_true(a->jt == 0 && a->jf == 0);
totalOut++;
continue;
}
int diff = 0;
for (int j = i-1; j >= 0; j--) {
struct Filter* b = &input[j];
if (b->label != 0) { continue; }
if (b->jt == a->label) {
b->sf.jt = diff;
b->jt = 0;
}
if (b->jf == a->label) {
b->sf.jf = diff;
b->jf = 0;
}
diff++;
}
}
// copy into output filter array...
struct sock_filter* sf = Allocator_calloc(alloc, sizeof(struct sock_filter), totalOut);
int outI = 0;
for (int i = 0; i < inputLen; i++) {
if (input[i].label == 0) {
Bits_memcpy(&sf[outI++], &input[i].sf, sizeof(struct sock_filter));
}
Assert_true(outI <= totalOut);
Assert_true(i != inputLen-1 || outI == totalOut);
}
struct sock_fprog* out = Allocator_malloc(alloc, sizeof(struct sock_fprog));
out->len = (unsigned short) totalOut;
out->filter = sf;
return out;
}
#define RET_TRAP 0x00030000u
#define RET_ERRNO(x) (0x00050000u | ((x) & 0x0000ffffu))
#define RET_SUCCESS 0x7fff0000u
static Er_DEFUN(struct sock_fprog* mkFilter(struct Allocator* alloc))
{
// Adding exceptions to the syscall filter:
//
// echo '#include ' | gcc -E -dM - | grep 'define __NR_' | sort
// for the full list of system calls with syscall numbers (different per ABI)
//
// If gdb traps out it will look like this:
//
// Program received signal SIGSYS, Bad system call.
// [Switching to Thread 0x7ffff7fdd740 (LWP 14673)]
// 0x00007ffff74d1caa in mmap64 () at ../sysdeps/unix/syscall-template.S:81
// 81 ../sysdeps/unix/syscall-template.S: No such file or directory.
//
// %eax should contain the system call number (on different ABIs YMMV)
//
// (gdb) print $eax
// $1 = 9
// (gdb)
//
// Consult your syscall table from the above gcc command...
//
// #define __NR_mmap 9
//
// Then add:
//
// IFEQ(__NR_mmap, success),
//
// And add a comment documenting where you needed that syscall :)
#define STMT(code, val) { .sf = BPF_STMT(code, val) }
#define JMPK(type, not, input, label) { \
.sf = BPF_JUMP(BPF_JMP+(type)+BPF_K, (input), 0, 0), \
.jt = (!(not) ? (label) : 0), \
.jf = ((not) ? (label) : 0) \
}
// Create a label for jumps, the label must be represented by a non-zero integer.
#define LABEL(lbl) { .label = (lbl) }
// Load offset into the register
#define LOAD(offset) STMT(BPF_LD+BPF_W+BPF_ABS, (offset))
// Return constant value
#define RET(val) STMT(BPF_RET+BPF_K, (val))
// If-equal if the currently loaded value equals input, jump to label.
#define IFEQ(input, label) JMPK(BPF_JEQ, 0, (input), (label))
// If-not-equal if the currently loaded value is not equal to input, jump to label.
#define IFNE(input, label) JMPK(BPF_JEQ, 1, (input), (label))
// If-greater-than
#define IFGT(input, label) JMPK(BPF_JGT, 0, (input), (label))
// If-greater-than-or-equal-to
#define IFGE(input, label) JMPK(BPF_JGE, 0, (input), (label))
// If-less-than
#define IFLT(input, label) JMPK(BPF_JGE, 1, (input), (label))
// If-less-than-or-equal-to
#define IFLE(input, label) JMPK(BPF_JGT, 1, (input), (label))
// labels are integers so they must be predefined
int success = 1;
int fail = 2;
int unmaskOnly = 3;
int isworking = 4;
int ioctl_setip = 5;
int bind_netlink = 6;
uint32_t auditArch = ArchInfo_getAuditArch();
struct Filter seccompFilter[] = {
LOAD(offsetof(struct seccomp_data, arch)),
IFNE(auditArch, fail),
// Get the syscall num.
LOAD(offsetof(struct seccomp_data, nr)),
// rust/threading
#ifdef __NR_futex
IFEQ(__NR_futex, success),
#endif
// udp
#ifdef __NR_sendmsg
IFEQ(__NR_sendmsg, success),
#endif
#ifdef __NR_recvmsg
IFEQ(__NR_recvmsg, success),
#endif
// ETHInterface
#ifdef __NR_sendto
IFEQ(__NR_sendto, success),
#endif
#ifdef __NR_recvfrom
IFEQ(__NR_recvfrom, success),
#endif
#ifdef __NR_socketcall
// 32-bit: recvmsg is a socketcall
IFEQ(__NR_socketcall, success),
#endif
// libuv
IFEQ(__NR_epoll_ctl, success),
#ifdef __NR_epoll_wait
IFEQ(__NR_epoll_wait, success),
#endif
#ifdef __NR_epoll_pwait
IFEQ(__NR_epoll_pwait, success),
#endif
// gettimeofday is required on some architectures
#ifdef __NR_gettimeofday
IFEQ(__NR_gettimeofday, success),
#endif
// TUN (and logging)
IFEQ(__NR_write, success),
IFEQ(__NR_read, success),
// readv and writev are used by some libc (musl)
#ifdef __NR_readv
IFEQ(__NR_readv, success),
#endif
#ifdef __NR_writev
IFEQ(__NR_writev, success),
#endif
// modern librt reads a read-only mapped section of kernel space which contains the time
// older versions need system calls for getting the time.
// i686 glibc-2.18's time() uses __NR_time
// Raspberry Pi and BeagleBone Black don't provide __NR_time
// 32-bit systems with 64-bit time_t use __NR_clock_gettime64
#ifdef __NR_clock_gettime64
IFEQ(__NR_clock_gettime64, success),
#endif
#ifdef __NR_clock_gettime
IFEQ(__NR_clock_gettime, success),
#endif
#ifdef __NR_time
IFEQ(__NR_time, success),
#endif
// NetPlatform_linux.c send recv
#ifdef __NR_send
IFEQ(__NR_send, success),
#endif
#ifdef __NR_recv
IFEQ(__NR_recv, success),
#endif
// malloc()
IFEQ(__NR_brk, success),
// abort()
IFEQ(__NR_gettid, success),
IFEQ(__NR_tgkill, success),
IFEQ(__NR_rt_sigprocmask, unmaskOnly),
// exit()
IFEQ(__NR_exit_group, success),
// Seccomp_isWorking()
IFEQ(__NR_getpriority, isworking),
// Securiy_checkPermissions() -> canOpenFiles()
IFEQ(__NR_dup, success),
IFEQ(__NR_close, success),
// Security_checkPermissions() -> getMaxMem()
// x86/ARM use ugetrlimit and mmap2
// ARM does not even have __NR_getrlimit or __NR_mmap defined
// and AMD64 does not have __NR_ugetrlimit or __NR_mmap2 defined
#ifdef __NR_getrlimit
IFEQ(__NR_getrlimit, success),
#endif
#ifdef __NR_ugetrlimit
IFEQ(__NR_ugetrlimit, success),
#endif
#ifdef __NR_mmap
IFEQ(__NR_mmap, success),
#endif
#ifdef __NR_mmap2
IFEQ(__NR_mmap2, success),
#endif
IFEQ(__NR_munmap, success),
// printf()
IFEQ(__NR_fstat, success),
#ifdef __NR_fstat64
IFEQ(__NR_fstat64, success),
#endif
// for setting IP addresses
// socketForIfName()
// and ETHInterface_listDevices
#ifdef __NR_socket
IFEQ(__NR_socket, success),
#endif
IFEQ(__NR_ioctl, ioctl_setip),
// Security_checkPermissions
IFEQ(__NR_getuid, success),
// Security_nofiles
IFEQ(__NR_setrlimit, success),
// for ETHInterface_listDevices (netlinkk)
#ifdef __NR_bind
IFEQ(__NR_bind, bind_netlink),
#endif
#ifdef __NR_getsockname
IFEQ(__NR_getsockname, success),
#endif
// musl free() calls madvise()
#ifdef __NR_madvise
IFEQ(__NR_madvise, success),
#endif
// accept() for PipeServer
#ifdef __NR_accept4
IFEQ(__NR_accept4, success),
#endif
#ifdef Cjdns_android
#ifdef __NR_rt_sigprocmask
IFEQ(__NR_rt_sigprocmask, success),
#endif
#endif
// rust/wg
#ifdef __NR_getrandom
IFEQ(__NR_getrandom, success),
#endif
// https://github.com/cjdelisle/boringtun/blob/master/src/crypto/x25519/mod.rs#L22
#if defined(__ARM_EABI__) && defined(__NR_fcntl64)
IFEQ(__NR_fcntl64, success),
#endif
// 2024-01-09 by Caleb's advice
// it is used by Seccomp_test
#ifdef __NR_sigaltstack
IFEQ(__NR_sigaltstack, success),
#endif
RET(SECCOMP_RET_TRAP),
LABEL(ioctl_setip),
LOAD(offsetof(struct seccomp_data, args[1])),
IFEQ(SIOCGIFINDEX, success),
IFEQ(SIOCGIFFLAGS, success),
IFEQ(SIOCSIFFLAGS, success),
IFEQ(SIOCSIFADDR, success),
IFEQ(SIOCSIFNETMASK, success),
IFEQ(SIOCSIFMTU, success),
RET(SECCOMP_RET_TRAP),
LABEL(bind_netlink),
LOAD(offsetof(struct seccomp_data, args[2])),
// Filter NETLINK by size of address.
// Most importantly INET and INET6
// are differnt.
IFEQ(sizeof(struct sockaddr_nl), success),
RET(SECCOMP_RET_TRAP),
// We allow sigprocmask to *unmask* signals but we don't allow it to mask them.
LABEL(unmaskOnly),
LOAD(offsetof(struct seccomp_data, args[0])),
IFEQ(SIG_UNBLOCK, success),
RET(SECCOMP_RET_TRAP),
LABEL(isworking),
RET(RET_ERRNO(IS_WORKING_ERRNO)),
LABEL(fail),
RET(SECCOMP_RET_TRAP),
LABEL(success),
RET(SECCOMP_RET_ALLOW),
};
Er_ret(compile(seccompFilter, sizeof(seccompFilter)/sizeof(seccompFilter[0]), alloc));
}
static Er_DEFUN(void installFilter(
struct sock_fprog* filter, struct Log* logger, struct Allocator* alloc))
{
struct sigaction sa = { .sa_sigaction = catchViolation, .sa_flags = SA_SIGINFO };
if (sigaction(SIGSYS, &sa, NULL)) {
Log_warn(logger, "sigaction(SIGSYS) -> [%s]\n", strerror(errno));
}
if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) == -1) {
// don't worry about it.
Log_warn(logger, "prctl(PR_SET_NO_NEW_PRIVS) -> [%s]\n", strerror(errno));
}
if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, filter) == -1) {
Er_raise(alloc, "prctl(PR_SET_SECCOMP) -> [%s]\n", strerror(errno));
}
Er_ret();
}
Er_DEFUN(void Seccomp_dropPermissions(struct Allocator* tempAlloc, struct Log* logger))
{
struct sock_fprog* filter = Er(mkFilter(tempAlloc));
Er(installFilter(filter, logger, tempAlloc));
if (!Seccomp_isWorking()) {
Er_raise(tempAlloc, "Seccomp filter not installed properly, Seccomp_isWorking() -> false");
}
Er_ret();
}
int Seccomp_isWorking(void)
{
errno = 0;
// If seccomp is not working, this will fail setting errno to EINVAL
long ret = getpriority(1000, 1);
int err = errno;
// Inside of the kernel, it seems to check whether the errno return is sane
// and if it is not, it treates it as a return value, IS_WORKING_ERRNO (3333) is very unique so
// we'll check for either case just in case this changes.
return (ret == -1 && err == IS_WORKING_ERRNO) || (ret == -IS_WORKING_ERRNO && err == 0);
}
int Seccomp_exists(void)
{
return 1;
}