/* vim: set expandtab ts=4 sw=4: */ /* * You may redistribute this program and/or modify it under the terms of * the GNU General Public License as published by the Free Software Foundation, * either version 3 of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ // sigaction() siginfo_t SIG_UNBLOCK #define _POSIX_C_SOURCE 199309L #include "util/Seccomp.h" #include "util/Bits.h" #include "util/ArchInfo.h" // getpriority() #include #include #include #include #include #include #include #include #include #include #include #include #include static void catchViolation(int sig, siginfo_t* si, void* threadContext) { printf("Attempted banned syscall number [%d] see doc/Seccomp.md for more information\n", si->si_value.sival_int); Assert_failure("Disallowed Syscall"); } struct Filter { int label; int jt; int jf; struct sock_filter sf; }; static struct sock_fprog* compile(struct Filter* input, int inputLen, struct Allocator* alloc) { // compute gotos int totalOut = 0; for (int i = inputLen-1; i >= 0; i--) { struct Filter* a = &input[i]; if (a->label == 0) { // check for unresolved gotos... Assert_true(a->jt == 0 && a->jf == 0); totalOut++; continue; } int diff = 0; for (int j = i-1; j >= 0; j--) { struct Filter* b = &input[j]; if (b->label != 0) { continue; } if (b->jt == a->label) { b->sf.jt = diff; b->jt = 0; } if (b->jf == a->label) { b->sf.jf = diff; b->jf = 0; } diff++; } } // copy into output filter array... struct sock_filter* sf = Allocator_calloc(alloc, sizeof(struct sock_filter), totalOut); int outI = 0; for (int i = 0; i < inputLen; i++) { if (input[i].label == 0) { Bits_memcpyConst(&sf[outI++], &input[i].sf, sizeof(struct sock_filter)); } Assert_true(outI <= totalOut); Assert_true(i != inputLen-1 || outI == totalOut); } struct sock_fprog* out = Allocator_malloc(alloc, sizeof(struct sock_fprog)); out->len = (unsigned short) totalOut; out->filter = sf; return out; } #define RET_TRAP 0x00030000u #define RET_ERRNO(x) (0x00050000u | ((x) & 0x0000ffffu)) #define RET_SUCCESS 0x7fff0000u static struct sock_fprog* mkFilter(struct Allocator* alloc, struct Except* eh) { // Adding exceptions to the syscall filter: // // echo '#include ' | gcc -E -dM - | grep 'define __NR_' | sort // for the full list of system calls with syscall numbers (different per ABI) // // If gdb traps out it will look like this: // // Program received signal SIGSYS, Bad system call. // [Switching to Thread 0x7ffff7fdd740 (LWP 14673)] // 0x00007ffff74d1caa in mmap64 () at ../sysdeps/unix/syscall-template.S:81 // 81 ../sysdeps/unix/syscall-template.S: No such file or directory. // // %eax should contain the system call number (on different ABIs YMMV) // // (gdb) print $eax // $1 = 9 // (gdb) // // Consult your syscall table from the above gcc command... // // #define __NR_mmap 9 // // Then add: // // IFEQ(__NR_mmap, success), // // And add a comment documenting where you needed that syscall :) #define STMT(code, val) { .sf = BPF_STMT(code, val) } #define JMPK(type, not, input, label) { \ .sf = BPF_JUMP(BPF_JMP+(type)+BPF_K, (input), 0, 0), \ .jt = (!(not) ? (label) : 0), \ .jf = ((not) ? (label) : 0) \ } // Create a label for jumps, the label must be represented by a non-zero integer. #define LABEL(lbl) { .label = (lbl) } // Load offset into the register #define LOAD(offset) STMT(BPF_LD+BPF_W+BPF_ABS, (offset)) // Return constant value #define RET(val) STMT(BPF_RET+BPF_K, (val)) // If-equal if the currently loaded value equals input, jump to label. #define IFEQ(input, label) JMPK(BPF_JEQ, 0, (input), (label)) // If-not-equal if the currently loaded value is not equal to input, jump to label. #define IFNE(input, label) JMPK(BPF_JEQ, 1, (input), (label)) // If-greater-than #define IFGT(input, label) JMPK(BPF_JGT, 0, (input), (label)) // If-greater-than-or-equal-to #define IFGE(input, label) JMPK(BPF_JGE, 0, (input), (label)) // If-less-than #define IFLT(input, label) JMPK(BPF_JGE, 1, (input), (label)) // If-less-than-or-equal-to #define IFLE(input, label) JMPK(BPF_JGT, 1, (input), (label)) // labels are integers so they must be predefined int success = 1; int fail = 2; int unmaskOnly = 3; int isworking = 4; int socket_setip = 5; int ioctl_setip = 6; uint32_t auditArch = ArchInfo_getAuditArch(); struct Filter seccompFilter[] = { LOAD(offsetof(struct seccomp_data, arch)), IFNE(auditArch, fail), // Get the syscall num. LOAD(offsetof(struct seccomp_data, nr)), // udp #ifdef __NR_sendmsg IFEQ(__NR_sendmsg, success), #endif #ifdef __NR_recvmsg IFEQ(__NR_recvmsg, success), #endif // ETHInterface #ifdef __NR_sendto IFEQ(__NR_sendto, success), #endif #ifdef __NR_recvfrom IFEQ(__NR_recvfrom, success), #endif #ifdef __NR_socketcall // 32-bit: recvmsg is a socketcall IFEQ(__NR_socketcall, success), #endif // libuv IFEQ(__NR_epoll_ctl, success), IFEQ(__NR_epoll_wait, success), // TUN (and logging) IFEQ(__NR_write, success), IFEQ(__NR_read, success), // modern librt reads a read-only mapped section of kernel space which contains the time // older versions need system calls for getting the time. // i686 glibc-2.18's time() uses __NR_time // Raspberry Pi and BeagleBone Black don't provide __NR_time IFEQ(__NR_clock_gettime, success), #ifdef __NR_time IFEQ(__NR_time, success), #endif // malloc() IFEQ(__NR_brk, success), // abort() IFEQ(__NR_gettid, success), IFEQ(__NR_tgkill, success), IFEQ(__NR_rt_sigprocmask, unmaskOnly), // exit() IFEQ(__NR_exit_group, success), // Seccomp_isWorking() IFEQ(__NR_getpriority, isworking), // Securiy_checkPermissions() -> canOpenFiles() IFEQ(__NR_dup, success), IFEQ(__NR_close, success), // Security_checkPermissions() -> getMaxMem() // x86/ARM use ugetrlimit and mmap2 // ARM does not even have __NR_getrlimit or __NR_mmap defined // and AMD64 does not have __NR_ugetrlimit or __NR_mmap2 defined #ifdef __NR_getrlimit IFEQ(__NR_getrlimit, success), #endif #ifdef __NR_ugetrlimit IFEQ(__NR_ugetrlimit, success), #endif #ifdef __NR_mmap IFEQ(__NR_mmap, success), #endif #ifdef __NR_mmap2 IFEQ(__NR_mmap2, success), #endif IFEQ(__NR_munmap, success), // printf() IFEQ(__NR_fstat, success), // for setting IP addresses... // socketForIfName() #ifdef __NR_socket IFEQ(__NR_socket, socket_setip), #endif IFEQ(__NR_ioctl, ioctl_setip), RET(SECCOMP_RET_TRAP), LABEL(socket_setip), LOAD(offsetof(struct seccomp_data, args[1])), IFEQ(SOCK_DGRAM, success), RET(SECCOMP_RET_TRAP), LABEL(ioctl_setip), LOAD(offsetof(struct seccomp_data, args[1])), IFEQ(SIOCGIFINDEX, success), IFEQ(SIOCGIFFLAGS, success), IFEQ(SIOCSIFFLAGS, success), IFEQ(SIOCSIFADDR, success), IFEQ(SIOCSIFNETMASK, success), IFEQ(SIOCSIFMTU, success), RET(SECCOMP_RET_TRAP), // We allow sigprocmask to *unmask* signals but we don't allow it to mask them. LABEL(unmaskOnly), LOAD(offsetof(struct seccomp_data, args[0])), IFEQ(SIG_UNBLOCK, success), RET(SECCOMP_RET_TRAP), LABEL(isworking), RET(RET_ERRNO(9000)), LABEL(fail), RET(SECCOMP_RET_TRAP), LABEL(success), RET(SECCOMP_RET_ALLOW), }; return compile(seccompFilter, sizeof(seccompFilter)/sizeof(seccompFilter[0]), alloc); } static void installFilter(struct sock_fprog* filter, struct Log* logger, struct Except* eh) { struct sigaction sa = { .sa_sigaction = catchViolation, .sa_flags = SA_SIGINFO }; if (sigaction(SIGSYS, &sa, NULL)) { Log_warn(logger, "sigaction(SIGSYS) -> [%s]\n", strerror(errno)); } if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) == -1) { // don't worry about it. Log_warn(logger, "prctl(PR_SET_NO_NEW_PRIVS) -> [%s]\n", strerror(errno)); } if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, filter) == -1) { Except_throw(eh, "prctl(PR_SET_SECCOMP) -> [%s]\n", strerror(errno)); } } void Seccomp_dropPermissions(struct Allocator* tempAlloc, struct Log* logger, struct Except* eh) { struct sock_fprog* filter = mkFilter(tempAlloc, eh); installFilter(filter, logger, eh); if (!Seccomp_isWorking()) { Except_throw(eh, "Seccomp filter not installed properly, Seccomp_isWorking() -> false"); } } int Seccomp_isWorking() { errno = 0; // If seccomp is not working, this will fail setting errno to EINVAL long ret = getpriority(1000, 1); // Inside of the kernel, it seems to check whether the errno return is sane // and if it is not, it treates it as a return value, 9000 is very unique so // we'll check for either case just in case this changes. return (ret == -1 && errno == 9000) || (ret == -9000 && errno == 0); } int Seccomp_exists() { return 1; }