Seccomp_linux.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467
  1. /* vim: set expandtab ts=4 sw=4: */
  2. /*
  3. * You may redistribute this program and/or modify it under the terms of
  4. * the GNU General Public License as published by the Free Software Foundation,
  5. * either version 3 of the License, or (at your option) any later version.
  6. *
  7. * This program is distributed in the hope that it will be useful,
  8. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. * GNU General Public License for more details.
  11. *
  12. * You should have received a copy of the GNU General Public License
  13. * along with this program. If not, see <https://www.gnu.org/licenses/>.
  14. */
  15. // sigaction() siginfo_t SIG_UNBLOCK
  16. #define _POSIX_C_SOURCE 199309L
  17. #include "util/Seccomp_impl.h"
  18. #include "util/Bits.h"
  19. #include "util/ArchInfo.h"
  20. #include "util/Defined.h"
  21. // getpriority()
  22. #include <sys/resource.h>
  23. #include <signal.h>
  24. #include <sys/prctl.h>
  25. #include <errno.h>
  26. #include <linux/filter.h>
  27. #include <linux/seccomp.h>
  28. #include <linux/audit.h>
  29. #include <linux/netlink.h>
  30. #include <sys/syscall.h>
  31. #include <sys/socket.h>
  32. #include <sys/ioctl.h>
  33. #include <stddef.h>
  34. #include <stdio.h>
  35. #include <string.h>
  36. /**
  37. * A unique number which is returned as errno by getpriority(), a syscall we never use
  38. * this will be used by Seccomp_isWorking() to detect that the filter has been properly installed.
  39. */
  40. #define IS_WORKING_ERRNO 3333
  41. /**
  42. * Accessing the SIGSYS siginfo depends on the fields being defined by the libc.
  43. * Older libc do not yet include the needed definitions and accessor macros.
  44. * Work around that by falling back to si_value.sival_int which works on some
  45. * but not all architectures.
  46. */
  47. #if defined(si_syscall)
  48. # define GET_SYSCALL_NUM(si) ((si)->si_syscall)
  49. #else
  50. #pragma message "your libc doesn't define SIGSYS signal info! \
  51. info about syscall number in case of SECCOMP crash can be invalid"
  52. # define GET_SYSCALL_NUM(si) ((si)->si_value.sival_int)
  53. #endif
  54. static void catchViolation(int sig, siginfo_t* si, void* threadContext)
  55. {
  56. printf("Attempted banned syscall number [%d] see doc/Seccomp.md for more information\n",
  57. GET_SYSCALL_NUM(si));
  58. if (Defined(si_syscall)) {
  59. printf("Your libc doesn't define SIGSYS signal info. "
  60. "Above information about syscall number can be invalid.\n");
  61. }
  62. Assert_failure("Disallowed Syscall");
  63. }
  64. struct Filter {
  65. int label;
  66. int jt;
  67. int jf;
  68. struct sock_filter sf;
  69. };
  70. static struct sock_fprog* compile(struct Filter* input, int inputLen, struct Allocator* alloc)
  71. {
  72. // compute gotos
  73. int totalOut = 0;
  74. for (int i = inputLen-1; i >= 0; i--) {
  75. struct Filter* a = &input[i];
  76. if (a->label == 0) {
  77. // check for unresolved gotos...
  78. Assert_true(a->jt == 0 && a->jf == 0);
  79. totalOut++;
  80. continue;
  81. }
  82. int diff = 0;
  83. for (int j = i-1; j >= 0; j--) {
  84. struct Filter* b = &input[j];
  85. if (b->label != 0) { continue; }
  86. if (b->jt == a->label) {
  87. b->sf.jt = diff;
  88. b->jt = 0;
  89. }
  90. if (b->jf == a->label) {
  91. b->sf.jf = diff;
  92. b->jf = 0;
  93. }
  94. diff++;
  95. }
  96. }
  97. // copy into output filter array...
  98. struct sock_filter* sf = Allocator_calloc(alloc, sizeof(struct sock_filter), totalOut);
  99. int outI = 0;
  100. for (int i = 0; i < inputLen; i++) {
  101. if (input[i].label == 0) {
  102. Bits_memcpy(&sf[outI++], &input[i].sf, sizeof(struct sock_filter));
  103. }
  104. Assert_true(outI <= totalOut);
  105. Assert_true(i != inputLen-1 || outI == totalOut);
  106. }
  107. struct sock_fprog* out = Allocator_malloc(alloc, sizeof(struct sock_fprog));
  108. out->len = (unsigned short) totalOut;
  109. out->filter = sf;
  110. return out;
  111. }
  112. #define RET_TRAP 0x00030000u
  113. #define RET_ERRNO(x) (0x00050000u | ((x) & 0x0000ffffu))
  114. #define RET_SUCCESS 0x7fff0000u
  115. static Er_DEFUN(struct sock_fprog* mkFilter(struct Allocator* alloc))
  116. {
  117. // Adding exceptions to the syscall filter:
  118. //
  119. // echo '#include <sys/syscall.h>' | gcc -E -dM - | grep 'define __NR_' | sort
  120. // for the full list of system calls with syscall numbers (different per ABI)
  121. //
  122. // If gdb traps out it will look like this:
  123. //
  124. // Program received signal SIGSYS, Bad system call.
  125. // [Switching to Thread 0x7ffff7fdd740 (LWP 14673)]
  126. // 0x00007ffff74d1caa in mmap64 () at ../sysdeps/unix/syscall-template.S:81
  127. // 81 ../sysdeps/unix/syscall-template.S: No such file or directory.
  128. //
  129. // %eax should contain the system call number (on different ABIs YMMV)
  130. //
  131. // (gdb) print $eax
  132. // $1 = 9
  133. // (gdb)
  134. //
  135. // Consult your syscall table from the above gcc command...
  136. //
  137. // #define __NR_mmap 9
  138. //
  139. // Then add:
  140. //
  141. // IFEQ(__NR_mmap, success),
  142. //
  143. // And add a comment documenting where you needed that syscall :)
  144. #define STMT(code, val) { .sf = BPF_STMT(code, val) }
  145. #define JMPK(type, not, input, label) { \
  146. .sf = BPF_JUMP(BPF_JMP+(type)+BPF_K, (input), 0, 0), \
  147. .jt = (!(not) ? (label) : 0), \
  148. .jf = ((not) ? (label) : 0) \
  149. }
  150. // Create a label for jumps, the label must be represented by a non-zero integer.
  151. #define LABEL(lbl) { .label = (lbl) }
  152. // Load offset into the register
  153. #define LOAD(offset) STMT(BPF_LD+BPF_W+BPF_ABS, (offset))
  154. // Return constant value
  155. #define RET(val) STMT(BPF_RET+BPF_K, (val))
  156. // If-equal if the currently loaded value equals input, jump to label.
  157. #define IFEQ(input, label) JMPK(BPF_JEQ, 0, (input), (label))
  158. // If-not-equal if the currently loaded value is not equal to input, jump to label.
  159. #define IFNE(input, label) JMPK(BPF_JEQ, 1, (input), (label))
  160. // If-greater-than
  161. #define IFGT(input, label) JMPK(BPF_JGT, 0, (input), (label))
  162. // If-greater-than-or-equal-to
  163. #define IFGE(input, label) JMPK(BPF_JGE, 0, (input), (label))
  164. // If-less-than
  165. #define IFLT(input, label) JMPK(BPF_JGE, 1, (input), (label))
  166. // If-less-than-or-equal-to
  167. #define IFLE(input, label) JMPK(BPF_JGT, 1, (input), (label))
  168. // labels are integers so they must be predefined
  169. int success = 1;
  170. int fail = 2;
  171. int unmaskOnly = 3;
  172. int isworking = 4;
  173. int ioctl_setip = 5;
  174. int bind_netlink = 6;
  175. uint32_t auditArch = ArchInfo_getAuditArch();
  176. struct Filter seccompFilter[] = {
  177. LOAD(offsetof(struct seccomp_data, arch)),
  178. IFNE(auditArch, fail),
  179. // Get the syscall num.
  180. LOAD(offsetof(struct seccomp_data, nr)),
  181. // rust/threading
  182. #ifdef __NR_futex
  183. IFEQ(__NR_futex, success),
  184. #endif
  185. // udp
  186. #ifdef __NR_sendmsg
  187. IFEQ(__NR_sendmsg, success),
  188. #endif
  189. #ifdef __NR_recvmsg
  190. IFEQ(__NR_recvmsg, success),
  191. #endif
  192. // ETHInterface
  193. #ifdef __NR_sendto
  194. IFEQ(__NR_sendto, success),
  195. #endif
  196. #ifdef __NR_recvfrom
  197. IFEQ(__NR_recvfrom, success),
  198. #endif
  199. #ifdef __NR_socketcall
  200. // 32-bit: recvmsg is a socketcall
  201. IFEQ(__NR_socketcall, success),
  202. #endif
  203. // libuv
  204. IFEQ(__NR_epoll_ctl, success),
  205. #ifdef __NR_epoll_wait
  206. IFEQ(__NR_epoll_wait, success),
  207. #endif
  208. #ifdef __NR_epoll_pwait
  209. IFEQ(__NR_epoll_pwait, success),
  210. #endif
  211. // gettimeofday is required on some architectures
  212. #ifdef __NR_gettimeofday
  213. IFEQ(__NR_gettimeofday, success),
  214. #endif
  215. // TUN (and logging)
  216. IFEQ(__NR_write, success),
  217. IFEQ(__NR_read, success),
  218. // readv and writev are used by some libc (musl)
  219. #ifdef __NR_readv
  220. IFEQ(__NR_readv, success),
  221. #endif
  222. #ifdef __NR_writev
  223. IFEQ(__NR_writev, success),
  224. #endif
  225. // modern librt reads a read-only mapped section of kernel space which contains the time
  226. // older versions need system calls for getting the time.
  227. // i686 glibc-2.18's time() uses __NR_time
  228. // Raspberry Pi and BeagleBone Black don't provide __NR_time
  229. // 32-bit systems with 64-bit time_t use __NR_clock_gettime64
  230. #ifdef __NR_clock_gettime64
  231. IFEQ(__NR_clock_gettime64, success),
  232. #endif
  233. #ifdef __NR_clock_gettime
  234. IFEQ(__NR_clock_gettime, success),
  235. #endif
  236. #ifdef __NR_time
  237. IFEQ(__NR_time, success),
  238. #endif
  239. // NetPlatform_linux.c send recv
  240. #ifdef __NR_send
  241. IFEQ(__NR_send, success),
  242. #endif
  243. #ifdef __NR_recv
  244. IFEQ(__NR_recv, success),
  245. #endif
  246. // malloc()
  247. IFEQ(__NR_brk, success),
  248. // abort()
  249. IFEQ(__NR_gettid, success),
  250. IFEQ(__NR_tgkill, success),
  251. IFEQ(__NR_rt_sigprocmask, unmaskOnly),
  252. // exit()
  253. IFEQ(__NR_exit_group, success),
  254. // Seccomp_isWorking()
  255. IFEQ(__NR_getpriority, isworking),
  256. // Securiy_checkPermissions() -> canOpenFiles()
  257. IFEQ(__NR_dup, success),
  258. IFEQ(__NR_close, success),
  259. // Security_checkPermissions() -> getMaxMem()
  260. // x86/ARM use ugetrlimit and mmap2
  261. // ARM does not even have __NR_getrlimit or __NR_mmap defined
  262. // and AMD64 does not have __NR_ugetrlimit or __NR_mmap2 defined
  263. #ifdef __NR_getrlimit
  264. IFEQ(__NR_getrlimit, success),
  265. #endif
  266. #ifdef __NR_ugetrlimit
  267. IFEQ(__NR_ugetrlimit, success),
  268. #endif
  269. #ifdef __NR_mmap
  270. IFEQ(__NR_mmap, success),
  271. #endif
  272. #ifdef __NR_mmap2
  273. IFEQ(__NR_mmap2, success),
  274. #endif
  275. IFEQ(__NR_munmap, success),
  276. // printf()
  277. IFEQ(__NR_fstat, success),
  278. #ifdef __NR_fstat64
  279. IFEQ(__NR_fstat64, success),
  280. #endif
  281. // for setting IP addresses
  282. // socketForIfName()
  283. // and ETHInterface_listDevices
  284. #ifdef __NR_socket
  285. IFEQ(__NR_socket, success),
  286. #endif
  287. IFEQ(__NR_ioctl, ioctl_setip),
  288. // Security_checkPermissions
  289. IFEQ(__NR_getuid, success),
  290. // Security_nofiles
  291. IFEQ(__NR_setrlimit, success),
  292. // for ETHInterface_listDevices (netlinkk)
  293. #ifdef __NR_bind
  294. IFEQ(__NR_bind, bind_netlink),
  295. #endif
  296. #ifdef __NR_getsockname
  297. IFEQ(__NR_getsockname, success),
  298. #endif
  299. // musl free() calls madvise()
  300. #ifdef __NR_madvise
  301. IFEQ(__NR_madvise, success),
  302. #endif
  303. // accept() for PipeServer
  304. #ifdef __NR_accept4
  305. IFEQ(__NR_accept4, success),
  306. #endif
  307. #ifdef Cjdns_android
  308. #ifdef __NR_rt_sigprocmask
  309. IFEQ(__NR_rt_sigprocmask, success),
  310. #endif
  311. #endif
  312. // rust/wg
  313. #ifdef __NR_getrandom
  314. IFEQ(__NR_getrandom, success),
  315. #endif
  316. // https://github.com/cjdelisle/boringtun/blob/master/src/crypto/x25519/mod.rs#L22
  317. #if defined(__ARM_EABI__) && defined(__NR_fcntl64)
  318. IFEQ(__NR_fcntl64, success),
  319. #endif
  320. // 2024-01-09 by Caleb's advice
  321. // it is used by Seccomp_test
  322. #ifdef __NR_sigaltstack
  323. IFEQ(__NR_sigaltstack, success),
  324. #endif
  325. RET(SECCOMP_RET_TRAP),
  326. LABEL(ioctl_setip),
  327. LOAD(offsetof(struct seccomp_data, args[1])),
  328. IFEQ(SIOCGIFINDEX, success),
  329. IFEQ(SIOCGIFFLAGS, success),
  330. IFEQ(SIOCSIFFLAGS, success),
  331. IFEQ(SIOCSIFADDR, success),
  332. IFEQ(SIOCSIFNETMASK, success),
  333. IFEQ(SIOCSIFMTU, success),
  334. RET(SECCOMP_RET_TRAP),
  335. LABEL(bind_netlink),
  336. LOAD(offsetof(struct seccomp_data, args[2])),
  337. // Filter NETLINK by size of address.
  338. // Most importantly INET and INET6
  339. // are differnt.
  340. IFEQ(sizeof(struct sockaddr_nl), success),
  341. RET(SECCOMP_RET_TRAP),
  342. // We allow sigprocmask to *unmask* signals but we don't allow it to mask them.
  343. LABEL(unmaskOnly),
  344. LOAD(offsetof(struct seccomp_data, args[0])),
  345. IFEQ(SIG_UNBLOCK, success),
  346. RET(SECCOMP_RET_TRAP),
  347. LABEL(isworking),
  348. RET(RET_ERRNO(IS_WORKING_ERRNO)),
  349. LABEL(fail),
  350. RET(SECCOMP_RET_TRAP),
  351. LABEL(success),
  352. RET(SECCOMP_RET_ALLOW),
  353. };
  354. Er_ret(compile(seccompFilter, sizeof(seccompFilter)/sizeof(seccompFilter[0]), alloc));
  355. }
  356. static Er_DEFUN(void installFilter(
  357. struct sock_fprog* filter, struct Log* logger, struct Allocator* alloc))
  358. {
  359. struct sigaction sa = { .sa_sigaction = catchViolation, .sa_flags = SA_SIGINFO };
  360. if (sigaction(SIGSYS, &sa, NULL)) {
  361. Log_warn(logger, "sigaction(SIGSYS) -> [%s]\n", strerror(errno));
  362. }
  363. if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) == -1) {
  364. // don't worry about it.
  365. Log_warn(logger, "prctl(PR_SET_NO_NEW_PRIVS) -> [%s]\n", strerror(errno));
  366. }
  367. if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, filter) == -1) {
  368. Er_raise(alloc, "prctl(PR_SET_SECCOMP) -> [%s]\n", strerror(errno));
  369. }
  370. Er_ret();
  371. }
  372. Er_DEFUN(void Seccomp_dropPermissions(struct Allocator* tempAlloc, struct Log* logger))
  373. {
  374. struct sock_fprog* filter = Er(mkFilter(tempAlloc));
  375. Er(installFilter(filter, logger, tempAlloc));
  376. if (!Seccomp_isWorking()) {
  377. Er_raise(tempAlloc, "Seccomp filter not installed properly, Seccomp_isWorking() -> false");
  378. }
  379. Er_ret();
  380. }
  381. int Seccomp_isWorking(void)
  382. {
  383. errno = 0;
  384. // If seccomp is not working, this will fail setting errno to EINVAL
  385. long ret = getpriority(1000, 1);
  386. int err = errno;
  387. // Inside of the kernel, it seems to check whether the errno return is sane
  388. // and if it is not, it treates it as a return value, IS_WORKING_ERRNO (3333) is very unique so
  389. // we'll check for either case just in case this changes.
  390. return (ret == -1 && err == IS_WORKING_ERRNO) || (ret == -IS_WORKING_ERRNO && err == 0);
  391. }
  392. int Seccomp_exists(void)
  393. {
  394. return 1;
  395. }