Seccomp.c 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346
  1. /* vim: set expandtab ts=4 sw=4: */
  2. /*
  3. * You may redistribute this program and/or modify it under the terms of
  4. * the GNU General Public License as published by the Free Software Foundation,
  5. * either version 3 of the License, or (at your option) any later version.
  6. *
  7. * This program is distributed in the hope that it will be useful,
  8. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. * GNU General Public License for more details.
  11. *
  12. * You should have received a copy of the GNU General Public License
  13. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  14. */
  15. // sigaction() siginfo_t SIG_UNBLOCK
  16. #define _POSIX_C_SOURCE 199309L
  17. #include "util/Seccomp.h"
  18. #include "util/Bits.h"
  19. #include "util/ArchInfo.h"
  20. // getpriority()
  21. #include <sys/resource.h>
  22. #include <signal.h>
  23. #include <sys/prctl.h>
  24. #include <errno.h>
  25. #include <linux/filter.h>
  26. #include <linux/seccomp.h>
  27. #include <linux/audit.h>
  28. #include <sys/syscall.h>
  29. #include <sys/socket.h>
  30. #include <sys/ioctl.h>
  31. #include <stddef.h>
  32. #include <stdio.h>
  33. #include <string.h>
  34. static void catchViolation(int sig, siginfo_t* si, void* threadContext)
  35. {
  36. printf("Attempted banned syscall number [%d] see doc/Seccomp.md for more information\n",
  37. si->si_value.sival_int);
  38. Assert_failure("Disallowed Syscall");
  39. }
  40. struct Filter {
  41. int label;
  42. int jt;
  43. int jf;
  44. struct sock_filter sf;
  45. };
  46. static struct sock_fprog* compile(struct Filter* input, int inputLen, struct Allocator* alloc)
  47. {
  48. // compute gotos
  49. int totalOut = 0;
  50. for (int i = inputLen-1; i >= 0; i--) {
  51. struct Filter* a = &input[i];
  52. if (a->label == 0) {
  53. // check for unresolved gotos...
  54. Assert_true(a->jt == 0 && a->jf == 0);
  55. totalOut++;
  56. continue;
  57. }
  58. int diff = 0;
  59. for (int j = i-1; j >= 0; j--) {
  60. struct Filter* b = &input[j];
  61. if (b->label != 0) { continue; }
  62. if (b->jt == a->label) {
  63. b->sf.jt = diff;
  64. b->jt = 0;
  65. }
  66. if (b->jf == a->label) {
  67. b->sf.jf = diff;
  68. b->jf = 0;
  69. }
  70. diff++;
  71. }
  72. }
  73. // copy into output filter array...
  74. struct sock_filter* sf = Allocator_calloc(alloc, sizeof(struct sock_filter), totalOut);
  75. int outI = 0;
  76. for (int i = 0; i < inputLen; i++) {
  77. if (input[i].label == 0) {
  78. Bits_memcpyConst(&sf[outI++], &input[i].sf, sizeof(struct sock_filter));
  79. }
  80. Assert_true(outI <= totalOut);
  81. Assert_true(i != inputLen-1 || outI == totalOut);
  82. }
  83. struct sock_fprog* out = Allocator_malloc(alloc, sizeof(struct sock_fprog));
  84. out->len = (unsigned short) totalOut;
  85. out->filter = sf;
  86. return out;
  87. }
  88. #define RET_TRAP 0x00030000u
  89. #define RET_ERRNO(x) (0x00050000u | ((x) & 0x0000ffffu))
  90. #define RET_SUCCESS 0x7fff0000u
  91. static struct sock_fprog* mkFilter(struct Allocator* alloc, struct Except* eh)
  92. {
  93. // Adding exceptions to the syscall filter:
  94. //
  95. // echo '#include <sys/syscall.h>' | gcc -E -dM - | grep 'define __NR_' | sort
  96. // for the full list of system calls with syscall numbers (different per ABI)
  97. //
  98. // If gdb traps out it will look like this:
  99. //
  100. // Program received signal SIGSYS, Bad system call.
  101. // [Switching to Thread 0x7ffff7fdd740 (LWP 14673)]
  102. // 0x00007ffff74d1caa in mmap64 () at ../sysdeps/unix/syscall-template.S:81
  103. // 81 ../sysdeps/unix/syscall-template.S: No such file or directory.
  104. //
  105. // %eax should contain the system call number (on different ABIs YMMV)
  106. //
  107. // (gdb) print $eax
  108. // $1 = 9
  109. // (gdb)
  110. //
  111. // Consult your syscall table from the above gcc command...
  112. //
  113. // #define __NR_mmap 9
  114. //
  115. // Then add:
  116. //
  117. // IFEQ(__NR_mmap, success),
  118. //
  119. // And add a comment documenting where you needed that syscall :)
  120. #define STMT(code, val) { .sf = BPF_STMT(code, val) }
  121. #define JMPK(type, not, input, label) { \
  122. .sf = BPF_JUMP(BPF_JMP+(type)+BPF_K, (input), 0, 0), \
  123. .jt = (!(not) ? (label) : 0), \
  124. .jf = ((not) ? (label) : 0) \
  125. }
  126. // Create a label for jumps, the label must be represented by a non-zero integer.
  127. #define LABEL(lbl) { .label = (lbl) }
  128. // Load offset into the register
  129. #define LOAD(offset) STMT(BPF_LD+BPF_W+BPF_ABS, (offset))
  130. // Return constant value
  131. #define RET(val) STMT(BPF_RET+BPF_K, (val))
  132. // If-equal if the currently loaded value equals input, jump to label.
  133. #define IFEQ(input, label) JMPK(BPF_JEQ, 0, (input), (label))
  134. // If-not-equal if the currently loaded value is not equal to input, jump to label.
  135. #define IFNE(input, label) JMPK(BPF_JEQ, 1, (input), (label))
  136. // If-greater-than
  137. #define IFGT(input, label) JMPK(BPF_JGT, 0, (input), (label))
  138. // If-greater-than-or-equal-to
  139. #define IFGE(input, label) JMPK(BPF_JGE, 0, (input), (label))
  140. // If-less-than
  141. #define IFLT(input, label) JMPK(BPF_JGE, 1, (input), (label))
  142. // If-less-than-or-equal-to
  143. #define IFLE(input, label) JMPK(BPF_JGT, 1, (input), (label))
  144. // labels are integers so they must be predefined
  145. int success = 1;
  146. int fail = 2;
  147. int unmaskOnly = 3;
  148. int isworking = 4;
  149. int socket_setip = 5;
  150. int ioctl_setip = 6;
  151. uint32_t auditArch = ArchInfo_getAuditArch();
  152. struct Filter seccompFilter[] = {
  153. LOAD(offsetof(struct seccomp_data, arch)),
  154. IFNE(auditArch, fail),
  155. // Get the syscall num.
  156. LOAD(offsetof(struct seccomp_data, nr)),
  157. // udp
  158. #ifdef __NR_sendmsg
  159. IFEQ(__NR_sendmsg, success),
  160. #endif
  161. #ifdef __NR_recvmsg
  162. IFEQ(__NR_recvmsg, success),
  163. #endif
  164. // ETHInterface
  165. #ifdef __NR_sendto
  166. IFEQ(__NR_sendto, success),
  167. #endif
  168. #ifdef __NR_recvfrom
  169. IFEQ(__NR_recvfrom, success),
  170. #endif
  171. #ifdef __NR_socketcall
  172. // 32-bit: recvmsg is a socketcall
  173. IFEQ(__NR_socketcall, success),
  174. #endif
  175. // libuv
  176. IFEQ(__NR_epoll_ctl, success),
  177. IFEQ(__NR_epoll_wait, success),
  178. // TUN (and logging)
  179. IFEQ(__NR_write, success),
  180. IFEQ(__NR_read, success),
  181. // modern librt reads a read-only mapped section of kernel space which contains the time
  182. // older versions need system calls for getting the time.
  183. // i686 glibc-2.18's time() uses __NR_time
  184. // Raspberry Pi and BeagleBone Black don't provide __NR_time
  185. IFEQ(__NR_clock_gettime, success),
  186. #ifdef __NR_time
  187. IFEQ(__NR_time, success),
  188. #endif
  189. // malloc()
  190. IFEQ(__NR_brk, success),
  191. // abort()
  192. IFEQ(__NR_gettid, success),
  193. IFEQ(__NR_tgkill, success),
  194. IFEQ(__NR_rt_sigprocmask, unmaskOnly),
  195. // exit()
  196. IFEQ(__NR_exit_group, success),
  197. // Seccomp_isWorking()
  198. IFEQ(__NR_getpriority, isworking),
  199. // Securiy_checkPermissions() -> canOpenFiles()
  200. IFEQ(__NR_dup, success),
  201. IFEQ(__NR_close, success),
  202. // Security_checkPermissions() -> getMaxMem()
  203. // x86/ARM use ugetrlimit and mmap2
  204. // ARM does not even have __NR_getrlimit or __NR_mmap defined
  205. // and AMD64 does not have __NR_ugetrlimit or __NR_mmap2 defined
  206. #ifdef __NR_getrlimit
  207. IFEQ(__NR_getrlimit, success),
  208. #endif
  209. #ifdef __NR_ugetrlimit
  210. IFEQ(__NR_ugetrlimit, success),
  211. #endif
  212. #ifdef __NR_mmap
  213. IFEQ(__NR_mmap, success),
  214. #endif
  215. #ifdef __NR_mmap2
  216. IFEQ(__NR_mmap2, success),
  217. #endif
  218. IFEQ(__NR_munmap, success),
  219. // printf()
  220. IFEQ(__NR_fstat, success),
  221. // for setting IP addresses...
  222. // socketForIfName()
  223. #ifdef __NR_socket
  224. IFEQ(__NR_socket, socket_setip),
  225. #endif
  226. IFEQ(__NR_ioctl, ioctl_setip),
  227. RET(SECCOMP_RET_TRAP),
  228. LABEL(socket_setip),
  229. LOAD(offsetof(struct seccomp_data, args[1])),
  230. IFEQ(SOCK_DGRAM, success),
  231. RET(SECCOMP_RET_TRAP),
  232. LABEL(ioctl_setip),
  233. LOAD(offsetof(struct seccomp_data, args[1])),
  234. IFEQ(SIOCGIFINDEX, success),
  235. IFEQ(SIOCGIFFLAGS, success),
  236. IFEQ(SIOCSIFFLAGS, success),
  237. IFEQ(SIOCSIFADDR, success),
  238. IFEQ(SIOCSIFNETMASK, success),
  239. IFEQ(SIOCSIFMTU, success),
  240. RET(SECCOMP_RET_TRAP),
  241. // We allow sigprocmask to *unmask* signals but we don't allow it to mask them.
  242. LABEL(unmaskOnly),
  243. LOAD(offsetof(struct seccomp_data, args[0])),
  244. IFEQ(SIG_UNBLOCK, success),
  245. RET(SECCOMP_RET_TRAP),
  246. LABEL(isworking),
  247. RET(RET_ERRNO(9000)),
  248. LABEL(fail),
  249. RET(SECCOMP_RET_TRAP),
  250. LABEL(success),
  251. RET(SECCOMP_RET_ALLOW),
  252. };
  253. return compile(seccompFilter, sizeof(seccompFilter)/sizeof(seccompFilter[0]), alloc);
  254. }
  255. static void installFilter(struct sock_fprog* filter, struct Log* logger, struct Except* eh)
  256. {
  257. struct sigaction sa = { .sa_sigaction = catchViolation, .sa_flags = SA_SIGINFO };
  258. if (sigaction(SIGSYS, &sa, NULL)) {
  259. Log_warn(logger, "sigaction(SIGSYS) -> [%s]\n", strerror(errno));
  260. }
  261. if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) == -1) {
  262. // don't worry about it.
  263. Log_warn(logger, "prctl(PR_SET_NO_NEW_PRIVS) -> [%s]\n", strerror(errno));
  264. }
  265. if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, filter) == -1) {
  266. Except_throw(eh, "prctl(PR_SET_SECCOMP) -> [%s]\n", strerror(errno));
  267. }
  268. }
  269. void Seccomp_dropPermissions(struct Allocator* tempAlloc, struct Log* logger, struct Except* eh)
  270. {
  271. struct sock_fprog* filter = mkFilter(tempAlloc, eh);
  272. installFilter(filter, logger, eh);
  273. if (!Seccomp_isWorking()) {
  274. Except_throw(eh, "Seccomp filter not installed properly, Seccomp_isWorking() -> false");
  275. }
  276. }
  277. int Seccomp_isWorking()
  278. {
  279. errno = 0;
  280. // If seccomp is not working, this will fail setting errno to EINVAL
  281. long ret = getpriority(1000, 1);
  282. // Inside of the kernel, it seems to check whether the errno return is sane
  283. // and if it is not, it treates it as a return value, 9000 is very unique so
  284. // we'll check for either case just in case this changes.
  285. return (ret == -1 && errno == 9000) || (ret == -9000 && errno == 0);
  286. }
  287. int Seccomp_exists()
  288. {
  289. return 1;
  290. }