cgroups-bpf.c 13 KB


  1. /*
  2. * Copyright (C) 2021 Daniel Golle <daniel@makrotopia.org>
  3. *
  4. * This program is free software; you can redistribute it and/or modify
  5. * it under the terms of the GNU Lesser General Public License version 2.1
  6. * as published by the Free Software Foundation
  7. *
  8. * This program is distributed in the hope that it will be useful,
  9. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. * GNU General Public License for more details.
  12. *
  13. * somehow emulate devices.allow/devices.deny using eBPF
  14. *
  15. * OCI run-time spec defines the syntax for allowing/denying access
  16. * to devices according to the definition of cgroup-v1 in the Kernel
  17. * as described in Documentation/admin-guide/cgroup-v1.
  18. */
  19. #include <assert.h>
  20. #include <linux/bpf.h>
  21. #ifdef __GLIBC__
  22. #include <sys/cdefs.h>
  23. #else
  24. #include <sys/reg.h>
  25. #endif
  26. #include <sys/syscall.h>
  27. #include <libubox/blobmsg.h>
  28. #include <libubox/blobmsg_json.h>
  29. #include <libubox/list.h>
  30. #include "cgroups.h"
  31. #include "cgroups-bpf.h"
  32. #include "log.h"
  33. static struct bpf_insn *program = NULL;
  34. static int bpf_total_insn = 0;
  35. static const char *license = "GPL";
  36. static int
  37. syscall_bpf (int cmd, union bpf_attr *attr, unsigned int size)
  38. {
  39. return (int) syscall (__NR_bpf, cmd, attr, size);
  40. }
  41. /* from crun/src/libcrun/ebpf.c */
  42. #define BPF_ALU32_IMM(OP, DST, IMM) \
  43. ((struct bpf_insn){ .code = BPF_ALU | BPF_OP (OP) | BPF_K, .dst_reg = DST, .src_reg = 0, .off = 0, .imm = IMM })
  44. #define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \
  45. ((struct bpf_insn){ \
  46. .code = BPF_LDX | BPF_SIZE (SIZE) | BPF_MEM, .dst_reg = DST, .src_reg = SRC, .off = OFF, .imm = 0 })
  47. #define BPF_MOV64_REG(DST, SRC) \
  48. ((struct bpf_insn){ .code = BPF_ALU64 | BPF_MOV | BPF_X, .dst_reg = DST, .src_reg = SRC, .off = 0, .imm = 0 })
  49. #define BPF_JMP_A(OFF) \
  50. ((struct bpf_insn){ .code = BPF_JMP | BPF_JA, .dst_reg = 0, .src_reg = 0, .off = OFF, .imm = 0 })
  51. #define BPF_JMP_IMM(OP, DST, IMM, OFF) \
  52. ((struct bpf_insn){ .code = BPF_JMP | BPF_OP (OP) | BPF_K, .dst_reg = DST, .src_reg = 0, .off = OFF, .imm = IMM })
  53. #define BPF_JMP_REG(OP, DST, SRC, OFF) \
  54. ((struct bpf_insn){ .code = BPF_JMP | BPF_OP (OP) | BPF_X, .dst_reg = DST, .src_reg = SRC, .off = OFF, .imm = 0 })
  55. #define BPF_MOV64_IMM(DST, IMM) \
  56. ((struct bpf_insn){ .code = BPF_ALU64 | BPF_MOV | BPF_K, .dst_reg = DST, .src_reg = 0, .off = 0, .imm = IMM })
  57. #define BPF_MOV32_REG(DST, SRC) \
  58. ((struct bpf_insn){ .code = BPF_ALU | BPF_MOV | BPF_X, .dst_reg = DST, .src_reg = SRC, .off = 0, .imm = 0 })
  59. #define BPF_EXIT_INSN() \
  60. ((struct bpf_insn){ .code = BPF_JMP | BPF_EXIT, .dst_reg = 0, .src_reg = 0, .off = 0, .imm = 0 })
  61. /* taken from systemd. */
  62. static const struct bpf_insn pre_insn[] = {
  63. /* type -> R2. */
  64. BPF_LDX_MEM (BPF_W, BPF_REG_2, BPF_REG_1, 0),
  65. BPF_ALU32_IMM (BPF_AND, BPF_REG_2, 0xFFFF),
  66. /* access -> R3. */
  67. BPF_LDX_MEM (BPF_W, BPF_REG_3, BPF_REG_1, 0),
  68. BPF_ALU32_IMM (BPF_RSH, BPF_REG_3, 16),
  69. /* major -> R4. */
  70. BPF_LDX_MEM (BPF_W, BPF_REG_4, BPF_REG_1, 4),
  71. /* minor -> R5. */
  72. BPF_LDX_MEM (BPF_W, BPF_REG_5, BPF_REG_1, 8),
  73. };
  74. enum {
  75. OCI_LINUX_CGROUPS_DEVICES_ALLOW,
  76. OCI_LINUX_CGROUPS_DEVICES_TYPE,
  77. OCI_LINUX_CGROUPS_DEVICES_MAJOR,
  78. OCI_LINUX_CGROUPS_DEVICES_MINOR,
  79. OCI_LINUX_CGROUPS_DEVICES_ACCESS,
  80. __OCI_LINUX_CGROUPS_DEVICES_MAX,
  81. };
  82. static const struct blobmsg_policy oci_linux_cgroups_devices_policy[] = {
  83. [OCI_LINUX_CGROUPS_DEVICES_ALLOW] = { "allow", BLOBMSG_TYPE_BOOL },
  84. [OCI_LINUX_CGROUPS_DEVICES_TYPE] = { "type", BLOBMSG_TYPE_STRING },
  85. [OCI_LINUX_CGROUPS_DEVICES_MAJOR] = { "major", BLOBMSG_CAST_INT64 },
  86. [OCI_LINUX_CGROUPS_DEVICES_MINOR] = { "minor", BLOBMSG_CAST_INT64 },
  87. [OCI_LINUX_CGROUPS_DEVICES_ACCESS] = { "access", BLOBMSG_TYPE_STRING },
  88. };
  89. /*
  90. * cgroup-v1 devices got a (default) behaviour and a list of exceptions.
  91. * define datatypes similar to the legacy kernel code.
  92. */
  93. #define DEVCG_DEV_ALL (BPF_DEVCG_DEV_BLOCK | BPF_DEVCG_DEV_CHAR)
  94. #define DEVCG_ACC_ALL (BPF_DEVCG_ACC_READ | BPF_DEVCG_ACC_WRITE | BPF_DEVCG_ACC_MKNOD)
  95. enum devcg_behavior {
  96. DEVCG_DEFAULT_NONE,
  97. DEVCG_DEFAULT_ALLOW,
  98. DEVCG_DEFAULT_DENY,
  99. };
  100. struct dev_exception_item {
  101. uint32_t major, minor;
  102. short type;
  103. short access;
  104. struct list_head list;
  105. bool allow;
  106. };
  107. /*
  108. * add a bunch of default rules
  109. */
  110. static int add_default_exceptions(struct list_head *exceptions)
  111. {
  112. int i, ret = 0;
  113. struct dev_exception_item *cur;
  114. /* from crun/src/libcrun/cgroup.c */
  115. const struct dev_exception_item defrules[] = {
  116. /* always allow mknod */
  117. { .allow = true, .type = BPF_DEVCG_DEV_CHAR, .major = ~0, .minor = ~0, .access = BPF_DEVCG_ACC_MKNOD },
  118. { .allow = true, .type = BPF_DEVCG_DEV_BLOCK, .major = ~0, .minor = ~0, .access = BPF_DEVCG_ACC_MKNOD },
  119. /* /dev/null */
  120. { .allow = true, .type = BPF_DEVCG_DEV_CHAR, .major = 1, .minor = 3, .access = DEVCG_ACC_ALL },
  121. /* /dev/random */
  122. { .allow = true, .type = BPF_DEVCG_DEV_CHAR, .major = 1, .minor = 8, .access = DEVCG_ACC_ALL },
  123. /* /dev/full */
  124. { .allow = true, .type = BPF_DEVCG_DEV_CHAR, .major = 1, .minor = 7, .access = DEVCG_ACC_ALL },
  125. /* /dev/tty */
  126. { .allow = true, .type = BPF_DEVCG_DEV_CHAR, .major = 5, .minor = 0, .access = DEVCG_ACC_ALL },
  127. /* /dev/zero */
  128. { .allow = true, .type = BPF_DEVCG_DEV_CHAR, .major = 1, .minor = 5, .access = DEVCG_ACC_ALL },
  129. /* /dev/urandom */
  130. { .allow = true, .type = BPF_DEVCG_DEV_CHAR, .major = 1, .minor = 9, .access = DEVCG_ACC_ALL },
  131. /* /dev/console */
  132. { .allow = true, .type = BPF_DEVCG_DEV_CHAR, .major = 5, .minor = 1, .access = DEVCG_ACC_ALL },
  133. /* /dev/pts/[0-255] */
  134. { .allow = true, .type = BPF_DEVCG_DEV_CHAR, .major = 136, .minor = ~0, .access = DEVCG_ACC_ALL },
  135. /* /dev/ptmx */
  136. { .allow = true, .type = BPF_DEVCG_DEV_CHAR, .major = 5, .minor = 2, .access = DEVCG_ACC_ALL },
  137. /* /dev/net/tun */
  138. { .allow = true, .type = BPF_DEVCG_DEV_CHAR, .major = 10, .minor = 200, .access = DEVCG_ACC_ALL },
  139. };
  140. for (i = 0; i < (sizeof(defrules) / sizeof(struct dev_exception_item)); ++i) {
  141. cur = malloc(sizeof(struct dev_exception_item));
  142. if (!cur) {
  143. ret = ENOMEM;
  144. break;
  145. }
  146. /* add defaults to list in reverse order (last item will be first in list) */
  147. memcpy(cur, &defrules[i], sizeof(struct dev_exception_item));
  148. list_add(&cur->list, exceptions);
  149. }
  150. return ret;
  151. }
  152. /*
  153. * free all exceptions in the list
  154. */
  155. static void flush_exceptions(struct list_head *freelist)
  156. {
  157. struct dev_exception_item *dl, *dln;
  158. if (!list_empty(freelist))
  159. list_for_each_entry_safe(dl, dln, freelist, list) {
  160. list_del(&dl->list);
  161. free(dl);
  162. }
  163. }
  164. /*
  165. * parse OCI cgroups devices and translate into cgroups-v2 eBPF program
  166. */
  167. int parseOCIlinuxcgroups_devices(struct blob_attr *msg)
  168. {
  169. struct blob_attr *tb[__OCI_LINUX_CGROUPS_DEVICES_MAX];
  170. struct blob_attr *cur;
  171. int rem, ret = 0;
  172. int bpf_type, bpf_access;
  173. unsigned char acidx;
  174. bool allow = false,
  175. has_access = false,
  176. has_type = false,
  177. has_major = false,
  178. has_minor = false;
  179. int total_ins = 0,
  180. cur_ins = 0,
  181. pre_insn_len = sizeof(pre_insn) / sizeof(struct bpf_insn),
  182. next_ins;
  183. char *access, *devtype;
  184. uint32_t devmajor, devminor;
  185. struct dev_exception_item *dl;
  186. struct list_head exceptions;
  187. enum devcg_behavior behavior = DEVCG_DEFAULT_ALLOW;
  188. INIT_LIST_HEAD(&exceptions);
  189. /* parse according to OCI spec */
  190. blobmsg_for_each_attr(cur, msg, rem) {
  191. blobmsg_parse(oci_linux_cgroups_devices_policy, __OCI_LINUX_CGROUPS_DEVICES_MAX,
  192. tb, blobmsg_data(cur), blobmsg_len(cur));
  193. if (!tb[OCI_LINUX_CGROUPS_DEVICES_ALLOW]) {
  194. ret = EINVAL;
  195. goto out;
  196. }
  197. allow = blobmsg_get_bool(tb[OCI_LINUX_CGROUPS_DEVICES_ALLOW]);
  198. bpf_access = 0;
  199. if (tb[OCI_LINUX_CGROUPS_DEVICES_ACCESS]) {
  200. access = blobmsg_get_string(tb[OCI_LINUX_CGROUPS_DEVICES_ACCESS]);
  201. if ((strlen(access) > 3) || (strlen(access) == 0)) {
  202. ret = EINVAL;
  203. goto out;
  204. }
  205. for (acidx = 0; acidx < strlen(access); ++acidx) {
  206. switch (access[acidx]) {
  207. case 'r':
  208. bpf_access |= BPF_DEVCG_ACC_READ;
  209. break;
  210. case 'w':
  211. bpf_access |= BPF_DEVCG_ACC_WRITE;
  212. break;
  213. case 'm':
  214. bpf_access |= BPF_DEVCG_ACC_MKNOD;
  215. break;
  216. default:
  217. ret = EINVAL;
  218. goto out;
  219. }
  220. }
  221. }
  222. if (!bpf_access)
  223. bpf_access = DEVCG_ACC_ALL;
  224. bpf_type = 0;
  225. if (tb[OCI_LINUX_CGROUPS_DEVICES_TYPE]) {
  226. devtype = blobmsg_get_string(tb[OCI_LINUX_CGROUPS_DEVICES_TYPE]);
  227. switch (devtype[0]) {
  228. case 'c':
  229. bpf_type = BPF_DEVCG_DEV_CHAR;
  230. break;
  231. case 'b':
  232. bpf_type = BPF_DEVCG_DEV_BLOCK;
  233. break;
  234. case 'a':
  235. bpf_type = DEVCG_DEV_ALL;
  236. break;
  237. default:
  238. ret = EINVAL;
  239. goto out;
  240. }
  241. }
  242. if (!bpf_type)
  243. bpf_type = DEVCG_DEV_ALL;
  244. if (tb[OCI_LINUX_CGROUPS_DEVICES_MAJOR])
  245. devmajor = blobmsg_cast_u64(tb[OCI_LINUX_CGROUPS_DEVICES_MAJOR]);
  246. else
  247. devmajor = ~0;
  248. if (tb[OCI_LINUX_CGROUPS_DEVICES_MINOR])
  249. devminor = blobmsg_cast_u64(tb[OCI_LINUX_CGROUPS_DEVICES_MINOR]);
  250. else
  251. devminor = ~0;
  252. if (bpf_type == DEVCG_DEV_ALL) {
  253. /* wildcard => change default policy and flush all existing rules */
  254. flush_exceptions(&exceptions);
  255. behavior = allow?DEVCG_DEFAULT_ALLOW:DEVCG_DEFAULT_DENY;
  256. } else {
  257. /* allocate and populate record for exception */
  258. dl = malloc(sizeof(struct dev_exception_item));
  259. if (!dl) {
  260. ret = ENOSPC;
  261. break;
  262. }
  263. dl->allow = allow;
  264. dl->type = bpf_type;
  265. dl->access = bpf_access;
  266. dl->major = devmajor;
  267. dl->minor = devminor;
  268. /* push to exceptions list, last goes first */
  269. list_add(&dl->list, &exceptions);
  270. }
  271. }
  272. if (ret)
  273. goto out;
  274. /* add default rules */
  275. ret = add_default_exceptions(&exceptions);
  276. if (ret)
  277. goto out;
  278. /* calculate number of instructions to allocate */
  279. list_for_each_entry(dl, &exceptions, list) {
  280. has_access = dl->access != DEVCG_ACC_ALL;
  281. has_type = dl->type != DEVCG_DEV_ALL;
  282. has_major = dl->major != ~0;
  283. has_minor = dl->minor != ~0;
  284. total_ins += (has_type ? 1 : 0) + (has_access ? 3 : 0) + (has_major ? 1 : 0) + (has_minor ? 1 : 0) + 2;
  285. }
  286. /* acccount for loader instructions */
  287. total_ins += pre_insn_len;
  288. /* final accept/deny block */
  289. total_ins += 2;
  290. /* allocate memory for eBPF program */
  291. program = calloc(total_ins, sizeof(struct bpf_insn));
  292. if (!program) {
  293. ret = ENOMEM;
  294. goto out;
  295. }
  296. /* copy program loader instructions */
  297. memcpy(program, &pre_insn, sizeof(pre_insn));
  298. cur_ins = pre_insn_len;
  299. /* generate eBPF program */
  300. list_for_each_entry(dl, &exceptions, list) {
  301. has_access = dl->access != DEVCG_ACC_ALL;
  302. has_type = dl->type != DEVCG_DEV_ALL;
  303. has_major = dl->major != ~0;
  304. has_minor = dl->minor != ~0;
  305. next_ins = (has_type ? 1 : 0) + (has_access ? 3 : 0) + (has_major ? 1 : 0) + (has_minor ? 1 : 0) + 1;
  306. if (has_type) {
  307. program[cur_ins++] = BPF_JMP_IMM(BPF_JNE, BPF_REG_2, dl->type, next_ins);
  308. --next_ins;
  309. }
  310. if (has_access) {
  311. program[cur_ins++] = BPF_MOV32_REG(BPF_REG_1, BPF_REG_3);
  312. program[cur_ins++] = BPF_ALU32_IMM(BPF_AND, BPF_REG_1, dl->access);
  313. program[cur_ins++] = BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, next_ins - 2);
  314. next_ins -= 3;
  315. }
  316. if (has_major) {
  317. program[cur_ins++] = BPF_JMP_IMM(BPF_JNE, BPF_REG_4, dl->major, next_ins);
  318. --next_ins;
  319. }
  320. if (has_minor) {
  321. program[cur_ins++] = BPF_JMP_IMM(BPF_JNE, BPF_REG_5, dl->minor, next_ins);
  322. --next_ins;
  323. }
  324. program[cur_ins++] = BPF_MOV64_IMM(BPF_REG_0, dl->allow ? 1 : 0);
  325. program[cur_ins++] = BPF_EXIT_INSN();
  326. }
  327. /* default behavior */
  328. program[cur_ins++] = BPF_MOV64_IMM(BPF_REG_0, (behavior == DEVCG_DEFAULT_ALLOW)?1:0);
  329. program[cur_ins++] = BPF_EXIT_INSN();
  330. if (debug) {
  331. fprintf(stderr, "cgroup devices:\na > devices.%s\n",
  332. (behavior == DEVCG_DEFAULT_ALLOW)?"allow":"deny");
  333. list_for_each_entry(dl, &exceptions, list)
  334. fprintf(stderr, "%c %d:%d %s%s%s > devices.%s\n",
  335. (dl->type == DEVCG_DEV_ALL)?'a':
  336. (dl->type == BPF_DEVCG_DEV_CHAR)?'c':'b',
  337. (dl->major == ~0)?-1:dl->major,
  338. (dl->minor == ~0)?-1:dl->minor,
  339. (dl->access & BPF_DEVCG_ACC_READ)?"r":"",
  340. (dl->access & BPF_DEVCG_ACC_WRITE)?"w":"",
  341. (dl->access & BPF_DEVCG_ACC_MKNOD)?"m":"",
  342. (dl->allow)?"allow":"deny");
  343. fprintf(stderr, "generated cgroup-devices eBPF program:\n");
  344. fprintf(stderr, " [idx]\tcode\t dest\t src\t off\t imm\n");
  345. for (cur_ins=0; cur_ins<total_ins; cur_ins++)
  346. fprintf(stderr, " [%03d]\t%02hhx\t%3hhu\t%3hhu\t%04hx\t%d\n", cur_ins,
  347. program[cur_ins].code,
  348. program[cur_ins].dst_reg,
  349. program[cur_ins].src_reg,
  350. program[cur_ins].off,
  351. program[cur_ins].imm);
  352. }
  353. assert(cur_ins == total_ins);
  354. bpf_total_insn = total_ins;
  355. ret = 0;
  356. out:
  357. flush_exceptions(&exceptions);
  358. return ret;
  359. }
  360. /*
  361. * attach eBPF program to cgroup
  362. */
  363. int attach_cgroups_ebpf(int cgroup_dirfd) {
  364. int prog_fd;
  365. #if ( __WORDSIZE == 64 )
  366. uint64_t program_ptr = (uint64_t)program;
  367. uint64_t license_ptr = (uint64_t)license;
  368. #elif ( __WORDSIZE == 32 )
  369. uint32_t program_ptr = (uint32_t)program;
  370. uint32_t license_ptr = (uint32_t)license;
  371. #else
  372. #error
  373. #endif
  374. union bpf_attr load_attr = {
  375. .prog_type = BPF_PROG_TYPE_CGROUP_DEVICE,
  376. .license = license_ptr,
  377. .insns = program_ptr,
  378. .insn_cnt = bpf_total_insn,
  379. };
  380. if (!program)
  381. return 0;
  382. prog_fd = syscall_bpf(BPF_PROG_LOAD, &load_attr, sizeof(load_attr));
  383. if (prog_fd < 0)
  384. return EIO;
  385. union bpf_attr attach_attr = {
  386. .attach_type = BPF_CGROUP_DEVICE,
  387. .target_fd = cgroup_dirfd,
  388. .attach_bpf_fd = prog_fd,
  389. };
  390. return syscall_bpf(BPF_PROG_ATTACH, &attach_attr, sizeof (attach_attr));
  391. }