2
0

cgroups.c 25 KB


  1. /*
  2. * Copyright (C) 2020 Daniel Golle <daniel@makrotopia.org>
  3. *
  4. * This program is free software; you can redistribute it and/or modify
  5. * it under the terms of the GNU Lesser General Public License version 2.1
  6. * as published by the Free Software Foundation
  7. *
  8. * This program is distributed in the hope that it will be useful,
  9. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. * GNU General Public License for more details.
  12. *
  13. * reads unified cgroup config as proposed in
  14. * https://github.com/opencontainers/runtime-spec/pull/1040
  15. * attempt conversion from cgroup1 -> cgroup2
  16. * https://github.com/containers/crun/blob/0.14.1/crun.1.md#cgroup-v2
  17. *
  18. * ToDo:
  19. * - convert cgroup1 net_prio and net_cls to eBPF program
  20. * - rdma (anyone?) intelrdt (anyone?)
  21. */
  22. #define _GNU_SOURCE
  23. #include <errno.h>
  24. #include <fcntl.h>
  25. #include <stdlib.h>
  26. #include <stdio.h>
  27. #include <string.h>
  28. #include <sys/stat.h>
  29. #include <sys/mman.h>
  30. #include <unistd.h>
  31. #include <libgen.h>
  32. #include <inttypes.h>
  33. #include <libubox/avl.h>
  34. #include <libubox/avl-cmp.h>
  35. #include <libubox/blobmsg.h>
  36. #include <libubox/list.h>
  37. #include <libubox/utils.h>
  38. #include "log.h"
  39. #include "cgroups.h"
  40. #include "cgroups-bpf.h"
  41. #define CGROUP_ROOT "/sys/fs/cgroup/"
  42. #define CGROUP_IO_WEIGHT_MAX 10000
  43. struct cgval {
  44. struct avl_node avl;
  45. char *val;
  46. };
  47. struct avl_tree cgvals;
  48. static char *cgroup_path;
  49. static bool initialized;
  50. void cgroups_prepare(void) {
  51. initialized = false;
  52. }
  53. void cgroups_init(const char *p) {
  54. avl_init(&cgvals, avl_strcmp, false, NULL);
  55. cgroup_path = strdup(p);
  56. initialized = true;
  57. }
  58. static void cgroups_set(const char *key, const char *val)
  59. {
  60. struct cgval *valp;
  61. valp = avl_find_element(&cgvals, key, valp, avl);
  62. if (!valp) {
  63. valp = malloc(sizeof(struct cgval));
  64. if (!valp)
  65. exit(ENOMEM);
  66. valp->avl.key = strdup(key);
  67. avl_insert(&cgvals, &valp->avl);
  68. } else {
  69. DEBUG("overwriting previous cgroup2 assignment %s=\"%s\"!\n", key, valp->val);
  70. free(valp->val);
  71. }
  72. valp->val = strdup(val);
  73. }
  74. void cgroups_free(void)
  75. {
  76. struct cgval *valp, *tmp;
  77. if (initialized) {
  78. avl_remove_all_elements(&cgvals, valp, avl, tmp) {
  79. free((void *)(valp->avl.key));
  80. free(valp->val);
  81. free(valp);
  82. }
  83. free(cgroup_path);
  84. }
  85. }
  86. void cgroups_apply(pid_t pid)
  87. {
  88. struct cgval *valp;
  89. char *cdir, *ent;
  90. int fd;
  91. size_t maxlen = strlen("cgroup.subtree_control");
  92. bool cpuset = false,
  93. cpu = false,
  94. hugetlb = false,
  95. io = false,
  96. memory = false,
  97. pids = false,
  98. rdma = false;
  99. char subtree_control[64] = { 0 };
  100. DEBUG("using cgroup path %s\n", cgroup_path);
  101. mkdir_p(cgroup_path, 0700);
  102. /* find which controllers need to be enabled */
  103. avl_for_each_element(&cgvals, valp, avl) {
  104. ent = (char *)valp->avl.key;
  105. if (strlen(ent) > maxlen)
  106. maxlen = strlen(ent);
  107. if (!strncmp("cpuset.", ent, 7))
  108. cpuset = true;
  109. else if (!strncmp("cpu.", ent, 4))
  110. cpu = true;
  111. else if (!strncmp("hugetlb.", ent, 8))
  112. hugetlb = true;
  113. else if (!strncmp("io.", ent, 3))
  114. io = true;
  115. else if (!strncmp("memory.", ent, 7))
  116. memory = true;
  117. else if (!strncmp("pids.", ent, 5))
  118. pids = true;
  119. else if (!strncmp("rdma.", ent, 5))
  120. rdma = true;
  121. }
  122. maxlen += strlen(cgroup_path) + 2;
  123. if (cpuset)
  124. strcat(subtree_control, "+cpuset ");
  125. if (cpu)
  126. strcat(subtree_control, "+cpu ");
  127. if (hugetlb)
  128. strcat(subtree_control, "+hugetlb ");
  129. if (io)
  130. strcat(subtree_control, "+io ");
  131. if (memory)
  132. strcat(subtree_control, "+memory ");
  133. if (pids)
  134. strcat(subtree_control, "+pids ");
  135. if (rdma)
  136. strcat(subtree_control, "+rdma ");
  137. /* remove trailing space */
  138. ent = strchr(subtree_control, '\0') - 1;
  139. *ent = '\0';
  140. ent = malloc(maxlen);
  141. if (!ent)
  142. exit(ENOMEM);
  143. DEBUG("recursively applying cgroup.subtree_control = \"%s\"\n", subtree_control);
  144. cdir = &cgroup_path[strlen(CGROUP_ROOT) - 2];
  145. while ((cdir = strchr(cdir + 1, '/'))) {
  146. *cdir = '\0';
  147. snprintf(ent, maxlen, "%s/cgroup.subtree_control", cgroup_path);
  148. DEBUG(" * %s\n", ent);
  149. if ((fd = open(ent, O_WRONLY)) < 0) {
  150. ERROR("can't open %s: %m\n", ent);
  151. continue;
  152. }
  153. if (write(fd, subtree_control, strlen(subtree_control)) == -1) {
  154. ERROR("can't write to %s: %m\n", ent);
  155. close(fd);
  156. continue;
  157. }
  158. close(fd);
  159. *cdir = '/';
  160. }
  161. avl_for_each_element(&cgvals, valp, avl) {
  162. DEBUG("applying cgroup2 %s=\"%s\"\n", (char *)valp->avl.key, valp->val);
  163. snprintf(ent, maxlen, "%s/%s", cgroup_path, (char *)valp->avl.key);
  164. fd = open(ent, O_WRONLY);
  165. if (fd < 0) {
  166. ERROR("can't open %s: %m\n", ent);
  167. continue;
  168. }
  169. if (dprintf(fd, "%s", valp->val) < 0) {
  170. ERROR("can't write to %s: %m\n", ent);
  171. };
  172. close(fd);
  173. }
  174. int dirfd = open(cgroup_path, O_DIRECTORY);
  175. if (dirfd < 0) {
  176. ERROR("can't open %s: %m\n", cgroup_path);
  177. } else {
  178. attach_cgroups_ebpf(dirfd);
  179. close(dirfd);
  180. }
  181. snprintf(ent, maxlen, "%s/%s", cgroup_path, "cgroup.procs");
  182. fd = open(ent, O_WRONLY);
  183. if (fd < 0) {
  184. ERROR("can't open %s: %m\n", cgroup_path);
  185. } else {
  186. dprintf(fd, "%d", pid);
  187. close(fd);
  188. }
  189. free(ent);
  190. }
  191. enum {
  192. OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAJOR,
  193. OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MINOR,
  194. OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_WEIGHT,
  195. OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT,
  196. __OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAX,
  197. };
  198. static const struct blobmsg_policy oci_linux_cgroups_blockio_weightdevice_policy[] = {
  199. [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAJOR] = { "major", BLOBMSG_CAST_INT64 },
  200. [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MINOR] = { "minor", BLOBMSG_CAST_INT64 },
  201. [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_WEIGHT] = { "weight", BLOBMSG_TYPE_INT32 },
  202. [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT] = { "leafWeight", BLOBMSG_TYPE_INT32 },
  203. };
  204. enum {
  205. OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR,
  206. OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR,
  207. OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE,
  208. __OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX,
  209. };
  210. static const struct blobmsg_policy oci_linux_cgroups_blockio_throttledevice_policy[] = {
  211. [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR] = { "major", BLOBMSG_CAST_INT64 },
  212. [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR] = { "minor", BLOBMSG_CAST_INT64 },
  213. [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE] = { "rate", BLOBMSG_CAST_INT64 },
  214. };
  215. enum {
  216. OCI_LINUX_CGROUPS_BLOCKIO_WEIGHT,
  217. OCI_LINUX_CGROUPS_BLOCKIO_LEAFWEIGHT,
  218. OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE,
  219. OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADBPSDEVICE,
  220. OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEBPSDEVICE,
  221. OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADIOPSDEVICE,
  222. OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEIOPSDEVICE,
  223. __OCI_LINUX_CGROUPS_BLOCKIO_MAX,
  224. };
  225. static const struct blobmsg_policy oci_linux_cgroups_blockio_policy[] = {
  226. [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHT] = { "weight", BLOBMSG_TYPE_INT32 },
  227. [OCI_LINUX_CGROUPS_BLOCKIO_LEAFWEIGHT] = { "leafWeight", BLOBMSG_TYPE_INT32 },
  228. [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE] = { "weightDevice", BLOBMSG_TYPE_ARRAY },
  229. [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADBPSDEVICE] = { "throttleReadBpsDevice", BLOBMSG_TYPE_ARRAY },
  230. [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEBPSDEVICE] = { "throttleWriteBpsDevice", BLOBMSG_TYPE_ARRAY },
  231. [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADIOPSDEVICE] = { "throttleReadIOPSDevice", BLOBMSG_TYPE_ARRAY },
  232. [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEIOPSDEVICE] = { "throttleWriteIOPSDevice", BLOBMSG_TYPE_ARRAY },
  233. };
  234. struct posix_dev {
  235. uint64_t major;
  236. uint64_t minor;
  237. };
  238. struct iomax_line {
  239. struct avl_node avl;
  240. struct posix_dev dev;
  241. uint64_t rbps;
  242. uint64_t wbps;
  243. uint64_t riops;
  244. uint64_t wiops;
  245. };
  246. static int avl_devcmp(const void *k1, const void *k2, void *ptr)
  247. {
  248. struct posix_dev *d1 = (struct posix_dev *)k1, *d2 = (struct posix_dev *)k2;
  249. if (d1->major < d2->major)
  250. return -1;
  251. if (d1->major > d2->major)
  252. return 1;
  253. if (d1->minor < d2->minor)
  254. return -1;
  255. if (d1->minor > d2->minor)
  256. return 1;
  257. return 0;
  258. }
  259. static struct iomax_line *get_iomax_line(struct avl_tree *iomax, uint64_t major, uint64_t minor)
  260. {
  261. struct iomax_line *l;
  262. struct posix_dev d;
  263. d.major = major;
  264. d.minor = minor;
  265. l = avl_find_element(iomax, &d, l, avl);
  266. if (!l) {
  267. l = malloc(sizeof(struct iomax_line));
  268. if (!l)
  269. exit(ENOMEM);
  270. l->dev.major = d.major;
  271. l->dev.minor = d.minor;
  272. l->avl.key = &l->dev;
  273. l->rbps = -1;
  274. l->wbps = -1;
  275. l->riops = -1;
  276. l->wiops = -1;
  277. avl_insert(iomax, &l->avl);
  278. }
  279. return l;
  280. }
  281. static int parseOCIlinuxcgroups_legacy_blockio(struct blob_attr *msg)
  282. {
  283. struct blob_attr *tb[__OCI_LINUX_CGROUPS_BLOCKIO_MAX],
  284. *tbwd[__OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAX],
  285. *tbtd[__OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX],
  286. *cur;
  287. int rem;
  288. int weight = -1, leafweight = -1;
  289. size_t numweightstrs = 0, numiomaxstrs = 0, strtotlen = 1;
  290. char **weightstrs = NULL, **iomaxstrs = NULL, **curstr;
  291. char *weightstr, *iomaxstr;
  292. struct avl_tree iomax;
  293. struct iomax_line *curiomax, *tmp;
  294. blobmsg_parse(oci_linux_cgroups_blockio_policy, __OCI_LINUX_CGROUPS_BLOCKIO_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
  295. if (tb[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHT]) {
  296. weight = blobmsg_get_u32(tb[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHT]);
  297. ++numweightstrs;
  298. }
  299. if (weight > CGROUP_IO_WEIGHT_MAX)
  300. return ERANGE;
  301. if (tb[OCI_LINUX_CGROUPS_BLOCKIO_LEAFWEIGHT])
  302. leafweight = blobmsg_get_u32(tb[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHT]);
  303. if (leafweight > CGROUP_IO_WEIGHT_MAX)
  304. return ERANGE;
  305. blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE], rem)
  306. ++numweightstrs;
  307. weightstrs = calloc(numweightstrs + 1, sizeof(char *));
  308. if (!weightstrs)
  309. exit(ENOMEM);
  310. numweightstrs = 0;
  311. if (weight > -1)
  312. if (asprintf(&weightstrs[numweightstrs++], "default %d", weight) < 0)
  313. return ENOMEM;
  314. blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE], rem) {
  315. uint64_t major, minor;
  316. int devweight = weight, devleafweight = leafweight;
  317. blobmsg_parse(oci_linux_cgroups_blockio_weightdevice_policy, __OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAX, tbwd, blobmsg_data(cur), blobmsg_len(cur));
  318. if (!tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAJOR] ||
  319. !tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MINOR])
  320. return ENODATA;
  321. if (!tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_WEIGHT] &&
  322. !tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT])
  323. return ENODATA;
  324. if (tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_WEIGHT])
  325. devweight = blobmsg_get_u32(tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_WEIGHT]);
  326. if (devweight > CGROUP_IO_WEIGHT_MAX)
  327. return ERANGE;
  328. if (tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT])
  329. devleafweight = blobmsg_get_u32(tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT]);
  330. if (devleafweight > CGROUP_IO_WEIGHT_MAX)
  331. return ERANGE;
  332. if (tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT])
  333. return ENOTSUP;
  334. major = blobmsg_cast_u64(tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAJOR]);
  335. minor = blobmsg_cast_u64(tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MINOR]);
  336. if (asprintf(&weightstrs[numweightstrs++], "%" PRIu64 ":%" PRIu64 " %u", major, minor, devweight) < 0)
  337. return ENOMEM;
  338. }
  339. if (numweightstrs) {
  340. curstr = weightstrs;
  341. while (*curstr)
  342. strtotlen += strlen(*(curstr++)) + 1;
  343. weightstr = calloc(strtotlen, sizeof(char));
  344. if (!weightstr)
  345. exit(ENOMEM);
  346. curstr = weightstrs;
  347. while (*curstr) {
  348. strcat(weightstr, *curstr);
  349. strcat(weightstr, "\n");
  350. free(*(curstr++));
  351. }
  352. cgroups_set("io.bfq.weight", weightstr);
  353. free(weightstr);
  354. };
  355. free(weightstrs);
  356. avl_init(&iomax, avl_devcmp, false, NULL);
  357. blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADBPSDEVICE], rem) {
  358. struct iomax_line *l;
  359. blobmsg_parse(oci_linux_cgroups_blockio_throttledevice_policy, __OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX, tbtd, blobmsg_data(cur), blobmsg_len(cur));
  360. if (!tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR] ||
  361. !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR] ||
  362. !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE])
  363. return ENODATA;
  364. l = get_iomax_line(&iomax,
  365. blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR]),
  366. blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR]));
  367. l->rbps = blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE]);
  368. }
  369. blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEBPSDEVICE], rem) {
  370. struct iomax_line *l;
  371. blobmsg_parse(oci_linux_cgroups_blockio_throttledevice_policy, __OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX, tbtd, blobmsg_data(cur), blobmsg_len(cur));
  372. if (!tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR] ||
  373. !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR] ||
  374. !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE])
  375. return ENODATA;
  376. l = get_iomax_line(&iomax,
  377. blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR]),
  378. blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR]));
  379. l->wbps = blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE]);
  380. }
  381. blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADIOPSDEVICE], rem) {
  382. struct iomax_line *l;
  383. blobmsg_parse(oci_linux_cgroups_blockio_throttledevice_policy, __OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX, tbtd, blobmsg_data(cur), blobmsg_len(cur));
  384. if (!tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR] ||
  385. !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR] ||
  386. !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE])
  387. return ENODATA;
  388. l = get_iomax_line(&iomax,
  389. blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR]),
  390. blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR]));
  391. l->riops = blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE]);
  392. }
  393. blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEIOPSDEVICE], rem) {
  394. struct iomax_line *l;
  395. blobmsg_parse(oci_linux_cgroups_blockio_throttledevice_policy, __OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX, tbtd, blobmsg_data(cur), blobmsg_len(cur));
  396. if (!tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR] ||
  397. !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR] ||
  398. !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE])
  399. return ENODATA;
  400. l = get_iomax_line(&iomax,
  401. blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR]),
  402. blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR]));
  403. l->wiops = blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE]);
  404. }
  405. avl_for_each_element(&iomax, curiomax, avl)
  406. ++numiomaxstrs;
  407. if (!numiomaxstrs)
  408. return 0;
  409. iomaxstrs = calloc(numiomaxstrs + 1, sizeof(char *));
  410. if (!iomaxstrs)
  411. exit(ENOMEM);
  412. numiomaxstrs = 0;
  413. avl_for_each_element(&iomax, curiomax, avl) {
  414. char iomaxlstr[160];
  415. char lstr[32];
  416. sprintf(iomaxlstr, "%" PRIu64 ":%" PRIu64 " ", curiomax->dev.major, curiomax->dev.minor);
  417. if (curiomax->rbps != -1) {
  418. sprintf(lstr, "rbps=%" PRIu64 " ", curiomax->rbps);
  419. strcat(iomaxlstr, lstr);
  420. }
  421. if (curiomax->wbps != -1) {
  422. sprintf(lstr, "wbps=%" PRIu64 " ", curiomax->wbps);
  423. strcat(iomaxlstr, lstr);
  424. }
  425. if (curiomax->riops != -1) {
  426. sprintf(lstr, "riops=%" PRIu64 " ", curiomax->riops);
  427. strcat(iomaxlstr, lstr);
  428. }
  429. if (curiomax->wiops != -1) {
  430. sprintf(lstr, "wiops=%" PRIu64 " ", curiomax->wiops);
  431. strcat(iomaxlstr, lstr);
  432. }
  433. iomaxstrs[numiomaxstrs++] = strdup(iomaxlstr);
  434. }
  435. avl_for_each_element_safe(&iomax, curiomax, avl, tmp) {
  436. avl_delete(&iomax, &curiomax->avl);
  437. free(curiomax);
  438. }
  439. strtotlen = 1; /* 1 accounts for \0 at end of string */
  440. if (numiomaxstrs) {
  441. curstr = iomaxstrs;
  442. while (*curstr)
  443. strtotlen += strlen(*(curstr++)) + 1; /* +1 accounts for \n at end of line */
  444. iomaxstr = calloc(strtotlen, sizeof(char));
  445. if (!iomaxstr)
  446. exit(ENOMEM);
  447. curstr = iomaxstrs;
  448. while (*curstr) {
  449. strcat(iomaxstr, *curstr);
  450. strcat(iomaxstr, "\n");
  451. free(*(curstr++));
  452. }
  453. cgroups_set("io.max", iomaxstr);
  454. free(iomaxstr);
  455. };
  456. free(iomaxstrs);
  457. return 0;
  458. }
  459. enum {
  460. OCI_LINUX_CGROUPS_CPU_SHARES,
  461. OCI_LINUX_CGROUPS_CPU_PERIOD,
  462. OCI_LINUX_CGROUPS_CPU_QUOTA,
  463. OCI_LINUX_CGROUPS_CPU_REALTIMERUNTIME,
  464. OCI_LINUX_CGROUPS_CPU_REALTIMEPERIOD,
  465. OCI_LINUX_CGROUPS_CPU_CPUS,
  466. OCI_LINUX_CGROUPS_CPU_MEMS,
  467. __OCI_LINUX_CGROUPS_CPU_MAX,
  468. };
  469. static const struct blobmsg_policy oci_linux_cgroups_cpu_policy[] = {
  470. [OCI_LINUX_CGROUPS_CPU_SHARES] = { "shares", BLOBMSG_CAST_INT64 },
  471. [OCI_LINUX_CGROUPS_CPU_PERIOD] = { "period", BLOBMSG_CAST_INT64 },
  472. [OCI_LINUX_CGROUPS_CPU_QUOTA] = { "quota", BLOBMSG_CAST_INT64 }, /* signed int64! */
  473. [OCI_LINUX_CGROUPS_CPU_REALTIMEPERIOD] = { "realtimePeriod", BLOBMSG_CAST_INT64 },
  474. [OCI_LINUX_CGROUPS_CPU_REALTIMERUNTIME] = { "realtimeRuntime", BLOBMSG_CAST_INT64 },
  475. [OCI_LINUX_CGROUPS_CPU_CPUS] = { "cpus", BLOBMSG_TYPE_STRING },
  476. [OCI_LINUX_CGROUPS_CPU_MEMS] = { "mems", BLOBMSG_TYPE_STRING },
  477. };
  478. static int parseOCIlinuxcgroups_legacy_cpu(struct blob_attr *msg)
  479. {
  480. struct blob_attr *tb[__OCI_LINUX_CGROUPS_CPU_MAX];
  481. uint64_t shares, period = 0;
  482. int64_t quota = -2; /* unset */
  483. char tmp[32] = { 0 };
  484. blobmsg_parse(oci_linux_cgroups_cpu_policy, __OCI_LINUX_CGROUPS_CPU_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
  485. if (tb[OCI_LINUX_CGROUPS_CPU_REALTIMEPERIOD] ||
  486. tb[OCI_LINUX_CGROUPS_CPU_REALTIMERUNTIME])
  487. return ENOTSUP; /* no equivalent in cgroup2 */
  488. if (tb[OCI_LINUX_CGROUPS_CPU_SHARES]) {
  489. shares = blobmsg_cast_u64(tb[OCI_LINUX_CGROUPS_CPU_SHARES]);
  490. if ((shares < 2) || (shares > 262144))
  491. return ERANGE;
  492. snprintf(tmp, sizeof(tmp), "%" PRIu64, (((uint64_t)1) + ((shares - 2) * 9999) / 262142));
  493. cgroups_set("cpu.weight", tmp);
  494. tmp[0] = '\0';
  495. }
  496. if (tb[OCI_LINUX_CGROUPS_CPU_QUOTA])
  497. quota = blobmsg_cast_s64(tb[OCI_LINUX_CGROUPS_CPU_QUOTA]);
  498. if (tb[OCI_LINUX_CGROUPS_CPU_PERIOD])
  499. period = blobmsg_cast_u64(tb[OCI_LINUX_CGROUPS_CPU_PERIOD]);
  500. if (period) {
  501. if (quota >= 0)
  502. snprintf(tmp, sizeof(tmp), "%" PRId64 " %" PRIu64 , quota, period);
  503. else
  504. snprintf(tmp, sizeof(tmp), "max %" PRIu64, period); /* assume default */
  505. } else if (quota >= 0) {
  506. snprintf(tmp, sizeof(tmp), "%" PRId64, quota);
  507. } else if (quota == -1) {
  508. strcpy(tmp, "max");
  509. }
  510. if (tmp[0])
  511. cgroups_set("cpu.max", tmp);
  512. if (tb[OCI_LINUX_CGROUPS_CPU_CPUS])
  513. cgroups_set("cpuset.cpus", blobmsg_get_string(tb[OCI_LINUX_CGROUPS_CPU_CPUS]));
  514. if (tb[OCI_LINUX_CGROUPS_CPU_MEMS])
  515. cgroups_set("cpuset.mems", blobmsg_get_string(tb[OCI_LINUX_CGROUPS_CPU_MEMS]));
  516. return 0;
  517. }
  518. enum {
  519. OCI_LINUX_CGROUPS_MEMORY_LIMIT,
  520. OCI_LINUX_CGROUPS_MEMORY_RESERVATION,
  521. OCI_LINUX_CGROUPS_MEMORY_SWAP,
  522. OCI_LINUX_CGROUPS_MEMORY_KERNEL,
  523. OCI_LINUX_CGROUPS_MEMORY_KERNELTCP,
  524. OCI_LINUX_CGROUPS_MEMORY_SWAPPINESS,
  525. OCI_LINUX_CGROUPS_MEMORY_DISABLEOOMKILLER,
  526. OCI_LINUX_CGROUPS_MEMORY_USEHIERARCHY,
  527. __OCI_LINUX_CGROUPS_MEMORY_MAX,
  528. };
  529. static const struct blobmsg_policy oci_linux_cgroups_memory_policy[] = {
  530. [OCI_LINUX_CGROUPS_MEMORY_LIMIT] = { "limit", BLOBMSG_CAST_INT64 }, /* signed int64! */
  531. [OCI_LINUX_CGROUPS_MEMORY_RESERVATION] = { "reservation", BLOBMSG_CAST_INT64 }, /* signed int64! */
  532. [OCI_LINUX_CGROUPS_MEMORY_SWAP] = { "swap", BLOBMSG_CAST_INT64 }, /* signed int64! */
  533. [OCI_LINUX_CGROUPS_MEMORY_KERNEL] = { "kernel", BLOBMSG_CAST_INT64 }, /* signed int64! ignored */
  534. [OCI_LINUX_CGROUPS_MEMORY_KERNELTCP] = { "kernelTCP", BLOBMSG_CAST_INT64 }, /* signed int64! ignored */
  535. [OCI_LINUX_CGROUPS_MEMORY_SWAPPINESS] = { "swappiness", BLOBMSG_CAST_INT64 },
  536. [OCI_LINUX_CGROUPS_MEMORY_DISABLEOOMKILLER] = { "disableOOMKiller", BLOBMSG_TYPE_BOOL },
  537. [OCI_LINUX_CGROUPS_MEMORY_USEHIERARCHY] { "useHierarchy", BLOBMSG_TYPE_BOOL },
  538. };
  539. static int parseOCIlinuxcgroups_legacy_memory(struct blob_attr *msg)
  540. {
  541. struct blob_attr *tb[__OCI_LINUX_CGROUPS_MEMORY_MAX];
  542. char tmp[32] = { 0 };
  543. int64_t limit = -1, swap, reservation;
  544. blobmsg_parse(oci_linux_cgroups_memory_policy, __OCI_LINUX_CGROUPS_MEMORY_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
  545. /*
  546. * not all properties of the OCI memory section can be mapped to cgroup2
  547. * kernel memory accounting is always enabled and included in the set
  548. * memory limit, hence these options can be ignored
  549. * disableOOMKiller could be emulated using oom_score_adj + seccomp eBPF
  550. * preventing self-upgrade (but allow downgrade)
  551. *
  552. * see also https://github.com/opencontainers/runtime-spec/issues/1005
  553. */
  554. if (tb[OCI_LINUX_CGROUPS_MEMORY_SWAPPINESS] ||
  555. tb[OCI_LINUX_CGROUPS_MEMORY_DISABLEOOMKILLER] ||
  556. tb[OCI_LINUX_CGROUPS_MEMORY_USEHIERARCHY])
  557. return ENOTSUP;
  558. if (tb[OCI_LINUX_CGROUPS_MEMORY_LIMIT]) {
  559. limit = blobmsg_cast_s64(tb[OCI_LINUX_CGROUPS_MEMORY_LIMIT]);
  560. if (limit == -1)
  561. strcpy(tmp, "max");
  562. else
  563. snprintf(tmp, sizeof(tmp), "%" PRId64, limit);
  564. cgroups_set("memory.max", tmp);
  565. }
  566. if (tb[OCI_LINUX_CGROUPS_MEMORY_RESERVATION]) {
  567. reservation = blobmsg_cast_s64(tb[OCI_LINUX_CGROUPS_MEMORY_RESERVATION]);
  568. if (reservation == -1)
  569. strcpy(tmp, "max");
  570. else
  571. snprintf(tmp, sizeof(tmp), "%" PRId64, reservation);
  572. cgroups_set("memory.low", tmp);
  573. }
  574. /* OCI 'swap' acounts for memory+swap */
  575. if (tb[OCI_LINUX_CGROUPS_MEMORY_SWAP]) {
  576. swap = blobmsg_cast_s64(tb[OCI_LINUX_CGROUPS_MEMORY_SWAP]);
  577. if (swap == -1)
  578. strcpy(tmp, "max");
  579. else if (limit == -1 || (limit < swap))
  580. snprintf(tmp, sizeof(tmp), "%" PRId64, swap);
  581. else
  582. snprintf(tmp, sizeof(tmp), "%" PRId64, limit - swap);
  583. cgroups_set("memory.swap_max", tmp);
  584. }
  585. return 0;
  586. }
  587. enum {
  588. OCI_LINUX_CGROUPS_PIDS_LIMIT,
  589. __OCI_LINUX_CGROUPS_PIDS_MAX,
  590. };
  591. static const struct blobmsg_policy oci_linux_cgroups_pids_policy[] = {
  592. [OCI_LINUX_CGROUPS_PIDS_LIMIT] = { "limit", BLOBMSG_CAST_INT64 },
  593. };
  594. static int parseOCIlinuxcgroups_legacy_pids(struct blob_attr *msg)
  595. {
  596. struct blob_attr *tb[__OCI_LINUX_CGROUPS_MEMORY_MAX];
  597. char tmp[32] = { 0 };
  598. blobmsg_parse(oci_linux_cgroups_pids_policy, __OCI_LINUX_CGROUPS_PIDS_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
  599. if (!tb[OCI_LINUX_CGROUPS_PIDS_LIMIT])
  600. return EINVAL;
  601. snprintf(tmp, sizeof(tmp), "%" PRIu64, blobmsg_cast_u64(tb[OCI_LINUX_CGROUPS_PIDS_LIMIT]));
  602. cgroups_set("pids.max", tmp);
  603. return 0;
  604. }
  605. static int parseOCIlinuxcgroups_unified(struct blob_attr *msg)
  606. {
  607. struct blob_attr *cur;
  608. int rem;
  609. blobmsg_for_each_attr(cur, msg, rem) {
  610. if (blobmsg_type(cur) != BLOBMSG_TYPE_STRING)
  611. return EINVAL;
  612. /* restrict keys */
  613. if (strchr(blobmsg_name(cur), '/') ||
  614. !strcmp(blobmsg_name(cur), "cgroup.subtree_control") ||
  615. !strcmp(blobmsg_name(cur), "cgroup.procs") ||
  616. !strcmp(blobmsg_name(cur), "cgroup.threads") ||
  617. !strcmp(blobmsg_name(cur), "cgroup.freeze"))
  618. return EINVAL;
  619. cgroups_set(blobmsg_name(cur), blobmsg_get_string(cur));
  620. }
  621. return 0;
  622. }
  623. enum {
  624. OCI_LINUX_CGROUPS_BLOCKIO,
  625. OCI_LINUX_CGROUPS_CPU,
  626. OCI_LINUX_CGROUPS_DEVICES,
  627. OCI_LINUX_CGROUPS_HUGEPAGELIMITS,
  628. OCI_LINUX_CGROUPS_INTELRDT,
  629. OCI_LINUX_CGROUPS_MEMORY,
  630. OCI_LINUX_CGROUPS_NETWORK,
  631. OCI_LINUX_CGROUPS_PIDS,
  632. OCI_LINUX_CGROUPS_RDMA,
  633. OCI_LINUX_CGROUPS_UNIFIED,
  634. __OCI_LINUX_CGROUPS_MAX,
  635. };
  636. static const struct blobmsg_policy oci_linux_cgroups_policy[] = {
  637. [OCI_LINUX_CGROUPS_BLOCKIO] = { "blockIO", BLOBMSG_TYPE_TABLE },
  638. [OCI_LINUX_CGROUPS_CPU] = { "cpu", BLOBMSG_TYPE_TABLE },
  639. [OCI_LINUX_CGROUPS_DEVICES] = { "devices", BLOBMSG_TYPE_ARRAY },
  640. [OCI_LINUX_CGROUPS_HUGEPAGELIMITS] = { "hugepageLimits", BLOBMSG_TYPE_ARRAY },
  641. [OCI_LINUX_CGROUPS_INTELRDT] = { "intelRdt", BLOBMSG_TYPE_TABLE },
  642. [OCI_LINUX_CGROUPS_MEMORY] = { "memory", BLOBMSG_TYPE_TABLE },
  643. [OCI_LINUX_CGROUPS_NETWORK] = { "network", BLOBMSG_TYPE_TABLE },
  644. [OCI_LINUX_CGROUPS_PIDS] = { "pids", BLOBMSG_TYPE_TABLE },
  645. [OCI_LINUX_CGROUPS_RDMA] = { "rdma", BLOBMSG_TYPE_TABLE },
  646. [OCI_LINUX_CGROUPS_UNIFIED] = { "unified", BLOBMSG_TYPE_TABLE },
  647. };
  648. int parseOCIlinuxcgroups(struct blob_attr *msg)
  649. {
  650. struct blob_attr *tb[__OCI_LINUX_CGROUPS_MAX];
  651. int ret;
  652. blobmsg_parse(oci_linux_cgroups_policy, __OCI_LINUX_CGROUPS_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
  653. if (tb[OCI_LINUX_CGROUPS_HUGEPAGELIMITS] ||
  654. tb[OCI_LINUX_CGROUPS_INTELRDT] ||
  655. tb[OCI_LINUX_CGROUPS_NETWORK] ||
  656. tb[OCI_LINUX_CGROUPS_RDMA])
  657. return ENOTSUP;
  658. if (tb[OCI_LINUX_CGROUPS_BLOCKIO]) {
  659. ret = parseOCIlinuxcgroups_legacy_blockio(tb[OCI_LINUX_CGROUPS_BLOCKIO]);
  660. if (ret)
  661. return ret;
  662. }
  663. if (tb[OCI_LINUX_CGROUPS_CPU]) {
  664. ret = parseOCIlinuxcgroups_legacy_cpu(tb[OCI_LINUX_CGROUPS_CPU]);
  665. if (ret)
  666. return ret;
  667. }
  668. if (tb[OCI_LINUX_CGROUPS_DEVICES]) {
  669. ret = parseOCIlinuxcgroups_devices(tb[OCI_LINUX_CGROUPS_DEVICES]);
  670. if (ret)
  671. return ret;
  672. }
  673. if (tb[OCI_LINUX_CGROUPS_MEMORY]) {
  674. ret = parseOCIlinuxcgroups_legacy_memory(tb[OCI_LINUX_CGROUPS_MEMORY]);
  675. if (ret)
  676. return ret;
  677. }
  678. if (tb[OCI_LINUX_CGROUPS_PIDS]) {
  679. ret = parseOCIlinuxcgroups_legacy_pids(tb[OCI_LINUX_CGROUPS_PIDS]);
  680. if (ret)
  681. return ret;
  682. }
  683. if (tb[OCI_LINUX_CGROUPS_UNIFIED]) {
  684. ret = parseOCIlinuxcgroups_unified(tb[OCI_LINUX_CGROUPS_UNIFIED]);
  685. if (ret)
  686. return ret;
  687. }
  688. return 0;
  689. }