cgroups.c 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826
  1. /*
  2. * Copyright (C) 2020 Daniel Golle <daniel@makrotopia.org>
  3. *
  4. * This program is free software; you can redistribute it and/or modify
  5. * it under the terms of the GNU Lesser General Public License version 2.1
  6. * as published by the Free Software Foundation
  7. *
  8. * This program is distributed in the hope that it will be useful,
  9. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. * GNU General Public License for more details.
  12. *
  13. * reads unified cgroup config as proposed in
  14. * https://github.com/opencontainers/runtime-spec/pull/1040
  15. * attempt conversion from cgroup1 -> cgroup2
  16. * https://github.com/containers/crun/blob/0.14.1/crun.1.md#cgroup-v2
  17. *
  18. * ToDo:
  19. * - convert cgroup1 devices to eBPF program
  20. * - convert cgroup1 net_prio and net_cls to eBPF program
  21. * - rdma (anyone?) intelrdt (anyone?)
  22. */
  23. #define _GNU_SOURCE
  24. #include <assert.h>
  25. #include <errno.h>
  26. #include <fcntl.h>
  27. #include <stdlib.h>
  28. #include <stdio.h>
  29. #include <string.h>
  30. #include <sys/stat.h>
  31. #include <sys/mman.h>
  32. #include <unistd.h>
  33. #include <libgen.h>
  34. #include <inttypes.h>
  35. #include <libubox/avl.h>
  36. #include <libubox/avl-cmp.h>
  37. #include <libubox/blobmsg.h>
  38. #include <libubox/list.h>
  39. #include "fs.h"
  40. #include "log.h"
  41. #include "cgroups.h"
  42. #define CGROUP_ROOT "/sys/fs/cgroup/"
  43. #define CGROUP_IO_WEIGHT_MAX 10000
  44. struct cgval {
  45. struct avl_node avl;
  46. char *val;
  47. };
  48. struct avl_tree cgvals;
  49. static char *cgroup_path;
  50. static bool initialized;
  51. void cgroups_prepare(void) {
  52. initialized = false;
  53. }
  54. void cgroups_init(const char *p) {
  55. avl_init(&cgvals, avl_strcmp, false, NULL);
  56. cgroup_path = strdup(p);
  57. initialized = true;
  58. }
  59. static void cgroups_set(const char *key, const char *val)
  60. {
  61. struct cgval *valp;
  62. valp = avl_find_element(&cgvals, key, valp, avl);
  63. if (!valp) {
  64. valp = malloc(sizeof(struct cgval));
  65. assert(valp != NULL);
  66. valp->avl.key = strdup(key);
  67. avl_insert(&cgvals, &valp->avl);
  68. } else {
  69. DEBUG("overwriting previous cgroup2 assignment %s=\"%s\"!\n", key, valp->val);
  70. free(valp->val);
  71. }
  72. valp->val = strdup(val);
  73. }
  74. void cgroups_free(void)
  75. {
  76. struct cgval *valp, *tmp;
  77. if (initialized) {
  78. avl_remove_all_elements(&cgvals, valp, avl, tmp) {
  79. free((void *)(valp->avl.key));
  80. free(valp->val);
  81. free(valp);
  82. }
  83. free(cgroup_path);
  84. }
  85. }
  86. void cgroups_apply(pid_t pid)
  87. {
  88. struct cgval *valp;
  89. char *cdir, *ent;
  90. int fd;
  91. size_t maxlen = strlen("cgroup.subtree_control");
  92. bool cpuset = false,
  93. cpu = false,
  94. hugetlb = false,
  95. io = false,
  96. memory = false,
  97. pids = false,
  98. rdma = false;
  99. char subtree_control[64] = { 0 };
  100. DEBUG("using cgroup path %s\n", cgroup_path);
  101. mkdir_p(cgroup_path, 0700);
  102. /* find which controllers need to be enabled */
  103. avl_for_each_element(&cgvals, valp, avl) {
  104. ent = (char *)valp->avl.key;
  105. if (strlen(ent) > maxlen)
  106. maxlen = strlen(ent);
  107. if (!strncmp("cpuset.", ent, 7))
  108. cpuset = true;
  109. else if (!strncmp("cpu.", ent, 4))
  110. cpu = true;
  111. else if (!strncmp("hugetlb.", ent, 8))
  112. hugetlb = true;
  113. else if (!strncmp("io.", ent, 3))
  114. io = true;
  115. else if (!strncmp("memory.", ent, 7))
  116. memory = true;
  117. else if (!strncmp("pids.", ent, 5))
  118. pids = true;
  119. else if (!strncmp("rdma.", ent, 5))
  120. pids = true;
  121. }
  122. maxlen += strlen(cgroup_path) + 2;
  123. if (cpuset)
  124. strcat(subtree_control, "+cpuset ");
  125. if (cpu)
  126. strcat(subtree_control, "+cpu ");
  127. if (hugetlb)
  128. strcat(subtree_control, "+hugetlb ");
  129. if (io)
  130. strcat(subtree_control, "+io ");
  131. if (memory)
  132. strcat(subtree_control, "+memory ");
  133. if (pids)
  134. strcat(subtree_control, "+pids ");
  135. if (rdma)
  136. strcat(subtree_control, "+rdma ");
  137. /* remove trailing space */
  138. ent = strchr(subtree_control, '\0') - 1;
  139. *ent = '\0';
  140. ent = malloc(maxlen);
  141. assert(ent != 0);
  142. DEBUG("recursively applying cgroup.subtree_control = \"%s\"\n", subtree_control);
  143. cdir = &cgroup_path[strlen(CGROUP_ROOT) - 2];
  144. while ((cdir = strchr(cdir + 1, '/'))) {
  145. *cdir = '\0';
  146. snprintf(ent, maxlen, "%s/cgroup.subtree_control", cgroup_path);
  147. DEBUG(" * %s\n", ent);
  148. fd = open(ent, O_WRONLY);
  149. assert(fd != -1);
  150. write(fd, subtree_control, strlen(subtree_control));
  151. close(fd);
  152. *cdir = '/';
  153. }
  154. avl_for_each_element(&cgvals, valp, avl) {
  155. DEBUG("applying cgroup2 %s=\"%s\"\n", (char *)valp->avl.key, valp->val);
  156. snprintf(ent, maxlen, "%s/%s", cgroup_path, (char *)valp->avl.key);
  157. fd = open(ent, O_WRONLY);
  158. if (fd == -1) {
  159. ERROR("can't open %s: %m\n", ent);
  160. continue;
  161. }
  162. if (dprintf(fd, "%s", valp->val) < 0) {
  163. ERROR("can't write to %s: %m\n", ent);
  164. };
  165. close(fd);
  166. }
  167. snprintf(ent, maxlen, "%s/%s", cgroup_path, "cgroup.procs");
  168. fd = open(ent, O_WRONLY);
  169. assert(fd != -1);
  170. dprintf(fd, "%d", pid);
  171. close(fd);
  172. free(ent);
  173. }
  174. enum {
  175. OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAJOR,
  176. OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MINOR,
  177. OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_WEIGHT,
  178. OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT,
  179. __OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAX,
  180. };
  181. static const struct blobmsg_policy oci_linux_cgroups_blockio_weightdevice_policy[] = {
  182. [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAJOR] = { "major", BLOBMSG_CAST_INT64 },
  183. [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MINOR] = { "minor", BLOBMSG_CAST_INT64 },
  184. [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_WEIGHT] = { "weight", BLOBMSG_TYPE_INT32 },
  185. [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT] = { "leafWeight", BLOBMSG_TYPE_INT32 },
  186. };
  187. enum {
  188. OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR,
  189. OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR,
  190. OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE,
  191. __OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX,
  192. };
  193. static const struct blobmsg_policy oci_linux_cgroups_blockio_throttledevice_policy[] = {
  194. [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR] = { "major", BLOBMSG_CAST_INT64 },
  195. [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR] = { "minor", BLOBMSG_CAST_INT64 },
  196. [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE] = { "rate", BLOBMSG_CAST_INT64 },
  197. };
  198. enum {
  199. OCI_LINUX_CGROUPS_BLOCKIO_WEIGHT,
  200. OCI_LINUX_CGROUPS_BLOCKIO_LEAFWEIGHT,
  201. OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE,
  202. OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADBPSDEVICE,
  203. OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEBPSDEVICE,
  204. OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADIOPSDEVICE,
  205. OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEIOPSDEVICE,
  206. __OCI_LINUX_CGROUPS_BLOCKIO_MAX,
  207. };
  208. static const struct blobmsg_policy oci_linux_cgroups_blockio_policy[] = {
  209. [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHT] = { "weight", BLOBMSG_TYPE_INT32 },
  210. [OCI_LINUX_CGROUPS_BLOCKIO_LEAFWEIGHT] = { "leafWeight", BLOBMSG_TYPE_INT32 },
  211. [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE] = { "weightDevice", BLOBMSG_TYPE_ARRAY },
  212. [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADBPSDEVICE] = { "throttleReadBpsDevice", BLOBMSG_TYPE_ARRAY },
  213. [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEBPSDEVICE] = { "throttleWriteBpsDevice", BLOBMSG_TYPE_ARRAY },
  214. [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADIOPSDEVICE] = { "throttleReadIOPSDevice", BLOBMSG_TYPE_ARRAY },
  215. [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEIOPSDEVICE] = { "throttleWriteIOPSDevice", BLOBMSG_TYPE_ARRAY },
  216. };
  217. struct posix_dev {
  218. uint64_t major;
  219. uint64_t minor;
  220. };
  221. struct iomax_line {
  222. struct avl_node avl;
  223. struct posix_dev dev;
  224. uint64_t rbps;
  225. uint64_t wbps;
  226. uint64_t riops;
  227. uint64_t wiops;
  228. };
  229. static int avl_devcmp(const void *k1, const void *k2, void *ptr)
  230. {
  231. struct posix_dev *d1 = (struct posix_dev *)k1, *d2 = (struct posix_dev *)k2;
  232. if (d1->major < d2->major)
  233. return -1;
  234. if (d1->major > d2->major)
  235. return 1;
  236. if (d1->minor < d2->minor)
  237. return -1;
  238. if (d1->minor > d2->minor)
  239. return 1;
  240. return 0;
  241. }
  242. static struct iomax_line *get_iomax_line(struct avl_tree *iomax, uint64_t major, uint64_t minor)
  243. {
  244. struct iomax_line *l;
  245. struct posix_dev d;
  246. d.major = major;
  247. d.minor = minor;
  248. l = avl_find_element(iomax, &d, l, avl);
  249. if (!l) {
  250. l = malloc(sizeof(struct iomax_line));
  251. assert(l != NULL);
  252. l->dev.major = d.major;
  253. l->dev.minor = d.minor;
  254. l->avl.key = &l->dev;
  255. l->rbps = -1;
  256. l->wbps = -1;
  257. l->riops = -1;
  258. l->wiops = -1;
  259. avl_insert(iomax, &l->avl);
  260. }
  261. return l;
  262. }
  263. static int parseOCIlinuxcgroups_legacy_blockio(struct blob_attr *msg)
  264. {
  265. struct blob_attr *tb[__OCI_LINUX_CGROUPS_BLOCKIO_MAX],
  266. *tbwd[__OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAX],
  267. *tbtd[__OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX],
  268. *cur;
  269. int rem;
  270. int weight = -1, leafweight = -1;
  271. size_t numweightstrs = 0, numiomaxstrs = 0, strtotlen = 1;
  272. char **weightstrs = NULL, **iomaxstrs = NULL, **curstr;
  273. char *weightstr, *iomaxstr;
  274. struct avl_tree iomax;
  275. struct iomax_line *curiomax, *tmp;
  276. blobmsg_parse(oci_linux_cgroups_blockio_policy, __OCI_LINUX_CGROUPS_BLOCKIO_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
  277. if (tb[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHT]) {
  278. weight = blobmsg_get_u32(tb[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHT]);
  279. ++numweightstrs;
  280. }
  281. if (weight > CGROUP_IO_WEIGHT_MAX)
  282. return ERANGE;
  283. if (tb[OCI_LINUX_CGROUPS_BLOCKIO_LEAFWEIGHT])
  284. leafweight = blobmsg_get_u32(tb[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHT]);
  285. if (leafweight > CGROUP_IO_WEIGHT_MAX)
  286. return ERANGE;
  287. blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE], rem)
  288. ++numweightstrs;
  289. weightstrs = calloc(numweightstrs + 1, sizeof(char *));
  290. assert(weightstrs != 0);
  291. numweightstrs = 0;
  292. if (weight > -1)
  293. asprintf(&weightstrs[numweightstrs++], "default %d", weight);
  294. blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE], rem) {
  295. uint64_t major, minor;
  296. int devweight = weight, devleafweight = leafweight;
  297. blobmsg_parse(oci_linux_cgroups_blockio_weightdevice_policy, __OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAX, tbwd, blobmsg_data(cur), blobmsg_len(cur));
  298. if (!tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAJOR] ||
  299. !tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MINOR])
  300. return ENODATA;
  301. if (!tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_WEIGHT] &&
  302. !tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT])
  303. return ENODATA;
  304. if (tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_WEIGHT])
  305. devweight = blobmsg_get_u32(tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_WEIGHT]);
  306. if (devweight > CGROUP_IO_WEIGHT_MAX)
  307. return ERANGE;
  308. if (tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT])
  309. devleafweight = blobmsg_get_u32(tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT]);
  310. if (devleafweight > CGROUP_IO_WEIGHT_MAX)
  311. return ERANGE;
  312. if (tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT])
  313. return ENOTSUP;
  314. major = blobmsg_cast_u64(tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAJOR]);
  315. minor = blobmsg_cast_u64(tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MINOR]);
  316. asprintf(&weightstrs[numweightstrs++], "%" PRIu64 ":%" PRIu64 " %u", major, minor, devweight);
  317. }
  318. if (numweightstrs) {
  319. curstr = weightstrs;
  320. while (*curstr)
  321. strtotlen += strlen(*(curstr++)) + 1;
  322. weightstr = calloc(strtotlen, sizeof(char));
  323. assert(weightstr != 0);
  324. curstr = weightstrs;
  325. while (*curstr) {
  326. strcat(weightstr, *curstr);
  327. strcat(weightstr, "\n");
  328. free(*(curstr++));
  329. }
  330. cgroups_set("io.bfq.weight", weightstr);
  331. free(weightstr);
  332. };
  333. free(weightstrs);
  334. avl_init(&iomax, avl_devcmp, false, NULL);
  335. blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADBPSDEVICE], rem) {
  336. struct iomax_line *l;
  337. blobmsg_parse(oci_linux_cgroups_blockio_throttledevice_policy, __OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX, tbtd, blobmsg_data(cur), blobmsg_len(cur));
  338. if (!tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR] ||
  339. !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR] ||
  340. !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE])
  341. return ENODATA;
  342. l = get_iomax_line(&iomax,
  343. blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR]),
  344. blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR]));
  345. l->rbps = blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE]);
  346. }
  347. blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEBPSDEVICE], rem) {
  348. struct iomax_line *l;
  349. blobmsg_parse(oci_linux_cgroups_blockio_throttledevice_policy, __OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX, tbtd, blobmsg_data(cur), blobmsg_len(cur));
  350. if (!tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR] ||
  351. !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR] ||
  352. !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE])
  353. return ENODATA;
  354. l = get_iomax_line(&iomax,
  355. blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR]),
  356. blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR]));
  357. l->wbps = blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE]);
  358. }
  359. blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADIOPSDEVICE], rem) {
  360. struct iomax_line *l;
  361. blobmsg_parse(oci_linux_cgroups_blockio_throttledevice_policy, __OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX, tbtd, blobmsg_data(cur), blobmsg_len(cur));
  362. if (!tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR] ||
  363. !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR] ||
  364. !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE])
  365. return ENODATA;
  366. l = get_iomax_line(&iomax,
  367. blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR]),
  368. blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR]));
  369. l->riops = blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE]);
  370. }
  371. blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEIOPSDEVICE], rem) {
  372. struct iomax_line *l;
  373. blobmsg_parse(oci_linux_cgroups_blockio_throttledevice_policy, __OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX, tbtd, blobmsg_data(cur), blobmsg_len(cur));
  374. if (!tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR] ||
  375. !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR] ||
  376. !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE])
  377. return ENODATA;
  378. l = get_iomax_line(&iomax,
  379. blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR]),
  380. blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR]));
  381. l->wiops = blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE]);
  382. }
  383. avl_for_each_element(&iomax, curiomax, avl)
  384. ++numiomaxstrs;
  385. if (!numiomaxstrs)
  386. return 0;
  387. iomaxstrs = calloc(numiomaxstrs + 1, sizeof(char *));
  388. assert(iomaxstrs != 0);
  389. numiomaxstrs = 0;
  390. avl_for_each_element(&iomax, curiomax, avl) {
  391. char iomaxlstr[160];
  392. char lstr[32];
  393. sprintf(iomaxlstr, "%" PRIu64 ":%" PRIu64 " ", curiomax->dev.major, curiomax->dev.minor);
  394. if (curiomax->rbps != -1) {
  395. sprintf(lstr, "rbps=%" PRIu64 " ", curiomax->rbps);
  396. strcat(iomaxlstr, lstr);
  397. }
  398. if (curiomax->wbps != -1) {
  399. sprintf(lstr, "wbps=%" PRIu64 " ", curiomax->wbps);
  400. strcat(iomaxlstr, lstr);
  401. }
  402. if (curiomax->riops != -1) {
  403. sprintf(lstr, "riops=%" PRIu64 " ", curiomax->riops);
  404. strcat(iomaxlstr, lstr);
  405. }
  406. if (curiomax->wiops != -1) {
  407. sprintf(lstr, "wiops=%" PRIu64 " ", curiomax->wiops);
  408. strcat(iomaxlstr, lstr);
  409. }
  410. iomaxstrs[numiomaxstrs++] = strdup(iomaxlstr);
  411. }
  412. avl_for_each_element_safe(&iomax, curiomax, avl, tmp) {
  413. avl_delete(&iomax, &curiomax->avl);
  414. free(curiomax);
  415. }
  416. strtotlen = 1; /* 1 accounts for \0 at end of string */
  417. if (numiomaxstrs) {
  418. curstr = iomaxstrs;
  419. while (*curstr)
  420. strtotlen += strlen(*(curstr++)) + 1; /* +1 accounts for \n at end of line */
  421. iomaxstr = calloc(strtotlen, sizeof(char));
  422. assert(iomaxstr != 0);
  423. curstr = iomaxstrs;
  424. while (*curstr) {
  425. strcat(iomaxstr, *curstr);
  426. strcat(iomaxstr, "\n");
  427. free(*(curstr++));
  428. }
  429. cgroups_set("io.max", iomaxstr);
  430. free(iomaxstr);
  431. };
  432. free(iomaxstrs);
  433. return 0;
  434. }
  435. enum {
  436. OCI_LINUX_CGROUPS_CPU_SHARES,
  437. OCI_LINUX_CGROUPS_CPU_PERIOD,
  438. OCI_LINUX_CGROUPS_CPU_QUOTA,
  439. OCI_LINUX_CGROUPS_CPU_REALTIMERUNTIME,
  440. OCI_LINUX_CGROUPS_CPU_REALTIMEPERIOD,
  441. OCI_LINUX_CGROUPS_CPU_CPUS,
  442. OCI_LINUX_CGROUPS_CPU_MEMS,
  443. __OCI_LINUX_CGROUPS_CPU_MAX,
  444. };
  445. static const struct blobmsg_policy oci_linux_cgroups_cpu_policy[] = {
  446. [OCI_LINUX_CGROUPS_CPU_SHARES] = { "shares", BLOBMSG_CAST_INT64 },
  447. [OCI_LINUX_CGROUPS_CPU_PERIOD] = { "period", BLOBMSG_CAST_INT64 },
  448. [OCI_LINUX_CGROUPS_CPU_QUOTA] = { "quota", BLOBMSG_CAST_INT64 }, /* signed int64! */
  449. [OCI_LINUX_CGROUPS_CPU_REALTIMEPERIOD] = { "realtimePeriod", BLOBMSG_CAST_INT64 },
  450. [OCI_LINUX_CGROUPS_CPU_REALTIMERUNTIME] = { "realtimeRuntime", BLOBMSG_CAST_INT64 },
  451. [OCI_LINUX_CGROUPS_CPU_CPUS] = { "cpus", BLOBMSG_TYPE_STRING },
  452. [OCI_LINUX_CGROUPS_CPU_MEMS] = { "mems", BLOBMSG_TYPE_STRING },
  453. };
  454. static int parseOCIlinuxcgroups_legacy_cpu(struct blob_attr *msg)
  455. {
  456. struct blob_attr *tb[__OCI_LINUX_CGROUPS_CPU_MAX];
  457. uint64_t shares, period = 0;
  458. int64_t quota = -2; /* unset */
  459. char tmp[32] = { 0 };
  460. blobmsg_parse(oci_linux_cgroups_cpu_policy, __OCI_LINUX_CGROUPS_CPU_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
  461. if (tb[OCI_LINUX_CGROUPS_CPU_REALTIMEPERIOD] ||
  462. tb[OCI_LINUX_CGROUPS_CPU_REALTIMERUNTIME])
  463. return ENOTSUP; /* no equivalent in cgroup2 */
  464. if (tb[OCI_LINUX_CGROUPS_CPU_SHARES]) {
  465. shares = blobmsg_cast_u64(tb[OCI_LINUX_CGROUPS_CPU_SHARES]);
  466. if ((shares < 2) || (shares > 262144))
  467. return ERANGE;
  468. snprintf(tmp, sizeof(tmp), "%" PRIu64, (((uint64_t)1) + ((shares - 2) * 9999) / 262142));
  469. cgroups_set("cpu.weight", tmp);
  470. tmp[0] = '\0';
  471. }
  472. if (tb[OCI_LINUX_CGROUPS_CPU_QUOTA])
  473. quota = blobmsg_cast_s64(tb[OCI_LINUX_CGROUPS_CPU_QUOTA]);
  474. if (tb[OCI_LINUX_CGROUPS_CPU_PERIOD])
  475. period = blobmsg_cast_u64(tb[OCI_LINUX_CGROUPS_CPU_PERIOD]);
  476. if (period) {
  477. if (quota >= 0)
  478. snprintf(tmp, sizeof(tmp), "%" PRId64 " %" PRIu64 , quota, period);
  479. else
  480. snprintf(tmp, sizeof(tmp), "max %" PRIu64, period); /* assume default */
  481. } else if (quota >= 0) {
  482. snprintf(tmp, sizeof(tmp), "%" PRId64, quota);
  483. } else if (quota == -1) {
  484. strcpy(tmp, "max");
  485. }
  486. if (tmp[0])
  487. cgroups_set("cpu.max", tmp);
  488. if (tb[OCI_LINUX_CGROUPS_CPU_CPUS])
  489. cgroups_set("cpuset.cpus", blobmsg_get_string(tb[OCI_LINUX_CGROUPS_CPU_CPUS]));
  490. if (tb[OCI_LINUX_CGROUPS_CPU_MEMS])
  491. cgroups_set("cpuset.mems", blobmsg_get_string(tb[OCI_LINUX_CGROUPS_CPU_MEMS]));
  492. return 0;
  493. }
  494. enum {
  495. OCI_LINUX_CGROUPS_MEMORY_LIMIT,
  496. OCI_LINUX_CGROUPS_MEMORY_RESERVATION,
  497. OCI_LINUX_CGROUPS_MEMORY_SWAP,
  498. OCI_LINUX_CGROUPS_MEMORY_KERNEL,
  499. OCI_LINUX_CGROUPS_MEMORY_KERNELTCP,
  500. OCI_LINUX_CGROUPS_MEMORY_SWAPPINESS,
  501. OCI_LINUX_CGROUPS_MEMORY_DISABLEOOMKILLER,
  502. OCI_LINUX_CGROUPS_MEMORY_USEHIERARCHY,
  503. __OCI_LINUX_CGROUPS_MEMORY_MAX,
  504. };
  505. static const struct blobmsg_policy oci_linux_cgroups_memory_policy[] = {
  506. [OCI_LINUX_CGROUPS_MEMORY_LIMIT] = { "limit", BLOBMSG_CAST_INT64 }, /* signed int64! */
  507. [OCI_LINUX_CGROUPS_MEMORY_RESERVATION] = { "reservation", BLOBMSG_CAST_INT64 }, /* signed int64! */
  508. [OCI_LINUX_CGROUPS_MEMORY_SWAP] = { "swap", BLOBMSG_CAST_INT64 }, /* signed int64! */
  509. [OCI_LINUX_CGROUPS_MEMORY_KERNEL] = { "kernel", BLOBMSG_CAST_INT64 }, /* signed int64! ignored */
  510. [OCI_LINUX_CGROUPS_MEMORY_KERNELTCP] = { "kernelTCP", BLOBMSG_CAST_INT64 }, /* signed int64! ignored */
  511. [OCI_LINUX_CGROUPS_MEMORY_SWAPPINESS] = { "swappiness", BLOBMSG_CAST_INT64 },
  512. [OCI_LINUX_CGROUPS_MEMORY_DISABLEOOMKILLER] = { "disableOOMKiller", BLOBMSG_TYPE_BOOL },
  513. [OCI_LINUX_CGROUPS_MEMORY_USEHIERARCHY] { "useHierarchy", BLOBMSG_TYPE_BOOL },
  514. };
  515. static int parseOCIlinuxcgroups_legacy_memory(struct blob_attr *msg)
  516. {
  517. struct blob_attr *tb[__OCI_LINUX_CGROUPS_MEMORY_MAX];
  518. char tmp[32] = { 0 };
  519. int64_t limit, swap, reservation;
  520. blobmsg_parse(oci_linux_cgroups_memory_policy, __OCI_LINUX_CGROUPS_MEMORY_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
  521. /*
  522. * not all properties of the OCI memory section can be mapped to cgroup2
  523. * kernel memory accounting is always enabled and included in the set
  524. * memory limit, hence these options can be ignored
  525. * disableOOMKiller could be emulated using oom_score_adj + seccomp eBPF
  526. * preventing self-upgrade (but allow downgrade)
  527. *
  528. * see also https://github.com/opencontainers/runtime-spec/issues/1005
  529. */
  530. if (tb[OCI_LINUX_CGROUPS_MEMORY_SWAPPINESS] ||
  531. tb[OCI_LINUX_CGROUPS_MEMORY_DISABLEOOMKILLER] ||
  532. tb[OCI_LINUX_CGROUPS_MEMORY_USEHIERARCHY])
  533. return ENOTSUP;
  534. if (tb[OCI_LINUX_CGROUPS_MEMORY_LIMIT]) {
  535. limit = blobmsg_cast_s64(tb[OCI_LINUX_CGROUPS_MEMORY_LIMIT]);
  536. if (limit == -1)
  537. strcpy(tmp, "max");
  538. else
  539. snprintf(tmp, sizeof(tmp), "%" PRId64, limit);
  540. cgroups_set("memory.max", tmp);
  541. }
  542. if (tb[OCI_LINUX_CGROUPS_MEMORY_RESERVATION]) {
  543. reservation = blobmsg_cast_s64(tb[OCI_LINUX_CGROUPS_MEMORY_RESERVATION]);
  544. if (reservation == -1)
  545. strcpy(tmp, "max");
  546. else
  547. snprintf(tmp, sizeof(tmp), "%" PRId64, reservation);
  548. cgroups_set("memory.low", tmp);
  549. }
  550. /* OCI 'swap' acounts for memory+swap */
  551. if (tb[OCI_LINUX_CGROUPS_MEMORY_SWAP]) {
  552. swap = blobmsg_cast_s64(tb[OCI_LINUX_CGROUPS_MEMORY_SWAP]);
  553. if (swap == -1)
  554. strcpy(tmp, "max");
  555. else if (limit == -1 || (limit < swap))
  556. snprintf(tmp, sizeof(tmp), "%" PRId64, swap);
  557. else
  558. snprintf(tmp, sizeof(tmp), "%" PRId64, limit - swap);
  559. cgroups_set("memory.swap_max", tmp);
  560. }
  561. return 0;
  562. }
  563. enum {
  564. OCI_LINUX_CGROUPS_PIDS_LIMIT,
  565. __OCI_LINUX_CGROUPS_PIDS_MAX,
  566. };
  567. static const struct blobmsg_policy oci_linux_cgroups_pids_policy[] = {
  568. [OCI_LINUX_CGROUPS_PIDS_LIMIT] = { "limit", BLOBMSG_CAST_INT64 },
  569. };
  570. static int parseOCIlinuxcgroups_legacy_pids(struct blob_attr *msg)
  571. {
  572. struct blob_attr *tb[__OCI_LINUX_CGROUPS_MEMORY_MAX];
  573. char tmp[32] = { 0 };
  574. blobmsg_parse(oci_linux_cgroups_pids_policy, __OCI_LINUX_CGROUPS_PIDS_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
  575. if (!tb[OCI_LINUX_CGROUPS_PIDS_LIMIT])
  576. return EINVAL;
  577. snprintf(tmp, sizeof(tmp), "%" PRIu64, blobmsg_cast_u64(tb[OCI_LINUX_CGROUPS_PIDS_LIMIT]));
  578. cgroups_set("pids.max", tmp);
  579. return 0;
  580. }
  581. static int parseOCIlinuxcgroups_unified(struct blob_attr *msg)
  582. {
  583. struct blob_attr *cur;
  584. int rem;
  585. blobmsg_for_each_attr(cur, msg, rem) {
  586. if (blobmsg_type(cur) != BLOBMSG_TYPE_STRING)
  587. return EINVAL;
  588. /* restrict keys */
  589. if (strchr(blobmsg_name(cur), '/') ||
  590. !strcmp(blobmsg_name(cur), "cgroup.subtree_control") ||
  591. !strcmp(blobmsg_name(cur), "cgroup.procs") ||
  592. !strcmp(blobmsg_name(cur), "cgroup.threads") ||
  593. !strcmp(blobmsg_name(cur), "cgroup.freeze"))
  594. return EINVAL;
  595. cgroups_set(blobmsg_name(cur), blobmsg_get_string(cur));
  596. }
  597. return 0;
  598. }
  599. enum {
  600. OCI_LINUX_CGROUPS_BLOCKIO,
  601. OCI_LINUX_CGROUPS_CPU,
  602. OCI_LINUX_CGROUPS_DEVICES,
  603. OCI_LINUX_CGROUPS_HUGEPAGELIMITS,
  604. OCI_LINUX_CGROUPS_INTELRDT,
  605. OCI_LINUX_CGROUPS_MEMORY,
  606. OCI_LINUX_CGROUPS_NETWORK,
  607. OCI_LINUX_CGROUPS_PIDS,
  608. OCI_LINUX_CGROUPS_RDMA,
  609. OCI_LINUX_CGROUPS_UNIFIED,
  610. __OCI_LINUX_CGROUPS_MAX,
  611. };
  612. static const struct blobmsg_policy oci_linux_cgroups_policy[] = {
  613. [OCI_LINUX_CGROUPS_BLOCKIO] = { "blockIO", BLOBMSG_TYPE_TABLE },
  614. [OCI_LINUX_CGROUPS_CPU] = { "cpu", BLOBMSG_TYPE_TABLE },
  615. [OCI_LINUX_CGROUPS_DEVICES] = { "devices", BLOBMSG_TYPE_ARRAY },
  616. [OCI_LINUX_CGROUPS_HUGEPAGELIMITS] = { "hugepageLimits", BLOBMSG_TYPE_ARRAY },
  617. [OCI_LINUX_CGROUPS_INTELRDT] = { "intelRdt", BLOBMSG_TYPE_TABLE },
  618. [OCI_LINUX_CGROUPS_MEMORY] = { "memory", BLOBMSG_TYPE_TABLE },
  619. [OCI_LINUX_CGROUPS_NETWORK] = { "network", BLOBMSG_TYPE_TABLE },
  620. [OCI_LINUX_CGROUPS_PIDS] = { "pids", BLOBMSG_TYPE_TABLE },
  621. [OCI_LINUX_CGROUPS_RDMA] = { "rdma", BLOBMSG_TYPE_TABLE },
  622. [OCI_LINUX_CGROUPS_UNIFIED] = { "unified", BLOBMSG_TYPE_TABLE },
  623. };
  624. int parseOCIlinuxcgroups(struct blob_attr *msg)
  625. {
  626. struct blob_attr *tb[__OCI_LINUX_CGROUPS_MAX];
  627. int ret;
  628. blobmsg_parse(oci_linux_cgroups_policy, __OCI_LINUX_CGROUPS_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
  629. if (tb[OCI_LINUX_CGROUPS_DEVICES] ||
  630. tb[OCI_LINUX_CGROUPS_HUGEPAGELIMITS] ||
  631. tb[OCI_LINUX_CGROUPS_INTELRDT] ||
  632. tb[OCI_LINUX_CGROUPS_NETWORK] ||
  633. tb[OCI_LINUX_CGROUPS_RDMA])
  634. return ENOTSUP;
  635. if (tb[OCI_LINUX_CGROUPS_BLOCKIO]) {
  636. ret = parseOCIlinuxcgroups_legacy_blockio(tb[OCI_LINUX_CGROUPS_BLOCKIO]);
  637. if (ret)
  638. return ret;
  639. }
  640. if (tb[OCI_LINUX_CGROUPS_CPU]) {
  641. ret = parseOCIlinuxcgroups_legacy_cpu(tb[OCI_LINUX_CGROUPS_CPU]);
  642. if (ret)
  643. return ret;
  644. }
  645. if (tb[OCI_LINUX_CGROUPS_MEMORY]) {
  646. ret = parseOCIlinuxcgroups_legacy_memory(tb[OCI_LINUX_CGROUPS_MEMORY]);
  647. if (ret)
  648. return ret;
  649. }
  650. if (tb[OCI_LINUX_CGROUPS_PIDS]) {
  651. ret = parseOCIlinuxcgroups_legacy_pids(tb[OCI_LINUX_CGROUPS_PIDS]);
  652. if (ret)
  653. return ret;
  654. }
  655. if (tb[OCI_LINUX_CGROUPS_UNIFIED]) {
  656. ret = parseOCIlinuxcgroups_unified(tb[OCI_LINUX_CGROUPS_UNIFIED]);
  657. if (ret)
  658. return ret;
  659. }
  660. return 0;
  661. }