run-child-proc.cc 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334
  1. #include <cstdlib>
  2. #include <cstring>
  3. #include <cstdio>
  4. #include <sys/types.h>
  5. #include <sys/stat.h>
  6. #include <sys/ioctl.h>
  7. #include <sys/un.h>
  8. #include <sys/socket.h>
  9. #include <fcntl.h>
  10. #include <unistd.h>
  11. #include <termios.h>
  12. #include "service.h"
  13. #include "proc-service.h"
  14. #ifdef SUPPORT_CGROUPS
  15. extern std::string cgroups_path;
  16. extern bool have_cgroups_path;
  17. #endif
  18. // Move an fd, if necessary, to another fd. The destination fd must be available (not open).
  19. // if fd is specified as -1, returns -1 immediately. Returns 0 on success.
  20. static int move_fd(int fd, int dest)
  21. {
  22. if (fd == -1) return -1;
  23. if (fd == dest) return 0;
  24. if (dup2(fd, dest) == -1) {
  25. return -1;
  26. }
  27. close(fd);
  28. return 0;
  29. }
  30. // Move a file descriptor to another, freeing up the original descriptor so that it can be used
  31. // for some reserved purpose.
  32. static int move_reserved_fd(int *fd, int min_fd)
  33. {
  34. int new_fd = fcntl(*fd, F_DUPFD_CLOEXEC, min_fd);
  35. if (new_fd != -1) {
  36. close(*fd);
  37. *fd = new_fd;
  38. }
  39. return new_fd;
  40. }
  41. void base_process_service::run_child_proc(run_proc_params params) noexcept
  42. {
  43. // Child process. Must not risk throwing any uncaught exception from here until exit().
  44. const char * const *args = params.args;
  45. const char *working_dir = params.working_dir;
  46. const char *logfile = params.logfile;
  47. bool on_console = params.on_console;
  48. int wpipefd = params.wpipefd;
  49. int csfd = params.csfd;
  50. int notify_fd = params.notify_fd;
  51. int force_notify_fd = params.force_notify_fd;
  52. const char *notify_var = params.notify_var;
  53. uid_t uid = params.uid;
  54. gid_t gid = params.gid;
  55. const std::vector<service_rlimits> &rlimits = params.rlimits;
  56. // If the console already has a session leader, presumably it is us. On the other hand
  57. // if it has no session leader, and we don't create one, then control inputs such as
  58. // ^C will have no effect.
  59. bool do_set_ctty = (tcgetsid(0) == -1);
  60. // Copy signal mask, but unmask signals that we masked on startup. For the moment, we'll
  61. // also block all signals, since apparently dup() can be interrupted (!!! really, POSIX??).
  62. sigset_t sigwait_set;
  63. sigset_t sigall_set;
  64. sigfillset(&sigall_set);
  65. sigprocmask(SIG_SETMASK, &sigall_set, &sigwait_set);
  66. sigdelset(&sigwait_set, SIGCHLD);
  67. sigdelset(&sigwait_set, SIGINT);
  68. sigdelset(&sigwait_set, SIGTERM);
  69. sigdelset(&sigwait_set, SIGQUIT);
  70. constexpr int bufsz = 11 + ((CHAR_BIT * sizeof(pid_t) + 2) / 3) + 1;
  71. // "LISTEN_PID=" - 11 characters; the expression above gives a conservative estimate
  72. // on the maxiumum number of bytes required for LISTEN=nnn, including nul terminator,
  73. // where nnn is a pid_t in decimal (i.e. one decimal digit is worth just over 3 bits).
  74. char nbuf[bufsz];
  75. // "DINIT_CS_FD=" - 12 bytes. (we -1 from sizeof(int) in account of sign bit).
  76. constexpr int csenvbufsz = 12 + ((CHAR_BIT * sizeof(int) - 1 + 2) / 3) + 1;
  77. char csenvbuf[csenvbufsz];
  78. run_proc_err err;
  79. err.stage = exec_stage::ARRANGE_FDS;
  80. int minfd = (socket_fd == -1) ? 3 : 4;
  81. if (force_notify_fd != -1) {
  82. // Move wpipefd/csfd/socket_fd to another fd if necessary:
  83. if (wpipefd == force_notify_fd) {
  84. if (move_reserved_fd(&wpipefd, minfd) == -1) {
  85. goto failure_out;
  86. }
  87. }
  88. if (csfd == force_notify_fd) {
  89. if (move_reserved_fd(&csfd, minfd) == -1) {
  90. goto failure_out;
  91. }
  92. }
  93. if (socket_fd == force_notify_fd) {
  94. // Note that we might move this again later
  95. if (move_reserved_fd(&socket_fd, 0) == -1) {
  96. goto failure_out;
  97. }
  98. }
  99. // allocate the forced notification fd:
  100. if (notify_fd != force_notify_fd) {
  101. if (dup2(notify_fd, force_notify_fd) == -1) {
  102. goto failure_out;
  103. }
  104. close(notify_fd);
  105. notify_fd = force_notify_fd;
  106. }
  107. }
  108. // Make sure we have the fds for stdin/out/err (and pre-opened socket) available:
  109. if (wpipefd < minfd) {
  110. wpipefd = fcntl(wpipefd, F_DUPFD_CLOEXEC, minfd);
  111. if (wpipefd == -1) goto failure_out;
  112. }
  113. if (csfd != -1 && csfd < minfd) {
  114. csfd = fcntl(csfd, F_DUPFD, minfd);
  115. if (csfd == -1) goto failure_out;
  116. }
  117. if (notify_fd < minfd && notify_fd != force_notify_fd) {
  118. notify_fd = fcntl(notify_fd, F_DUPFD, minfd);
  119. if (notify_fd == -1) goto failure_out;
  120. }
  121. // Read environment from file
  122. if (params.env_file != nullptr && *params.env_file != 0) {
  123. err.stage = exec_stage::READ_ENV_FILE;
  124. try {
  125. read_env_file(params.env_file);
  126. }
  127. catch (std::system_error &sys_err) {
  128. errno = sys_err.code().value();
  129. }
  130. catch (std::bad_alloc &alloc_err) {
  131. errno = ENOMEM; goto failure_out;
  132. }
  133. }
  134. // Set up notify-fd variable:
  135. if (notify_var != nullptr && *notify_var != 0) {
  136. err.stage = exec_stage::SET_NOTIFYFD_VAR;
  137. // We need to do an allocation: the variable name length, '=', and space for the value,
  138. // and nul terminator:
  139. int notify_var_len = strlen(notify_var);
  140. int req_sz = notify_var_len + ((CHAR_BIT * sizeof(int) - 1 + 2) / 3) + 1;
  141. char * var_str = (char *) malloc(req_sz);
  142. if (var_str == nullptr) goto failure_out;
  143. snprintf(var_str, req_sz, "%s=%d", notify_var, notify_fd);
  144. if (putenv(var_str)) goto failure_out;
  145. }
  146. // Set up Systemd-style socket activation:
  147. if (socket_fd != -1) {
  148. err.stage = exec_stage::SETUP_ACTIVATION_SOCKET;
  149. // If we passing a pre-opened socket, it has to be fd number 3. (Thanks, Systemd).
  150. if (dup2(socket_fd, 3) == -1) goto failure_out;
  151. if (socket_fd != 3) close(socket_fd);
  152. if (putenv(const_cast<char *>("LISTEN_FDS=1"))) goto failure_out;
  153. snprintf(nbuf, bufsz, "LISTEN_PID=%jd", static_cast<intmax_t>(getpid()));
  154. if (putenv(nbuf)) goto failure_out;
  155. }
  156. if (csfd != -1) {
  157. err.stage = exec_stage::SETUP_CONTROL_SOCKET;
  158. snprintf(csenvbuf, csenvbufsz, "DINIT_CS_FD=%d", csfd);
  159. if (putenv(csenvbuf)) goto failure_out;
  160. }
  161. if (working_dir != nullptr && *working_dir != 0) {
  162. err.stage = exec_stage::CHDIR;
  163. if (chdir(working_dir) == -1) {
  164. goto failure_out;
  165. }
  166. }
  167. if (! on_console) {
  168. // Re-set stdin, stdout, stderr
  169. for (int i = 0; i < 3; i++) {
  170. if (i != force_notify_fd) close(i);
  171. }
  172. err.stage = exec_stage::SETUP_STDINOUTERR;
  173. if (notify_fd == 0 || move_fd(open("/dev/null", O_RDONLY), 0) == 0) {
  174. // stdin = 0. That's what we should have; proceed with opening stdout and stderr. We have to
  175. // take care not to clobber the notify_fd.
  176. if (notify_fd != 1) {
  177. if (move_fd(open(logfile, O_WRONLY | O_CREAT | O_APPEND, S_IRUSR | S_IWUSR), 1) != 0) {
  178. goto failure_out;
  179. }
  180. if (notify_fd != 2 && dup2(1, 2) != 2) {
  181. goto failure_out;
  182. }
  183. }
  184. else if (move_fd(open(logfile, O_WRONLY | O_CREAT | O_APPEND, S_IRUSR | S_IWUSR), 2) != 0) {
  185. goto failure_out;
  186. }
  187. }
  188. else goto failure_out;
  189. // We have the option of creating a session and process group, or just a new process
  190. // group. If we just create a new process group, the child process cannot make itself
  191. // a session leader if it wants to do that (eg getty/login will generally want this).
  192. // If we do neither, and we are running with a controlling terminal, a ^C or similar
  193. // will also affect the child process (which probably isn't so bad, though since we
  194. // will handle the shutdown ourselves it's not necessary). Creating a new session
  195. // (and a new process group as part of that) seems like a safe bet, and has the
  196. // advantage of letting us signal the process as part of a process group.
  197. setsid();
  198. }
  199. else {
  200. // "run on console" - run as a foreground job on the terminal/console device
  201. // if do_set_ctty is false, we are the session leader; we are probably running
  202. // as a user process. Don't create a new session leader in that case, and run
  203. // as part of the parent session. Otherwise, the new session cannot claim the
  204. // terminal as a controlling terminal (it is already claimed), meaning that it
  205. // will not see control signals from ^C etc.
  206. if (do_set_ctty) {
  207. // Disable suspend (^Z) (and on some systems, delayed suspend / ^Y)
  208. signal(SIGTSTP, SIG_IGN);
  209. // Become session leader
  210. setsid();
  211. ioctl(0, TIOCSCTTY, 0);
  212. }
  213. setpgid(0,0);
  214. if (params.in_foreground) {
  215. tcsetpgrp(0, getpgrp());
  216. }
  217. }
  218. // Resource limits
  219. err.stage = exec_stage::SET_RLIMITS;
  220. for (auto &limit : rlimits) {
  221. rlimit setlimits;
  222. if (!limit.hard_set || !limit.soft_set) {
  223. // if either hard or soft limit is not set, use current:
  224. if (getrlimit(limit.resource_id, &setlimits) != 0) goto failure_out;
  225. }
  226. if (limit.hard_set) setlimits.rlim_max = limit.limits.rlim_max;
  227. if (limit.soft_set) setlimits.rlim_cur = limit.limits.rlim_cur;
  228. if (setrlimit(limit.resource_id, &setlimits) != 0) goto failure_out;
  229. }
  230. #if SUPPORT_CGROUPS
  231. if (params.run_in_cgroup != nullptr && *params.run_in_cgroup != 0) {
  232. err.stage = exec_stage::ENTER_CGROUP;
  233. int sys_fs_cgroup_fd = open("/sys/fs/cgroup", O_RDONLY | O_DIRECTORY | O_PATH);
  234. if (sys_fs_cgroup_fd == -1) goto failure_out;
  235. const char *run_cgroup_path = params.run_in_cgroup;
  236. if (run_cgroup_path[0] != '/') {
  237. // A relative cgroup path must be resolved against our own path (cgroups_path)
  238. if (!have_cgroups_path) {
  239. errno = ENOENT;
  240. goto failure_out;
  241. }
  242. if (!cgroups_path.empty()) {
  243. int cgrp_root_path = openat(sys_fs_cgroup_fd, cgroups_path.c_str(), O_RDONLY | O_DIRECTORY | O_PATH);
  244. if (cgrp_root_path == -1) goto failure_out;
  245. close(sys_fs_cgroup_fd);
  246. sys_fs_cgroup_fd = cgrp_root_path;
  247. }
  248. }
  249. else {
  250. ++run_cgroup_path; // skip leading slash
  251. }
  252. int cgroup_dir_fd = openat(sys_fs_cgroup_fd, run_cgroup_path, O_RDONLY | O_DIRECTORY | O_PATH);
  253. if (cgroup_dir_fd == -1) goto failure_out;
  254. close(sys_fs_cgroup_fd);
  255. int cgroup_procs_fd = openat(cgroup_dir_fd, "cgroup.procs", O_WRONLY);
  256. if (cgroup_procs_fd == -1) goto failure_out;
  257. close(cgroup_dir_fd);
  258. // We need to write our own pid into the cgroup.procs file
  259. char pidbuf[std::numeric_limits<pid_t>::digits10 + 3];
  260. // +1 for most significant digit, +1 for '\n', +1 for nul terminator
  261. int num_chars;
  262. if (sizeof(pid_t) <= sizeof(unsigned)) {
  263. num_chars = sprintf(pidbuf, "%u\n", (unsigned)getpid());
  264. }
  265. else if (sizeof(pid_t) <= sizeof(unsigned long)) {
  266. num_chars = sprintf(pidbuf, "%lu\n", (unsigned long)getpid());
  267. }
  268. else {
  269. static_assert(sizeof(pid_t) <= sizeof(unsigned long long));
  270. num_chars = sprintf(pidbuf, "%llu\n", (unsigned long long)getpid());
  271. }
  272. if (write(cgroup_procs_fd, pidbuf, num_chars) == -1) goto failure_out;
  273. close(cgroup_procs_fd);
  274. }
  275. #endif
  276. if (uid != uid_t(-1)) {
  277. err.stage = exec_stage::SET_UIDGID;
  278. // We must set group first (i.e. before we drop privileges)
  279. if (setregid(gid, gid) != 0) goto failure_out;
  280. if (setreuid(uid, uid) != 0) goto failure_out;
  281. }
  282. sigprocmask(SIG_SETMASK, &sigwait_set, nullptr);
  283. err.stage = exec_stage::DO_EXEC;
  284. execvp(args[0], const_cast<char **>(args));
  285. // If we got here, the exec failed:
  286. failure_out:
  287. err.st_errno = errno;
  288. write(wpipefd, &err, sizeof(err));
  289. _exit(0);
  290. }