proc-service.cc 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646
  1. #include <cstring>
  2. #include <type_traits>
  3. #include <sys/un.h>
  4. #include <sys/socket.h>
  5. #include "dinit.h"
  6. #include "dinit-socket.h"
  7. #include "dinit-util.h"
  8. #include "dinit-log.h"
  9. #include "proc-service.h"
  10. /*
  11. * Most of the implementation for process-based services (process, scripted, bgprocess) is here.
  12. *
  13. * See proc-service.h header for interface details.
  14. */
  15. // Strings describing the execution stages (failure points).
  16. const char * const exec_stage_descriptions[static_cast<int>(exec_stage::DO_EXEC) + 1] = {
  17. "arranging file descriptors", // ARRANGE_FDS
  18. "reading environment file", // READ_ENV_FILE
  19. "setting environment variable", // SET_NOTIFYFD_VAR
  20. "setting up activation socket", // SETUP_ACTIVATION_SOCKET
  21. "setting up control socket", // SETUP_CONTROL_SOCKET
  22. "changing directory", // CHDIR
  23. "setting up standard input/output descriptors", // SETUP_STDINOUTERR
  24. "setting resource limits", // SET_RLIMITS
  25. "setting user/group ID", // SET_UIDGID
  26. "executing command" // DO_EXEC
  27. };
  28. // Given a string and a list of pairs of (start,end) indices for each argument in that string,
  29. // store a null terminator for the argument. Return a `char *` vector containing the beginning
  30. // of each argument and a trailing nullptr. (The returned array is invalidated if the string is
  31. // later modified).
  32. std::vector<const char *> separate_args(std::string &s,
  33. const std::list<std::pair<unsigned,unsigned>> &arg_indices)
  34. {
  35. std::vector<const char *> r;
  36. r.reserve(arg_indices.size() + 1);
  37. // First store nul terminator for each part:
  38. for (auto index_pair : arg_indices) {
  39. if (index_pair.second < s.length()) {
  40. s[index_pair.second] = 0;
  41. }
  42. }
  43. // Now we can get the C string (c_str) and store offsets into it:
  44. const char * cstr = s.c_str();
  45. for (auto index_pair : arg_indices) {
  46. r.push_back(cstr + index_pair.first);
  47. }
  48. r.push_back(nullptr);
  49. return r;
  50. }
  51. void process_service::exec_succeeded() noexcept
  52. {
  53. // This could be a smooth recovery (state already STARTED). Even more, the process
  54. // might be stopped (and killed via a signal) during smooth recovery. We don't to
  55. // process startup again in either case, so we check for state STARTING:
  56. if (get_state() == service_state_t::STARTING) {
  57. if (force_notification_fd != -1 || !notification_var.empty()) {
  58. // Wait for readiness notification:
  59. readiness_watcher.set_enabled(event_loop, true);
  60. }
  61. else {
  62. started();
  63. }
  64. }
  65. else if (get_state() == service_state_t::STOPPING) {
  66. // stopping, but smooth recovery was in process. That's now over so we can
  67. // commence normal stop. Note that if pid == -1 the process already stopped(!),
  68. // that's handled below.
  69. if (pid != -1 && stop_check_dependents()) {
  70. bring_down();
  71. }
  72. }
  73. }
  74. void scripted_service::exec_succeeded() noexcept
  75. {
  76. // For a scripted service, this means nothing other than that the start/stop
  77. // script will now begin.
  78. }
  79. rearm exec_status_pipe_watcher::fd_event(eventloop_t &loop, int fd, int flags) noexcept
  80. {
  81. base_process_service *sr = service;
  82. sr->waiting_for_execstat = false;
  83. run_proc_err exec_status;
  84. int r = read(get_watched_fd(), &exec_status, sizeof(exec_status));
  85. deregister(loop);
  86. close(get_watched_fd());
  87. if (r > 0) {
  88. // We read an errno code; exec() failed, and the service startup failed.
  89. if (sr->pid != -1) {
  90. sr->child_listener.deregister(event_loop, sr->pid);
  91. sr->reserved_child_watch = false;
  92. if (sr->stop_timer_armed) {
  93. sr->restart_timer.stop_timer(loop);
  94. sr->stop_timer_armed = false;
  95. }
  96. }
  97. sr->pid = -1;
  98. sr->exec_failed(exec_status);
  99. }
  100. else {
  101. sr->exec_succeeded();
  102. if (sr->pid == -1) {
  103. // Somehow the process managed to complete before we even saw the exec() status.
  104. sr->handle_exit_status(sr->exit_status);
  105. }
  106. }
  107. sr->services->process_queues();
  108. return rearm::REMOVED;
  109. }
  110. rearm ready_notify_watcher::fd_event(eventloop_t &, int fd, int flags) noexcept
  111. {
  112. char buf[128];
  113. if (service->get_state() == service_state_t::STARTING) {
  114. // can we actually read anything from the notification pipe?
  115. int r = bp_sys::read(fd, buf, sizeof(buf));
  116. if (r > 0) {
  117. service->started();
  118. }
  119. else if (r == 0 || errno != EAGAIN) {
  120. service->failed_to_start(false, false);
  121. service->set_state(service_state_t::STOPPING);
  122. service->bring_down();
  123. }
  124. }
  125. else {
  126. // Just keep consuming data from the pipe:
  127. int r = bp_sys::read(fd, buf, sizeof(buf));
  128. if (r == 0) {
  129. // Process closed write end or terminated
  130. close(fd);
  131. service->notification_fd = -1;
  132. return rearm::DISARM;
  133. }
  134. }
  135. service->services->process_queues();
  136. return rearm::REARM;
  137. }
  138. dasynq::rearm service_child_watcher::status_change(eventloop_t &loop, pid_t child, int status) noexcept
  139. {
  140. base_process_service *sr = service;
  141. sr->pid = -1;
  142. sr->exit_status = bp_sys::exit_status(status);
  143. // Ok, for a process service, any process death which we didn't rig ourselves is a bit... unexpected.
  144. // Probably, the child died because we asked it to (sr->service_state == STOPPING). But even if we
  145. // didn't, there's not much we can do.
  146. if (sr->waiting_for_execstat) {
  147. // We still don't have an exec() status from the forked child, wait for that
  148. // before doing any further processing.
  149. return dasynq::rearm::NOOP; // hold watch reservation
  150. }
  151. // Must stop watch now since handle_exit_status might result in re-launch:
  152. // (stop_watch instead of deregister, so that we hold watch reservation).
  153. stop_watch(loop);
  154. if (sr->stop_timer_armed) {
  155. sr->restart_timer.stop_timer(loop);
  156. sr->stop_timer_armed = false;
  157. }
  158. sr->handle_exit_status(bp_sys::exit_status(status));
  159. return dasynq::rearm::NOOP;
  160. }
  161. void process_service::handle_exit_status(bp_sys::exit_status exit_status) noexcept
  162. {
  163. bool did_exit = exit_status.did_exit();
  164. bool was_signalled = exit_status.was_signalled();
  165. auto service_state = get_state();
  166. if (notification_fd != -1) {
  167. readiness_watcher.deregister(event_loop);
  168. bp_sys::close(notification_fd);
  169. notification_fd = -1;
  170. }
  171. if (!exit_status.did_exit_clean() && service_state != service_state_t::STOPPING) {
  172. if (did_exit) {
  173. log(loglevel_t::ERROR, "Service ", get_name(), " process terminated with exit code ",
  174. exit_status.get_exit_status());
  175. }
  176. else if (was_signalled) {
  177. log(loglevel_t::ERROR, "Service ", get_name(), " terminated due to signal ",
  178. exit_status.get_term_sig());
  179. }
  180. }
  181. #if USE_UTMPX
  182. if (*inittab_id || *inittab_line) {
  183. clear_utmp_entry(inittab_id, inittab_line);
  184. }
  185. #endif
  186. if (service_state == service_state_t::STARTING) {
  187. // If state is STARTING, we must be waiting for readiness notification; the process has
  188. // terminated before becoming ready.
  189. stop_reason = stopped_reason_t::FAILED;
  190. failed_to_start();
  191. }
  192. else if (service_state == service_state_t::STOPPING) {
  193. // We won't log a non-zero exit status or termination due to signal here -
  194. // we assume that the process died because we signalled it.
  195. if (stop_timer_armed) {
  196. restart_timer.stop_timer(event_loop);
  197. }
  198. stopped();
  199. }
  200. else if (smooth_recovery && service_state == service_state_t::STARTED
  201. && get_target_state() == service_state_t::STARTED) {
  202. do_smooth_recovery();
  203. return;
  204. }
  205. else {
  206. stop_reason = stopped_reason_t::TERMINATED;
  207. emergency_stop();
  208. }
  209. services->process_queues();
  210. }
  211. void process_service::exec_failed(run_proc_err errcode) noexcept
  212. {
  213. log(loglevel_t::ERROR, get_name(), ": execution failed - ",
  214. exec_stage_descriptions[static_cast<int>(errcode.stage)], ": ", strerror(errcode.st_errno));
  215. if (notification_fd != -1) {
  216. readiness_watcher.deregister(event_loop);
  217. bp_sys::close(notification_fd);
  218. notification_fd = -1;
  219. }
  220. if (get_state() == service_state_t::STARTING) {
  221. stop_reason = stopped_reason_t::EXECFAILED;
  222. failed_to_start();
  223. }
  224. else {
  225. // Process service in smooth recovery:
  226. stop_reason = stopped_reason_t::TERMINATED;
  227. emergency_stop();
  228. }
  229. }
  230. void bgproc_service::handle_exit_status(bp_sys::exit_status exit_status) noexcept
  231. {
  232. // For bgproc services, receiving exit status can mean one of two things:
  233. // 1. We were launching the process, and it finished (possibly after forking). If it did fork
  234. // we want to obtain the process id of the process that we should now monitor, the actual
  235. // daemon.
  236. // 2. The above has already happened, and we are monitoring the daemon process, which has now
  237. // terminated for some reason.
  238. begin:
  239. bool did_exit = exit_status.did_exit();
  240. bool was_signalled = exit_status.was_signalled();
  241. auto service_state = get_state();
  242. if (!exit_status.did_exit_clean() && service_state != service_state_t::STOPPING) {
  243. if (did_exit) {
  244. log(loglevel_t::ERROR, "Service ", get_name(), " process terminated with exit code ",
  245. exit_status.get_exit_status());
  246. }
  247. else if (was_signalled) {
  248. log(loglevel_t::ERROR, "Service ", get_name(), " terminated due to signal ",
  249. exit_status.get_term_sig());
  250. }
  251. }
  252. // This may be a "smooth recovery" where we are restarting the process while leaving the
  253. // service in the STARTED state. This must be the case if 'restarting' is set while the state
  254. // is currently STARTED.
  255. if (restarting && service_state == service_state_t::STARTED) {
  256. restarting = false;
  257. bool need_stop = false;
  258. if ((did_exit && exit_status.get_exit_status() != 0) || was_signalled) {
  259. need_stop = true;
  260. }
  261. else {
  262. // We need to re-read the PID, since it has now changed.
  263. if (pid_file.length() != 0) {
  264. auto pid_result = read_pid_file(&exit_status);
  265. switch (pid_result) {
  266. case pid_result_t::FAILED:
  267. // Failed startup: no auto-restart.
  268. need_stop = true;
  269. break;
  270. case pid_result_t::TERMINATED:
  271. goto begin;
  272. case pid_result_t::OK:
  273. break;
  274. }
  275. }
  276. }
  277. if (need_stop) {
  278. // Failed startup: no auto-restart.
  279. stop_reason = stopped_reason_t::TERMINATED;
  280. emergency_stop();
  281. services->process_queues();
  282. }
  283. return;
  284. }
  285. if (service_state == service_state_t::STARTING) {
  286. // POSIX requires that if the process exited clearly with a status code of 0,
  287. // the exit status value will be 0:
  288. if (exit_status.did_exit_clean()) {
  289. auto pid_result = read_pid_file(&exit_status);
  290. switch (pid_result) {
  291. case pid_result_t::FAILED:
  292. // Failed startup: no auto-restart.
  293. stop_reason = stopped_reason_t::FAILED;
  294. failed_to_start();
  295. break;
  296. case pid_result_t::TERMINATED:
  297. // started, but immediately terminated
  298. started();
  299. goto begin;
  300. case pid_result_t::OK:
  301. started();
  302. break;
  303. }
  304. }
  305. else {
  306. stop_reason = stopped_reason_t::FAILED;
  307. failed_to_start();
  308. }
  309. }
  310. else if (service_state == service_state_t::STOPPING) {
  311. // We won't log a non-zero exit status or termination due to signal here -
  312. // we assume that the process died because we signalled it.
  313. stopped();
  314. }
  315. else {
  316. // we must be STARTED
  317. if (smooth_recovery && get_target_state() == service_state_t::STARTED) {
  318. restarting = true;
  319. do_smooth_recovery();
  320. return;
  321. }
  322. stop_reason = stopped_reason_t::TERMINATED;
  323. forced_stop();
  324. stop_dependents();
  325. stopped();
  326. }
  327. services->process_queues();
  328. }
  329. void bgproc_service::exec_failed(run_proc_err errcode) noexcept
  330. {
  331. log(loglevel_t::ERROR, get_name(), ": execution failed - ",
  332. exec_stage_descriptions[static_cast<int>(errcode.stage)], ": ", strerror(errcode.st_errno));
  333. // Only time we execute is for startup:
  334. stop_reason = stopped_reason_t::EXECFAILED;
  335. failed_to_start();
  336. }
  337. void scripted_service::handle_exit_status(bp_sys::exit_status exit_status) noexcept
  338. {
  339. bool did_exit = exit_status.did_exit();
  340. bool was_signalled = exit_status.was_signalled();
  341. auto service_state = get_state();
  342. // For a scripted service, a termination occurs in one of three main cases:
  343. // - the start script completed (or failed), when service was STARTING
  344. // - the start script was interrupted to cancel startup; state is STOPPING
  345. // - the stop script complete (or failed), state is STOPPING
  346. if (service_state == service_state_t::STOPPING) {
  347. // We might be running the stop script, or we might be running the start script and have issued
  348. // a cancel order via SIGINT:
  349. if (interrupting_start) {
  350. if (stop_timer_armed) {
  351. restart_timer.stop_timer(event_loop);
  352. stop_timer_armed = false;
  353. }
  354. // We issued a start interrupt, so we expected this failure:
  355. if (did_exit && exit_status.get_exit_status() != 0) {
  356. log(loglevel_t::INFO, "Service ", get_name(), " start cancelled; exit code ",
  357. exit_status.get_exit_status());
  358. // Assume that a command terminating normally (with failure status) requires no cleanup:
  359. stopped();
  360. }
  361. else {
  362. if (was_signalled) {
  363. log(loglevel_t::INFO, "Service ", get_name(), " start cancelled from signal ",
  364. exit_status.get_term_sig());
  365. }
  366. // If the start script completed successfully, or was interrupted via our signal,
  367. // we want to run the stop script to clean up:
  368. bring_down();
  369. }
  370. interrupting_start = false;
  371. }
  372. else if (exit_status.did_exit_clean()) {
  373. // We were running the stop script and finished successfully
  374. stopped();
  375. }
  376. else {
  377. // ??? failed to stop! Let's log it as warning:
  378. if (did_exit) {
  379. log(loglevel_t::WARN, "Service ", get_name(), " stop command failed with exit code ",
  380. exit_status.get_exit_status());
  381. }
  382. else if (was_signalled) {
  383. log(loglevel_t::WARN, "Service ", get_name(), " stop command terminated due to signal ",
  384. exit_status.get_term_sig());
  385. }
  386. // Even if the stop script failed, assume that service is now stopped, so that any dependencies
  387. // can be stopped. There's not really any other useful course of action here.
  388. stopped();
  389. }
  390. services->process_queues();
  391. }
  392. else { // STARTING
  393. if (exit_status.did_exit_clean()) {
  394. started();
  395. }
  396. else if (was_signalled && exit_status.get_term_sig() == SIGINT && onstart_flags.skippable) {
  397. // A skippable service can be skipped by interrupting (eg by ^C if the service
  398. // starts on the console).
  399. start_skipped = true;
  400. started();
  401. }
  402. else {
  403. // failed to start
  404. if (did_exit) {
  405. log(loglevel_t::ERROR, "Service ", get_name(), " command failed with exit code ",
  406. exit_status.get_exit_status());
  407. }
  408. else if (was_signalled) {
  409. log(loglevel_t::ERROR, "Service ", get_name(), " command terminated due to signal ",
  410. exit_status.get_term_sig());
  411. }
  412. stop_reason = stopped_reason_t::FAILED;
  413. failed_to_start();
  414. }
  415. services->process_queues();
  416. }
  417. }
  418. void scripted_service::exec_failed(run_proc_err errcode) noexcept
  419. {
  420. log(loglevel_t::ERROR, get_name(), ": execution failed - ",
  421. exec_stage_descriptions[static_cast<int>(errcode.stage)], ": ", strerror(errcode.st_errno));
  422. auto service_state = get_state();
  423. if (service_state == service_state_t::STARTING) {
  424. stop_reason = stopped_reason_t::EXECFAILED;
  425. failed_to_start();
  426. }
  427. else if (service_state == service_state_t::STOPPING) {
  428. // We've logged the failure, but it's probably better not to leave the service in
  429. // STOPPING state:
  430. stopped();
  431. }
  432. }
  433. // Return a value as an unsigned-type value.
  434. template <typename T> typename std::make_unsigned<T>::type make_unsigned_val(T val)
  435. {
  436. return static_cast<typename std::make_unsigned<T>::type>(val);
  437. }
  438. bgproc_service::pid_result_t
  439. bgproc_service::read_pid_file(bp_sys::exit_status *exit_status) noexcept
  440. {
  441. const char *pid_file_c = pid_file.c_str();
  442. int fd = bp_sys::open(pid_file_c, O_CLOEXEC);
  443. if (fd == -1) {
  444. log(loglevel_t::ERROR, get_name(), ": read pid file: ", strerror(errno));
  445. return pid_result_t::FAILED;
  446. }
  447. char pidbuf[21]; // just enough to hold any 64-bit integer
  448. int r = complete_read(fd, pidbuf, 20);
  449. if (r < 0) {
  450. // Could not read from PID file
  451. log(loglevel_t::ERROR, get_name(), ": could not read from pidfile; ", strerror(errno));
  452. bp_sys::close(fd);
  453. return pid_result_t::FAILED;
  454. }
  455. bp_sys::close(fd);
  456. pidbuf[r] = 0; // store nul terminator
  457. bool valid_pid = false;
  458. try {
  459. unsigned long long v = std::stoull(pidbuf, nullptr, 0);
  460. if (v <= make_unsigned_val(std::numeric_limits<pid_t>::max())) {
  461. pid = (pid_t) v;
  462. valid_pid = true;
  463. }
  464. }
  465. catch (std::out_of_range &exc) {
  466. // Too large?
  467. }
  468. catch (std::invalid_argument &exc) {
  469. // Ok, so it doesn't look like a number: proceed...
  470. }
  471. if (valid_pid) {
  472. pid_t wait_r = waitpid(pid, exit_status, WNOHANG);
  473. if (wait_r == -1 && errno == ECHILD) {
  474. // We can't track this child - check process exists:
  475. if (bp_sys::kill(pid, 0) == 0 || errno != ESRCH) {
  476. tracking_child = false;
  477. return pid_result_t::OK;
  478. }
  479. else {
  480. log(loglevel_t::ERROR, get_name(), ": pid read from pidfile (", pid, ") is not valid");
  481. pid = -1;
  482. return pid_result_t::FAILED;
  483. }
  484. }
  485. else if (wait_r == pid) {
  486. pid = -1;
  487. return pid_result_t::TERMINATED;
  488. }
  489. else if (wait_r == 0) {
  490. // We can track the child
  491. child_listener.add_reserved(event_loop, pid, dasynq::DEFAULT_PRIORITY - 10);
  492. tracking_child = true;
  493. reserved_child_watch = true;
  494. return pid_result_t::OK;
  495. }
  496. }
  497. log(loglevel_t::ERROR, get_name(), ": pid read from pidfile (", pid, ") is not valid");
  498. pid = -1;
  499. return pid_result_t::FAILED;
  500. }
  501. void process_service::bring_down() noexcept
  502. {
  503. if (waiting_for_execstat) {
  504. // The process is still starting. This should be uncommon, but can occur during
  505. // smooth recovery. We can't do much now; we have to wait until we get the
  506. // status, and then act appropriately.
  507. return;
  508. }
  509. else if (pid != -1) {
  510. // The process is still kicking on - must actually kill it. We signal the process
  511. // group (-pid) rather than just the process as there's less risk then of creating
  512. // an orphaned process group:
  513. if (! onstart_flags.no_sigterm) {
  514. kill_pg(SIGTERM);
  515. }
  516. if (term_signal != -1) {
  517. kill_pg(term_signal);
  518. }
  519. // If there's a stop timeout, arm the timer now:
  520. if (stop_timeout != time_val(0,0)) {
  521. restart_timer.arm_timer_rel(event_loop, stop_timeout);
  522. stop_timer_armed = true;
  523. }
  524. // The rest is done in handle_exit_status.
  525. }
  526. else {
  527. // The process is already dead.
  528. stopped();
  529. }
  530. }
  531. void bgproc_service::bring_down() noexcept
  532. {
  533. if (pid != -1) {
  534. // The process is still kicking on - must actually kill it. We signal the process
  535. // group (-pid) rather than just the process as there's less risk then of creating
  536. // an orphaned process group:
  537. if (! onstart_flags.no_sigterm) {
  538. kill_pg(SIGTERM);
  539. }
  540. if (term_signal != -1) {
  541. kill_pg(term_signal);
  542. }
  543. // In most cases, the rest is done in handle_exit_status.
  544. // If we are a BGPROCESS and the process is not our immediate child, however, that
  545. // won't work - check for this now:
  546. if (! tracking_child) {
  547. stopped();
  548. }
  549. else if (stop_timeout != time_val(0,0)) {
  550. restart_timer.arm_timer_rel(event_loop, stop_timeout);
  551. stop_timer_armed = true;
  552. }
  553. }
  554. else {
  555. // The process is already dead.
  556. stopped();
  557. }
  558. }
  559. void scripted_service::bring_down() noexcept
  560. {
  561. if (pid != -1) {
  562. // We're already running the stop script; nothing to do.
  563. return;
  564. }
  565. if (stop_command.length() == 0) {
  566. stopped();
  567. }
  568. else if (! start_ps_process(stop_arg_parts, false)) {
  569. // Couldn't execute stop script, but there's not much we can do:
  570. stopped();
  571. }
  572. else {
  573. // successfully started stop script: start kill timer:
  574. if (stop_timeout != time_val(0,0)) {
  575. restart_timer.arm_timer_rel(event_loop, stop_timeout);
  576. stop_timer_armed = true;
  577. }
  578. }
  579. }
  580. dasynq::rearm process_restart_timer::timer_expiry(eventloop_t &, int expiry_count)
  581. {
  582. service->timer_expired();
  583. // Leave the timer disabled, or, if it has been reset by any processing above, leave it armed:
  584. return dasynq::rearm::NOOP;
  585. }