proc-service.cc 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638
  1. #include <cstring>
  2. #include <type_traits>
  3. #include <sys/un.h>
  4. #include <sys/socket.h>
  5. #include "dinit.h"
  6. #include "dinit-socket.h"
  7. #include "dinit-util.h"
  8. #include "dinit-log.h"
  9. #include "proc-service.h"
  10. /*
  11. * Most of the implementation for process-based services (process, scripted, bgprocess) is here.
  12. *
  13. * See proc-service.h header for interface details.
  14. */
  15. // Strings describing the execution stages (failure points).
  16. const char * const exec_stage_descriptions[static_cast<int>(exec_stage::DO_EXEC) + 1] = {
  17. "arranging file descriptors", // ARRANGE_FDS
  18. "reading environment file", // READ_ENV_FILE
  19. "setting environment variable", // SET_NOTIFYFD_VAR
  20. "setting up activation socket", // SETUP_ACTIVATION_SOCKET
  21. "setting up control socket", // SETUP_CONTROL_SOCKET
  22. "changing directory", // CHDIR
  23. "setting up standard input/output descriptors", // SETUP_STDINOUTERR
  24. "setting resource limits", // SET_RLIMITS
  25. "setting user/group ID", // SET_UIDGID
  26. "executing command" // DO_EXEC
  27. };
  28. // Given a string and a list of pairs of (start,end) indices for each argument in that string,
  29. // store a null terminator for the argument. Return a `char *` vector containing the beginning
  30. // of each argument and a trailing nullptr. (The returned array is invalidated if the string is
  31. // later modified).
  32. std::vector<const char *> separate_args(std::string &s,
  33. const std::list<std::pair<unsigned,unsigned>> &arg_indices)
  34. {
  35. std::vector<const char *> r;
  36. r.reserve(arg_indices.size() + 1);
  37. // First store nul terminator for each part:
  38. for (auto index_pair : arg_indices) {
  39. if (index_pair.second < s.length()) {
  40. s[index_pair.second] = 0;
  41. }
  42. }
  43. // Now we can get the C string (c_str) and store offsets into it:
  44. const char * cstr = s.c_str();
  45. for (auto index_pair : arg_indices) {
  46. r.push_back(cstr + index_pair.first);
  47. }
  48. r.push_back(nullptr);
  49. return r;
  50. }
  51. void process_service::exec_succeeded() noexcept
  52. {
  53. // This could be a smooth recovery (state already STARTED). Even more, the process
  54. // might be stopped (and killed via a signal) during smooth recovery. We don't to
  55. // process startup again in either case, so we check for state STARTING:
  56. if (get_state() == service_state_t::STARTING) {
  57. if (force_notification_fd != -1 || !notification_var.empty()) {
  58. // Wait for readiness notification:
  59. readiness_watcher.set_enabled(event_loop, true);
  60. }
  61. else {
  62. started();
  63. }
  64. }
  65. else if (get_state() == service_state_t::STOPPING) {
  66. // stopping, but smooth recovery was in process. That's now over so we can
  67. // commence normal stop. Note that if pid == -1 the process already stopped(!),
  68. // that's handled below.
  69. if (pid != -1 && stop_check_dependents()) {
  70. bring_down();
  71. }
  72. }
  73. }
  74. void scripted_service::exec_succeeded() noexcept
  75. {
  76. // For a scripted service, this means nothing other than that the start/stop
  77. // script will now begin.
  78. }
  79. rearm exec_status_pipe_watcher::fd_event(eventloop_t &loop, int fd, int flags) noexcept
  80. {
  81. base_process_service *sr = service;
  82. sr->waiting_for_execstat = false;
  83. run_proc_err exec_status;
  84. int r = read(get_watched_fd(), &exec_status, sizeof(exec_status));
  85. deregister(loop);
  86. close(get_watched_fd());
  87. if (r > 0) {
  88. // We read an errno code; exec() failed, and the service startup failed.
  89. if (sr->pid != -1) {
  90. sr->child_listener.deregister(event_loop, sr->pid);
  91. sr->reserved_child_watch = false;
  92. if (sr->stop_timer_armed) {
  93. sr->restart_timer.stop_timer(loop);
  94. sr->stop_timer_armed = false;
  95. }
  96. }
  97. sr->pid = -1;
  98. sr->exec_failed(exec_status);
  99. }
  100. else {
  101. sr->exec_succeeded();
  102. if (sr->pid == -1) {
  103. // Somehow the process managed to complete before we even saw the exec() status.
  104. sr->handle_exit_status(sr->exit_status);
  105. }
  106. }
  107. sr->services->process_queues();
  108. return rearm::REMOVED;
  109. }
  110. rearm ready_notify_watcher::fd_event(eventloop_t &, int fd, int flags) noexcept
  111. {
  112. char buf[128];
  113. if (service->get_state() == service_state_t::STARTING) {
  114. // can we actually read anything from the notification pipe?
  115. int r = bp_sys::read(fd, buf, sizeof(buf));
  116. if (r > 0) {
  117. service->started();
  118. }
  119. else if (r == 0 || errno != EAGAIN) {
  120. service->failed_to_start(false, false);
  121. service->set_state(service_state_t::STOPPING);
  122. service->bring_down();
  123. }
  124. }
  125. else {
  126. // Just keep consuming data from the pipe:
  127. int r = bp_sys::read(fd, buf, sizeof(buf));
  128. if (r == 0) {
  129. // Process closed write end or terminated
  130. close(fd);
  131. service->notification_fd = -1;
  132. return rearm::DISARM;
  133. }
  134. }
  135. service->services->process_queues();
  136. return rearm::REARM;
  137. }
  138. dasynq::rearm service_child_watcher::status_change(eventloop_t &loop, pid_t child, int status) noexcept
  139. {
  140. base_process_service *sr = service;
  141. sr->pid = -1;
  142. sr->exit_status = bp_sys::exit_status(status);
  143. // Ok, for a process service, any process death which we didn't rig ourselves is a bit... unexpected.
  144. // Probably, the child died because we asked it to (sr->service_state == STOPPING). But even if we
  145. // didn't, there's not much we can do.
  146. if (sr->waiting_for_execstat) {
  147. // We still don't have an exec() status from the forked child, wait for that
  148. // before doing any further processing.
  149. return dasynq::rearm::NOOP; // hold watch reservation
  150. }
  151. // Must stop watch now since handle_exit_status might result in re-launch:
  152. // (stop_watch instead of deregister, so that we hold watch reservation).
  153. stop_watch(loop);
  154. if (sr->stop_timer_armed) {
  155. sr->restart_timer.stop_timer(loop);
  156. sr->stop_timer_armed = false;
  157. }
  158. sr->handle_exit_status(bp_sys::exit_status(status));
  159. return dasynq::rearm::NOOP;
  160. }
  161. void process_service::handle_exit_status(bp_sys::exit_status exit_status) noexcept
  162. {
  163. bool did_exit = exit_status.did_exit();
  164. bool was_signalled = exit_status.was_signalled();
  165. auto service_state = get_state();
  166. if (notification_fd != -1) {
  167. readiness_watcher.deregister(event_loop);
  168. bp_sys::close(notification_fd);
  169. notification_fd = -1;
  170. }
  171. if (!exit_status.did_exit_clean() && service_state != service_state_t::STOPPING) {
  172. if (did_exit) {
  173. log(loglevel_t::ERROR, "Service ", get_name(), " process terminated with exit code ",
  174. exit_status.get_exit_status());
  175. }
  176. else if (was_signalled) {
  177. log(loglevel_t::ERROR, "Service ", get_name(), " terminated due to signal ",
  178. exit_status.get_term_sig());
  179. }
  180. }
  181. #if USE_UTMPX
  182. if (*inittab_id || *inittab_line) {
  183. clear_utmp_entry(inittab_id, inittab_line);
  184. }
  185. #endif
  186. if (service_state == service_state_t::STARTING) {
  187. // If state is STARTING, we must be waiting for readiness notification; the process has
  188. // terminated before becoming ready.
  189. stop_reason = stopped_reason_t::FAILED;
  190. failed_to_start();
  191. }
  192. else if (service_state == service_state_t::STOPPING) {
  193. // We won't log a non-zero exit status or termination due to signal here -
  194. // we assume that the process died because we signalled it.
  195. if (stop_timer_armed) {
  196. restart_timer.stop_timer(event_loop);
  197. }
  198. stopped();
  199. }
  200. else if (smooth_recovery && service_state == service_state_t::STARTED
  201. && get_target_state() == service_state_t::STARTED) {
  202. do_smooth_recovery();
  203. return;
  204. }
  205. else {
  206. stop_reason = stopped_reason_t::TERMINATED;
  207. emergency_stop();
  208. }
  209. services->process_queues();
  210. }
  211. void process_service::exec_failed(run_proc_err errcode) noexcept
  212. {
  213. log(loglevel_t::ERROR, get_name(), ": execution failed - ",
  214. exec_stage_descriptions[static_cast<int>(errcode.stage)], strerror(errcode.st_errno));
  215. if (notification_fd != -1) {
  216. readiness_watcher.deregister(event_loop);
  217. bp_sys::close(notification_fd);
  218. notification_fd = -1;
  219. }
  220. if (get_state() == service_state_t::STARTING) {
  221. stop_reason = stopped_reason_t::EXECFAILED;
  222. failed_to_start();
  223. }
  224. else {
  225. // Process service in smooth recovery:
  226. stop_reason = stopped_reason_t::TERMINATED;
  227. emergency_stop();
  228. }
  229. }
  230. void bgproc_service::handle_exit_status(bp_sys::exit_status exit_status) noexcept
  231. {
  232. begin:
  233. bool did_exit = exit_status.did_exit();
  234. bool was_signalled = exit_status.was_signalled();
  235. auto service_state = get_state();
  236. if (!exit_status.did_exit_clean() && service_state != service_state_t::STOPPING) {
  237. if (did_exit) {
  238. log(loglevel_t::ERROR, "Service ", get_name(), " process terminated with exit code ",
  239. exit_status.get_exit_status());
  240. }
  241. else if (was_signalled) {
  242. log(loglevel_t::ERROR, "Service ", get_name(), " terminated due to signal ",
  243. exit_status.get_term_sig());
  244. }
  245. }
  246. // This may be a "smooth recovery" where we are restarting the process while leaving the
  247. // service in the STARTED state.
  248. if (restarting && service_state == service_state_t::STARTED) {
  249. //restarting = false;
  250. bool need_stop = false;
  251. if ((did_exit && exit_status.get_exit_status() != 0) || was_signalled) {
  252. need_stop = true;
  253. }
  254. else {
  255. // We need to re-read the PID, since it has now changed.
  256. if (pid_file.length() != 0) {
  257. auto pid_result = read_pid_file(&exit_status);
  258. switch (pid_result) {
  259. case pid_result_t::FAILED:
  260. // Failed startup: no auto-restart.
  261. need_stop = true;
  262. break;
  263. case pid_result_t::TERMINATED:
  264. goto begin;
  265. case pid_result_t::OK:
  266. break;
  267. }
  268. }
  269. }
  270. if (need_stop) {
  271. // Failed startup: no auto-restart.
  272. stop_reason = stopped_reason_t::TERMINATED;
  273. emergency_stop();
  274. services->process_queues();
  275. }
  276. return;
  277. }
  278. //restarting = false;
  279. if (service_state == service_state_t::STARTING) {
  280. // POSIX requires that if the process exited clearly with a status code of 0,
  281. // the exit status value will be 0:
  282. if (exit_status.did_exit_clean()) {
  283. auto pid_result = read_pid_file(&exit_status);
  284. switch (pid_result) {
  285. case pid_result_t::FAILED:
  286. // Failed startup: no auto-restart.
  287. stop_reason = stopped_reason_t::FAILED;
  288. failed_to_start();
  289. break;
  290. case pid_result_t::TERMINATED:
  291. // started, but immediately terminated
  292. started();
  293. goto begin;
  294. case pid_result_t::OK:
  295. started();
  296. break;
  297. }
  298. }
  299. else {
  300. stop_reason = stopped_reason_t::FAILED;
  301. failed_to_start();
  302. }
  303. }
  304. else if (service_state == service_state_t::STOPPING) {
  305. // We won't log a non-zero exit status or termination due to signal here -
  306. // we assume that the process died because we signalled it.
  307. stopped();
  308. }
  309. else {
  310. // we must be STARTED
  311. if (smooth_recovery && get_target_state() == service_state_t::STARTED) {
  312. do_smooth_recovery();
  313. return;
  314. }
  315. stop_reason = stopped_reason_t::TERMINATED;
  316. forced_stop();
  317. stop_dependents();
  318. stopped();
  319. }
  320. services->process_queues();
  321. }
  322. void bgproc_service::exec_failed(run_proc_err errcode) noexcept
  323. {
  324. log(loglevel_t::ERROR, get_name(), ": execution failed - ",
  325. exec_stage_descriptions[static_cast<int>(errcode.stage)], strerror(errcode.st_errno));
  326. // Only time we execute is for startup:
  327. stop_reason = stopped_reason_t::EXECFAILED;
  328. failed_to_start();
  329. }
  330. void scripted_service::handle_exit_status(bp_sys::exit_status exit_status) noexcept
  331. {
  332. bool did_exit = exit_status.did_exit();
  333. bool was_signalled = exit_status.was_signalled();
  334. auto service_state = get_state();
  335. // For a scripted service, a termination occurs in one of three main cases:
  336. // - the start script completed (or failed), when service was STARTING
  337. // - the start script was interrupted to cancel startup; state is STOPPING
  338. // - the stop script complete (or failed), state is STOPPING
  339. if (service_state == service_state_t::STOPPING) {
  340. // We might be running the stop script, or we might be running the start script and have issued
  341. // a cancel order via SIGINT:
  342. if (interrupting_start) {
  343. if (stop_timer_armed) {
  344. restart_timer.stop_timer(event_loop);
  345. stop_timer_armed = false;
  346. }
  347. // We issued a start interrupt, so we expected this failure:
  348. if (did_exit && exit_status.get_exit_status() != 0) {
  349. log(loglevel_t::INFO, "Service ", get_name(), " start cancelled; exit code ",
  350. exit_status.get_exit_status());
  351. // Assume that a command terminating normally (with failure status) requires no cleanup:
  352. stopped();
  353. }
  354. else {
  355. if (was_signalled) {
  356. log(loglevel_t::INFO, "Service ", get_name(), " start cancelled from signal ",
  357. exit_status.get_term_sig());
  358. }
  359. // If the start script completed successfully, or was interrupted via our signal,
  360. // we want to run the stop script to clean up:
  361. bring_down();
  362. }
  363. interrupting_start = false;
  364. }
  365. else if (exit_status.did_exit_clean()) {
  366. // We were running the stop script and finished successfully
  367. stopped();
  368. }
  369. else {
  370. // ??? failed to stop! Let's log it as warning:
  371. if (did_exit) {
  372. log(loglevel_t::WARN, "Service ", get_name(), " stop command failed with exit code ",
  373. exit_status.get_exit_status());
  374. }
  375. else if (was_signalled) {
  376. log(loglevel_t::WARN, "Service ", get_name(), " stop command terminated due to signal ",
  377. exit_status.get_term_sig());
  378. }
  379. // Even if the stop script failed, assume that service is now stopped, so that any dependencies
  380. // can be stopped. There's not really any other useful course of action here.
  381. stopped();
  382. }
  383. services->process_queues();
  384. }
  385. else { // STARTING
  386. if (exit_status.did_exit_clean()) {
  387. started();
  388. }
  389. else if (was_signalled && exit_status.get_term_sig() == SIGINT && onstart_flags.skippable) {
  390. // A skippable service can be skipped by interrupting (eg by ^C if the service
  391. // starts on the console).
  392. start_skipped = true;
  393. started();
  394. }
  395. else {
  396. // failed to start
  397. if (did_exit) {
  398. log(loglevel_t::ERROR, "Service ", get_name(), " command failed with exit code ",
  399. exit_status.get_exit_status());
  400. }
  401. else if (was_signalled) {
  402. log(loglevel_t::ERROR, "Service ", get_name(), " command terminated due to signal ",
  403. exit_status.get_term_sig());
  404. }
  405. stop_reason = stopped_reason_t::FAILED;
  406. failed_to_start();
  407. }
  408. services->process_queues();
  409. }
  410. }
  411. void scripted_service::exec_failed(run_proc_err errcode) noexcept
  412. {
  413. log(loglevel_t::ERROR, get_name(), ": execution failed - ",
  414. exec_stage_descriptions[static_cast<int>(errcode.stage)], strerror(errcode.st_errno));
  415. auto service_state = get_state();
  416. if (service_state == service_state_t::STARTING) {
  417. stop_reason = stopped_reason_t::EXECFAILED;
  418. failed_to_start();
  419. }
  420. else if (service_state == service_state_t::STOPPING) {
  421. // We've logged the failure, but it's probably better not to leave the service in
  422. // STOPPING state:
  423. stopped();
  424. }
  425. }
  426. // Return a value as an unsigned-type value.
  427. template <typename T> typename std::make_unsigned<T>::type make_unsigned_val(T val)
  428. {
  429. return static_cast<typename std::make_unsigned<T>::type>(val);
  430. }
  431. bgproc_service::pid_result_t
  432. bgproc_service::read_pid_file(bp_sys::exit_status *exit_status) noexcept
  433. {
  434. const char *pid_file_c = pid_file.c_str();
  435. int fd = open(pid_file_c, O_CLOEXEC);
  436. if (fd == -1) {
  437. log(loglevel_t::ERROR, get_name(), ": read pid file: ", strerror(errno));
  438. return pid_result_t::FAILED;
  439. }
  440. char pidbuf[21]; // just enough to hold any 64-bit integer
  441. int r = complete_read(fd, pidbuf, 20);
  442. if (r < 0) {
  443. // Could not read from PID file
  444. log(loglevel_t::ERROR, get_name(), ": could not read from pidfile; ", strerror(errno));
  445. close(fd);
  446. return pid_result_t::FAILED;
  447. }
  448. close(fd);
  449. pidbuf[r] = 0; // store nul terminator
  450. bool valid_pid = false;
  451. try {
  452. unsigned long long v = std::stoull(pidbuf, nullptr, 0);
  453. if (v <= make_unsigned_val(std::numeric_limits<pid_t>::max())) {
  454. pid = (pid_t) v;
  455. valid_pid = true;
  456. }
  457. }
  458. catch (std::out_of_range &exc) {
  459. // Too large?
  460. }
  461. catch (std::invalid_argument &exc) {
  462. // Ok, so it doesn't look like a number: proceed...
  463. }
  464. if (valid_pid) {
  465. pid_t wait_r = waitpid(pid, exit_status, WNOHANG);
  466. if (wait_r == -1 && errno == ECHILD) {
  467. // We can't track this child - check process exists:
  468. if (kill(pid, 0) == 0 || errno != ESRCH) {
  469. tracking_child = false;
  470. return pid_result_t::OK;
  471. }
  472. else {
  473. log(loglevel_t::ERROR, get_name(), ": pid read from pidfile (", pid, ") is not valid");
  474. pid = -1;
  475. return pid_result_t::FAILED;
  476. }
  477. }
  478. else if (wait_r == pid) {
  479. pid = -1;
  480. return pid_result_t::TERMINATED;
  481. }
  482. else if (wait_r == 0) {
  483. // We can track the child
  484. child_listener.add_reserved(event_loop, pid, dasynq::DEFAULT_PRIORITY - 10);
  485. tracking_child = true;
  486. reserved_child_watch = true;
  487. return pid_result_t::OK;
  488. }
  489. }
  490. log(loglevel_t::ERROR, get_name(), ": pid read from pidfile (", pid, ") is not valid");
  491. pid = -1;
  492. return pid_result_t::FAILED;
  493. }
  494. void process_service::bring_down() noexcept
  495. {
  496. if (waiting_for_execstat) {
  497. // The process is still starting. This should be uncommon, but can occur during
  498. // smooth recovery. We can't do much now; we have to wait until we get the
  499. // status, and then act appropriately.
  500. return;
  501. }
  502. else if (pid != -1) {
  503. // The process is still kicking on - must actually kill it. We signal the process
  504. // group (-pid) rather than just the process as there's less risk then of creating
  505. // an orphaned process group:
  506. if (! onstart_flags.no_sigterm) {
  507. kill_pg(SIGTERM);
  508. }
  509. if (term_signal != -1) {
  510. kill_pg(term_signal);
  511. }
  512. // If there's a stop timeout, arm the timer now:
  513. if (stop_timeout != time_val(0,0)) {
  514. restart_timer.arm_timer_rel(event_loop, stop_timeout);
  515. stop_timer_armed = true;
  516. }
  517. // The rest is done in handle_exit_status.
  518. }
  519. else {
  520. // The process is already dead.
  521. stopped();
  522. }
  523. }
  524. void bgproc_service::bring_down() noexcept
  525. {
  526. if (pid != -1) {
  527. // The process is still kicking on - must actually kill it. We signal the process
  528. // group (-pid) rather than just the process as there's less risk then of creating
  529. // an orphaned process group:
  530. if (! onstart_flags.no_sigterm) {
  531. kill_pg(SIGTERM);
  532. }
  533. if (term_signal != -1) {
  534. kill_pg(term_signal);
  535. }
  536. // In most cases, the rest is done in handle_exit_status.
  537. // If we are a BGPROCESS and the process is not our immediate child, however, that
  538. // won't work - check for this now:
  539. if (! tracking_child) {
  540. stopped();
  541. }
  542. else if (stop_timeout != time_val(0,0)) {
  543. restart_timer.arm_timer_rel(event_loop, stop_timeout);
  544. stop_timer_armed = true;
  545. }
  546. }
  547. else {
  548. // The process is already dead.
  549. stopped();
  550. }
  551. }
  552. void scripted_service::bring_down() noexcept
  553. {
  554. if (pid != -1) {
  555. // We're already running the stop script; nothing to do.
  556. return;
  557. }
  558. if (stop_command.length() == 0) {
  559. stopped();
  560. }
  561. else if (! start_ps_process(stop_arg_parts, false)) {
  562. // Couldn't execute stop script, but there's not much we can do:
  563. stopped();
  564. }
  565. else {
  566. // successfully started stop script: start kill timer:
  567. if (stop_timeout != time_val(0,0)) {
  568. restart_timer.arm_timer_rel(event_loop, stop_timeout);
  569. stop_timer_armed = true;
  570. }
  571. }
  572. }
  573. dasynq::rearm process_restart_timer::timer_expiry(eventloop_t &, int expiry_count)
  574. {
  575. service->timer_expired();
  576. // Leave the timer disabled, or, if it has been reset by any processing above, leave it armed:
  577. return dasynq::rearm::NOOP;
  578. }