Browse Source

Initial support for cgroups

This is mainly in the form of a "run-in-cgroup" setting where a cgroup
name (path) can be specified (a relative path will be resolved relative
to the cgroup of dinit).

Only a single hierarchy is supported. Cgroups v2 are a natural fit (and
have been tested) but Cgroups v1 with a single hierarchy should probably
also work.

There is no functionality to create cgroups or set their resource
constraints; that needs to be done separately.
Davin McCall 1 year ago
parent
commit
8d6ab2f98c

+ 1 - 0
build/Makefile

@@ -11,6 +11,7 @@ includes/mconfig.h: ../mconfig tools/mconfig-gen.cc
 	$(MAKE) -C tools mconfig-gen
 	./tools/mconfig-gen SBINDIR=$(SBINDIR) SYSCONTROLSOCKET=$(SYSCONTROLSOCKET) \
 		SHUTDOWN_PREFIX=$(SHUTDOWN_PREFIX) VERSION=$(VERSION) \
+		$(if $(SUPPORT_CGROUPS),SUPPORT_CGROUPS=$(SUPPORT_CGROUPS),) \
 		$(if $(USE_UTMPX),USE_UTMPX=$(USE_UTMPX),) > includes/mconfig.h
 
 clean:

+ 4 - 1
build/tools/mconfig-gen.cc

@@ -63,10 +63,13 @@ int main(int argc, char **argv)
     cout << "#ifndef DINIT_MCONFIG_H\n";
     cout << "#define DINIT_MCONFIG_H 1\n";
 
-    cout << "\n// Defines\n";
+    cout << "\n// Defines\n\n";
     if (vars.find("USE_UTMPX") != vars.end()) {
         cout << "#define USE_UTMPX " << vars["USE_UTMPX"] << "\n";
     }
+    if (vars.find("SUPPORT_CGROUPS") != vars.end()) {
+        cout << "#define SUPPORT_CGROUPS " << vars["SUPPORT_CGROUPS"] << "\n";
+    }
 
     cout << "\n// Constants\n";
     cout << "\nconstexpr static char DINIT_VERSION[] = " << stringify(vars["VERSION"]) << ";\n";

+ 5 - 0
configs/mconfig.Linux

@@ -26,3 +26,8 @@ SANITIZEOPTS=-fsanitize=address,undefined
 #   -fno-plt  (optional) : Recommended optimisation
 #   -flto     (optional) : Perform link-time optimisation
 #   -fsanitize=address,undefined :  Apply sanitizers (during unit tests)
+
+
+# Features.
+
+SUPPORT_CGROUPS=1

+ 8 - 2
configs/mconfig.Linux.sh

@@ -91,10 +91,16 @@ GENERAL_BUILD_SETTINGS=$(
   echo "#   -fsanitize=address,undefined :  Apply sanitizers (during unit tests)"
 )
 
-#echo "$INST_PATH_OPTS"
-#echo "$GENERAL_BUILD_SETTINGS"
+FEATURE_SETTINGS=$(
+  echo ""
+  echo ""
+  echo "# Feature settings"
+  echo ""
+  echo "SUPPORT_CGROUPS=1"
+)
 
 (
   echo "$INST_PATH_OPTS"
   echo "$GENERAL_BUILD_SETTINGS" 
+  echo "$FEATURE_SETTINGS"
 ) >> ../mconfig

+ 3 - 0
src/baseproc-service.cc

@@ -189,6 +189,9 @@ bool base_process_service::start_ps_process(const std::vector<const char *> &cmd
         run_params.force_notify_fd = force_notification_fd;
         run_params.notify_var = notification_var.c_str();
         run_params.env_file = env_file.c_str();
+        #if SUPPORT_CGROUPS
+        run_params.run_in_cgroup = run_in_cgroup.c_str();
+        #endif
         run_child_proc(run_params);
     }
     else {

+ 119 - 3
src/dinit.cc

@@ -63,8 +63,11 @@ static void close_control_socket() noexcept;
 static void confirm_restart_boot() noexcept;
 static void flush_log() noexcept;
 
-static void control_socket_cb(eventloop_t *loop, int fd);
+static void control_socket_cb(eventloop_t *loop, int fd) noexcept;
 
+#ifdef SUPPORT_CGROUPS
+static void find_cgroup_path() noexcept;
+#endif
 
 // Variables
 
@@ -91,6 +94,11 @@ static bool log_is_syslog = true; // if false, log is a file
 // Set to true (when console_input_watcher is active) if console input becomes available
 static bool console_input_ready = false;
 
+#ifdef SUPPORT_CGROUPS
+// Path of the root cgroup according to dinit. This will be dinit's own cgroup path.
+std::string cgroups_path;
+bool have_cgroups_path = false;
+#endif
 
 namespace {
     // Event-loop handler for a signal, which just delegates to a function (pointer).
@@ -386,10 +394,22 @@ int dinit_main(int argc, char **argv)
         if (! env_file_set) {
             env_file = env_file_path;
         }
+
+        // we will assume an empty cgroups root path
+        #if SUPPORT_CGROUPS
+        have_cgroups_path = true;
+        #endif
     }
 
+    #if SUPPORT_CGROUPS
+    if (!have_cgroups_path) {
+        find_cgroup_path();
+        // We will press on if the cgroup root path could not be identified, since services might
+        // not require cgroups anyway.
+    }
+    #endif
+
     /* Set up signal handlers etc */
-    /* SIG_CHILD is ignored by default: good */
     sigset_t sigwait_set;
     sigemptyset(&sigwait_set);
     sigaddset(&sigwait_set, SIGCHLD);
@@ -734,7 +754,7 @@ static void confirm_restart_boot() noexcept
 }
 
 // Callback for control socket
-static void control_socket_cb(eventloop_t *loop, int sockfd)
+static void control_socket_cb(eventloop_t *loop, int sockfd) noexcept
 {
     // Considered keeping a limit the number of active connections, however, there doesn't
     // seem much to be gained from that. Only root can create connections and not being
@@ -956,6 +976,102 @@ static void flush_log() noexcept
     }
 }
 
+#ifdef SUPPORT_CGROUPS
+
+static void find_cgroup_path() noexcept
+{
+    if (have_cgroups_path) {
+        return;
+    }
+
+    int pfd = open("/proc/self/cgroup", O_RDONLY);
+    if (pfd == -1) {
+        return;
+    }
+
+    try {
+        size_t cgroup_line_sz = 64;
+        size_t cur_read = 0;
+        size_t line_end_pos = (size_t)-1;
+        size_t colon_count = 0; // how many colons have we seen?
+        size_t second_colon_pos = 0;
+        std::vector<char, default_init_allocator<char>> cgroup_line(cgroup_line_sz);
+
+        while (true) {
+            ssize_t r = read(pfd, cgroup_line.data() + cur_read, cgroup_line_sz - cur_read);
+            if (r == 0) {
+                if (line_end_pos == (size_t)-1) {
+                    line_end_pos = cur_read + 1;
+                }
+                break;
+            }
+            if (r == -1) {
+                close(pfd);
+                return;
+            }
+
+            size_t rr = (size_t)r;
+            for (size_t i = 0; i < rr; ++i) {
+                if (cgroup_line[cur_read + i] == '\n') {
+                    line_end_pos = cur_read + i;
+                }
+                else if (line_end_pos != (size_t)-1) {
+                    log(loglevel_t::WARN, "In multiple cgroups, cannot determine cgroup root path");
+                    close(pfd);
+                    return;
+                }
+                else if (cgroup_line[cur_read + i] == ':') {
+                    if (++colon_count == 2) {
+                        second_colon_pos = cur_read + i;
+                    }
+                }
+            }
+
+            cur_read += rr;
+            if (line_end_pos != 0) {
+                break;
+            }
+
+            if (cur_read == cgroup_line_sz) {
+                cgroup_line.resize(cgroup_line_sz * 2);
+                cgroup_line_sz *= 2;
+            }
+        };
+
+        close(pfd);
+        pfd = -1;
+
+        // Now extract the path
+        // The group line should look something like:
+        //
+        //    0::/some/path
+        //
+        // We want "some/path", i.e. we'll skip the leading slash.
+        if (colon_count < 2 || (line_end_pos - second_colon_pos) == 1
+                || cgroup_line[second_colon_pos+1] != '/') {
+            // path is from 2nd colon to end
+            log(loglevel_t::WARN, "Could not determine cgroup root path");
+            return;
+        }
+
+        cgroups_path.clear();
+        size_t first_char_pos = second_colon_pos + 2;
+        size_t root_path_len = line_end_pos - first_char_pos;
+        cgroups_path.append(cgroup_line.data() + first_char_pos, root_path_len);
+        have_cgroups_path = true;
+        return;
+    }
+    catch (std::bad_alloc &b) {
+        if (pfd != -1) {
+            close(pfd);
+        }
+        log(loglevel_t::WARN, "Out-of-memory reading cgroup root path");
+        return;
+    }
+}
+
+#endif // SUPPORT_CGROUPS
+
 /* handle SIGINT signal (generated by Linux kernel when ctrl+alt+del pressed) */
 static void sigint_reboot_cb(eventloop_t &eloop) noexcept
 {

+ 27 - 0
src/includes/dinit-util.h

@@ -88,4 +88,31 @@ inline bool starts_with(const std::string &s, const char *prefix)
     return *prefix == 0;
 }
 
+// An allocator that doesn't value-initialise for construction
+template <typename T>
+class default_init_allocator : public std::allocator<T>
+{
+    using std::allocator<T>::allocator;
+
+public:
+    template <typename U>
+    struct rebind {
+        using other = default_init_allocator<U>;
+    };
+
+    // Note this is only a template so that if there is no suitable constructor for T, we won't
+    // error out here
+    template <typename U>
+    void construct(U *obj)
+    {
+        ::new(obj) U;
+    }
+
+    template <typename ...Args>
+    void construct(T *obj, Args... args)
+    {
+        std::allocator<T>::construct(obj, args...);
+    }
+};
+
 #endif

+ 17 - 2
src/includes/load-service.h

@@ -18,6 +18,7 @@
 #include "dinit-utmp.h"
 #include "dinit-util.h"
 #include "service-constants.h"
+#include "mconfig.h"
 
 struct service_flags_t
 {
@@ -746,7 +747,7 @@ class service_settings_wrapper
     bool do_sub_vars = false;
 
     service_type_t service_type = service_type_t::INTERNAL;
-    std::list<dep_type> depends;
+    list<dep_type> depends;
     string logfile;
     service_flags_t onstart_flags;
     int term_signal = SIGTERM;  // termination signal
@@ -768,7 +769,7 @@ class service_settings_wrapper
     std::vector<service_rlimits> rlimits;
 
     int readiness_fd = -1;      // readiness fd in service process
-    std::string readiness_var;  // environment var to hold readiness fd
+    string readiness_var;  // environment var to hold readiness fd
 
     uid_t run_as_uid = -1;
     gid_t run_as_uid_gid = -1; // primary group of "run as" uid if known
@@ -776,6 +777,10 @@ class service_settings_wrapper
 
     string chain_to_name;
 
+    #if SUPPORT_CGROUPS
+    string run_in_cgroup;
+    #endif
+
     #if USE_UTMPX
     char inittab_id[sizeof(utmpx().ut_id)] = {0};
     char inittab_line[sizeof(utmpx().ut_line)] = {0};
@@ -807,6 +812,11 @@ class service_settings_wrapper
             if (!working_dir.empty()) {
                 report_lint("'working-dir' specified, but 'type' is internal (or not specified).");
             }
+            #if SUPPORT_CGROUPS
+            if (!run_in_cgroup.empty()) {
+                report_lint("'run-in-cgroup' specified, but 'type' is internal (or not specified).");
+            }
+            #endif
             if (run_as_uid != (uid_t)-1) {
                 report_lint("'run-as' specified, but 'type' is internal (or not specified).");
             }
@@ -905,6 +915,11 @@ void process_service_line(settings_wrapper &settings, const char *name, string &
     else if (setting == "env-file") {
         settings.env_file = read_setting_value(i, end, nullptr);
     }
+    #if SUPPORT_CGROUPS
+    else if (setting == "run-in-cgroup") {
+        settings.run_in_cgroup = read_setting_value(i, end, nullptr);
+    }
+    #endif
     else if (setting == "socket-listen") {
         settings.socket_path = read_setting_value(i, end, nullptr);
     }

+ 14 - 0
src/includes/proc-service.h

@@ -28,6 +28,9 @@ struct run_proc_params
     const char *working_dir;  // working directory
     const char *logfile;      // log file or nullptr (stdout/stderr); must be valid if !on_console
     const char *env_file;     // file with environment settings (or nullptr)
+    #if SUPPORT_CGROUPS
+    const char *run_in_cgroup; //  cgroup path
+    #endif
     bool on_console;          // whether to run on console
     bool in_foreground;       // if on console: whether to run in foreground
     int wpipefd;              // pipe to which error status will be sent (if error occurs)
@@ -165,6 +168,10 @@ class base_process_service : public service_record
 
     std::vector<service_rlimits> rlimits; // resource limits
 
+#if SUPPORT_CGROUPS
+    string run_in_cgroup;
+#endif
+
     service_child_watcher child_listener;
     exec_status_pipe_watcher child_status_listener;
     process_restart_timer process_timer; // timer is used for start, stop and restart
@@ -323,6 +330,13 @@ class base_process_service : public service_record
         env_file = std::move(env_file_p);
     }
 
+    #if SUPPORT_CGROUPS
+    void set_cgroup(std::string &&run_in_cgroup_p) noexcept
+    {
+        run_in_cgroup = std::move(run_in_cgroup_p);
+    }
+    #endif
+
     void set_rlimits(std::vector<service_rlimits> &&rlimits_p)
     {
         rlimits = std::move(rlimits_p);

+ 17 - 1
src/includes/service-constants.h

@@ -62,7 +62,10 @@ inline bool did_finish(stopped_reason_t reason)
 /* Execution stage */
 enum class exec_stage {
     ARRANGE_FDS, READ_ENV_FILE, SET_NOTIFYFD_VAR, SETUP_ACTIVATION_SOCKET, SETUP_CONTROL_SOCKET,
-    CHDIR, SETUP_STDINOUTERR, SET_RLIMITS, SET_UIDGID, /* must be last: */ DO_EXEC
+    CHDIR, SETUP_STDINOUTERR, ENTER_CGROUP, SET_RLIMITS, SET_UIDGID,
+    /* values for future expansion: */
+    SPARE1, SPARE2, SPARE3, SPARE4, SPARE5, SPARE6, SPARE7, SPARE8,
+    /* must be last: */ DO_EXEC
 };
 
 /* Strings describing the execution stages (failure points). */
@@ -74,8 +77,21 @@ const char * const exec_stage_descriptions[/* static_cast<int>(exec_stage::DO_EX
         "setting up control socket",    // SETUP_CONTROL_SOCKET
         "changing directory",           // CHDIR
         "setting up standard input/output descriptors", // SETUP_STDINOUTERR
+        #if SUPPORT_CGROUPS
+        "entering cgroup",              // ENTER_CGROUP
+        #else
+        "",                             // ENTER_CGROUP (placeholder)
+        #endif
         "setting resource limits",      // SET_RLIMITS
         "setting user/group ID",        // SET_UIDGID
+        nullptr,                        // SPARE1
+        nullptr,                        // SPARE2
+        nullptr,                        // SPARE3
+        nullptr,                        // SPARE4
+        nullptr,                        // SPARE5
+        nullptr,                        // SPARE6
+        nullptr,                        // SPARE7
+        nullptr,                        // SPARE8
         "executing command"             // DO_EXEC
 };
 

+ 9 - 0
src/load-service.cc

@@ -373,6 +373,9 @@ service_record * dirload_service_set::load_reload_service(const char *name, serv
             rvalps->set_stop_command(std::move(settings.stop_command), std::move(stop_arg_parts));
             rvalps->set_working_dir(std::move(settings.working_dir));
             rvalps->set_env_file(std::move(settings.env_file));
+            #if SUPPORT_CGROUPS
+            rvalps->set_cgroup(std::move(settings.run_in_cgroup));
+            #endif
             rvalps->set_rlimits(std::move(settings.rlimits));
             rvalps->set_restart_interval(settings.restart_interval, settings.max_restarts);
             rvalps->set_restart_delay(settings.restart_delay);
@@ -404,6 +407,9 @@ service_record * dirload_service_set::load_reload_service(const char *name, serv
             rvalps->set_stop_command(std::move(settings.stop_command), std::move(stop_arg_parts));
             rvalps->set_working_dir(std::move(settings.working_dir));
             rvalps->set_env_file(std::move(settings.env_file));
+            #if SUPPORT_CGROUPS
+            rvalps->set_cgroup(std::move(settings.run_in_cgroup));
+            #endif
             rvalps->set_rlimits(std::move(settings.rlimits));
             rvalps->set_pid_file(std::move(settings.pid_file));
             rvalps->set_restart_interval(settings.restart_interval, settings.max_restarts);
@@ -431,6 +437,9 @@ service_record * dirload_service_set::load_reload_service(const char *name, serv
             rvalps->set_stop_command(std::move(settings.stop_command), std::move(stop_arg_parts));
             rvalps->set_working_dir(std::move(settings.working_dir));
             rvalps->set_env_file(std::move(settings.env_file));
+            #if SUPPORT_CGROUPS
+            rvalps->set_cgroup(std::move(settings.run_in_cgroup));
+            #endif
             rvalps->set_rlimits(std::move(settings.rlimits));
             rvalps->set_stop_timeout(settings.stop_timeout);
             rvalps->set_start_timeout(settings.start_timeout);

+ 3 - 0
src/proc-service.cc

@@ -853,6 +853,9 @@ bool process_service::start_stop_process(const std::vector<const char *> &cmd) n
         run_params.force_notify_fd = force_notification_fd;
         run_params.notify_var = nullptr;
         run_params.env_file = env_file.c_str();
+        #if SUPPORT_CGROUPS
+        run_params.run_in_cgroup = run_in_cgroup.c_str();
+        #endif
         run_child_proc(run_params);
     }
     else {

+ 61 - 3
src/run-child-proc.cc

@@ -1,5 +1,6 @@
 #include <cstdlib>
 #include <cstring>
+#include <cstdio>
 
 #include <sys/types.h>
 #include <sys/stat.h>
@@ -13,6 +14,11 @@
 #include "service.h"
 #include "proc-service.h"
 
+#ifdef SUPPORT_CGROUPS
+extern std::string cgroups_path;
+extern bool have_cgroups_path;
+#endif
+
 // Move an fd, if necessary, to another fd. The destination fd must be available (not open).
 // if fd is specified as -1, returns -1 immediately. Returns 0 on success.
 static int move_fd(int fd, int dest)
@@ -132,10 +138,9 @@ void base_process_service::run_child_proc(run_proc_params params) noexcept
         if (notify_fd == -1) goto failure_out;
     }
 
-    err.stage = exec_stage::READ_ENV_FILE;
-
     // Read environment from file
-    if (params.env_file != nullptr) {
+    if (params.env_file != nullptr && *params.env_file != 0) {
+        err.stage = exec_stage::READ_ENV_FILE;
         try {
             read_env_file(params.env_file);
         }
@@ -256,6 +261,59 @@ void base_process_service::run_child_proc(run_proc_params params) noexcept
         if (setrlimit(limit.resource_id, &setlimits) != 0) goto failure_out;
     }
 
+    #if SUPPORT_CGROUPS
+    if (params.run_in_cgroup != nullptr && *params.run_in_cgroup != 0) {
+        err.stage = exec_stage::ENTER_CGROUP;
+
+        int sys_fs_cgroup_fd = open("/sys/fs/cgroup", O_RDONLY | O_DIRECTORY | O_PATH);
+        if (sys_fs_cgroup_fd == -1) goto failure_out;
+
+        const char *run_cgroup_path = params.run_in_cgroup;
+        if (run_cgroup_path[0] != '/') {
+            // A relative cgroup path must be resolved against our own path (cgroups_path)
+            if (!have_cgroups_path) {
+                errno = ENOENT;
+                goto failure_out;
+            }
+            if (!cgroups_path.empty()) {
+                int cgrp_root_path = openat(sys_fs_cgroup_fd, cgroups_path.c_str(), O_RDONLY | O_DIRECTORY | O_PATH);
+                if (cgrp_root_path == -1) goto failure_out;
+                close(sys_fs_cgroup_fd);
+                sys_fs_cgroup_fd = cgrp_root_path;
+            }
+        }
+        else {
+            ++run_cgroup_path; // skip leading slash
+        }
+
+        int cgroup_dir_fd = openat(sys_fs_cgroup_fd, run_cgroup_path, O_RDONLY | O_DIRECTORY | O_PATH);
+        if (cgroup_dir_fd == -1) goto failure_out;
+        close(sys_fs_cgroup_fd);
+
+        int cgroup_procs_fd = openat(cgroup_dir_fd, "cgroup.procs", O_WRONLY);
+        if (cgroup_procs_fd == -1) goto failure_out;
+        close(cgroup_dir_fd);
+
+        // We need to write our own pid into the cgroup.procs file
+        char pidbuf[std::numeric_limits<pid_t>::digits10 + 3];
+        // +1 for most significant digit, +1 for '\n', +1 for nul terminator
+        int num_chars;
+        if (sizeof(pid_t) <= sizeof(unsigned)) {
+            num_chars = sprintf(pidbuf, "%u\n", (unsigned)getpid());
+        }
+        else if (sizeof(pid_t) <= sizeof(unsigned long)) {
+            num_chars = sprintf(pidbuf, "%lu\n", (unsigned long)getpid());
+        }
+        else {
+            static_assert(sizeof(pid_t) <= sizeof(unsigned long long));
+            num_chars = sprintf(pidbuf, "%llu\n", (unsigned long long)getpid());
+        }
+
+        if (write(cgroup_procs_fd, pidbuf, num_chars) == -1) goto failure_out;
+        close(cgroup_procs_fd);
+    }
+    #endif
+
     if (uid != uid_t(-1)) {
         err.stage = exec_stage::SET_UIDGID;
         // We must set group first (i.e. before we drop privileges)