Browse Source

jail: add support for running OCI bundle

Prepare ujail for running OCI bundled Linux containers.
This adds handling of most of the JSON schema defined by the
Open Container Initiative Runtime Specification.

What is supported by this commits:
 * basic OCI process definition
 * seccomp filters (no args yet)
 * capabilities (100%)
 * namespaces (100%)
 * uid/gid mappings for userns (100%)
 * mounts (no free form mounts yet)
 * env (100%, limited to a low number entries)
 * hostname (100%)
 * terminal (no consoleSize yet)

What is still missing:
 * complex mounts
 * maskedPaths, readonlyPaths
 * referencing existing namespaces
 * all hooks
 * rlimits
 * oomScoreAdj
 * additionalGids
 * cgroups
 * devices
 * sysctl
 * rootfsPropagation
 * personality and bi-arch (ie. 32-bit container on 64-bit host)

Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Daniel Golle 3 years ago
parent
commit
ea7a790f21
11 changed files with 1177 additions and 69 deletions
  1. 5 1
      CMakeLists.txt
  2. 174 2
      jail/capabilities.c
  3. 14 0
      jail/capabilities.h
  4. 604 30
      jail/jail.c
  5. 2 0
      jail/seccomp-bpf.h
  6. 272 0
      jail/seccomp-oci.c
  7. 33 0
      jail/seccomp-oci.h
  8. 37 0
      jail/seccomp-syscalls-helpers.h
  9. 1 21
      jail/seccomp.c
  10. 34 15
      service/instance.c
  11. 1 0
      service/instance.h

+ 5 - 1
CMakeLists.txt

@@ -103,8 +103,12 @@ INSTALL(TARGETS preload-seccomp
 ADD_DEPENDENCIES(preload-seccomp syscall-names-h)
 endif()
 
+IF(SECCOMP_SUPPORT)
+  SET(SOURCES_OCI_SECCOMP jail/seccomp-oci.c)
+ENDIF()
+
 IF(JAIL_SUPPORT)
-ADD_EXECUTABLE(ujail jail/jail.c jail/elf.c jail/fs.c jail/capabilities.c)
+ADD_EXECUTABLE(ujail jail/jail.c jail/elf.c jail/fs.c jail/capabilities.c ${SOURCES_OCI_SECCOMP})
 TARGET_LINK_LIBRARIES(ujail ${ubox} ${ubus} ${blobmsg_json})
 INSTALL(TARGETS ujail
 	RUNTIME DESTINATION ${CMAKE_INSTALL_SBINDIR}

+ 174 - 2
jail/capabilities.c

@@ -1,5 +1,6 @@
 /*
  * Copyright (C) 2015 Etienne CHAMPETIER <champetier.etienne@gmail.com>
+ * Copyright (C) 2020 Daniel Golle <daniel@makrotopia.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License version 2.1
@@ -14,6 +15,7 @@
 #define _GNU_SOURCE 1
 #include <syslog.h>
 #include <sys/prctl.h>
+#include <sys/capability.h>
 
 #include <libubox/blobmsg.h>
 #include <libubox/blobmsg_json.h>
@@ -22,17 +24,187 @@
 #include "../capabilities-names.h"
 #include "capabilities.h"
 
+#define JAIL_CAP_ERROR (1LLU << (CAP_LAST_CAP+1))
+#define JAIL_CAP_ALL (0xffffffffffffffffLLU)
+
 static int find_capabilities(const char *name)
 {
 	int i;
 
 	for (i = 0; i <= CAP_LAST_CAP; i++)
-		if (capabilities_names[i] && !strcmp(capabilities_names[i], name))
+		if (capabilities_names[i] && !strcasecmp(capabilities_names[i], name))
 			return i;
 
 	return -1;
 }
 
+enum {
+	OCI_CAPABILITIES_BOUNDING,
+	OCI_CAPABILITIES_EFFECTIVE,
+	OCI_CAPABILITIES_INHERITABLE,
+	OCI_CAPABILITIES_PERMITTED,
+	OCI_CAPABILITIES_AMBIENT,
+	__OCI_CAPABILITIES_MAX
+};
+
+static const struct blobmsg_policy oci_capabilities_policy[] = {
+	[OCI_CAPABILITIES_BOUNDING] = { "bounding", BLOBMSG_TYPE_ARRAY },
+	[OCI_CAPABILITIES_EFFECTIVE] = { "effective", BLOBMSG_TYPE_ARRAY },
+	[OCI_CAPABILITIES_INHERITABLE] = { "inheritable", BLOBMSG_TYPE_ARRAY },
+	[OCI_CAPABILITIES_PERMITTED] = { "permitted", BLOBMSG_TYPE_ARRAY },
+	[OCI_CAPABILITIES_AMBIENT] = { "ambient", BLOBMSG_TYPE_ARRAY },
+};
+
+static uint64_t parseOCIcap(struct blob_attr *msg)
+{
+	struct blob_attr *cur;
+	int rem;
+	uint64_t caps = 0;
+	int capnum;
+
+	/* each capset is optional, set all-1 mask if absent */
+	if (!msg)
+		return JAIL_CAP_ALL;
+
+	blobmsg_for_each_attr(cur, msg, rem) {
+		capnum = find_capabilities(blobmsg_get_string(cur));
+		if (capnum < 0)
+			return JAIL_CAP_ERROR;
+
+		caps |= (1LLU << capnum);
+	}
+
+	return caps;
+}
+
+int parseOCIcapabilities(struct jail_capset *capset, struct blob_attr *msg)
+{
+	struct blob_attr *tb[__OCI_CAPABILITIES_MAX];
+	uint64_t caps;
+	blobmsg_parse(oci_capabilities_policy, __OCI_CAPABILITIES_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
+
+	caps = parseOCIcap(tb[OCI_CAPABILITIES_BOUNDING]);
+	if (caps == JAIL_CAP_ERROR)
+		return EINVAL;
+	else
+		capset->bounding = caps;
+
+	caps = parseOCIcap(tb[OCI_CAPABILITIES_EFFECTIVE]);
+	if (caps == JAIL_CAP_ERROR)
+		return EINVAL;
+	else
+		capset->effective = caps;
+
+	caps = parseOCIcap(tb[OCI_CAPABILITIES_INHERITABLE]);
+	if (caps == JAIL_CAP_ERROR)
+		return EINVAL;
+	else
+		capset->inheritable = caps;
+
+	caps = parseOCIcap(tb[OCI_CAPABILITIES_PERMITTED]);
+	if (caps == JAIL_CAP_ERROR)
+		return EINVAL;
+	else
+		capset->permitted = caps;
+
+	caps = parseOCIcap(tb[OCI_CAPABILITIES_AMBIENT]);
+	if (caps == JAIL_CAP_ERROR)
+		return EINVAL;
+	else
+		capset->ambient = caps;
+
+	capset->apply = 1;
+
+	return 0;
+}
+
+
+int applyOCIcapabilities(struct jail_capset ocicapset)
+{
+	struct __user_cap_header_struct uh = {};
+	struct __user_cap_data_struct ud;
+	int cap;
+	int is_set;
+
+	if (!ocicapset.apply)
+		return 0;
+
+	/* drop from bounding set */
+	if (ocicapset.bounding != JAIL_CAP_ALL) {
+		for (cap = 0; cap <= CAP_LAST_CAP; cap++) {
+			if (!prctl(PR_CAPBSET_READ, cap, 0, 0, 0)) {
+				/* can't raise */
+				if (ocicapset.bounding & (1LLU << cap))
+					ERROR("capability %s (%d) is not in bounding set\n", capabilities_names[cap], cap);
+
+				continue;
+			}
+			if ( (ocicapset.bounding & (1LLU << cap)) == 0) {
+				DEBUG("dropping capability %s (%d) from bounding set\n", capabilities_names[cap], cap);
+				if (prctl(PR_CAPBSET_DROP, cap, 0, 0, 0)) {
+					ERROR("prctl(PR_CAPBSET_DROP, %d) failed: %m\n", cap);
+					return errno;
+				}
+			} else {
+				DEBUG("keeping capability %s (%d)\n", capabilities_names[cap], cap);
+			}
+		}
+	}
+
+	/* set effective, permitted and inheritable */
+	uh.version = _LINUX_CAPABILITY_VERSION_3;
+	uh.pid = getpid();
+
+	if (capget(&uh, &ud)) {
+		ERROR("capget() failed\n");
+		return -1;
+	}
+
+	DEBUG("old capabilities: Pe=%08x Pp=%08x Pi=%08x\n", ud.effective, ud.permitted, ud.inheritable);
+
+	if (ocicapset.effective != JAIL_CAP_ALL)
+		ud.effective = ocicapset.effective;
+
+	if (ocicapset.permitted != JAIL_CAP_ALL)
+		ud.permitted = ocicapset.permitted;
+
+	if (ocicapset.inheritable != JAIL_CAP_ALL)
+		ud.inheritable = ocicapset.inheritable;
+
+	DEBUG("new capabilities: Pe=%08x Pp=%08x Pi=%08x\n", ud.effective, ud.permitted, ud.inheritable);
+
+	if (capset(&uh, &ud)) {
+		ERROR("capset() failed\n");
+		return -1;
+	}
+
+	/* edit ambient set */
+	if (ocicapset.ambient != JAIL_CAP_ALL) {
+		for (cap = 0; cap <= CAP_LAST_CAP; cap++) {
+			is_set = prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, cap, 0, 0);
+			if ( (ocicapset.ambient & (1LLU << cap)) == 0) {
+				if (is_set) {
+					DEBUG("dropping capability %s (%d) from ambient set\n", capabilities_names[cap], cap);
+					if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_LOWER, cap, 0, 0)) {
+						ERROR("prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_LOWER, %d, 0, 0) failed: %m\n", cap);
+						return errno;
+					}
+				}
+			} else {
+				if (!is_set) {
+					DEBUG("raising capability %s (%d) to ambient set\n", capabilities_names[cap], cap);
+					if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, cap, 0, 0)) {\
+						ERROR("prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, %d, 0, 0) failed: %m\n", cap);
+						return errno;
+					}
+				}
+			}
+		}
+	}
+
+	return 0;
+}
+
 int drop_capabilities(const char *file)
 {
 	enum {
@@ -81,7 +253,7 @@ int drop_capabilities(const char *file)
 
 	if (capdrop == 0LLU) {
 		DEBUG("cap.keep empty -> only dropping capabilities from cap.drop (blacklist)\n");
-		capdrop = 0xffffffffffffffffLLU;
+		capdrop = JAIL_CAP_ALL;
 	} else {
 		DEBUG("cap.keep has at least one capability -> dropping every capabilities not in cap.keep (whitelist)\n");
 	}

+ 14 - 0
jail/capabilities.h

@@ -13,6 +13,20 @@
 #ifndef _JAIL_CAPABILITIES_H_
 #define _JAIL_CAPABILITIES_H_
 
+#include <libubox/blobmsg.h>
+
+struct jail_capset {
+	uint64_t bounding;
+	uint64_t effective;
+	uint64_t inheritable;
+	uint64_t permitted;
+	uint64_t ambient;
+	uint8_t apply;
+};
+
 int drop_capabilities(const char *file);
 
+int parseOCIcapabilities(struct jail_capset *capset, struct blob_attr *msg);
+int applyOCIcapabilities(struct jail_capset capset);
+
 #endif

+ 604 - 30
jail/jail.c

@@ -28,6 +28,7 @@
 #include <libgen.h>
 #include <sched.h>
 #include <linux/limits.h>
+#include <linux/filter.h>
 #include <signal.h>
 
 #include "capabilities.h"
@@ -35,24 +36,36 @@
 #include "fs.h"
 #include "jail.h"
 #include "log.h"
+#include "seccomp-oci.h"
 
+#include <libubox/utils.h>
+#include <libubox/blobmsg.h>
+#include <libubox/blobmsg_json.h>
+#include <libubox/list.h>
+#include <libubox/vlist.h>
 #include <libubox/uloop.h>
 #include <libubus.h>
 
 #define STACK_SIZE	(1024 * 1024)
-#define OPT_ARGS	"S:C:n:h:r:w:d:psulocU:G:NR:fFO:T:Ey"
+#define OPT_ARGS	"S:C:n:h:r:w:d:psulocU:G:NR:fFO:T:EyJ:"
 
 static struct {
 	char *name;
 	char *hostname;
 	char **jail_argv;
+	char *cwd;
 	char *seccomp;
+	struct sock_fprog *ociseccomp;
 	char *capabilities;
+	struct jail_capset capset;
 	char *user;
 	char *group;
 	char *extroot;
 	char *overlaydir;
 	char *tmpoverlaysize;
+	char **envp;
+	char *uidmap;
+	char *gidmap;
 	int no_new_privs;
 	int namespace;
 	int procfs;
@@ -65,6 +78,7 @@ static struct {
 	int require_jail;
 } opts;
 
+static struct blob_buf ocibuf;
 
 extern int pivot_root(const char *new_root, const char *put_old);
 
@@ -154,9 +168,9 @@ int mount_bind(const char *root, const char *path, int readonly, int error) {
 }
 
 static int mount_overlay(char *jail_root, char *overlaydir) {
-	char *upperdir, *workdir, *optsstr;
+	char *upperdir, *workdir, *optsstr, *upperetc, *upperresolvconf;
 	const char mountoptsformat[] = "lowerdir=%s,upperdir=%s,workdir=%s";
-	int ret = -1;
+	int ret = -1, fd;
 
 	if (asprintf(&upperdir, "%s%s", overlaydir, "/upper") < 0)
 		goto out;
@@ -170,6 +184,31 @@ static int mount_overlay(char *jail_root, char *overlaydir) {
 	if (mkdir_p(upperdir, 0755) || mkdir_p(workdir, 0755))
 		goto opts_printf;
 
+/*
+ * make sure /etc/resolv.conf exists in overlay and is owned by jail userns root
+ * this is to work-around a bug in overlayfs described in the overlayfs-userns
+ * patch:
+ * 3. modification of a file 'hithere' which is in l but not yet
+ * in u, and which is not owned by T, is not allowed, even if
+ * writes to u are allowed.  This may be a bug in overlayfs,
+ * but it is safe behavior.
+ */
+	if (asprintf(&upperetc, "%s/etc", upperdir) < 0)
+		goto opts_printf;
+
+	if (mkdir_p(upperetc, 0755))
+		goto upper_etc_printf;
+
+	if (asprintf(&upperresolvconf, "%s/resolv.conf", upperetc) < 0)
+		goto upper_etc_printf;
+
+	fd = creat(upperresolvconf, 0644);
+	if (fd == -1) {
+		ERROR("creat(%s) failed: %m\n", upperresolvconf);
+		goto upper_resolvconf_printf;
+	}
+	close(fd);
+
 	DEBUG("mount -t overlay %s %s (%s)\n", jail_root, jail_root, optsstr);
 
 	if (mount(jail_root, jail_root, "overlay", MS_NOATIME, optsstr))
@@ -177,6 +216,10 @@ static int mount_overlay(char *jail_root, char *overlaydir) {
 
 	ret = 0;
 
+upper_resolvconf_printf:
+	free(upperresolvconf);
+upper_etc_printf:
+	free(upperetc);
 opts_printf:
 	free(optsstr);
 work_printf:
@@ -398,7 +441,29 @@ static int build_jail_fs(void)
 	return 0;
 }
 
-static int write_uid_gid_map(pid_t child_pid, bool gidmap, int id)
+static int write_uid_gid_map(pid_t child_pid, bool gidmap, char *mapstr)
+{
+	int map_file;
+	char map_path[64];
+
+	if (snprintf(map_path, sizeof(map_path), "/proc/%d/%s",
+		child_pid, gidmap?"gid_map":"uid_map") < 0)
+		return -1;
+
+	if ((map_file = open(map_path, O_WRONLY)) == -1)
+		return -1;
+
+	if (dprintf(map_file, "%s", mapstr)) {
+		close(map_file);
+		return -1;
+	}
+
+	close(map_file);
+	free(mapstr);
+	return 0;
+}
+
+static int write_single_uid_gid_map(pid_t child_pid, bool gidmap, int id)
 {
 	int map_file;
 	char map_path[64];
@@ -433,7 +498,7 @@ static int write_setgroups(pid_t child_pid, bool allow)
 		return -1;
 	}
 
-	if (dprintf(setgroups_file, allow?"allow":"deny") == -1) {
+	if (dprintf(setgroups_file, "%s", allow?"allow":"deny") == -1) {
 		close(setgroups_file);
 		return -1;
 	}
@@ -475,7 +540,7 @@ static void get_jail_user(int *user, int *user_gid, int *gr_gid)
 
 static void set_jail_user(int pw_uid, int user_gid, int gr_gid)
 {
-	if ((user_gid != -1) && initgroups(opts.user, user_gid)) {
+	if (opts.user && (user_gid != -1) && initgroups(opts.user, user_gid)) {
 		ERROR("failed to initgroups() for user %s: %m\n", opts.user);
 		exit(EXIT_FAILURE);
 	}
@@ -492,7 +557,7 @@ static void set_jail_user(int pw_uid, int user_gid, int gr_gid)
 }
 
 #define MAX_ENVP	8
-static char** build_envp(const char *seccomp)
+static char** build_envp(const char *seccomp, char **ocienvp)
 {
 	static char *envp[MAX_ENVP];
 	static char preload_var[PATH_MAX];
@@ -500,6 +565,8 @@ static char** build_envp(const char *seccomp)
 	static char debug_var[] = "LD_DEBUG=all";
 	static char container_var[] = "container=ujail";
 	const char *preload_lib = find_lib("libpreload-seccomp.so");
+	char **addenv;
+
 	int count = 0;
 
 	if (seccomp && !preload_lib) {
@@ -518,6 +585,14 @@ static char** build_envp(const char *seccomp)
 	if (debug > 1)
 		envp[count++] = debug_var;
 
+	addenv = ocienvp;
+	while (addenv && *addenv) {
+		envp[count++] = *(addenv++);
+		if (count >= MAX_ENVP) {
+			ERROR("environment limited to %d extra records, truncating\n", MAX_ENVP);
+			break;
+		}
+	}
 	return envp;
 }
 
@@ -548,6 +623,7 @@ static void usage(void)
 	fprintf(stderr, "  -T <size>\tuse tmpfs r/w overlayfs with <size>\n");
 	fprintf(stderr, "  -E\t\tfail if jail cannot be setup\n");
 	fprintf(stderr, "  -y\t\tprovide jail console\n");
+	fprintf(stderr, "  -J <dir>\tstart OCI bundle\n");
 	fprintf(stderr, "\nWarning: by default root inside the jail is the same\n\
 and he has the same powers as root outside the jail,\n\
 thus he can escape the jail and/or break stuff.\n\
@@ -584,18 +660,18 @@ static int exec_jail(void *pipes_ptr)
 	close(pipes[2]);
 
 	if (opts.namespace & CLONE_NEWUSER) {
-		if (setgid(0) < 0) {
+		if (setregid(0, 0) < 0) {
 			ERROR("setgid\n");
 			exit(EXIT_FAILURE);
 		}
-		if (setuid(0) < 0) {
+		if (setreuid(0, 0) < 0) {
 			ERROR("setuid\n");
 			exit(EXIT_FAILURE);
 		}
-//		if (setgroups(0, NULL) < 0) {
-//			ERROR("setgroups\n");
-//			exit(EXIT_FAILURE);
-//		}
+		if (setgroups(0, NULL) < 0) {
+			ERROR("setgroups\n");
+			exit(EXIT_FAILURE);
+		}
 	}
 
 	if (opts.namespace && opts.hostname && strlen(opts.hostname) > 0
@@ -609,6 +685,9 @@ static int exec_jail(void *pipes_ptr)
 		exit(EXIT_FAILURE);
 	}
 
+	if (applyOCIcapabilities(opts.capset))
+		exit(EXIT_FAILURE);
+
 	if (opts.capabilities && drop_capabilities(opts.capabilities))
 		exit(EXIT_FAILURE);
 
@@ -619,13 +698,17 @@ static int exec_jail(void *pipes_ptr)
 
 	if (!(opts.namespace & CLONE_NEWUSER)) {
 		get_jail_user(&pw_uid, &pw_gid, &gr_gid);
-		set_jail_user(pw_uid, pw_gid, gr_gid);
+
+		set_jail_user(opts.pw_uid?:pw_uid, opts.pw_gid?:pw_gid, opts.gr_gid?:gr_gid);
 	}
 
-	char **envp = build_envp(opts.seccomp);
+	char **envp = build_envp(opts.seccomp, opts.envp);
 	if (!envp)
 		exit(EXIT_FAILURE);
 
+	if (opts.ociseccomp && applyOCIlinuxseccomp(opts.ociseccomp))
+		exit(EXIT_FAILURE);
+
 	INFO("exec-ing %s\n", *opts.jail_argv);
 	execve(*opts.jail_argv, opts.jail_argv, envp);
 	/* we get there only if execve fails */
@@ -702,12 +785,482 @@ static void netns_updown(pid_t pid, bool start)
 	ubus_free(ctx);
 }
 
+
+enum {
+	OCI_ROOT_PATH,
+	OCI_ROOT_READONLY,
+	__OCI_ROOT_MAX,
+};
+
+static const struct blobmsg_policy oci_root_policy[] = {
+	[OCI_ROOT_PATH] = { "path", BLOBMSG_TYPE_STRING },
+	[OCI_ROOT_READONLY] = { "readonly", BLOBMSG_TYPE_BOOL },
+};
+
+static int parseOCIroot(const char *jsonfile, struct blob_attr *msg)
+{
+	static char rootpath[PATH_MAX] = { 0 };
+	struct blob_attr *tb[__OCI_ROOT_MAX];
+	char *cur;
+
+	blobmsg_parse(oci_root_policy, __OCI_ROOT_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
+
+	if (!tb[OCI_ROOT_PATH])
+		return ENODATA;
+
+	strncpy(rootpath, jsonfile, PATH_MAX);
+	cur = strrchr(rootpath, '/');
+
+	if (!cur)
+		return ENOTDIR;
+
+	*(++cur) = '\0';
+	strncat(rootpath, blobmsg_get_string(tb[OCI_ROOT_PATH]), PATH_MAX - (strlen(rootpath) + 1));
+
+	opts.extroot = rootpath;
+
+	opts.ronly = blobmsg_get_bool(tb[OCI_ROOT_READONLY]);
+
+	return 0;
+}
+
+
+enum {
+	OCI_MOUNT_SOURCE,
+	OCI_MOUNT_DESTINATION,
+	OCI_MOUNT_TYPE,
+	OCI_MOUNT_OPTIONS,
+	__OCI_MOUNT_MAX,
+};
+
+static const struct blobmsg_policy oci_mount_policy[] = {
+	[OCI_MOUNT_SOURCE] = { "source", BLOBMSG_TYPE_STRING },
+	[OCI_MOUNT_DESTINATION] = { "destination", BLOBMSG_TYPE_STRING },
+	[OCI_MOUNT_TYPE] = { "type", BLOBMSG_TYPE_STRING },
+	[OCI_MOUNT_OPTIONS] = { "options", BLOBMSG_TYPE_ARRAY },
+};
+
+static int parseOCImount(struct blob_attr *msg)
+{
+	struct blob_attr *tb[__OCI_MOUNT_MAX];
+
+	blobmsg_parse(oci_mount_policy, __OCI_MOUNT_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
+
+	if (!tb[OCI_MOUNT_DESTINATION])
+		return EINVAL;
+
+	if (!strcmp("proc", blobmsg_get_string(tb[OCI_MOUNT_TYPE])) &&
+	    !strcmp("/proc", blobmsg_get_string(tb[OCI_MOUNT_DESTINATION]))) {
+		opts.procfs = true;
+		return 0;
+	}
+
+	if (!strcmp("sysfs", blobmsg_get_string(tb[OCI_MOUNT_TYPE])) &&
+	    !strcmp("/sys", blobmsg_get_string(tb[OCI_MOUNT_DESTINATION]))) {
+		opts.sysfs = true;
+		return 0;
+	}
+
+	if (!strcmp("tmpfs", blobmsg_get_string(tb[OCI_MOUNT_TYPE])) &&
+	    !strcmp("/dev", blobmsg_get_string(tb[OCI_MOUNT_DESTINATION]))) {
+		/* we always mount a small tmpfs on /dev */
+		return 0;
+	}
+
+	INFO("ignoring unsupported mount %s %s -t %s -o %s\n",
+		blobmsg_get_string(tb[OCI_MOUNT_SOURCE]),
+		blobmsg_get_string(tb[OCI_MOUNT_DESTINATION]),
+		blobmsg_get_string(tb[OCI_MOUNT_TYPE]),
+		blobmsg_format_json(tb[OCI_MOUNT_OPTIONS], true));
+
+	return 0;
+};
+
+
+enum {
+	OCI_PROCESS_USER_UID,
+	OCI_PROCESS_USER_GID,
+	OCI_PROCESS_USER_UMASK,
+	OCI_PROCESS_USER_ADDITIONALGIDS,
+	__OCI_PROCESS_USER_MAX,
+};
+
+static const struct blobmsg_policy oci_process_user_policy[] = {
+	[OCI_PROCESS_USER_UID] = { "uid", BLOBMSG_TYPE_INT32 },
+	[OCI_PROCESS_USER_GID] = { "gid", BLOBMSG_TYPE_INT32 },
+	[OCI_PROCESS_USER_UMASK] = { "umask", BLOBMSG_TYPE_INT32 },
+	[OCI_PROCESS_USER_ADDITIONALGIDS] = { "additionalGids", BLOBMSG_TYPE_ARRAY },
+};
+
+static int parseOCIprocessuser(struct blob_attr *msg) {
+	struct blob_attr *tb[__OCI_PROCESS_USER_MAX];
+
+	blobmsg_parse(oci_process_user_policy, __OCI_PROCESS_USER_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
+
+	if (tb[OCI_PROCESS_USER_UID])
+		opts.pw_uid = blobmsg_get_u32(tb[OCI_PROCESS_USER_UID]);
+
+	if (tb[OCI_PROCESS_USER_GID]) {
+		opts.pw_gid = blobmsg_get_u32(tb[OCI_PROCESS_USER_GID]);
+		opts.gr_gid = blobmsg_get_u32(tb[OCI_PROCESS_USER_GID]);
+	}
+
+	/* ToDo: umask, additional GIDs */
+
+	return 0;
+}
+
+enum {
+	OCI_PROCESS_ARGS,
+	OCI_PROCESS_CAPABILITIES,
+	OCI_PROCESS_CWD,
+	OCI_PROCESS_ENV,
+	OCI_PROCESS_NONEWPRIVILEGES,
+	OCI_PROCESS_RLIMITS,
+	OCI_PROCESS_TERMINAL,
+	OCI_PROCESS_USER,
+	__OCI_PROCESS_MAX,
+};
+
+static const struct blobmsg_policy oci_process_policy[] = {
+	[OCI_PROCESS_ARGS] = { "args", BLOBMSG_TYPE_ARRAY },
+	[OCI_PROCESS_CAPABILITIES] = { "capabilities", BLOBMSG_TYPE_TABLE },
+	[OCI_PROCESS_CWD] = { "cwd", BLOBMSG_TYPE_STRING },
+	[OCI_PROCESS_ENV] = { "env", BLOBMSG_TYPE_ARRAY },
+	[OCI_PROCESS_NONEWPRIVILEGES] = { "noNewPrivileges", BLOBMSG_TYPE_BOOL },
+	[OCI_PROCESS_RLIMITS] = { "rlimits", BLOBMSG_TYPE_ARRAY },
+	[OCI_PROCESS_TERMINAL] = { "terminal", BLOBMSG_TYPE_BOOL },
+	[OCI_PROCESS_USER] = { "user", BLOBMSG_TYPE_TABLE },
+};
+
+static int parseOCIprocess(struct blob_attr *msg)
+{
+	struct blob_attr *tb[__OCI_PROCESS_MAX];
+	struct blob_attr *cur;
+	unsigned int sz = 0;
+	int rem;
+	int res;
+
+	blobmsg_parse(oci_process_policy, __OCI_PROCESS_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
+
+	if (!tb[OCI_PROCESS_ARGS])
+		return ENOENT;
+
+	blobmsg_for_each_attr(cur, tb[OCI_PROCESS_ARGS], rem)
+		++sz;
+
+	if (!sz)
+		return ENODATA;
+
+	opts.jail_argv = calloc(1 + sz, sizeof(char*));
+	if (!opts.jail_argv)
+		return ENOMEM;
+
+	sz = 0;
+	blobmsg_for_each_attr(cur, tb[OCI_PROCESS_ARGS], rem)
+		opts.jail_argv[sz++] = blobmsg_get_string(cur);
+
+	opts.console = blobmsg_get_bool(tb[OCI_PROCESS_TERMINAL]);
+	opts.no_new_privs = blobmsg_get_bool(tb[OCI_PROCESS_NONEWPRIVILEGES]);
+
+	if (tb[OCI_PROCESS_CWD])
+		opts.cwd = blobmsg_get_string(tb[OCI_PROCESS_CWD]);
+
+	sz = 0;
+	blobmsg_for_each_attr(cur, tb[OCI_PROCESS_ENV], rem)
+		++sz;
+
+	if (sz > 0) {
+		opts.envp = calloc(1 + sz, sizeof(char*));
+		if (!opts.envp)
+			return ENOMEM;
+	}
+
+	sz = 0;
+	blobmsg_for_each_attr(cur, tb[OCI_PROCESS_ENV], rem)
+		opts.envp[sz++] = strdup(blobmsg_get_string(cur));
+
+	if (tb[OCI_PROCESS_USER] && (res = parseOCIprocessuser(tb[OCI_PROCESS_USER])))
+		return res;
+
+	if (tb[OCI_PROCESS_CAPABILITIES] &&
+	    (res = parseOCIcapabilities(&opts.capset, tb[OCI_PROCESS_CAPABILITIES])))
+		return res;
+
+	/* ToDo: rlimits, capabilities */
+
+	return 0;
+}
+
+enum {
+	OCI_LINUX_NAMESPACE_TYPE,
+	OCI_LINUX_NAMESPACE_PATH,
+	__OCI_LINUX_NAMESPACE_MAX,
+};
+
+static const struct blobmsg_policy oci_linux_namespace_policy[] = {
+	[OCI_LINUX_NAMESPACE_TYPE] = { "type", BLOBMSG_TYPE_STRING },
+	[OCI_LINUX_NAMESPACE_PATH] = { "path", BLOBMSG_TYPE_STRING },
+};
+
+static unsigned int resolve_nstype(char *type) {
+	if (!strcmp("pid", type))
+		return CLONE_NEWPID;
+	else if (!strcmp("network", type))
+		return CLONE_NEWNET;
+	else if (!strcmp("mount", type))
+		return CLONE_NEWNS;
+	else if (!strcmp("ipc", type))
+		return CLONE_NEWIPC;
+	else if (!strcmp("uts", type))
+		return CLONE_NEWUTS;
+	else if (!strcmp("user", type))
+		return CLONE_NEWUSER;
+	else if (!strcmp("cgroup", type))
+		return CLONE_NEWCGROUP;
+	else
+		return 0;
+}
+
+static int parseOCIlinuxns(struct blob_attr *msg)
+{
+	struct blob_attr *tb[__OCI_LINUX_NAMESPACE_MAX];
+
+
+	blobmsg_parse(oci_linux_namespace_policy, __OCI_LINUX_NAMESPACE_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
+
+	if (!tb[OCI_LINUX_NAMESPACE_TYPE])
+		return EINVAL;
+
+	if (tb[OCI_LINUX_NAMESPACE_PATH])
+		return ENOTSUP; /* ToDo */
+
+	opts.namespace |= resolve_nstype(blobmsg_get_string(tb[OCI_LINUX_NAMESPACE_TYPE]));
+
+	return 0;
+};
+
+
+enum {
+	OCI_LINUX_UIDGIDMAP_CONTAINERID,
+	OCI_LINUX_UIDGIDMAP_HOSTID,
+	OCI_LINUX_UIDGIDMAP_SIZE,
+	__OCI_LINUX_UIDGIDMAP_MAX,
+};
+
+static const struct blobmsg_policy oci_linux_uidgidmap_policy[] = {
+	[OCI_LINUX_UIDGIDMAP_CONTAINERID] = { "containerID", BLOBMSG_TYPE_INT32 },
+	[OCI_LINUX_UIDGIDMAP_HOSTID] = { "hostID", BLOBMSG_TYPE_INT32 },
+	[OCI_LINUX_UIDGIDMAP_SIZE] = { "size", BLOBMSG_TYPE_INT32 },
+};
+
+static int parseOCIuidgidmappings(struct blob_attr *msg, bool is_gidmap)
+{
+	const char *map_format = "%d %d %d\n";
+	struct blob_attr *tb[__OCI_LINUX_UIDGIDMAP_MAX];
+	struct blob_attr *cur;
+	int rem, len;
+	char **mappings;
+	char *map, *curstr;
+	unsigned int cnt = 0;
+	size_t totallen = 0;
+
+	/* count number of mappings */
+	blobmsg_for_each_attr(cur, msg, rem)
+		cnt++;
+
+	if (!cnt)
+		return 0;
+
+	/* allocate array for mappings */
+	mappings = calloc(1 + cnt, sizeof(char*));
+	if (!mappings)
+		return ENOMEM;
+
+	mappings[cnt] = NULL;
+
+	cnt = 0;
+	blobmsg_for_each_attr(cur, msg, rem) {
+		blobmsg_parse(oci_linux_uidgidmap_policy, __OCI_LINUX_UIDGIDMAP_MAX, tb, blobmsg_data(cur), blobmsg_len(cur));
+
+		if (!tb[OCI_LINUX_UIDGIDMAP_CONTAINERID] ||
+		    !tb[OCI_LINUX_UIDGIDMAP_HOSTID] ||
+		    !tb[OCI_LINUX_UIDGIDMAP_SIZE])
+			return EINVAL;
+
+		/* write mapping line into allocated string */
+		len = asprintf(&mappings[cnt++], map_format,
+			 blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_CONTAINERID]),
+			 blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_HOSTID]),
+			 blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_SIZE]));
+
+		if (len < 0)
+			return ENOMEM;
+
+		totallen += len;
+	}
+
+	/* allocate combined mapping string */
+	map = calloc(1 + len, sizeof(char));
+	if (!map)
+		return ENOMEM;
+
+	map[0] = '\0';
+
+	/* concatenate mapping strings into combined string */
+	curstr = mappings[0];
+	while (curstr) {
+		strcat(map, curstr);
+		free(curstr++);
+	}
+	free(mappings);
+
+	if (is_gidmap)
+		opts.gidmap = map;
+	else
+		opts.uidmap = map;
+
+	return 0;
+}
+
+enum {
+	OCI_LINUX_RESOURCES,
+	OCI_LINUX_SECCOMP,
+	OCI_LINUX_SYSCTL,
+	OCI_LINUX_NAMESPACES,
+	OCI_LINUX_UIDMAPPINGS,
+	OCI_LINUX_GIDMAPPINGS,
+	OCI_LINUX_MASKEDPATHS,
+	OCI_LINUX_READONLYPATHS,
+	OCI_LINUX_ROOTFSPROPAGATION,
+	__OCI_LINUX_MAX,
+};
+
+static const struct blobmsg_policy oci_linux_policy[] = {
+	[OCI_LINUX_RESOURCES] = { "resources", BLOBMSG_TYPE_TABLE },
+	[OCI_LINUX_SECCOMP] = { "seccomp", BLOBMSG_TYPE_TABLE },
+	[OCI_LINUX_SYSCTL] = { "sysctl", BLOBMSG_TYPE_TABLE },
+	[OCI_LINUX_NAMESPACES] = { "namespaces", BLOBMSG_TYPE_ARRAY },
+	[OCI_LINUX_UIDMAPPINGS] = { "uidMappings", BLOBMSG_TYPE_ARRAY },
+	[OCI_LINUX_GIDMAPPINGS] = { "gidMappings", BLOBMSG_TYPE_ARRAY },
+	[OCI_LINUX_MASKEDPATHS] = { "maskedPaths", BLOBMSG_TYPE_ARRAY },
+	[OCI_LINUX_READONLYPATHS] = { "readonlyPaths", BLOBMSG_TYPE_ARRAY },
+	[OCI_LINUX_ROOTFSPROPAGATION] = { "rootfsPropagation", BLOBMSG_TYPE_STRING },
+};
+
+static int parseOCIlinux(struct blob_attr *msg)
+{
+	struct blob_attr *tb[__OCI_LINUX_MAX];
+	struct blob_attr *cur;
+	int rem;
+	int res = 0;
+
+	blobmsg_parse(oci_linux_policy, __OCI_LINUX_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
+
+	if (tb[OCI_LINUX_NAMESPACES]) {
+		blobmsg_for_each_attr(cur, tb[OCI_LINUX_NAMESPACES], rem) {
+			res = parseOCIlinuxns(cur);
+			if (res)
+				return res;
+		}
+	}
+
+	if (tb[OCI_LINUX_UIDMAPPINGS]) {
+		res = parseOCIuidgidmappings(tb[OCI_LINUX_GIDMAPPINGS], 0);
+		if (res)
+			return res;
+	}
+
+	if (tb[OCI_LINUX_GIDMAPPINGS]) {
+		res = parseOCIuidgidmappings(tb[OCI_LINUX_GIDMAPPINGS], 1);
+		if (res)
+			return res;
+	}
+
+	if (tb[OCI_LINUX_SECCOMP]) {
+		opts.ociseccomp = parseOCIlinuxseccomp(tb[OCI_LINUX_SECCOMP]);
+		if (!opts.ociseccomp)
+			return EINVAL;
+	}
+
+	return 0;
+}
+
+enum {
+	OCI_VERSION,
+	OCI_HOSTNAME,
+	OCI_PROCESS,
+	OCI_ROOT,
+	OCI_MOUNTS,
+	OCI_LINUX,
+	__OCI_MAX,
+};
+
+static const struct blobmsg_policy oci_policy[] = {
+	[OCI_VERSION] = { "ociVersion", BLOBMSG_TYPE_STRING },
+	[OCI_HOSTNAME] = { "hostname", BLOBMSG_TYPE_STRING },
+	[OCI_PROCESS] = { "process", BLOBMSG_TYPE_TABLE },
+	[OCI_ROOT] = { "root", BLOBMSG_TYPE_TABLE },
+	[OCI_MOUNTS] = { "mounts", BLOBMSG_TYPE_ARRAY },
+	[OCI_LINUX] = { "linux", BLOBMSG_TYPE_TABLE },
+};
+
+static int parseOCI(const char *jsonfile)
+{
+	struct blob_attr *tb[__OCI_MAX];
+	struct blob_attr *cur;
+	int rem;
+	int res;
+
+	blob_buf_init(&ocibuf, 0);
+	if (!blobmsg_add_json_from_file(&ocibuf, jsonfile))
+		return ENOENT;
+
+	blobmsg_parse(oci_policy, __OCI_MAX, tb, blob_data(ocibuf.head), blob_len(ocibuf.head));
+
+	if (!tb[OCI_VERSION])
+		return ENOMSG;
+
+	if (strncmp("1.0", blobmsg_get_string(tb[OCI_VERSION]), 3)) {
+		ERROR("unsupported ociVersion %s\n", blobmsg_get_string(tb[OCI_VERSION]));
+		return ENOTSUP;
+	}
+
+	if (tb[OCI_HOSTNAME])
+		opts.hostname = blobmsg_get_string(tb[OCI_HOSTNAME]);
+
+	if (!tb[OCI_PROCESS])
+		return ENODATA;
+
+	if ((res = parseOCIprocess(tb[OCI_PROCESS])))
+		return res;
+
+	if (!tb[OCI_ROOT])
+		return ENODATA;
+
+	if ((res = parseOCIroot(jsonfile, tb[OCI_ROOT])))
+		return res;
+
+	if (!tb[OCI_MOUNTS])
+		return ENODATA;
+
+	blobmsg_for_each_attr(cur, tb[OCI_MOUNTS], rem)
+		if ((res = parseOCImount(cur)))
+			return res;
+
+	if (tb[OCI_LINUX] && (res = parseOCIlinux(tb[OCI_LINUX])))
+		return res;
+
+	return 0;
+}
+
 int main(int argc, char **argv)
 {
 	sigset_t sigmask;
 	uid_t uid = getuid();
-	char log[] = "/dev/log";
-	char ubus[] = "/var/run/ubus.sock";
+	const char log[] = "/dev/log";
+	const char ubus[] = "/var/run/ubus.sock";
+	char *jsonfile = NULL;
 	int ch, i;
 	int pipes[4];
 	char sig_buf[1];
@@ -802,19 +1355,32 @@ int main(int argc, char **argv)
 		case 'y':
 			opts.console = 1;
 			break;
+		case 'J':
+			asprintf(&jsonfile, "%s/config.json", optarg);
+			break;
 		}
 	}
 
 	if (opts.namespace)
 		opts.namespace |= CLONE_NEWIPC | CLONE_NEWPID;
 
+	if (jsonfile) {
+		int ocires;
+		ocires = parseOCI(jsonfile);
+		free(jsonfile);
+		if (ocires) {
+			ERROR("parsing of OCI JSON spec has failed: %s (%d)\n", strerror(ocires), ocires);
+			return ocires;
+		}
+	}
+
 	if (opts.tmpoverlaysize && strlen(opts.tmpoverlaysize) > 8) {
 		ERROR("size parameter too long: \"%s\"\n", opts.tmpoverlaysize);
 		return -1;
 	}
 
 	/* no <binary> param found */
-	if (argc - optind < 1) {
+	if (!jsonfile && (argc - optind < 1)) {
 		usage();
 		return EXIT_FAILURE;
 	}
@@ -825,12 +1391,14 @@ int main(int argc, char **argv)
 	}
 	DEBUG("Using namespaces(0x%08x), capabilities(%d), seccomp(%d)\n",
 		opts.namespace,
-		opts.capabilities != 0,
-		opts.seccomp != 0);
-
-	opts.jail_argv = &argv[optind];
+		opts.capabilities != 0 || opts.capset.apply,
+		opts.seccomp != 0 || opts.ociseccomp != 0);
 
-	get_jail_user(&opts.pw_uid, &opts.pw_gid, &opts.gr_gid);
+	if (!jsonfile) {
+		opts.jail_argv = &argv[optind];
+		if (opts.namespace & CLONE_NEWUSER)
+			get_jail_user(&opts.pw_uid, &opts.pw_gid, &opts.gr_gid);
+	}
 
 	if (!opts.extroot) {
 		if (opts.namespace && add_path_and_deps(*opts.jail_argv, 1, -1, 0)) {
@@ -908,17 +1476,23 @@ int main(int argc, char **argv)
 		}
 		close(pipes[0]);
 		if (opts.namespace & CLONE_NEWUSER) {
-			bool has_gr = (opts.gr_gid != -1);
-			if (write_setgroups(jail_process.pid, false)) {
+			if (write_setgroups(jail_process.pid, true)) {
 				ERROR("can't write setgroups\n");
 				return -1;
 			}
-			if (opts.pw_uid != -1) {
-				write_uid_gid_map(jail_process.pid, 0, opts.pw_uid);
-				write_uid_gid_map(jail_process.pid, 1, has_gr?opts.gr_gid:opts.pw_gid);
+			if (!opts.uidmap) {
+				bool has_gr = (opts.gr_gid != -1);
+				if (opts.pw_uid != -1) {
+					write_single_uid_gid_map(jail_process.pid, 0, opts.pw_uid);
+					write_single_uid_gid_map(jail_process.pid, 1, has_gr?opts.gr_gid:opts.pw_gid);
+				} else {
+					write_single_uid_gid_map(jail_process.pid, 0, 65534);
+					write_single_uid_gid_map(jail_process.pid, 1, has_gr?opts.gr_gid:65534);
+				}
 			} else {
-				write_uid_gid_map(jail_process.pid, 0, 65534);
-				write_uid_gid_map(jail_process.pid, 1, has_gr?opts.gr_gid:65534);
+				write_uid_gid_map(jail_process.pid, 0, opts.uidmap);
+				if (opts.gidmap)
+					write_uid_gid_map(jail_process.pid, 1, opts.gidmap);
 			}
 		}
 

+ 2 - 0
jail/seccomp-bpf.h

@@ -41,8 +41,10 @@
 #define SECCOMP_RET_TRAP	0x00030000U /* disallow and force a SIGSYS */
 #define SECCOMP_RET_ERRNO	0x00050000U /* returns an errno */
 #define SECCOMP_RET_LOG		0x00070000U
+#define SECCOMP_RET_LOGALLOW	0x7ffc0000U
 #define SECCOMP_RET_TRACE	0x7ff00000U /* pass to a tracer or disallow */
 #define SECCOMP_RET_ALLOW	0x7fff0000U /* allow */
+#define SECCOMP_RET_KILLPROCESS	0x80000000U
 #define SECCOMP_RET_ERROR(x)	(SECCOMP_RET_ERRNO | ((x) & 0x0000ffffU))
 #define SECCOMP_RET_LOGGER(x)	(SECCOMP_RET_LOG | ((x) & 0x0000ffffU))
 

+ 272 - 0
jail/seccomp-oci.c

@@ -0,0 +1,272 @@
+/*
+ * parse and setup OCI seccomp filter
+ * Copyright (c) 2020 Daniel Golle <daniel@makrotopia.org>
+ * seccomp example with syscall reporting
+ * Copyright (c) 2012 The Chromium OS Authors <chromium-os-dev@chromium.org>
+ * Authors:
+ *  Kees Cook <keescook@chromium.org>
+ *  Will Drewry <wad@chromium.org>
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+#define _GNU_SOURCE 1
+#include <stddef.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <libubox/utils.h>
+#include <libubox/blobmsg.h>
+#include <libubox/blobmsg_json.h>
+
+#include "log.h"
+#include "seccomp-bpf.h"
+#include "seccomp-oci.h"
+#include "../syscall-names.h"
+#include "seccomp-syscalls-helpers.h"
+
+static uint32_t resolve_action(char *actname)
+{
+	if (!strcmp(actname, "SCMP_ACT_KILL"))
+		return SECCOMP_RET_KILL;
+	else if (!strcmp(actname, "SCMP_ACT_KILL_PROCESS"))
+		return SECCOMP_RET_KILLPROCESS;
+	else if (!strcmp(actname, "SCMP_ACT_TRAP"))
+		return SECCOMP_RET_TRAP;
+	else if (!strcmp(actname, "SCMP_ACT_ERRNO"))
+		return SECCOMP_RET_ERRNO;
+	else if (!strcmp(actname, "SCMP_ACT_ERROR"))
+		return SECCOMP_RET_ERRNO;
+	else if (!strcmp(actname, "SCMP_ACT_TRACE"))
+		return SECCOMP_RET_TRACE;
+	else if (!strcmp(actname, "SCMP_ACT_ALLOW"))
+		return SECCOMP_RET_ALLOW;
+	else if (!strcmp(actname, "SCMP_ACT_LOG"))
+		return SECCOMP_RET_LOGALLOW;
+	else {
+		ERROR("unknown seccomp action %s\n", actname);
+		return SECCOMP_RET_KILL;
+	}
+}
+
+static uint32_t resolve_architecture(char *archname)
+{
+	if (!strcmp(archname, "SCMP_ARCH_X86"))
+		return AUDIT_ARCH_I386;
+	else if (!strcmp(archname, "SCMP_ARCH_X86_64"))
+		return AUDIT_ARCH_X86_64;
+	else if (!strcmp(archname, "SCMP_ARCH_X32"))
+		/*
+		 * return AUDIT_ARCH_X86_64;
+		 * 32-bit userland on 64-bit kernel is not supported yet
+		 */
+		return 0;
+	else if (!strcmp(archname, "SCMP_ARCH_ARM"))
+		return AUDIT_ARCH_ARM;
+	else if (!strcmp(archname, "SCMP_ARCH_AARCH64"))
+		return AUDIT_ARCH_AARCH64;
+	else if (!strcmp(archname, "SCMP_ARCH_MIPS"))
+		return AUDIT_ARCH_MIPS;
+	else if (!strcmp(archname, "SCMP_ARCH_MIPS64"))
+		return AUDIT_ARCH_MIPS64;
+	else if (!strcmp(archname, "SCMP_ARCH_MIPS64N32"))
+		return AUDIT_ARCH_MIPS64N32;
+	else if (!strcmp(archname, "SCMP_ARCH_MIPSEL"))
+		return AUDIT_ARCH_MIPSEL;
+	else if (!strcmp(archname, "SCMP_ARCH_MIPSEL64"))
+		return AUDIT_ARCH_MIPSEL64;
+	else if (!strcmp(archname, "SCMP_ARCH_MIPSEL64N32"))
+		return AUDIT_ARCH_MIPSEL64N32;
+	else if (!strcmp(archname, "SCMP_ARCH_PPC"))
+		return AUDIT_ARCH_PPC;
+	else if (!strcmp(archname, "SCMP_ARCH_PPC64"))
+		return AUDIT_ARCH_PPC64;
+	else if (!strcmp(archname, "SCMP_ARCH_PPC64LE"))
+		return AUDIT_ARCH_PPC64LE;
+	else if (!strcmp(archname, "SCMP_ARCH_S390"))
+		return AUDIT_ARCH_S390;
+	else if (!strcmp(archname, "SCMP_ARCH_S390X"))
+		return AUDIT_ARCH_S390X;
+	else if (!strcmp(archname, "SCMP_ARCH_PARISC"))
+		return AUDIT_ARCH_PARISC;
+	else if (!strcmp(archname, "SCMP_ARCH_PARISC64"))
+		return AUDIT_ARCH_PARISC64;
+	else {
+		ERROR("unknown seccomp architecture %s\n", archname);
+		return 0;
+	}
+}
+
+enum {
+	OCI_LINUX_SECCOMP_DEFAULTACTION,
+	OCI_LINUX_SECCOMP_ARCHITECTURES,
+	OCI_LINUX_SECCOMP_FLAGS,
+	OCI_LINUX_SECCOMP_SYSCALLS,
+	__OCI_LINUX_SECCOMP_MAX,
+};
+
+static const struct blobmsg_policy oci_linux_seccomp_policy[] = {
+	[OCI_LINUX_SECCOMP_DEFAULTACTION] = { "defaultAction", BLOBMSG_TYPE_STRING },
+	[OCI_LINUX_SECCOMP_ARCHITECTURES] = { "architectures", BLOBMSG_TYPE_ARRAY },
+	[OCI_LINUX_SECCOMP_FLAGS] = { "flags", BLOBMSG_TYPE_ARRAY },
+	[OCI_LINUX_SECCOMP_SYSCALLS] = { "syscalls", BLOBMSG_TYPE_ARRAY },
+};
+
+enum {
+	OCI_LINUX_SECCOMP_SYSCALLS_NAMES,
+	OCI_LINUX_SECCOMP_SYSCALLS_ACTION,
+	OCI_LINUX_SECCOMP_SYSCALLS_ERRNORET,
+	OCI_LINUX_SECCOMP_SYSCALLS_ARGS,
+	__OCI_LINUX_SECCOMP_SYSCALLS_MAX
+};
+
+static const struct blobmsg_policy oci_linux_seccomp_syscalls_policy[] = {
+	[OCI_LINUX_SECCOMP_SYSCALLS_NAMES] = { "names", BLOBMSG_TYPE_ARRAY },
+	[OCI_LINUX_SECCOMP_SYSCALLS_ERRNORET] = { "errnoRet", BLOBMSG_TYPE_INT32 },
+	[OCI_LINUX_SECCOMP_SYSCALLS_ARGS] = { "args", BLOBMSG_TYPE_ARRAY },
+	[OCI_LINUX_SECCOMP_SYSCALLS_ACTION] = { "action", BLOBMSG_TYPE_STRING },
+};
+
+enum {
+	OCI_LINUX_SECCOMP_SYSCALLS_ARGS_INDEX,
+	OCI_LINUX_SECCOMP_SYSCALLS_ARGS_VALUE,
+	OCI_LINUX_SECCOMP_SYSCALLS_ARGS_VALUETWO,
+	OCI_LINUX_SECCOMP_SYSCALLS_ARGS_OP,
+	__OCI_LINUX_SECCOMP_SYSCALLS_ARGS_MAX
+};
+
+static const struct blobmsg_policy oci_linux_seccomp_syscalls_args_policy[] = {
+	[OCI_LINUX_SECCOMP_SYSCALLS_ARGS_INDEX] = { "index", BLOBMSG_TYPE_INT32 },
+	[OCI_LINUX_SECCOMP_SYSCALLS_ARGS_VALUE] = { "value", BLOBMSG_TYPE_INT64 },
+	[OCI_LINUX_SECCOMP_SYSCALLS_ARGS_VALUETWO] = { "valueTwo", BLOBMSG_TYPE_INT64 },
+	[OCI_LINUX_SECCOMP_SYSCALLS_ARGS_OP] = { "op", BLOBMSG_TYPE_STRING },
+};
+
+struct sock_fprog *parseOCIlinuxseccomp(struct blob_attr *msg)
+{
+	struct blob_attr *tb[__OCI_LINUX_SECCOMP_MAX];
+	struct blob_attr *tbn[__OCI_LINUX_SECCOMP_SYSCALLS_MAX];
+	struct blob_attr *tba[__OCI_LINUX_SECCOMP_SYSCALLS_ARGS_MAX];
+	struct blob_attr *cur, *curn, *curarg;
+	int rem, remn, remargs, sc;
+	struct sock_filter *filter;
+	struct sock_fprog *prog;
+	int sz = 5, idx = 0;
+	uint32_t default_policy = 0;
+	uint32_t seccomp_arch;
+
+	blobmsg_parse(oci_linux_seccomp_policy, __OCI_LINUX_SECCOMP_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
+
+	if (!tb[OCI_LINUX_SECCOMP_DEFAULTACTION]) {
+		ERROR("seccomp: no default action set\n");
+		return NULL;
+	}
+
+	default_policy = resolve_action(blobmsg_get_string(tb[OCI_LINUX_SECCOMP_DEFAULTACTION]));
+
+	/* verify architecture while ignoring the x86_64 anomaly for now */
+	blobmsg_for_each_attr(cur, tb[OCI_LINUX_SECCOMP_ARCHITECTURES], rem) {
+		seccomp_arch = resolve_architecture(blobmsg_get_string(cur));
+		/* take the first useful arch for now */
+		if (seccomp_arch)
+			break;
+	}
+
+	if (ARCH_NR != seccomp_arch) {
+		ERROR("seccomp architecture doesn't match system\n");
+		return NULL;
+	}
+
+	blobmsg_for_each_attr(cur, tb[OCI_LINUX_SECCOMP_SYSCALLS], rem) {
+		blobmsg_parse(oci_linux_seccomp_syscalls_policy, __OCI_LINUX_SECCOMP_SYSCALLS_MAX, tbn, blobmsg_data(cur), blobmsg_len(cur));
+		blobmsg_for_each_attr(curn, tbn[OCI_LINUX_SECCOMP_SYSCALLS_NAMES], remn)
+			sz += 2;
+
+		if (tbn[OCI_LINUX_SECCOMP_SYSCALLS_ARGS])
+			blobmsg_for_each_attr(curarg, tbn[OCI_LINUX_SECCOMP_SYSCALLS_ARGS], remargs)
+				sz++;
+	}
+
+	prog = malloc(sizeof(struct sock_fprog));
+	if (!prog)
+		return NULL;
+
+	filter = calloc(sz, sizeof(struct sock_filter));
+	if (!filter) {
+		ERROR("failed to allocate memory for seccomp filter\n");
+		goto errout2;
+	}
+
+	/* validate arch */
+	set_filter(&filter[idx++], BPF_LD + BPF_W + BPF_ABS, 0, 0, arch_nr);
+	set_filter(&filter[idx++], BPF_JMP + BPF_JEQ + BPF_K, 1, 0, ARCH_NR);
+	set_filter(&filter[idx++], BPF_RET + BPF_K, 0, 0, SECCOMP_RET_KILL);
+
+	/* get syscall */
+	set_filter(&filter[idx++], BPF_LD + BPF_W + BPF_ABS, 0, 0, syscall_nr);
+
+	blobmsg_for_each_attr(cur, tb[OCI_LINUX_SECCOMP_SYSCALLS], rem) {
+		uint32_t action;
+		blobmsg_parse(oci_linux_seccomp_syscalls_policy, __OCI_LINUX_SECCOMP_SYSCALLS_MAX, tbn, blobmsg_data(cur), blobmsg_len(cur));
+		action = resolve_action(blobmsg_get_string(tbn[OCI_LINUX_SECCOMP_SYSCALLS_ACTION]));
+		if (tbn[OCI_LINUX_SECCOMP_SYSCALLS_ERRNORET]) {
+			if (action != SECCOMP_RET_ERRNO)
+				goto errout1;
+
+			action = SECCOMP_RET_ERROR(blobmsg_get_u32(tbn[OCI_LINUX_SECCOMP_SYSCALLS_ERRNORET]));
+		} else if (action == SECCOMP_RET_ERRNO)
+			action = SECCOMP_RET_ERROR(EPERM);
+
+		blobmsg_for_each_attr(curn, tbn[OCI_LINUX_SECCOMP_SYSCALLS_NAMES], remn) {
+			sc = find_syscall(blobmsg_get_string(curn));
+			if (sc == -1) {
+				ERROR("unknown syscall '%s'\n", blobmsg_get_string(curn));
+				goto errout1;
+			}
+
+			/* add rule to filter */
+			set_filter(&filter[idx++], BPF_JMP + BPF_JEQ + BPF_K, 0, 1, sc);
+			set_filter(&filter[idx++], BPF_RET + BPF_K, 0, 0, action);
+
+		}
+		blobmsg_for_each_attr(curn, tbn[OCI_LINUX_SECCOMP_SYSCALLS_ARGS], remn) {
+			blobmsg_parse(oci_linux_seccomp_syscalls_args_policy, __OCI_LINUX_SECCOMP_SYSCALLS_ARGS_MAX, tba, blobmsg_data(curn), blobmsg_len(curn));
+			/* ToDo: process args */
+		}
+	}
+
+	set_filter(&filter[idx], BPF_RET + BPF_K, 0, 0, default_policy);
+
+	prog->len = (unsigned short) idx + 1;
+	prog->filter = filter;
+
+	return prog;
+
+errout1:
+	free(prog->filter);
+errout2:
+	free(prog);
+	return NULL;
+}
+
+
+int applyOCIlinuxseccomp(struct sock_fprog *prog)
+{
+	if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+		ERROR("prctl(PR_SET_NO_NEW_PRIVS) failed: %m\n");
+		goto errout;
+	}
+
+	if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, prog)) {
+		ERROR("prctl(PR_SET_SECCOMP) failed: %m\n");
+		goto errout;
+	}
+	free(prog);
+
+	return 0;
+
+errout:
+	free(prog->filter);
+	free(prog);
+	return errno;
+}

+ 33 - 0
jail/seccomp-oci.h

@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) 2020 Daniel Golle <daniel@makrotopia.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 2.1
+ * as published by the Free Software Foundation
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#ifndef _JAIL_SECCOMP_OCI_H_
+#define _JAIL_SECCOMP_OCI_H_
+
+#include <linux/filter.h>
+
+#ifdef SECCOMP_SUPPORT
+struct sock_fprog *parseOCIlinuxseccomp(struct blob_attr *msg);
+int applyOCIlinuxseccomp(struct sock_fprog *prog);
+#else
+
+
+struct sock_fprog *parseOCIlinuxseccomp(struct blob_attr *msg) {
+	return NULL;
+}
+
+int applyOCIlinuxseccomp(struct sock_fprog *prog) {
+	return ENOTSUP;
+}
+#endif
+
+#endif

+ 37 - 0
jail/seccomp-syscalls-helpers.h

@@ -0,0 +1,37 @@
+/*
+ * Copyright (C) 2015 John Crispin <blogic@openwrt.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 2.1
+ * as published by the Free Software Foundation
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#ifndef _JAIL_SECCOMP_HELPERS_H_
+#define _JAIL_SECCOMP_HELPERS_H_
+
+static int find_syscall(const char *name)
+{
+	int i;
+
+	for (i = 0; i < SYSCALL_COUNT; i++) {
+		int sc = syscall_index_to_number(i);
+		if (syscall_name(sc) && !strcmp(syscall_name(sc), name))
+			return sc;
+	}
+
+	return -1;
+}
+
+static void set_filter(struct sock_filter *filter, __u16 code, __u8 jt, __u8 jf, __u32 k)
+{
+	filter->code = code;
+	filter->jt = jt;
+	filter->jf = jf;
+	filter->k = k;
+}
+
+#endif

+ 1 - 21
jail/seccomp.c

@@ -21,27 +21,7 @@
 #include "seccomp-bpf.h"
 #include "seccomp.h"
 #include "../syscall-names.h"
-
-static int find_syscall(const char *name)
-{
-	int i;
-
-	for (i = 0; i < SYSCALL_COUNT; i++) {
-		int sc = syscall_index_to_number(i);
-		if (syscall_name(sc) && !strcmp(syscall_name(sc), name))
-			return sc;
-	}
-
-	return -1;
-}
-
-static void set_filter(struct sock_filter *filter, __u16 code, __u8 jt, __u8 jf, __u32 k)
-{
-	filter->code = code;
-	filter->jt = jt;
-	filter->jf = jf;
-	filter->k = k;
-}
+#include "seccomp-syscalls-helpers.h"
 
 int install_syscall_filter(const char *argv, const char *file)
 {

+ 34 - 15
service/instance.c

@@ -65,6 +65,7 @@ enum {
 	INSTANCE_ATTR_EXTROOT,
 	INSTANCE_ATTR_OVERLAYDIR,
 	INSTANCE_ATTR_TMPOVERLAYSIZE,
+	INSTANCE_ATTR_BUNDLE,
 	__INSTANCE_ATTR_MAX
 };
 
@@ -95,6 +96,7 @@ static const struct blobmsg_policy instance_attr[__INSTANCE_ATTR_MAX] = {
 	[INSTANCE_ATTR_EXTROOT] = { "extroot", BLOBMSG_TYPE_STRING },
 	[INSTANCE_ATTR_OVERLAYDIR] = { "overlaydir", BLOBMSG_TYPE_STRING },
 	[INSTANCE_ATTR_TMPOVERLAYSIZE] = { "tmpoverlaysize", BLOBMSG_TYPE_STRING },
+	[INSTANCE_ATTR_BUNDLE] = { "bundle", BLOBMSG_TYPE_STRING },
 };
 
 enum {
@@ -294,6 +296,11 @@ jail_run(struct service_instance *in, char **argv)
 		argv[argc++] = in->tmpoverlaysize;
 	}
 
+	if (in->bundle) {
+		argv[argc++] = "-J";
+		argv[argc++] = in->bundle;
+	}
+
 	if (in->require_jail)
 		argv[argc++] = "-E";
 
@@ -484,7 +491,7 @@ instance_start(struct service_instance *in)
 		return;
 	}
 
-	if (!in->command) {
+	if (!in->bundle && !in->command) {
 		LOG("Not starting instance %s::%s, command not set\n", in->srv->name, in->name);
 		return;
 	}
@@ -802,7 +809,8 @@ instance_config_changed(struct service_instance *in, struct service_instance *in
 		return true;
 	if (in->respawn_timeout != in_new->respawn_timeout)
 		return true;
-
+	if (in->bundle && in_new->bundle && strcmp(in->bundle, in_new->bundle))
+		return true;
 	if ((!in->seccomp && in_new->seccomp) ||
 	    (in->seccomp && !in_new->seccomp) ||
 	    (in->seccomp && in_new->seccomp && strcmp(in->seccomp, in_new->seccomp)))
@@ -996,6 +1004,9 @@ instance_jail_parse(struct service_instance *in, struct blob_attr *attr)
 	if (in->no_new_privs)
 		jail->argc++;
 
+	if (in->bundle)
+		jail->argc += 2;
+
 	return true;
 }
 
@@ -1035,8 +1046,8 @@ instance_config_parse(struct service_instance *in)
 	blobmsg_parse(instance_attr, __INSTANCE_ATTR_MAX, tb,
 		blobmsg_data(in->config), blobmsg_data_len(in->config));
 
-	if (!instance_config_parse_command(in, tb))
-		return false;
+	if (!tb[INSTANCE_ATTR_BUNDLE] && !instance_config_parse_command(in, tb))
+			return false;
 
 	if (tb[INSTANCE_ATTR_TERMTIMEOUT])
 		in->term_timeout = blobmsg_get_u32(tb[INSTANCE_ATTR_TERMTIMEOUT]);
@@ -1113,6 +1124,9 @@ instance_config_parse(struct service_instance *in)
 	if (tb[INSTANCE_ATTR_TMPOVERLAYSIZE])
 		in->tmpoverlaysize = strdup(blobmsg_get_string(tb[INSTANCE_ATTR_TMPOVERLAYSIZE]));
 
+	if (tb[INSTANCE_ATTR_BUNDLE])
+		in->bundle = strdup(blobmsg_get_string(tb[INSTANCE_ATTR_BUNDLE]));
+
 	if (tb[INSTANCE_ATTR_PIDFILE]) {
 		char *pidfile = blobmsg_get_string(tb[INSTANCE_ATTR_PIDFILE]);
 		if (pidfile)
@@ -1264,6 +1278,7 @@ instance_free(struct service_instance *in)
 	free(in->extroot);
 	free(in->overlaydir);
 	free(in->tmpoverlaysize);
+	free(in->bundle);
 	free(in->jail.name);
 	free(in->jail.hostname);
 	free(in->seccomp);
@@ -1324,6 +1339,8 @@ void instance_dump(struct blob_buf *b, struct service_instance *in, int verbose)
 		blobmsg_add_u32(b, "pid", in->proc.pid);
 	if (in->command)
 		blobmsg_add_blob(b, in->command);
+	if (in->bundle)
+		blobmsg_add_string(b, "bundle", in->bundle);
 	blobmsg_add_u32(b, "term_timeout", in->term_timeout);
 	if (!in->proc.pending)
 		blobmsg_add_u32(b, "exit_code", in->exit_code);
@@ -1393,17 +1410,19 @@ void instance_dump(struct blob_buf *b, struct service_instance *in, int verbose)
 		void *r = blobmsg_open_table(b, "jail");
 		if (in->jail.name)
 			blobmsg_add_string(b, "name", in->jail.name);
-		if (in->jail.hostname)
-			blobmsg_add_string(b, "hostname", in->jail.hostname);
-
-		blobmsg_add_u8(b, "procfs", in->jail.procfs);
-		blobmsg_add_u8(b, "sysfs", in->jail.sysfs);
-		blobmsg_add_u8(b, "ubus", in->jail.ubus);
-		blobmsg_add_u8(b, "log", in->jail.log);
-		blobmsg_add_u8(b, "ronly", in->jail.ronly);
-		blobmsg_add_u8(b, "netns", in->jail.netns);
-		blobmsg_add_u8(b, "userns", in->jail.userns);
-		blobmsg_add_u8(b, "cgroupsns", in->jail.cgroupsns);
+		if (!in->bundle) {
+			if (in->jail.hostname)
+				blobmsg_add_string(b, "hostname", in->jail.hostname);
+
+			blobmsg_add_u8(b, "procfs", in->jail.procfs);
+			blobmsg_add_u8(b, "sysfs", in->jail.sysfs);
+			blobmsg_add_u8(b, "ubus", in->jail.ubus);
+			blobmsg_add_u8(b, "log", in->jail.log);
+			blobmsg_add_u8(b, "ronly", in->jail.ronly);
+			blobmsg_add_u8(b, "netns", in->jail.netns);
+			blobmsg_add_u8(b, "userns", in->jail.userns);
+			blobmsg_add_u8(b, "cgroupsns", in->jail.cgroupsns);
+		}
 		blobmsg_add_u8(b, "console", (in->console.fd.fd > -1));
 		blobmsg_close_table(b, r);
 		if (!avl_is_empty(&in->jail.mount.avl)) {

+ 1 - 0
service/instance.h

@@ -70,6 +70,7 @@ struct service_instance {
 	char *extroot;
 	char *overlaydir;
 	char *tmpoverlaysize;
+	char *bundle;
 	int syslog_facility;
 	int exit_code;