From 0219eb42c5ff8f14cc9d5f7a55e8234ee41cfd8e Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Fri, 29 Sep 2023 16:32:07 -0400 Subject: [PATCH 01/44] Drop "exec" argument to devtmpfs mount Newer kernels do not allow it. --- runtime/init-container/src/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime/init-container/src/init.c b/runtime/init-container/src/init.c index 92d94fb0..6e81a473 100644 --- a/runtime/init-container/src/init.c +++ b/runtime/init-container/src/init.c @@ -1699,7 +1699,7 @@ int main(void) { NULL)); CHECK(mount("devtmpfs", "/dev", "devtmpfs", MS_NOSUID, - "exec,mode=0755,size=2M")); + "mode=0755,size=2M")); CHECK(mount("tmpfs", "/tmp", "tmpfs", MS_NOSUID, "mode=0777")); From c9d5f2967b020e8889d23c9de5588f301910cf29 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Mon, 4 Sep 2023 22:12:10 -0400 Subject: [PATCH 02/44] Pass explicit arguments to boolean QEMU options Omitting the argument or using the "no" prefix is deprecated. --- runtime/examples/direct.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime/examples/direct.rs b/runtime/examples/direct.rs index 502b8415..d6c33967 100644 --- a/runtime/examples/direct.rs +++ b/runtime/examples/direct.rs @@ -126,7 +126,7 @@ fn spawn_vm<'a, P: AsRef>(temp_path: P, mount_args: &'a [(&'a str, impl To "virtio-rng-pci", "-chardev", format!( - "socket,path={},server,nowait,id=manager_cdev", + "socket,path={},server=true,wait=false,id=manager_cdev", temp_path.as_ref().join("manager.sock").display() ) .as_str(), From f6c6ff934635c03c33c314abb2711c476311bc59 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Mon, 4 Sep 2023 22:14:09 -0400 Subject: [PATCH 03/44] Add new CHECK_BOOL macro This is useful when a failure return is not -1. --- runtime/init-container/src/init.c | 34 ++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/runtime/init-container/src/init.c b/runtime/init-container/src/init.c index 6e81a473..57a97e68 100644 --- a/runtime/init-container/src/init.c +++ b/runtime/init-container/src/init.c @@ -113,6 +113,16 @@ static noreturn void die(void) { } } +#define CHECK_BOOL(x) ({ \ + __typeof__(x) _x = (x); \ + if (_x == 0) { \ + fprintf(stderr, "Error at %s:%d: %m\n", __FILE__, __LINE__); \ + die(); \ + } \ + _x; \ +}) + + #define CHECK(x) ({ \ __typeof__(x) _x = (x); \ if (_x == -1) { \ @@ -124,8 +134,8 @@ static noreturn void die(void) { static void load_module(const char* path) { int fd = CHECK(open(path, O_RDONLY | O_CLOEXEC)); - CHECK(syscall(SYS_finit_module, fd, "", 0)); - CHECK(close(fd)); + CHECK_BOOL(syscall(SYS_finit_module, fd, "", 0) == 0); + CHECK_BOOL(close(fd) == 0); } int make_nonblocking(int fd) { @@ -378,12 +388,12 @@ static int set_network_ns(char *entries[], int n) { fprintf(f, "search example.com\n"); for (int i = 0; i < n; ++i) { - fprintf(f, "nameserver %s\n", entries[i]); + CHECK_BOOL(fprintf(f, "nameserver %s\n", entries[i]) > 0); } - fflush(f); - fsync(fileno(f)); - fclose(f); + CHECK_BOOL(fflush(f) == 0); + CHECK_BOOL(fsync(fileno(f)) == 0); + CHECK_BOOL(fclose(f) == 0); return 0; } @@ -394,9 +404,9 @@ int write_sys(char *path, size_t value) { return -1; } - fprintf(f, "%ld", value); - fflush(f); - fclose(f); + CHECK_BOOL(fprintf(f, "%ld", value) > 0); + CHECK_BOOL(fflush(f) == 0); + CHECK_BOOL(fclose(f) == 0); return 0; } @@ -1634,9 +1644,9 @@ static void create_dir(const char *pathname, mode_t mode) { } int main(void) { - setbuf(stdin, NULL); - setbuf(stdout, NULL); - setbuf(stderr, NULL); + CHECK_BOOL(setvbuf(stdin, NULL, _IONBF, BUFSIZ) == 0); + CHECK_BOOL(setvbuf(stdout, NULL, _IONBF, BUFSIZ) == 0); + CHECK_BOOL(setvbuf(stderr, NULL, _IONBF, BUFSIZ) == 0); create_dir("/dev", DEFAULT_DIR_PERMS); CHECK(mount("devtmpfs", "/dev", "devtmpfs", MS_NOSUID, From 9856b2591b1bf32a334ec784be16f0b3a33a84c5 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Mon, 4 Sep 2023 22:14:41 -0400 Subject: [PATCH 04/44] Fix "direct" example It did not work due to an incorrect command line. --- runtime/examples/direct.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/runtime/examples/direct.rs b/runtime/examples/direct.rs index d6c33967..c3b9fe3b 100644 --- a/runtime/examples/direct.rs +++ b/runtime/examples/direct.rs @@ -96,11 +96,10 @@ fn join_as_string>(path: P, file: impl ToString) -> String { fn spawn_vm<'a, P: AsRef>(temp_path: P, mount_args: &'a [(&'a str, impl ToString)]) -> Child { let root_dir = get_root_dir(); let project_dir = get_project_dir(); - let runtime_dir = project_dir.join("poc").join("runtime"); let init_dir = project_dir.join("init-container"); - let mut cmd = Command::new("vmrt"); - cmd.current_dir(runtime_dir).args([ + let mut cmd = Command::new("qemu-system-x86_64"); + cmd.current_dir(&init_dir).args([ "-m", "256m", "-nographic", From dbc6684071c303547429a961e4b0fe77ad486573 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Mon, 4 Sep 2023 22:15:23 -0400 Subject: [PATCH 05/44] Drop -enable-kvm and -cpu host This allows QEMU to fall back to TCG when hardware virtualization is not available, such as in Qubes OS. --- runtime/examples/direct.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/runtime/examples/direct.rs b/runtime/examples/direct.rs index c3b9fe3b..eb809af8 100644 --- a/runtime/examples/direct.rs +++ b/runtime/examples/direct.rs @@ -112,9 +112,6 @@ fn spawn_vm<'a, P: AsRef>(temp_path: P, mount_args: &'a [(&'a str, impl To "-no-reboot", "-net", "none", - "-enable-kvm", - "-cpu", - "host", "-smp", "1", "-append", From 7a122be8621396f526b71d22f9ff167cfc8d7642 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Mon, 4 Sep 2023 22:16:17 -0400 Subject: [PATCH 06/44] Remove "search example.com" from /etc/hosts It can cause incorrect resolving of DNS queries. --- runtime/init-container/src/init.c | 1 - 1 file changed, 1 deletion(-) diff --git a/runtime/init-container/src/init.c b/runtime/init-container/src/init.c index 57a97e68..4df81f45 100644 --- a/runtime/init-container/src/init.c +++ b/runtime/init-container/src/init.c @@ -386,7 +386,6 @@ static int set_network_ns(char *entries[], int n) { return -1; } - fprintf(f, "search example.com\n"); for (int i = 0; i < n; ++i) { CHECK_BOOL(fprintf(f, "nameserver %s\n", entries[i]) > 0); } From d57132041fd0169e18141b27de593834ffc44b02 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Mon, 4 Sep 2023 22:17:30 -0400 Subject: [PATCH 07/44] Respect POSIX requirements for fork() and threads The parent process is multi-threaded, so in the child process after fork() only async signal safe interfaces can be used. --- runtime/init-container/src/init.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/runtime/init-container/src/init.c b/runtime/init-container/src/init.c index 4df81f45..a9b677aa 100644 --- a/runtime/init-container/src/init.c +++ b/runtime/init-container/src/init.c @@ -586,20 +586,10 @@ static bool redirect_fd_to_path(int fd, const char* path) { // lives in a separate memory segment (after forking) static int child_pipe = -1; -static void close_child_pipe() { - if (child_pipe != -1) { - char c = '\0'; - /* Can't do anything with errors here. */ - (void)write(child_pipe, &c, sizeof(c)); - close(child_pipe); - } -} - static noreturn void child_wrapper(int parent_pipe[2], struct new_process_args* new_proc_args, struct redir_fd_desc fd_descs[3]) { child_pipe = parent_pipe[1]; - atexit(close_child_pipe); if (close(parent_pipe[0]) < 0) { goto out; @@ -658,7 +648,13 @@ static noreturn void child_wrapper(int parent_pipe[2], new_proc_args->envp ?: environ); out: - exit(errno); + if (child_pipe != -1) { + char c = '\0'; + /* Can't do anything with errors here. */ + (void)write(child_pipe, &c, sizeof(c)); + close(child_pipe); + } + _exit(errno); } /* 0 is considered an invalid ID. */ From ebb6cd6f3fa60a9799052a636566405164d79b80 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Mon, 4 Sep 2023 22:37:18 -0400 Subject: [PATCH 08/44] Only chroot() in the child process This ensures that the parent process's root directory is not located in a filesystem that will be untrusted in the future. --- runtime/init-container/src/init.c | 81 +++++++++++++++++++++---------- 1 file changed, 55 insertions(+), 26 deletions(-) diff --git a/runtime/init-container/src/init.c b/runtime/init-container/src/init.c index a9b677aa..e82e5d9a 100644 --- a/runtime/init-container/src/init.c +++ b/runtime/init-container/src/init.c @@ -49,9 +49,10 @@ #define DEV_VPN "eth0" #define DEV_INET "eth1" +#define SYSROOT "/mnt/newroot" #define MODE_RW_UGO (S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH) -#define OUTPUT_PATH_PREFIX "/var/tmp/guest_agent_private/fds" +#define OUTPUT_PATH_PREFIX SYSROOT "/var/tmp/guest_agent_private/fds" #define NET_MEM_DEFAULT 1048576 #define NET_MEM_MAX 2097152 @@ -365,7 +366,7 @@ static void setup_agent_directories(void) { static int add_network_hosts(char *entries[][2], int n) { FILE *f; - if ((f = fopen("/etc/hosts", "a")) == 0) { + if ((f = fopen(SYSROOT "/etc/hosts", "a")) == 0) { return -1; } @@ -382,7 +383,7 @@ static int add_network_hosts(char *entries[][2], int n) { static int set_network_ns(char *entries[], int n) { FILE *f; - if ((f = fopen("/etc/resolv.conf", "w")) == 0) { + if ((f = fopen(SYSROOT "/etc/resolv.conf", "w")) == 0) { return -1; } @@ -603,15 +604,43 @@ static noreturn void child_wrapper(int parent_pipe[2], goto out; } +#ifdef MASSIVEDEBUGGING +#define X(a) do if (write(2, a "\n", sizeof(a)) != sizeof(a)) goto out; while (0) +#else +#define X(a) do (void)(a ""); while (0) +#endif + X("ENTERING CHROOT"); + if (chdir(SYSROOT) != 0 || chroot(".") != 0) { + goto out; + } + + X("chdir(\"/\")"); + if (chdir("/") != 0) { + goto out; + } + if (new_proc_args->cwd) { + X("chdir(\"command dir\")"); if (chdir(new_proc_args->cwd) < 0) { goto out; } } + X("fd processing"); for (int fd = 0; fd < 3; ++fd) { + X("processing an FD"); switch (fd_descs[fd].type) { case REDIRECT_FD_FILE: + X("redirecting an FD to a file"); + if (strncmp(fd_descs[fd].path, SYSROOT, sizeof SYSROOT - 1) != 0) + abort(); + fd_descs[fd].path += sizeof SYSROOT - 1; +#ifdef MASSIVEDEBUGGING + if ((size_t)write(2, fd_descs[fd].path, strlen(fd_descs[fd].path)) != strlen(fd_descs[fd].path)) { + goto out; + } + X(""); +#endif if (!redirect_fd_to_path(fd, fd_descs[fd].path)) { goto out; } @@ -633,15 +662,18 @@ static noreturn void child_wrapper(int parent_pipe[2], } gid_t gid = new_proc_args->gid; + X("setresgid"); if (setresgid(gid, gid, gid) < 0) { goto out; } uid_t uid = new_proc_args->uid; + X("setresuid"); if (setresuid(uid, uid, uid) < 0) { goto out; } + X("execve"); /* If execve returns we know an error happened. */ (void)execve(new_proc_args->bin, new_proc_args->argv, @@ -1670,62 +1702,59 @@ int main(void) { g_cmds_fd = CHECK(open(VPORT_CMD, O_RDWR | O_CLOEXEC)); CHECK(mkdir("/mnt", S_IRWXU)); + CHECK(mkdir("/proc", S_IRWXU)); CHECK(mkdir("/mnt/image", S_IRWXU)); CHECK(mkdir("/mnt/overlay", S_IRWXU)); - CHECK(mkdir("/mnt/newroot", DEFAULT_DIR_PERMS)); + CHECK(mkdir(SYSROOT, DEFAULT_DIR_PERMS)); // 'workdir' and 'upperdir' have to be on the same filesystem CHECK(mount("tmpfs", "/mnt/overlay", "tmpfs", MS_NOSUID, - "mode=0777,size=128M")); + "mode=0700,size=128M")); CHECK(mkdir("/mnt/overlay/upper", S_IRWXU)); CHECK(mkdir("/mnt/overlay/work", S_IRWXU)); CHECK(mount("/dev/vda", "/mnt/image", "squashfs", MS_RDONLY, "")); - CHECK(mount("overlay", "/mnt/newroot", "overlay", 0, + CHECK(mount("overlay", SYSROOT, "overlay", 0, "lowerdir=/mnt/image,upperdir=/mnt/overlay/upper,workdir=/mnt/overlay/work")); - CHECK(umount2("/dev", MNT_DETACH)); - - CHECK(chdir("/mnt/newroot")); - CHECK(mount(".", "/", "none", MS_MOVE, NULL)); - CHECK(chroot(".")); - CHECK(chdir("/")); - - create_dir("/dev", DEFAULT_DIR_PERMS); - create_dir("/tmp", DEFAULT_DIR_PERMS); + create_dir(SYSROOT "/dev", DEFAULT_DIR_PERMS); + create_dir(SYSROOT "/tmp", DEFAULT_DIR_PERMS); CHECK(mount("proc", "/proc", "proc", MS_NODEV | MS_NOSUID | MS_NOEXEC, NULL)); - CHECK(mount("sysfs", "/sys", "sysfs", + CHECK(mount("proc", SYSROOT "/proc", "proc", + MS_NODEV | MS_NOSUID | MS_NOEXEC, + NULL)); + CHECK(mount("sysfs", SYSROOT "/sys", "sysfs", MS_NODEV | MS_NOSUID | MS_NOEXEC, NULL)); - CHECK(mount("devtmpfs", "/dev", "devtmpfs", + CHECK(mount("devtmpfs", SYSROOT "/dev", "devtmpfs", MS_NOSUID, "mode=0755,size=2M")); - CHECK(mount("tmpfs", "/tmp", "tmpfs", + CHECK(mount("tmpfs", SYSROOT "/tmp", "tmpfs", MS_NOSUID, "mode=0777")); - create_dir("/dev/pts", DEFAULT_DIR_PERMS); - create_dir("/dev/shm", DEFAULT_DIR_PERMS); + create_dir(SYSROOT "/dev/pts", DEFAULT_DIR_PERMS); + create_dir(SYSROOT "/dev/shm", DEFAULT_DIR_PERMS); - CHECK(mount("devpts", "/dev/pts", "devpts", + CHECK(mount("devpts", SYSROOT "/dev/pts", "devpts", MS_NOSUID | MS_NOEXEC, "gid=5,mode=0620")); - CHECK(mount("tmpfs", "/dev/shm", "tmpfs", + CHECK(mount("tmpfs", SYSROOT "/dev/shm", "tmpfs", MS_NODEV | MS_NOSUID | MS_NOEXEC, NULL)); - if (access("/dev/null", F_OK) != 0) { - CHECK(mknod("/dev/null", + if (access(SYSROOT "/dev/null", F_OK) != 0) { + CHECK(mknod(SYSROOT "/dev/null", MODE_RW_UGO | S_IFCHR, makedev(1, 3))); } - if (access("/dev/ptmx", F_OK) != 0) { - CHECK(mknod("/dev/ptmx", + if (access(SYSROOT "/dev/ptmx", F_OK) != 0) { + CHECK(mknod(SYSROOT "/dev/ptmx", MODE_RW_UGO | S_IFCHR, makedev(5, 2))); } From 2113f7f1f0009d31c75fa1433ce17fab33068cb8 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Tue, 5 Sep 2023 11:50:17 -0400 Subject: [PATCH 09/44] Check that paths start with / This avoids an assertion failure or out-of-bounds read in the event of malformed input from the host. --- runtime/init-container/src/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime/init-container/src/init.c b/runtime/init-container/src/init.c index e82e5d9a..6d0ad07a 100644 --- a/runtime/init-container/src/init.c +++ b/runtime/init-container/src/init.c @@ -1130,7 +1130,7 @@ static void handle_mount(msg_id_t msg_id) { } } - if (!tag || !path) { + if (!tag || !path || path[0] != '/') { ret = EINVAL; goto out; } From ccbc2d7d219c1077fbec48e0841a9a5c9217353a Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Tue, 5 Sep 2023 14:40:43 -0400 Subject: [PATCH 10/44] Use mkdirat() to create certain directories This is preparation for preventing symlink races in the future. --- runtime/init-container/src/init.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/runtime/init-container/src/init.c b/runtime/init-container/src/init.c index 6d0ad07a..31cecd97 100644 --- a/runtime/init-container/src/init.c +++ b/runtime/init-container/src/init.c @@ -59,6 +59,7 @@ #define MTU_VPN 1220 #define MTU_INET 65521 +static int g_sysroot_fd = AT_FDCWD; struct new_process_args { char* bin; @@ -1664,7 +1665,7 @@ static noreturn void main_loop(void) { } static void create_dir(const char *pathname, mode_t mode) { - if (mkdir(pathname, mode) < 0 && errno != EEXIST) { + if (mkdirat(g_sysroot_fd, pathname, mode) < 0 && errno != EEXIST) { fprintf(stderr, "mkdir(%s) failed with: %m\n", pathname); die(); } @@ -1719,8 +1720,11 @@ int main(void) { CHECK(mount("overlay", SYSROOT, "overlay", 0, "lowerdir=/mnt/image,upperdir=/mnt/overlay/upper,workdir=/mnt/overlay/work")); - create_dir(SYSROOT "/dev", DEFAULT_DIR_PERMS); - create_dir(SYSROOT "/tmp", DEFAULT_DIR_PERMS); + g_sysroot_fd = CHECK(open(SYSROOT, O_RDONLY | O_DIRECTORY | O_CLOEXEC)); + assert(g_sysroot_fd >= 3); + + create_dir("dev", DEFAULT_DIR_PERMS); + create_dir("tmp", DEFAULT_DIR_PERMS); CHECK(mount("proc", "/proc", "proc", MS_NODEV | MS_NOSUID | MS_NOEXEC, @@ -1738,8 +1742,8 @@ int main(void) { MS_NOSUID, "mode=0777")); - create_dir(SYSROOT "/dev/pts", DEFAULT_DIR_PERMS); - create_dir(SYSROOT "/dev/shm", DEFAULT_DIR_PERMS); + create_dir("dev/pts", DEFAULT_DIR_PERMS); + create_dir("dev/shm", DEFAULT_DIR_PERMS); CHECK(mount("devpts", SYSROOT "/dev/pts", "devpts", MS_NOSUID | MS_NOEXEC, From 70eb4bcce51e79041f02f6e3852e385a0b1e950d Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Tue, 5 Sep 2023 16:45:04 -0400 Subject: [PATCH 11/44] Add missing error checks in add_network_hosts Errors from fprintf(), fflush(), fsync(), and fclose() were ignored. --- runtime/init-container/src/init.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/runtime/init-container/src/init.c b/runtime/init-container/src/init.c index 31cecd97..62dd5ad2 100644 --- a/runtime/init-container/src/init.c +++ b/runtime/init-container/src/init.c @@ -372,12 +372,20 @@ static int add_network_hosts(char *entries[][2], int n) { } for (int i = 0; i < n; ++i) { - fprintf(f, "%s\t%s\n", entries[i][0], entries[i][1]); + if (fprintf(f, "%s\t%s\n", entries[i][0], entries[i][1]) < 2) { + return -1; + } } - fflush(f); - fsync(fileno(f)); - fclose(f); + if (fflush(f)) { + return -1; + } + if (fsync(fileno(f))) { + return -1; + } + if (fclose(f)) { + return -1; + } return 0; } From a41d9979f9290b29d406cde16ce7d79b05bd6d3f Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Tue, 5 Sep 2023 16:50:59 -0400 Subject: [PATCH 12/44] Ensure that status_pipe is initialized before use If create_process_fds_dir() failed, status_pipe would be used uninitialized. --- runtime/init-container/src/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime/init-container/src/init.c b/runtime/init-container/src/init.c index 62dd5ad2..ec852ee2 100644 --- a/runtime/init-container/src/init.c +++ b/runtime/init-container/src/init.c @@ -747,6 +747,7 @@ static uint32_t spawn_new_process(struct new_process_args* new_proc_args, for (size_t fd = 0; fd < 3; ++fd) { proc_desc->redirs[fd].type = REDIRECT_FD_INVALID; } + int status_pipe[2] = { -1, -1 }; proc_desc->id = get_next_id(); if (create_process_fds_dir(proc_desc->id) < 0) { @@ -756,7 +757,6 @@ static uint32_t spawn_new_process(struct new_process_args* new_proc_args, /* All these shenanigans with pipes are so that we can distinguish internal * failures from spawned process exiting. */ - int status_pipe[2] = { -1, -1 }; if (pipe2(status_pipe, O_CLOEXEC | O_DIRECT) < 0) { ret = errno; goto out_err; From 5c19686bbb220a9ff24685718f65569647611fa9 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Tue, 5 Sep 2023 16:52:58 -0400 Subject: [PATCH 13/44] Add logging in case process spawning fails This allows debugging problems in this process. --- runtime/init-container/src/init.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/runtime/init-container/src/init.c b/runtime/init-container/src/init.c index ec852ee2..7f53402f 100644 --- a/runtime/init-container/src/init.c +++ b/runtime/init-container/src/init.c @@ -752,6 +752,7 @@ static uint32_t spawn_new_process(struct new_process_args* new_proc_args, proc_desc->id = get_next_id(); if (create_process_fds_dir(proc_desc->id) < 0) { ret = errno; + fprintf(stderr, "Failed to create file descriptor directory: %m\n"); goto out_err; } @@ -759,6 +760,7 @@ static uint32_t spawn_new_process(struct new_process_args* new_proc_args, * failures from spawned process exiting. */ if (pipe2(status_pipe, O_CLOEXEC | O_DIRECT) < 0) { ret = errno; + fprintf(stderr, "Failed to create status pipe: %m\n"); goto out_err; } @@ -800,6 +802,7 @@ static uint32_t spawn_new_process(struct new_process_args* new_proc_args, if (pipe2(proc_desc->redirs[fd].buffer.fds, O_CLOEXEC) < 0) { ret = errno; + fprintf(stderr, "Failed to create redirection pipe: %m\n"); goto out_err; } break; @@ -811,6 +814,7 @@ static uint32_t spawn_new_process(struct new_process_args* new_proc_args, p = fork(); if (p < 0) { ret = errno; + fprintf(stderr, "Failed to fork: %m\n"); goto out_err; } else if (p == 0) { child_wrapper(status_pipe, new_proc_args, proc_desc->redirs); @@ -823,8 +827,10 @@ static uint32_t spawn_new_process(struct new_process_args* new_proc_args, ssize_t x = read(status_pipe[0], &c, sizeof(c)); if (x < 0) { ret = errno; + fprintf(stderr, "Failed to read from pipe: %m\n"); goto out_err; } else if (x > 0) { + fprintf(stderr, "Failed to spawn process\n"); /* Process failed to spawn. */ int status = 0; CHECK(waitpid(p, &status, 0)); @@ -854,6 +860,7 @@ static uint32_t spawn_new_process(struct new_process_args* new_proc_args, &epoll_fd_descs[fd]) < 0) { if (errno == ENOMEM || errno == ENOSPC) { ret = errno; + fprintf(stderr, "Failed to add epoll descriptor: %m\n"); goto out_err; } CHECK(-1); From f6e53b16be7a2b7a2315fd8b259aba7845d16577 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Tue, 5 Sep 2023 20:30:24 -0400 Subject: [PATCH 14/44] Ensure that mounts are free from symlink races This ensures that mounts are safe, even if the malicious code is concurrently renaming directories. This prevents QSB-015-style attacks. --- runtime/init-container/src/init.c | 122 ++++++++++++++++++++++-------- 1 file changed, 92 insertions(+), 30 deletions(-) diff --git a/runtime/init-container/src/init.c b/runtime/init-container/src/init.c index 7f53402f..0eb29dc9 100644 --- a/runtime/init-container/src/init.c +++ b/runtime/init-container/src/init.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "communication.h" #include "cyclic_buffer.h" @@ -52,7 +53,7 @@ #define SYSROOT "/mnt/newroot" #define MODE_RW_UGO (S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH) -#define OUTPUT_PATH_PREFIX SYSROOT "/var/tmp/guest_agent_private/fds" +#define OUTPUT_PATH_PREFIX "/var/tmp/guest_agent_private/fds" #define NET_MEM_DEFAULT 1048576 #define NET_MEM_MAX 2097152 @@ -330,27 +331,59 @@ static void setup_sigfd(void) { g_sig_fd = CHECK(signalfd(g_sig_fd, &set, SFD_CLOEXEC)); } -static int create_dir_path(char* path) { +static int create_dir_path(char* path, int perms, int *out_fd) { assert(path[0] == '/'); char* next = path; - while (1) { - next = strchr(next + 1, '/'); - if (!next) { - break; + int fd = g_sysroot_fd; + int rc = -1; + char *prev; + do { + next++; + prev = next; + next = strchr(next, '/'); + if (next != NULL) { + *next = '\0'; } - *next = '\0'; - int ret = mkdir(path, DEFAULT_DIR_PERMS); - *next = '/'; - if (ret < 0 && errno != EEXIST) { - return -1; + if (*prev == '\0' || strcmp(prev, ".") == 0 || strcmp(prev, "..") == 0) { + fprintf(stderr, "Invalid path component '%s'\n", prev); + errno = EINVAL; + goto fail; + } + int ret = mkdirat(fd, prev, perms); + if (ret != 0 && errno != EEXIST) { + int tmp = errno; + assert(errno != EBADF); + fprintf(stderr, "mkdirat() failed: %m\n"); + errno = tmp; + goto fail; } - } - if (mkdir(path, DEFAULT_DIR_PERMS) < 0 && errno != EEXIST) { - return -1; - } - return 0; + int new_fd = openat(fd, prev, O_DIRECTORY | O_RDONLY | O_NOFOLLOW | O_CLOEXEC); + if (new_fd == -1) { + int tmp = errno; + assert(tmp != EBADF); + fprintf(stderr, "openat() failed: %m\n"); + errno = tmp; + goto fail; + } + if (fd != g_sysroot_fd) { + close(fd); + } + fd = new_fd; + } while (next); + rc = 0; + if (out_fd) { + *out_fd = fd; + fd = g_sysroot_fd; + } +fail: + if (fd != g_sysroot_fd) { + int save = errno; + close(fd); + errno = save; + } + return rc; } static void setup_agent_directories(void) { @@ -360,7 +393,7 @@ static void setup_agent_directories(void) { die(); } - CHECK(create_dir_path(path)); + CHECK(create_dir_path(path, DEFAULT_DIR_PERMS, NULL)); free(path); } @@ -613,24 +646,38 @@ static noreturn void child_wrapper(int parent_pipe[2], goto out; } +#define MASSIVEDEBUGGING #ifdef MASSIVEDEBUGGING -#define X(a) do if (write(2, a "\n", sizeof(a)) != sizeof(a)) goto out; while (0) +#define X(a) do { \ + int tmp = errno;\ + if (write(2, a "\n", sizeof(a)) != sizeof(a)) { \ + errno = tmp; \ + goto out; \ + } \ + errno = tmp; \ +} while (0) #else #define X(a) do (void)(a ""); while (0) #endif X("ENTERING CHROOT"); - if (chdir(SYSROOT) != 0 || chroot(".") != 0) { + if (chdir(SYSROOT) != 0) { + X("cannot enter " SYSROOT); + goto out; + } + if (chroot(".") != 0) { + X("cannot chroot(\".\")"); goto out; } - X("chdir(\"/\")"); if (chdir("/") != 0) { + X("cannot chdir(\"/\")"); goto out; } if (new_proc_args->cwd) { X("chdir(\"command dir\")"); if (chdir(new_proc_args->cwd) < 0) { + X("cannot chdir"); goto out; } } @@ -641,9 +688,6 @@ static noreturn void child_wrapper(int parent_pipe[2], switch (fd_descs[fd].type) { case REDIRECT_FD_FILE: X("redirecting an FD to a file"); - if (strncmp(fd_descs[fd].path, SYSROOT, sizeof SYSROOT - 1) != 0) - abort(); - fd_descs[fd].path += sizeof SYSROOT - 1; #ifdef MASSIVEDEBUGGING if ((size_t)write(2, fd_descs[fd].path, strlen(fd_descs[fd].path)) != strlen(fd_descs[fd].path)) { goto out; @@ -687,6 +731,8 @@ static noreturn void child_wrapper(int parent_pipe[2], (void)execve(new_proc_args->bin, new_proc_args->argv, new_proc_args->envp ?: environ); + X("execve failed"); + out: if (child_pipe != -1) { @@ -710,7 +756,7 @@ static int create_process_fds_dir(uint64_t id) { return -1; } - if (mkdir(path, S_IRWXU) < 0) { + if (create_dir_path(path, S_IRWXU, NULL) < 0) { int tmp = errno; free(path); errno = tmp; @@ -737,11 +783,13 @@ static uint32_t spawn_new_process(struct new_process_args* new_proc_args, struct epoll_fd_desc* epoll_fd_descs[3] = { NULL }; if (new_proc_args->is_entrypoint && g_entrypoint_desc) { + fprintf(stderr, "Caller bug, returning EEXIST\n"); return EEXIST; } struct process_desc* proc_desc = calloc(1, sizeof(*proc_desc)); if (!proc_desc) { + fprintf(stderr, "Memory allocation failed\n"); return ENOMEM; } for (size_t fd = 0; fd < 3; ++fd) { @@ -772,6 +820,7 @@ static uint32_t spawn_new_process(struct new_process_args* new_proc_args, proc_desc->redirs[fd].path = strdup(fd_descs[fd].path); if (!proc_desc->redirs[fd].path) { ret = errno; + fprintf(stderr, "Memory allocation failed\n"); goto out_err; } } else { @@ -779,13 +828,15 @@ static uint32_t spawn_new_process(struct new_process_args* new_proc_args, construct_output_path(proc_desc->id, fd); if (!proc_desc->redirs[fd].path) { ret = errno; + fprintf(stderr, "Cannot construct output path: %m\n"); goto out_err; } - int tmp_fd = open(proc_desc->redirs[fd].path, - O_RDWR | O_CREAT | O_EXCL, + int tmp_fd = openat(g_sysroot_fd, proc_desc->redirs[fd].path + 1, + O_RDWR | O_CREAT | O_EXCL | O_CLOEXEC | O_NOCTTY, S_IRWXU); if (tmp_fd < 0 || close(tmp_fd) < 0) { ret = errno; + fprintf(stderr, "Cannot open %s: %m\n", proc_desc->redirs[fd].path); goto out_err; } } @@ -875,6 +926,7 @@ static uint32_t spawn_new_process(struct new_process_args* new_proc_args, *id = proc_desc->id; + fprintf(stderr, "Adding process with id %" PRIu64 "\n", *id); add_process(proc_desc); if (new_proc_args->is_entrypoint) { g_entrypoint_desc = proc_desc; @@ -1109,13 +1161,21 @@ static void handle_kill_process(msg_id_t msg_id) { } static uint32_t do_mount(const char* tag, char* path) { - if (create_dir_path(path) < 0) { + int fd; + char buf[sizeof "/proc/self/fd/" + 10]; + if (create_dir_path(path, DEFAULT_DIR_PERMS, &fd) < 0) { return errno; } - if (mount(tag, path, "9p", 0, "trans=virtio,version=9p2000.L") < 0) { - return errno; + CHECK_BOOL(fd > 2); + int res = snprintf(buf, sizeof buf, "/proc/self/fd/%d", fd); + CHECK_BOOL(res >= (int)sizeof "/proc/self/fd/" && res < (int)sizeof buf); + if (mount(tag, buf, "9p", 0, "trans=virtio,version=9p2000.L") < 0) { + res = errno; + } else { + res = 0; } - return 0; + close(fd); + return res; } static void handle_mount(msg_id_t msg_id) { @@ -1262,12 +1322,14 @@ static void handle_query_output(msg_id_t msg_id) { } if (!id || !len || !fd || fd > 2) { + fprintf(stderr, "caller bug, returning EINVAL\n"); ret = EINVAL; goto out_err; } struct process_desc* proc_desc = find_process_by_id(id); if (!proc_desc) { + fprintf(stderr, "no process, returning ESRCH\n"); ret = ESRCH; goto out_err; } From 60f615f9ce21b8a2bc1cb31b7711f50b34d771e2 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Tue, 5 Sep 2023 20:31:51 -0400 Subject: [PATCH 15/44] Make gcc check for possibly uninitialized variables --- runtime/init-container/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime/init-container/Makefile b/runtime/init-container/Makefile index cc91055c..921efd05 100644 --- a/runtime/init-container/Makefile +++ b/runtime/init-container/Makefile @@ -1,7 +1,7 @@ CC := musl-gcc CXX := /bin/false # -MMD to create dependency files (*.d) on first compilation -CFLAGS := -MMD -std=c11 -O2 -Wall -Wextra -Werror -fPIE -pie -Iinclude/ +CFLAGS := -MMD -std=c11 -O2 -Wall -Wextra -Werror -fPIE -pie -Iinclude/ -Wmaybe-uninitialized ifneq ($(DEBUG), "") CFLAGS += -DNDEBUG From b37130d365eb9f26b2a9852fff8755b5fb1c302a Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Tue, 5 Sep 2023 21:44:11 -0400 Subject: [PATCH 16/44] Ensure all opens are contained in sysroot This is done using openat2() with RESOLVE_IN_ROOT. To work around a limitation of Arch Linux's musl 1.2.4-1 package, the needed definitions from are open-coded. The only opens that do not use RESOLVE_IN_ROOT are either outside of the chroot or are before any processes are launched. --- runtime/init-container/src/init.c | 57 ++++++++++++++++++++++--------- 1 file changed, 41 insertions(+), 16 deletions(-) diff --git a/runtime/init-container/src/init.c b/runtime/init-container/src/init.c index 0eb29dc9..8abc1ea6 100644 --- a/runtime/init-container/src/init.c +++ b/runtime/init-container/src/init.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "communication.h" #include "cyclic_buffer.h" @@ -32,9 +33,6 @@ #define CONTAINER_OF(ptr, type, member) (type*)((char*)(ptr) - offsetof(type, member)) -// XXX: maybe obtain this with sysconf? -#define PAGE_SIZE 0x1000 - #define DEFAULT_UID 0 #define DEFAULT_GID 0 #define DEFAULT_OUT_FILE_PERM S_IRWXU @@ -167,6 +165,25 @@ int make_cloexec(int fd) { } */ +static int open_relative(const char *path, uint64_t flags, uint64_t mode) { + /* + * Arch's musl 1.2.4-1 doesn't include , so + * open-code the parts that are needed. + */ + struct { + uint64_t flags; + uint64_t mode; + uint64_t resolve; + } how; + memset(&how, 0, sizeof how); + how.flags = flags | O_NOCTTY | O_CLOEXEC; + how.mode = mode; + how.resolve = 0x10 /* RESOLVE_IN_ROOT */; + long r = syscall(SYS_openat2, g_sysroot_fd, path, &how, sizeof how); + CHECK_BOOL(r >= -1 && r <= INT_MAX); + return r; +} + static void cleanup_fd_desc(struct redir_fd_desc* fd_desc) { switch (fd_desc->type) { case REDIRECT_FD_FILE: @@ -189,18 +206,20 @@ static void cleanup_fd_desc(struct redir_fd_desc* fd_desc) { } static bool redir_buffers_empty(struct redir_fd_desc *redirs, size_t len) { - FILE *f; for (size_t fd = 0; fd < len; ++fd) { switch (redirs[fd].type) { - case REDIRECT_FD_FILE: - if ((f = fopen(redirs[fd].path, "r")) == 0) { + case REDIRECT_FD_FILE:; + int this_fd = open_relative(redirs[fd].path, O_RDONLY, 0); + if (this_fd == -1) { continue; } - fseek(f, 0, SEEK_END); - bool empty = ftell(f) == 0; - fclose(f); - - if (!empty) { + struct stat statbuf; + int res = fstat(this_fd, &statbuf); + close(this_fd); + if (res != 0) { + continue; + } + if (statbuf.st_size) { return false; } break; @@ -312,6 +331,7 @@ static void handle_sigchld(void) { } if (redir_buffers_empty(proc_desc->redirs, 3)) { + fprintf(stderr, "Deleting process %" PRIu64 "\n", proc_desc->id); delete_proc(proc_desc); } } @@ -596,12 +616,17 @@ static int del_epoll_fd_desc(struct epoll_fd_desc* epoll_fd_desc) { * Returns whether call was successful (setting errno on failures). */ static bool redirect_fd_to_path(int fd, const char* path) { assert(fd == 0 || fd == 1 || fd == 2); + if (path[0] != '/' || path[1] == '/') { + errno = EINVAL; + return false; + } + path++; int source_fd = -1; if (fd == 0) { - source_fd = open(path, O_RDONLY); + source_fd = open_relative(path, O_RDONLY, 0); } else { - source_fd = open(path, O_WRONLY | O_CREAT, DEFAULT_OUT_FILE_PERM); + source_fd = open_relative(path, O_WRONLY | O_CREAT, DEFAULT_OUT_FILE_PERM); } if (source_fd < 0) { @@ -831,7 +856,7 @@ static uint32_t spawn_new_process(struct new_process_args* new_proc_args, fprintf(stderr, "Cannot construct output path: %m\n"); goto out_err; } - int tmp_fd = openat(g_sysroot_fd, proc_desc->redirs[fd].path + 1, + int tmp_fd = open_relative(proc_desc->redirs[fd].path, O_RDWR | O_CREAT | O_EXCL | O_CLOEXEC | O_NOCTTY, S_IRWXU); if (tmp_fd < 0 || close(tmp_fd) < 0) { @@ -1229,7 +1254,7 @@ static uint32_t do_query_output_path(char* path, uint64_t off, char** buf_ptr, char* buf = MAP_FAILED; size_t len = 0; - int fd = open(path, O_RDONLY); + int fd = open_relative(path, O_RDONLY, 0); if (fd < 0) { return errno; } @@ -1329,7 +1354,7 @@ static void handle_query_output(msg_id_t msg_id) { struct process_desc* proc_desc = find_process_by_id(id); if (!proc_desc) { - fprintf(stderr, "no process, returning ESRCH\n"); + fprintf(stderr, "no process %" PRIu64 ", returning ESRCH\n", id); ret = ESRCH; goto out_err; } From aad4abd564e851a17f8702fe08c5b6172678164b Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Thu, 7 Sep 2023 16:39:04 -0400 Subject: [PATCH 17/44] Create new user namespace This ensures that child processes have no capabilities(7) in the initial user namespace, and therefore cannot perform certain privileged operations such as mounting block devices, loading kernel modules, or invoking dangerous ioctls. --- runtime/init-container/Makefile | 2 +- runtime/init-container/src/init.c | 69 +++++++++++++++++++++++++++++-- 2 files changed, 67 insertions(+), 4 deletions(-) diff --git a/runtime/init-container/Makefile b/runtime/init-container/Makefile index 921efd05..58d7bcb3 100644 --- a/runtime/init-container/Makefile +++ b/runtime/init-container/Makefile @@ -1,7 +1,7 @@ CC := musl-gcc CXX := /bin/false # -MMD to create dependency files (*.d) on first compilation -CFLAGS := -MMD -std=c11 -O2 -Wall -Wextra -Werror -fPIE -pie -Iinclude/ -Wmaybe-uninitialized +CFLAGS := -MMD -std=c11 -O2 -Wall -Wextra -Werror -fPIE -pie -Iinclude/ -Wmaybe-uninitialized -Iunpacked_headers/usr/include ifneq ($(DEBUG), "") CFLAGS += -DNDEBUG diff --git a/runtime/init-container/src/init.c b/runtime/init-container/src/init.c index 8abc1ea6..b7915c27 100644 --- a/runtime/init-container/src/init.c +++ b/runtime/init-container/src/init.c @@ -23,6 +23,9 @@ #include #include #include +#include +#include +#include #include "communication.h" #include "cyclic_buffer.h" @@ -284,6 +287,9 @@ static struct exit_reason encode_status(int status, int type) { return exit_reason; } +pid_t global_zombie_pid = -1; +pid_t global_pidfd = -1; + static void handle_sigchld(void) { struct signalfd_siginfo siginfo = { 0 }; @@ -299,6 +305,10 @@ static void handle_sigchld(void) { } pid_t child_pid = (pid_t)siginfo.ssi_pid; + if (child_pid == global_zombie_pid) { + /* This process is deliberately kept as a zombie, ignore it */ + return; + } if (siginfo.ssi_code != CLD_EXITED && siginfo.ssi_code != CLD_KILLED @@ -654,6 +664,10 @@ static bool redirect_fd_to_path(int fd, const char* path) { // lives in a separate memory segment (after forking) static int child_pipe = -1; +#define NAMESPACES \ + (CLONE_NEWUSER | /* new user namespace */ \ + 0) + static noreturn void child_wrapper(int parent_pipe[2], struct new_process_args* new_proc_args, struct redir_fd_desc fd_descs[3]) { @@ -670,13 +684,10 @@ static noreturn void child_wrapper(int parent_pipe[2], if (sigprocmask(SIG_SETMASK, &set, NULL) < 0) { goto out; } - -#define MASSIVEDEBUGGING #ifdef MASSIVEDEBUGGING #define X(a) do { \ int tmp = errno;\ if (write(2, a "\n", sizeof(a)) != sizeof(a)) { \ - errno = tmp; \ goto out; \ } \ errno = tmp; \ @@ -684,6 +695,12 @@ static noreturn void child_wrapper(int parent_pipe[2], #else #define X(a) do (void)(a ""); while (0) #endif + X("ENTERING NAMESPACE"); + if (setns(global_pidfd, NAMESPACES)) { + X("CANNOT ENTER NAMESPACE"); + goto out; + } + X("ENTERING CHROOT"); if (chdir(SYSROOT) != 0) { X("cannot enter " SYSROOT); @@ -1773,6 +1790,51 @@ static void create_dir(const char *pathname, mode_t mode) { } } +static void get_namespace_fd(void) { + char buf[sizeof "/proc//uid_map" + 10]; + struct clone_args args = { + .flags = CLONE_CLEAR_SIGHAND | + CLONE_PIDFD | /* alloc a PID FD */ + NAMESPACES, + .pidfd = (uint64_t)&global_pidfd, + .child_tid = 0, + .parent_tid = 0, + .exit_signal = (uint64_t)SIGCHLD, + .stack = 0, + .stack_size = 0, + .tls = 0, + .set_tid = 0, + .set_tid_size = 0, + .cgroup = 0, + }; + sigset_t set; + CHECK(sigemptyset(&set)); + errno = 0; + global_zombie_pid = syscall(SYS_clone3, &args, sizeof args); + CHECK_BOOL(global_zombie_pid >= 0); + if (global_zombie_pid == 0) { + for (;;) { + const struct timespec x = { + .tv_sec = INT32_MAX, + .tv_nsec = 999999999, + }; + (void)(nanosleep(&x, NULL)); + } + } + /* parent */ + CHECK(global_pidfd); + int snprintf_res = snprintf(buf, sizeof buf, "/proc/%d/uid_map", global_zombie_pid); + CHECK_BOOL(snprintf_res > (int)sizeof buf - 10); + CHECK_BOOL(snprintf_res < (int)sizeof buf); + for (int i = 0; i < 2; ++i) { + int uidmapfd = CHECK(open(buf, O_NOFOLLOW | O_CLOEXEC | O_NOCTTY | O_WRONLY)); +#define UIDMAP "0 0 4294967295" + CHECK_BOOL(write(uidmapfd, UIDMAP, sizeof UIDMAP - 1) == sizeof UIDMAP - 1); + CHECK_BOOL(close(uidmapfd) == 0); + buf[snprintf_res - 7] = 'g'; + } +} + int main(void) { CHECK_BOOL(setvbuf(stdin, NULL, _IONBF, BUFSIZ) == 0); CHECK_BOOL(setvbuf(stdout, NULL, _IONBF, BUFSIZ) == 0); @@ -1869,6 +1931,7 @@ int main(void) { setup_agent_directories(); block_signals(); + get_namespace_fd(); setup_sigfd(); main_loop(); From fda190e1d4a62fe41b67ef2cdf7665dd3f312083 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Thu, 7 Sep 2023 19:18:33 -0400 Subject: [PATCH 18/44] Stop waiting for output notification It never comes and the process deadlocks. --- runtime/examples/direct.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/runtime/examples/direct.rs b/runtime/examples/direct.rs index eb809af8..b9ed59ad 100644 --- a/runtime/examples/direct.rs +++ b/runtime/examples/direct.rs @@ -63,7 +63,6 @@ async fn run_process_with_output( .expect("Run process failed"); println!("Spawned process with id: {}", id); notifications.process_died.notified().await; - notifications.output_available.notified().await; match ga.query_output(id, 1, 0, u64::MAX).await? { Ok(out) => { println!("Output:"); From 75c21cde0dd6779d819da400a6697ad3a5bfcb4c Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Thu, 7 Sep 2023 19:18:51 -0400 Subject: [PATCH 19/44] Move the quit call further down This allows successful execution of the "direct" example. --- runtime/examples/direct.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/runtime/examples/direct.rs b/runtime/examples/direct.rs index b9ed59ad..0940c5fc 100644 --- a/runtime/examples/direct.rs +++ b/runtime/examples/direct.rs @@ -319,8 +319,6 @@ async fn main() -> io::Result<()> { .expect("Output query failed"); println!("Big output 2: {}, expected 0", out.len()); - // ga.quit().await?.expect("Quit failed"); - let id = ga .run_entrypoint("/bin/sleep", &["sleep", "2"], None, 0, 0, &no_redir, None) .await? @@ -328,6 +326,8 @@ async fn main() -> io::Result<()> { println!("Spawned process with id: {}", id); notifications.process_died.notified().await; + ga.quit().await?.expect("Quit failed"); + /* VM should quit now. */ let e = child.wait().await.expect("failed to wait on child"); println!("{:?}", e); From 1764d3c924d3434bc92eca3dceb3bc6f7992b3fb Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Thu, 7 Sep 2023 19:28:12 -0400 Subject: [PATCH 20/44] examples/direct: Make the final sleep command run for longer This reduces the likelyhood of a deadlock --- runtime/examples/direct.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime/examples/direct.rs b/runtime/examples/direct.rs index 0940c5fc..069a0144 100644 --- a/runtime/examples/direct.rs +++ b/runtime/examples/direct.rs @@ -320,7 +320,7 @@ async fn main() -> io::Result<()> { println!("Big output 2: {}, expected 0", out.len()); let id = ga - .run_entrypoint("/bin/sleep", &["sleep", "2"], None, 0, 0, &no_redir, None) + .run_entrypoint("/bin/sleep", &["sleep", "100"], None, 0, 0, &no_redir, None) .await? .expect("Run process failed"); println!("Spawned process with id: {}", id); From 385f51afceda746fec1df2aea695462feadf2632 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Thu, 7 Sep 2023 19:32:49 -0400 Subject: [PATCH 21/44] Use modern Cargo package resolver This avoids a warning from Cargo. --- Cargo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/Cargo.toml b/Cargo.toml index a4f7a94b..c4e2676d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,6 +3,7 @@ members = [ "runtime", "gvmkit", ] +resolver = "2" [patch.crates-io] ya-runtime-sdk = { git = "https://github.com/golemfactory/ya-runtime-sdk.git", rev = "0395b0c704ef644d7f0554ac41e319f03b11c068" } From 351054f111e40b400bd52626f6afc00b05c4b04d Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Thu, 7 Sep 2023 22:31:47 -0400 Subject: [PATCH 22/44] Avoid writing junk into stderr log This is reserved for the program being run. --- runtime/init-container/src/init.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/runtime/init-container/src/init.c b/runtime/init-container/src/init.c index b7915c27..ef150326 100644 --- a/runtime/init-container/src/init.c +++ b/runtime/init-container/src/init.c @@ -757,23 +757,19 @@ static noreturn void child_wrapper(int parent_pipe[2], } gid_t gid = new_proc_args->gid; - X("setresgid"); if (setresgid(gid, gid, gid) < 0) { goto out; } uid_t uid = new_proc_args->uid; - X("setresuid"); if (setresuid(uid, uid, uid) < 0) { goto out; } - X("execve"); /* If execve returns we know an error happened. */ (void)execve(new_proc_args->bin, new_proc_args->argv, new_proc_args->envp ?: environ); - X("execve failed"); out: From 8cf02988d8151bdb748f56c1c3d4435b391b093c Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Thu, 7 Sep 2023 22:32:07 -0400 Subject: [PATCH 23/44] Add more debug logging No change in behavior unless MASSIVEDEBUGGING is defined. --- runtime/init-container/src/init.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/runtime/init-container/src/init.c b/runtime/init-container/src/init.c index ef150326..5279d3b7 100644 --- a/runtime/init-container/src/init.c +++ b/runtime/init-container/src/init.c @@ -672,29 +672,33 @@ static noreturn void child_wrapper(int parent_pipe[2], struct new_process_args* new_proc_args, struct redir_fd_desc fd_descs[3]) { child_pipe = parent_pipe[1]; +#define MASSIVEDEBUGGING +#ifdef MASSIVEDEBUGGING +#define X(a) do { \ + int tmp = errno;\ + if (write(2, a "\n", sizeof(a)) != sizeof(a)) { \ + goto out; \ + } \ + errno = tmp; \ +} while (0) +#else +#define X(a) do (void)(a ""); while (0) +#endif if (close(parent_pipe[0]) < 0) { + X("close problem"); goto out; } sigset_t set; if (sigemptyset(&set) < 0) { + X("sigemptyset problem"); goto out; } if (sigprocmask(SIG_SETMASK, &set, NULL) < 0) { + X("sigprocmask problem"); goto out; } -#ifdef MASSIVEDEBUGGING -#define X(a) do { \ - int tmp = errno;\ - if (write(2, a "\n", sizeof(a)) != sizeof(a)) { \ - goto out; \ - } \ - errno = tmp; \ -} while (0) -#else -#define X(a) do (void)(a ""); while (0) -#endif X("ENTERING NAMESPACE"); if (setns(global_pidfd, NAMESPACES)) { X("CANNOT ENTER NAMESPACE"); @@ -743,14 +747,17 @@ static noreturn void child_wrapper(int parent_pipe[2], case REDIRECT_FD_PIPE_BLOCKING: case REDIRECT_FD_PIPE_CYCLIC: if (dup2(fd_descs[fd].buffer.fds[fd ? 1 : 0], fd) < 0) { + X("dup2 problem"); goto out; } if (close(fd_descs[fd].buffer.fds[0]) < 0 || close(fd_descs[fd].buffer.fds[1]) < 0) { + X("close problem"); goto out; } break; default: + X("bad command"); errno = ENOTRECOVERABLE; goto out; } From 77432c6e438d11328169744912f5621838a6dbc4 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Thu, 7 Sep 2023 22:35:14 -0400 Subject: [PATCH 24/44] Check that access(2) failed with ENOENT Any other error is fatal. --- runtime/init-container/src/init.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/runtime/init-container/src/init.c b/runtime/init-container/src/init.c index 5279d3b7..5f044834 100644 --- a/runtime/init-container/src/init.c +++ b/runtime/init-container/src/init.c @@ -1920,11 +1920,13 @@ int main(void) { NULL)); if (access(SYSROOT "/dev/null", F_OK) != 0) { + CHECK_BOOL(errno == ENOENT); CHECK(mknod(SYSROOT "/dev/null", MODE_RW_UGO | S_IFCHR, makedev(1, 3))); } if (access(SYSROOT "/dev/ptmx", F_OK) != 0) { + CHECK_BOOL(errno == ENOENT); CHECK(mknod(SYSROOT "/dev/ptmx", MODE_RW_UGO | S_IFCHR, makedev(5, 2))); From b26559c8bc93b9dc0eb10d399814aa9076145530 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Fri, 8 Sep 2023 16:21:07 -0400 Subject: [PATCH 25/44] Ensure that process is not dumpable This blocks using /proc/self/exe and /proc/self/fd/* to access protected files on the initramfs. This may already have been blocked, but one cannot have too much defense in depth. --- runtime/init-container/src/init.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/runtime/init-container/src/init.c b/runtime/init-container/src/init.c index 5f044834..a533bbe8 100644 --- a/runtime/init-container/src/init.c +++ b/runtime/init-container/src/init.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -1842,6 +1843,8 @@ int main(void) { CHECK_BOOL(setvbuf(stdin, NULL, _IONBF, BUFSIZ) == 0); CHECK_BOOL(setvbuf(stdout, NULL, _IONBF, BUFSIZ) == 0); CHECK_BOOL(setvbuf(stderr, NULL, _IONBF, BUFSIZ) == 0); + int res = prctl(PR_SET_DUMPABLE, 0, 0, 0, 0); + CHECK_BOOL(res == 0 || res == 1); create_dir("/dev", DEFAULT_DIR_PERMS); CHECK(mount("devtmpfs", "/dev", "devtmpfs", MS_NOSUID, From c42378273d95c4328154d814cbd27b5543ada6ba Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Fri, 8 Sep 2023 16:56:03 -0400 Subject: [PATCH 26/44] Add close_range calls This prevents file descriptors from leaking into child processes. It also ensures that the process doesn't have any file descriptors an attacker could use if it managed to ptrace() the process or use /proc/self/fd/*. This should be prevented by other protections anyway, so this is purely defense in depth. --- runtime/init-container/src/init.c | 63 +++++++++++++++++-------------- 1 file changed, 34 insertions(+), 29 deletions(-) diff --git a/runtime/init-container/src/init.c b/runtime/init-container/src/init.c index a533bbe8..34f3de79 100644 --- a/runtime/init-container/src/init.c +++ b/runtime/init-container/src/init.c @@ -700,35 +700,6 @@ static noreturn void child_wrapper(int parent_pipe[2], X("sigprocmask problem"); goto out; } - X("ENTERING NAMESPACE"); - if (setns(global_pidfd, NAMESPACES)) { - X("CANNOT ENTER NAMESPACE"); - goto out; - } - - X("ENTERING CHROOT"); - if (chdir(SYSROOT) != 0) { - X("cannot enter " SYSROOT); - goto out; - } - if (chroot(".") != 0) { - X("cannot chroot(\".\")"); - goto out; - } - - if (chdir("/") != 0) { - X("cannot chdir(\"/\")"); - goto out; - } - - if (new_proc_args->cwd) { - X("chdir(\"command dir\")"); - if (chdir(new_proc_args->cwd) < 0) { - X("cannot chdir"); - goto out; - } - } - X("fd processing"); for (int fd = 0; fd < 3; ++fd) { X("processing an FD"); @@ -764,6 +735,40 @@ static noreturn void child_wrapper(int parent_pipe[2], } } + if (syscall(SYS_close_range, (unsigned int)global_pidfd + 1, ~0U, 0) != 0) { + abort(); + } + + if (global_pidfd > 3 && syscall(SYS_close_range, 3U, (unsigned int)(global_pidfd - 1), 0U) != 0) { + abort(); + } + + if (setns(global_pidfd, NAMESPACES)) { + goto out; + } + + if (close(global_pidfd)) { + goto out; + } + + if (chdir(SYSROOT) != 0) { + goto out; + } + + if (chroot(".") != 0) { + goto out; + } + + if (chdir("/") != 0) { + goto out; + } + + if (new_proc_args->cwd) { + if (chdir(new_proc_args->cwd) < 0) { + goto out; + } + } + gid_t gid = new_proc_args->gid; if (setresgid(gid, gid, gid) < 0) { goto out; From 3e5e40582c0b8c7ccf5bba6cc36af984154bfc60 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Sat, 9 Sep 2023 21:05:34 -0400 Subject: [PATCH 27/44] Add seccomp filters This helps prevent certain attacks on the kernel. For instance, it prevents perf_event_open() from being called, which has a huge attack surface. It also prevents user namespaces from being created, which also have a massive kernel-mode attack surface. --- .gitmodules | 3 + runtime/init-container/Makefile | 24 +- runtime/init-container/include/init-seccomp.h | 7 + runtime/init-container/libseccomp | 1 + runtime/init-container/src/init.c | 4 + runtime/init-container/src/seccomp.c | 512 ++++++++++++++++++ 6 files changed, 545 insertions(+), 6 deletions(-) create mode 100644 runtime/init-container/include/init-seccomp.h create mode 160000 runtime/init-container/libseccomp create mode 100644 runtime/init-container/src/seccomp.c diff --git a/.gitmodules b/.gitmodules index 17d1e89e..4ee70134 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "runtime/init-container/liburing"] path = runtime/init-container/liburing url = https://github.com/axboe/liburing +[submodule "runtime/init-container/libseccomp"] + path = runtime/init-container/libseccomp + url = https://github.com/seccomp/libseccomp.git diff --git a/runtime/init-container/Makefile b/runtime/init-container/Makefile index 58d7bcb3..aa0efde1 100644 --- a/runtime/init-container/Makefile +++ b/runtime/init-container/Makefile @@ -1,7 +1,8 @@ CC := musl-gcc CXX := /bin/false +LIBSECCOMP_SUBMODULE ?= libseccomp # -MMD to create dependency files (*.d) on first compilation -CFLAGS := -MMD -std=c11 -O2 -Wall -Wextra -Werror -fPIE -pie -Iinclude/ -Wmaybe-uninitialized -Iunpacked_headers/usr/include +CFLAGS := -MMD -std=c11 -O2 -Wall -Wextra -Werror -fPIE -pie -Iinclude/ -Wmaybe-uninitialized -Iunpacked_headers/usr/include -I$(CURDIR)/$(LIBSECCOMP_SUBMODULE)/include ifneq ($(DEBUG), "") CFLAGS += -DNDEBUG @@ -26,7 +27,7 @@ LIBURING_SUBMODULE ?= liburing SRC_DIR ?= src TEST_DIR ?= tests -OBJECTS = $(addprefix $(SRC_DIR)/,init.o communication.o process_bookkeeping.o cyclic_buffer.o) +OBJECTS = $(addprefix $(SRC_DIR)/,init.o communication.o process_bookkeeping.o cyclic_buffer.o seccomp.o) OBJECTS_EXT = $(addprefix $(SRC_DIR)/,network.o forward.o) # Add headers to object dependencies for conditional recompilation on header change @@ -54,7 +55,8 @@ $(SRC_DIR)/network.o: $(SRC_DIR)/network.c -I"$(CURDIR)/$(UNPACKED_HEADERS)/usr/include" \ -o $@ -c $< -$(SRC_DIR)/forward.o: $(SRC_DIR)/forward.c uring +$(SRC_DIR)/seccomp.o: $(CURDIR)/$(LIBSECCOMP_SUBMODULE)/include/seccomp.h +$(SRC_DIR)/forward.o: $(SRC_DIR)/forward.c uring $(CURDIR)/$(LIBSECCOMP_SUBMODULE)/src/.libs/libseccomp.a $(QUIET_CC)$(CC) -MMD -O2 -Wall -Wextra -Werror -fPIE -pie \ -I"$(CURDIR)/$(UNPACKED_HEADERS)/usr/include/" \ -I"$(CURDIR)/$(LIBURING_SUBMODULE)/src/include/" \ @@ -65,9 +67,9 @@ $(SRC_DIR)/forward.o: $(SRC_DIR)/forward.c uring %.o: %.c $(QUIET_CC)$(CC) $(CFLAGS) -o $@ -c $< -init: $(UNPACKED_HEADERS) uring $(OBJECTS) $(OBJECTS_EXT) +init: $(UNPACKED_HEADERS) uring $(OBJECTS) $(OBJECTS_EXT) $(CURDIR)/$(LIBSECCOMP_SUBMODULE)/src/.libs/libseccomp.a @echo init - $(QUIET_CC)$(CC) $(CFLAGS) -static -o $@ $(wordlist 3, $(words $^), $^) "$(CURDIR)/$(LIBURING_SUBMODULE)/src/liburing.a" + $(QUIET_CC)$(CC) $(CFLAGS) -static -o $@ $(wordlist 3, $(words $^), $^) "$(CURDIR)/$(LIBURING_SUBMODULE)/src/liburing.a" "$(CURDIR)/$(LIBSECCOMP_SUBMODULE)/src/.libs/libseccomp.a" @# default musl libs on some distros have debug symbols, lets strip them (and everything else) strip $@ @@ -91,6 +93,15 @@ uring: $(UNPACKED_HEADERS) (cd $(LIBURING_SUBMODULE) && CC=$(CC) CXX=$(CXX) ./configure > /dev/null) $(MAKE) -e CC=$(CC) -e CFLAGS=-I"$(CURDIR)/$(UNPACKED_HEADERS)/usr/include" -C "$(LIBURING_SUBMODULE)/src" all +SHELL := /bin/bash +$(CURDIR)/$(LIBSECCOMP_SUBMODULE)/src/.libs/libseccomp.a $(CURDIR)/$(LIBSECCOMP_SUBMODULE)/include/seccomp.h: $(UNPACKED_HEADERS) $(LIBSECCOMP_SUBMODULE) + set -euo pipefail; \ + cd $(LIBSECCOMP_SUBMODULE); \ + export CC=$(CC) CXX=$(CXX) CFLAGS=-I"$$PWD/../$(UNPACKED_HEADERS)/usr/include";\ + ./autogen.sh; \ + ./configure --disable-python;\ + $(MAKE) all + vmlinuz-virt: $(UNPACKED_KERNEL) cp $(UNPACKED_KERNEL)/boot/vmlinuz-virt . @@ -117,7 +128,7 @@ initramfs.cpio.gz: init $(UNPACKED_KERNEL) cp $(UNPACKED_KERNEL)/lib/modules/$(KERNEL_VER)/kernel/net/core/failover.ko initramfs cp $(UNPACKED_KERNEL)/lib/modules/$(KERNEL_VER)/kernel/net/ipv6/ipv6.ko initramfs cp $(UNPACKED_KERNEL)/lib/modules/$(KERNEL_VER)/kernel/net/packet/af_packet.ko initramfs - cd initramfs && find . | cpio --quiet -o -H newc -R 0:0 | gzip -9 > ../$@ + set -euo pipefail; cd initramfs && find . | cpio --quiet -o -H newc -R 0:0 | gzip -9 > ../$@ $(RM) -rf initramfs TESTS_NAMES := cyclic_buffer @@ -137,6 +148,7 @@ clean: $(RM) init $(SRC_DIR)/*.o $(SRC_DIR)/*.d $(TEST_DIR)/*.o *.o $(TESTS) $(RM) vmlinuz-virt initramfs.cpio.gz $(MAKE) -s -C $(LIBURING_SUBMODULE) clean + $(MAKE) -s -C $(LIBSECCOMP_SUBMODULE) clean .PHONY: distclean distclean: diff --git a/runtime/init-container/include/init-seccomp.h b/runtime/init-container/include/init-seccomp.h new file mode 100644 index 00000000..176a461f --- /dev/null +++ b/runtime/init-container/include/init-seccomp.h @@ -0,0 +1,7 @@ +#ifndef GOLEM_INIT_SANDBOX_H +#define GOLEM_INIT_SANDBOX_H GOLEM_INIT_SANDBOX_H +// Prepares for sandbox setup +void setup_sandbox(void); +// Actually enforces the sandbox. +void sandbox_apply(void); +#endif diff --git a/runtime/init-container/libseccomp b/runtime/init-container/libseccomp new file mode 160000 index 00000000..f1c3196d --- /dev/null +++ b/runtime/init-container/libseccomp @@ -0,0 +1 @@ +Subproject commit f1c3196d9b95de22dde8f23c5befcbeabef5711c diff --git a/runtime/init-container/src/init.c b/runtime/init-container/src/init.c index 34f3de79..00b3757a 100644 --- a/runtime/init-container/src/init.c +++ b/runtime/init-container/src/init.c @@ -34,6 +34,8 @@ #include "process_bookkeeping.h" #include "proto.h" #include "forward.h" +#include "init-seccomp.h" +#define SYSROOT "/mnt/newroot" #define CONTAINER_OF(ptr, type, member) (type*)((char*)(ptr) - offsetof(type, member)) @@ -779,6 +781,7 @@ static noreturn void child_wrapper(int parent_pipe[2], goto out; } + sandbox_apply(); /* If execve returns we know an error happened. */ (void)execve(new_proc_args->bin, new_proc_args->argv, @@ -1940,6 +1943,7 @@ int main(void) { makedev(5, 2))); } + setup_sandbox(); setup_network(); setup_agent_directories(); diff --git a/runtime/init-container/src/seccomp.c b/runtime/init-container/src/seccomp.c new file mode 100644 index 00000000..760c9fca --- /dev/null +++ b/runtime/init-container/src/seccomp.c @@ -0,0 +1,512 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "init-seccomp.h" + +static const char *allow_syscalls[] = { + "_llseek", + "_newselect", + "accept", + "accept4", + "access", + "adjtimex", + "alarm", + "bind", + "brk", + "capget", + "capset", + "chdir", + "chmod", + "chown", + "chown32", + "clock_adjtime", + "clock_adjtime64", + "clock_getres", + "clock_getres_time64", + "clock_gettime", + "clock_gettime64", + "clock_nanosleep", + "clock_nanosleep_time64", + "clone", + "clone3", + "close", + "close_range", + "connect", + "copy_file_range", + "creat", + "dup", + "dup2", + "dup3", + "epoll_create", + "epoll_create1", + "epoll_ctl", + "epoll_ctl_old", + "epoll_pwait", + "epoll_pwait2", + "epoll_wait", + "epoll_wait_old", + "eventfd", + "eventfd2", + "execve", + "execveat", + "exit", + "exit_group", + "faccessat", + "faccessat2", + "fadvise64", + "fadvise64_64", + "fallocate", + "fanotify_mark", + "fchdir", + "fchmod", + "fchmodat", + "fchown", + "fchown32", + "fchownat", + "fcntl", + "fcntl64", + "fdatasync", + "fgetxattr", + "flistxattr", + "flock", + "fork", + "fremovexattr", + "fsconfig", + "fsetxattr", + "fsmount", + "fsopen", + "fspick", + "fstat", + "fstat64", + "fstatat64", + "fstatfs", + "fstatfs64", + "fsync", + "ftruncate", + "ftruncate64", + "futex", + "futex_time64", + "futimesat", + "get_mempolicy", + "get_robust_list", + "get_thread_area", + "getcpu", + "getcwd", + "getdents", + "getdents64", + "getegid", + "getegid32", + "geteuid", + "geteuid32", + "getgid", + "getgid32", + "getgroups", + "getgroups32", + "getitimer", + "getpeername", + "getpgid", + "getpgrp", + "getpid", + "getppid", + "getpriority", + "getrandom", + "getresgid", + "getresgid32", + "getresuid", + "getresuid32", + "getrlimit", + "getrusage", + "getsid", + "getsockname", + "getsockopt", + "gettid", + "gettimeofday", + "getuid", + "getuid32", + "getxattr", + "inotify_add_watch", + "inotify_init", + "inotify_init1", + "inotify_rm_watch", + "io_cancel", + "io_destroy", + "io_getevents", + "io_setup", + "io_submit", + "ioctl", + "ioprio_get", + "ioprio_set", + "ipc", + "keyctl", + "kill", + "landlock_add_rule", + "landlock_create_ruleset", + "landlock_restrict_self", + "lchown", + "lchown32", + "lgetxattr", + "link", + "linkat", + "listen", + "listxattr", + "llistxattr", + "lremovexattr", + "lseek", + "lsetxattr", + "lstat", + "lstat64", + "madvise", + "mbind", + "membarrier", + "memfd_create", + "memfd_secret", + "mincore", + "mkdir", + "mkdirat", + "mknod", + "mknodat", + "mlock", + "mlock2", + "mlockall", + "mmap", + "mmap2", + "mount", + "mount_setattr", + "move_mount", + "mprotect", + "mq_getsetattr", + "mq_notify", + "mq_open", + "mq_timedreceive", + "mq_timedreceive_time64", + "mq_timedsend", + "mq_timedsend_time64", + "mq_unlink", + "mremap", +#if 0 + "msgctl", + "msgget", + "msgrcv", + "msgsnd", +#endif + "msync", + "munlock", + "munlockall", + "munmap", + "name_to_handle_at", + "nanosleep", + "newfstatat", + "open", + "open_tree", + "openat", + "openat2", + "pause", + "pidfd_getfd", + "pidfd_open", + "pidfd_send_signal", + "pipe", + "pipe2", + "pivot_root", + "pkey_alloc", + "pkey_free", + "pkey_mprotect", + "poll", + "ppoll", + "ppoll_time64", + "prctl", + "pread64", + "preadv", + "preadv2", + "prlimit64", + "process_mrelease", + "process_vm_readv", + "process_vm_writev", + "pselect6", + "pselect6_time64", + "ptrace", + "pwrite64", + "pwritev", + "pwritev2", + "read", + "readahead", + "readdir", + "readlink", + "readlinkat", + "readv", + "reboot", + "recv", + "recvfrom", + "recvmmsg", + "recvmmsg_time64", + "recvmsg", + "remap_file_pages", + "removexattr", + "rename", + "renameat", + "renameat2", + "restart_syscall", + "rmdir", + "rseq", + "rt_sigaction", + "rt_sigpending", + "rt_sigprocmask", + "rt_sigqueueinfo", + "rt_sigreturn", + "rt_sigsuspend", + "rt_sigtimedwait", + "rt_sigtimedwait_time64", + "rt_tgsigqueueinfo", + "sched_get_priority_max", + "sched_get_priority_min", + "sched_getaffinity", + "sched_getattr", + "sched_getparam", + "sched_getscheduler", + "sched_rr_get_interval", + "sched_rr_get_interval_time64", + "sched_setaffinity", + "sched_setattr", + "sched_setparam", + "sched_setscheduler", + "sched_yield", + "seccomp", + "select", +#if 0 + "semctl", + "semget", + "semop", + "semtimedop", + "semtimedop_time64", +#endif + "send", + "sendfile", + "sendfile64", + "sendmmsg", + "sendmsg", + "sendto", + "set_mempolicy", + "set_robust_list", + "set_thread_area", + "set_tid_address", + "setfsgid", + "setfsgid32", + "setfsuid", + "setfsuid32", + "setgid", + "setgid32", + "setgroups", + "setgroups32", + "setitimer", + "setpgid", + "setpriority", + "setregid", + "setregid32", + "setresgid", + "setresgid32", + "setresuid", + "setresuid32", + "setreuid", + "setreuid32", + "setrlimit", + "setsid", + "setsockopt", + "setuid", + "setuid32", + "setxattr", + "shmat", + "shmctl", + "shmdt", + "shmget", + "shutdown", + "sigaction", + "sigaltstack", + "signal", + "signalfd", + "signalfd4", + "sigpending", + "sigprocmask", + "sigreturn", + "sigsuspend", + "socket", + "socketcall", + "socketpair", + "splice", + "stat", + "stat64", + "statfs", + "statfs64", + "statx", + "symlink", + "symlinkat", + "sync", + "sync_file_range", + "syncfs", + "syscall", + "sysinfo", + "syslog", + "tee", + "tgkill", + "time", + "timer_create", + "timer_delete", + "timer_getoverrun", + "timer_gettime", + "timer_gettime64", + "timer_settime", + "timer_settime64", + "timerfd", + "timerfd_create", + "timerfd_gettime", + "timerfd_gettime64", + "timerfd_settime", + "timerfd_settime64", + "times", + "tkill", + "truncate", + "truncate64", + "ugetrlimit", + "umask", + "umount", + "umount2", + "uname", + "unlink", + "unlinkat", + "utime", + "utimensat", + "utimensat_time64", + "utimes", + "vfork", + "wait4", + "waitid", + "waitpid", + "write", + "writev", +}; + +static const char *arm_syscalls[] = { + "arm_fadvise64_64", + "arm_sync_file_range", + "breakpoint", + "cacheflush", + "set_tls", + "sync_file_range2", +}; + +static const char *x86_syscalls[] = { + "arch_prctl", +}; + +static const char *eperm_syscalls[] = { + "bdflush", + "io_pgetevents", + "kexec_file_load", + "kexec_load", + "migrate_pages", + "move_pages", + "nfsservctl", + "nice", + "oldfstat", + "oldlstat", + "oldolduname", + "oldstat", + "olduname", + "pciconfig_iobase", + "pciconfig_read", + "pciconfig_write", + "sgetmask", + "ssetmask", + "swapcontext", + "swapoff", + "swapon", + "sysfs", + "uselib", + "userfaultfd", + "ustat", + "vm86", + "vm86old", + "vmsplice", + "bpf", + "fanotify_init", + "perf_event_open", + "quotactl", + "setdomainname", + "sethostname", + "setns", + "unshare", +}; + +#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0])) + +static void +ya_runtime_add_syscalls(scmp_filter_ctx ctx, const char *const *syscalls, + size_t count, uint32_t arch, uint32_t action) { + for (size_t i = 0; i < count; ++i) { + int syscall_number = seccomp_syscall_resolve_name_rewrite(arch, syscalls[i]); + if (syscall_number == __NR_SCMP_ERROR) + abort(); + int status = seccomp_rule_add(ctx, action, syscall_number, 0); + if (status != 0) + abort(); + } +} + +static scmp_filter_ctx ctx; + +void setup_sandbox(void) { + uint32_t const arch = seccomp_arch_native(); + ctx = seccomp_init(SCMP_ACT_ERRNO(ENOSYS)); + + if (ctx == NULL) + abort(); + + ya_runtime_add_syscalls(ctx, allow_syscalls, ARRAY_SIZE(allow_syscalls), arch, SCMP_ACT_ALLOW); + int status = seccomp_rule_add(ctx, SCMP_ACT_ALLOW, + SCMP_SYS(personality), 1, SCMP_CMP64(0, SCMP_CMP_EQ, 0, 0)); + if (status != 0) { + abort(); + } + + switch (arch) { + case SCMP_ARCH_ARM: + case SCMP_ARCH_AARCH64: + ya_runtime_add_syscalls(ctx, arm_syscalls, ARRAY_SIZE(arm_syscalls), + arch, SCMP_ACT_ALLOW); + break; + case SCMP_ARCH_X86: + case SCMP_ARCH_X86_64: + ya_runtime_add_syscalls(ctx, x86_syscalls, ARRAY_SIZE(x86_syscalls), + arch, SCMP_ACT_ALLOW); + default: + break; + } + + ya_runtime_add_syscalls(ctx, eperm_syscalls, ARRAY_SIZE(eperm_syscalls), arch, SCMP_ACT_ERRNO(EPERM)); + int fd = memfd_create("fake", MFD_CLOEXEC); + if (fd < 3) + abort(); + if (seccomp_export_bpf(ctx, fd)) + abort(); +} + +void sandbox_apply(void) { + if (seccomp_load(ctx)) + abort(); +} From e953449497274c73d1712643905ad4eac8315c05 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Mon, 11 Sep 2023 22:22:36 -0400 Subject: [PATCH 28/44] Drop capabilities(7) and add Nvidia GPU nodes This ensures that the child processes cannot e.g. manipulate firewall rules or use bind mounts, and ensures that they _can_ access Nvidia GPU device nodes. --- runtime/init-container/src/init.c | 56 +++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/runtime/init-container/src/init.c b/runtime/init-container/src/init.c index 00b3757a..aa3ed026 100644 --- a/runtime/init-container/src/init.c +++ b/runtime/init-container/src/init.c @@ -25,8 +25,10 @@ #include #include #include +#include #include #include +#include #include "communication.h" #include "cyclic_buffer.h" @@ -671,6 +673,9 @@ static int child_pipe = -1; (CLONE_NEWUSER | /* new user namespace */ \ 0) +static int capset(cap_user_header_t hdrp, cap_user_data_t datap) { + return syscall(SYS_capset, hdrp, datap); +} static noreturn void child_wrapper(int parent_pipe[2], struct new_process_args* new_proc_args, struct redir_fd_desc fd_descs[3]) { @@ -782,6 +787,42 @@ static noreturn void child_wrapper(int parent_pipe[2], } sandbox_apply(); + + struct __user_cap_header_struct hdr = { + .version = _LINUX_CAPABILITY_VERSION_3, + }; + struct __user_cap_data_struct data[_LINUX_CAPABILITY_U32S_3] = { 0 }; + + for (int i = 0; i < _LINUX_CAPABILITY_U32S_3 * 32; ++i) { + switch (i) { + case CAP_SETUID: + case CAP_SETGID: + case CAP_SYS_NICE: + case CAP_SYS_CHROOT: + case CAP_SYS_RESOURCE: + case CAP_NET_BIND_SERVICE: + case CAP_KILL: + case CAP_FSETID: + case CAP_DAC_OVERRIDE: + case CAP_DAC_READ_SEARCH: + case CAP_CHOWN: + case CAP_IPC_LOCK: + case CAP_IPC_OWNER: { + data[i / 32].permitted |= (UINT32_C(1) << (i % 32)); + data[i / 32].effective |= (UINT32_C(1) << (i % 32)); + break; + } + default:; + int res = prctl(PR_CAPBSET_DROP, i); + if (res != 0 && (res != -1 && errno == EINVAL)) + goto out; + } + } + + if (capset(&hdr, &*data)) { + goto out; + } + /* If execve returns we know an error happened. */ (void)execve(new_proc_args->bin, new_proc_args->argv, @@ -1930,6 +1971,21 @@ int main(void) { MS_NODEV | MS_NOSUID | MS_NOEXEC, NULL)); + glob_t nvidia_nodes; + res = glob("/dev/nvidia[0-9]*", GLOB_ERR, NULL, &nvidia_nodes); + if (res == 0) { + struct stat statbuf; + for (size_t i = 0; i < nvidia_nodes.gl_pathc; ++i) { + CHECK_BOOL(strncmp(nvidia_nodes.gl_pathv[i], "/dev/nvidia", sizeof "/dev/nvidia" - 1) == 0); + CHECK_BOOL(stat(nvidia_nodes.gl_pathv[i], &statbuf) == 0); + CHECK_BOOL(S_ISCHR(statbuf.st_mode)); + res = mknodat(g_sysroot_fd, nvidia_nodes.gl_pathv[i] + 1, statbuf.st_dev, statbuf.st_mode & 0777); + CHECK_BOOL(res == 0 || (res == -1 && errno == EEXIST)); + } + } else { + CHECK_BOOL(res == GLOB_NOMATCH); + } + if (access(SYSROOT "/dev/null", F_OK) != 0) { CHECK_BOOL(errno == ENOENT); CHECK(mknod(SYSROOT "/dev/null", From a5db4ed2032fcc07b9d2c01a143c9f8e3f3f724d Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Tue, 12 Sep 2023 22:12:16 -0400 Subject: [PATCH 29/44] Make sandboxing conditional If no Nvidia GPUs are present, allow sandboxing to be disabled. --- runtime/init-container/src/init.c | 120 +++++++++++++++++++----------- 1 file changed, 75 insertions(+), 45 deletions(-) diff --git a/runtime/init-container/src/init.c b/runtime/init-container/src/init.c index aa3ed026..328ccde0 100644 --- a/runtime/init-container/src/init.c +++ b/runtime/init-container/src/init.c @@ -742,20 +742,26 @@ static noreturn void child_wrapper(int parent_pipe[2], } } - if (syscall(SYS_close_range, (unsigned int)global_pidfd + 1, ~0U, 0) != 0) { - abort(); - } + if (global_pidfd != -1) { + if (syscall(SYS_close_range, (unsigned int)global_pidfd + 1, ~0U, 0) != 0) { + abort(); + } - if (global_pidfd > 3 && syscall(SYS_close_range, 3U, (unsigned int)(global_pidfd - 1), 0U) != 0) { - abort(); - } + if (global_pidfd > 3 && syscall(SYS_close_range, 3U, (unsigned int)(global_pidfd - 1), 0U) != 0) { + abort(); + } - if (setns(global_pidfd, NAMESPACES)) { - goto out; - } + if (setns(global_pidfd, NAMESPACES)) { + goto out; + } - if (close(global_pidfd)) { - goto out; + if (close(global_pidfd)) { + goto out; + } + } else { + if (syscall(SYS_close_range, 3U, ~0U, 0U) != 0) { + abort(); + } } if (chdir(SYSROOT) != 0) { @@ -786,41 +792,43 @@ static noreturn void child_wrapper(int parent_pipe[2], goto out; } - sandbox_apply(); - - struct __user_cap_header_struct hdr = { - .version = _LINUX_CAPABILITY_VERSION_3, - }; - struct __user_cap_data_struct data[_LINUX_CAPABILITY_U32S_3] = { 0 }; - - for (int i = 0; i < _LINUX_CAPABILITY_U32S_3 * 32; ++i) { - switch (i) { - case CAP_SETUID: - case CAP_SETGID: - case CAP_SYS_NICE: - case CAP_SYS_CHROOT: - case CAP_SYS_RESOURCE: - case CAP_NET_BIND_SERVICE: - case CAP_KILL: - case CAP_FSETID: - case CAP_DAC_OVERRIDE: - case CAP_DAC_READ_SEARCH: - case CAP_CHOWN: - case CAP_IPC_LOCK: - case CAP_IPC_OWNER: { - data[i / 32].permitted |= (UINT32_C(1) << (i % 32)); - data[i / 32].effective |= (UINT32_C(1) << (i % 32)); - break; + if (global_pidfd != -1) { + sandbox_apply(); + + struct __user_cap_header_struct hdr = { + .version = _LINUX_CAPABILITY_VERSION_3, + }; + struct __user_cap_data_struct data[_LINUX_CAPABILITY_U32S_3] = { 0 }; + + for (int i = 0; i < _LINUX_CAPABILITY_U32S_3 * 32; ++i) { + switch (i) { + case CAP_SETUID: + case CAP_SETGID: + case CAP_SYS_NICE: + case CAP_SYS_CHROOT: + case CAP_SYS_RESOURCE: + case CAP_NET_BIND_SERVICE: + case CAP_KILL: + case CAP_FSETID: + case CAP_DAC_OVERRIDE: + case CAP_DAC_READ_SEARCH: + case CAP_CHOWN: + case CAP_IPC_LOCK: + case CAP_IPC_OWNER: { + data[i / 32].permitted |= (UINT32_C(1) << (i % 32)); + data[i / 32].effective |= (UINT32_C(1) << (i % 32)); + break; + } + default:; + int res = prctl(PR_CAPBSET_DROP, i); + if (res != 0 && (res != -1 && errno == EINVAL)) + goto out; } - default:; - int res = prctl(PR_CAPBSET_DROP, i); - if (res != 0 && (res != -1 && errno == EINVAL)) - goto out; } - } - if (capset(&hdr, &*data)) { - goto out; + if (capset(&hdr, &*data)) { + goto out; + } } /* If execve returns we know an error happened. */ @@ -1888,7 +1896,7 @@ static void get_namespace_fd(void) { } } -int main(void) { +int main(int argc, char **argv) { CHECK_BOOL(setvbuf(stdin, NULL, _IONBF, BUFSIZ) == 0); CHECK_BOOL(setvbuf(stdout, NULL, _IONBF, BUFSIZ) == 0); CHECK_BOOL(setvbuf(stderr, NULL, _IONBF, BUFSIZ) == 0); @@ -1971,9 +1979,29 @@ int main(void) { MS_NODEV | MS_NOSUID | MS_NOEXEC, NULL)); + bool do_sandbox = true; + for (int i = 1; i < argc; ++i) { + fprintf(stderr, "Command line argument: %s\n", argv[i]); + if (strcmp(argv[i], "sandbox=yes") == 0) { + do_sandbox = true; + } else if (strcmp(argv[i], "sandbox=no") == 0) { + fprintf(stderr, "WARNING: Disabling sandboxing.\n"); + do_sandbox = false; + } + } + for (char **p = environ; *p; ++p) { + fprintf(stderr, "Environment variable: %s\n", *p); + } + glob_t nvidia_nodes; res = glob("/dev/nvidia[0-9]*", GLOB_ERR, NULL, &nvidia_nodes); if (res == 0) { + if (do_sandbox == false) { + fprintf(stderr, "Sandboxing is disabled, refusing to enable Nvidia GPU passthrough.\n"); + fprintf(stderr, "Please re-run the container with sandboxing enabled or disable GPU passthrough.\n"); + errno = 0; + CHECK_BOOL(0); + } struct stat statbuf; for (size_t i = 0; i < nvidia_nodes.gl_pathc; ++i) { CHECK_BOOL(strncmp(nvidia_nodes.gl_pathv[i], "/dev/nvidia", sizeof "/dev/nvidia" - 1) == 0); @@ -2004,7 +2032,9 @@ int main(void) { setup_agent_directories(); block_signals(); - get_namespace_fd(); + if (do_sandbox) { + get_namespace_fd(); + } setup_sigfd(); main_loop(); From 4cd4f44fde151550c0345e1cde497ed56d9ca2a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= Date: Thu, 28 Sep 2023 16:13:36 +0200 Subject: [PATCH 30/44] Update to Ubuntu 22.04 for building container init It's necessary to get new kernel headers and seccomp. --- .github/workflows/release.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 14f8ff72..e741ebc2 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -47,7 +47,7 @@ jobs: } build-init: name: Build container Init - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - name: Install Musl run: sudo apt-get install -y musl-tools musl From 9e12ffb407b845aa3d734e508854a4bf7d94c735 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= Date: Fri, 29 Sep 2023 02:37:00 +0200 Subject: [PATCH 31/44] Install libseccomp build dependencies in CI --- .github/workflows/release.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index e741ebc2..015bd988 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -50,7 +50,7 @@ jobs: runs-on: ubuntu-22.04 steps: - name: Install Musl - run: sudo apt-get install -y musl-tools musl + run: sudo apt-get install -y musl-tools musl autoconf gperf libtool automake - uses: actions/checkout@v1 - name: Make run: | From 2576a0c643c4cedb18922092bb65f914dc7b094e Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Fri, 29 Sep 2023 20:57:31 -0400 Subject: [PATCH 32/44] Mount squashfs with MS_NODEV This fixes a security hole: a malicious device node could be used to escape the sandbox. --- runtime/init-container/src/init.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/runtime/init-container/src/init.c b/runtime/init-container/src/init.c index 328ccde0..fd0aaf00 100644 --- a/runtime/init-container/src/init.c +++ b/runtime/init-container/src/init.c @@ -1937,14 +1937,14 @@ int main(int argc, char **argv) { // 'workdir' and 'upperdir' have to be on the same filesystem CHECK(mount("tmpfs", "/mnt/overlay", "tmpfs", - MS_NOSUID, + MS_NOSUID | MS_NODEV, "mode=0700,size=128M")); CHECK(mkdir("/mnt/overlay/upper", S_IRWXU)); CHECK(mkdir("/mnt/overlay/work", S_IRWXU)); - CHECK(mount("/dev/vda", "/mnt/image", "squashfs", MS_RDONLY, "")); - CHECK(mount("overlay", SYSROOT, "overlay", 0, + CHECK(mount("/dev/vda", "/mnt/image", "squashfs", MS_RDONLY | MS_NODEV, "")); + CHECK(mount("overlay", SYSROOT, "overlay", MS_NODEV, "lowerdir=/mnt/image,upperdir=/mnt/overlay/upper,workdir=/mnt/overlay/work")); g_sysroot_fd = CHECK(open(SYSROOT, O_RDONLY | O_DIRECTORY | O_CLOEXEC)); From c0f534be7759ae4da0e53fbbe555fa3429bba793 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= Date: Sat, 30 Sep 2023 05:02:46 +0200 Subject: [PATCH 33/44] Load nvidia kernel modules and create device nodes Since device nodes are not created automatically, looking for them isn't a good test to detect GPU passthrough. Instead, consider GPU passthrough enabled if nvidia kernel modules are there. Device nodes needs to be created manually according to documentation at: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#device-node-verification This is likely due to the kernel API for doing it automatically being unavailable for non-GPL drivers. --- runtime/init-container/src/init.c | 75 +++++++++++++++++++++++++------ 1 file changed, 62 insertions(+), 13 deletions(-) diff --git a/runtime/init-container/src/init.c b/runtime/init-container/src/init.c index 328ccde0..e6515a79 100644 --- a/runtime/init-container/src/init.c +++ b/runtime/init-container/src/init.c @@ -1896,12 +1896,42 @@ static void get_namespace_fd(void) { } } +static int find_device_major(const char *name) { + FILE *f; + char *line = NULL; + size_t line_size; /* size of the buffer */ + ssize_t line_len; + char entry_name[32]; + int entry_major; + int major = -1; + + if ((f = fopen("/proc/devices", "r")) == 0) + return -1; + + while ((line_len = getline(&line, &line_size, f)) != -1) { + if (strcmp(line, "Character devices:\n") == 0) { + /* initial header, nothing to do yet */ + } else if (strcmp(line, "\n") == 0 || + strcmp(line, "Block devices:\n") == 0) { + /* end of character devices, entry not found */ + break; + } else if (sscanf(line, " %d %31s", &entry_major, entry_name) == 2 && + strcmp(entry_name, name) == 0) { + major = entry_major; + break; + } + } + free(line); + return major; +} + int main(int argc, char **argv) { CHECK_BOOL(setvbuf(stdin, NULL, _IONBF, BUFSIZ) == 0); CHECK_BOOL(setvbuf(stdout, NULL, _IONBF, BUFSIZ) == 0); CHECK_BOOL(setvbuf(stderr, NULL, _IONBF, BUFSIZ) == 0); int res = prctl(PR_SET_DUMPABLE, 0, 0, 0, 0); CHECK_BOOL(res == 0 || res == 1); + bool nvidia_loaded = false; create_dir("/dev", DEFAULT_DIR_PERMS); CHECK(mount("devtmpfs", "/dev", "devtmpfs", MS_NOSUID, @@ -1927,6 +1957,28 @@ int main(int argc, char **argv) { load_module("/9pnet_virtio.ko"); load_module("/9p.ko"); + if (access("/nvidia.ko", R_OK) == 0) { + load_module("/i2c-core.ko"); + load_module("/drm_panel_orientation_quirks.ko"); + load_module("/firmware_class.ko"); + load_module("/drm.ko"); + load_module("/nvidia.ko"); + load_module("/nvidia-uvm.ko"); + load_module("/fbdev.ko"); + load_module("/fb.ko"); + load_module("/fb_sys_fops.ko"); + load_module("/cfbcopyarea.ko"); + load_module("/cfbfillrect.ko"); + load_module("/cfbimgblt.ko"); + load_module("/syscopyarea.ko"); + load_module("/sysfillrect.ko"); + load_module("/sysimgblt.ko"); + load_module("/drm_kms_helper.ko"); + load_module("/nvidia-modeset.ko"); + load_module("/nvidia-drm.ko"); + nvidia_loaded = true; + } + g_cmds_fd = CHECK(open(VPORT_CMD, O_RDWR | O_CLOEXEC)); CHECK(mkdir("/mnt", S_IRWXU)); @@ -1993,25 +2045,22 @@ int main(int argc, char **argv) { fprintf(stderr, "Environment variable: %s\n", *p); } - glob_t nvidia_nodes; - res = glob("/dev/nvidia[0-9]*", GLOB_ERR, NULL, &nvidia_nodes); - if (res == 0) { + if (nvidia_loaded) { if (do_sandbox == false) { fprintf(stderr, "Sandboxing is disabled, refusing to enable Nvidia GPU passthrough.\n"); fprintf(stderr, "Please re-run the container with sandboxing enabled or disable GPU passthrough.\n"); errno = 0; CHECK_BOOL(0); } - struct stat statbuf; - for (size_t i = 0; i < nvidia_nodes.gl_pathc; ++i) { - CHECK_BOOL(strncmp(nvidia_nodes.gl_pathv[i], "/dev/nvidia", sizeof "/dev/nvidia" - 1) == 0); - CHECK_BOOL(stat(nvidia_nodes.gl_pathv[i], &statbuf) == 0); - CHECK_BOOL(S_ISCHR(statbuf.st_mode)); - res = mknodat(g_sysroot_fd, nvidia_nodes.gl_pathv[i] + 1, statbuf.st_dev, statbuf.st_mode & 0777); - CHECK_BOOL(res == 0 || (res == -1 && errno == EEXIST)); - } - } else { - CHECK_BOOL(res == GLOB_NOMATCH); + int nvidia_major = CHECK(find_device_major("nvidia-frontend")); + /* TODO: multi-card support needs more /dev/nvidia%d nodes */ + res = mknodat(g_sysroot_fd, "dev/nvidia0", S_IFCHR | (0666 & 0777), nvidia_major << 8 | 0); + CHECK_BOOL(res == 0 || (res == -1 && errno == EEXIST)); + res = mknodat(g_sysroot_fd, "dev/nvidiactl", S_IFCHR | (0666 & 0777), nvidia_major << 8 | 255); + CHECK_BOOL(res == 0 || (res == -1 && errno == EEXIST)); + nvidia_major = CHECK(find_device_major("nvidia-uvm")); + res = mknodat(g_sysroot_fd, "dev/nvidia-uvm", S_IFCHR | (0666 & 0777), nvidia_major << 8 | 0); + CHECK_BOOL(res == 0 || (res == -1 && errno == EEXIST)); } if (access(SYSROOT "/dev/null", F_OK) != 0) { From ade43e87339b467dee00b60864babbc107393367 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= Date: Mon, 16 Oct 2023 05:59:21 +0200 Subject: [PATCH 34/44] Load extra modules necessary in 6.1.x kernel virtio_pci_modern_dev and virtio_pci_legacy_dev are dependencies of virtio_pci. netfs is a dependency of fscache and 9p. Kernel config doesn't allow building those as built-in when actual functionality is in a module, so load the extra modules manually when necessary. Make it optional, to still work with older kernel too. --- runtime/init-container/src/init.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/runtime/init-container/src/init.c b/runtime/init-container/src/init.c index 160903a5..264ce948 100644 --- a/runtime/init-container/src/init.c +++ b/runtime/init-container/src/init.c @@ -1940,6 +1940,10 @@ int main(int argc, char **argv) { load_module("/failover.ko"); load_module("/virtio.ko"); load_module("/virtio_ring.ko"); + if (access("/virtio_pci_modern_dev.ko", R_OK) == 0) + load_module("/virtio_pci_modern_dev.ko"); + if (access("/virtio_pci_legacy_dev.ko", R_OK) == 0) + load_module("/virtio_pci_legacy_dev.ko"); load_module("/virtio_pci.ko"); load_module("/net_failover.ko"); load_module("/virtio_net.ko"); @@ -1949,6 +1953,8 @@ int main(int argc, char **argv) { load_module("/virtio_blk.ko"); load_module("/squashfs.ko"); load_module("/overlay.ko"); + if (access("/netfs.ko", R_OK) == 0) + load_module("/netfs.ko"); load_module("/fscache.ko"); load_module("/af_packet.ko"); load_module("/ipv6.ko"); From 7f4b96cb7fb2d90e6b18f2a7d8c2eb3dc9c49b1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= Date: Tue, 3 Oct 2023 16:36:22 +0200 Subject: [PATCH 35/44] Mount overlay with nvidia libraries Nvidia libraries must match the kernel module version. To avoid the need to rebuild app containers frequently, mount nvidia libraries as an overlay over base image. --- runtime/init-container/src/init.c | 11 +++++++++-- runtime/src/vmrt.rs | 12 ++++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/runtime/init-container/src/init.c b/runtime/init-container/src/init.c index e6515a79..160903a5 100644 --- a/runtime/init-container/src/init.c +++ b/runtime/init-container/src/init.c @@ -1996,8 +1996,15 @@ int main(int argc, char **argv) { CHECK(mkdir("/mnt/overlay/work", S_IRWXU)); CHECK(mount("/dev/vda", "/mnt/image", "squashfs", MS_RDONLY, "")); - CHECK(mount("overlay", SYSROOT, "overlay", 0, - "lowerdir=/mnt/image,upperdir=/mnt/overlay/upper,workdir=/mnt/overlay/work")); + if (access("/dev/vdb", R_OK) == 0) { + CHECK(mkdir("/mnt/gpu-files", S_IRWXU)); + CHECK(mount("/dev/vdb", "/mnt/gpu-files", "squashfs", MS_RDONLY, "")); + CHECK(mount("overlay", SYSROOT, "overlay", 0, + "lowerdir=/mnt/image:/mnt/gpu-files,upperdir=/mnt/overlay/upper,workdir=/mnt/overlay/work")); + } else { + CHECK(mount("overlay", SYSROOT, "overlay", 0, + "lowerdir=/mnt/image,upperdir=/mnt/overlay/upper,workdir=/mnt/overlay/work")); + } g_sysroot_fd = CHECK(open(SYSROOT, O_RDONLY | O_DIRECTORY | O_CLOEXEC)); assert(g_sysroot_fd >= 3); diff --git a/runtime/src/vmrt.rs b/runtime/src/vmrt.rs index 354b4f42..e5a23e27 100755 --- a/runtime/src/vmrt.rs +++ b/runtime/src/vmrt.rs @@ -21,6 +21,7 @@ const DIR_RUNTIME: &str = "runtime"; const FILE_RUNTIME: &str = "vmrt"; const FILE_VMLINUZ: &str = "vmlinuz-virt"; const FILE_INITRAMFS: &str = "initramfs.cpio.gz"; +const FILE_NVIDIA_FILES: &str = "nvidia-files.squashfs"; #[derive(Default)] pub struct RuntimeData { @@ -115,6 +116,17 @@ pub async fn start_vmrt( cmd.arg("none"); } + if Path::new(FILE_NVIDIA_FILES).exists() { + cmd.arg("-drive"); + cmd.arg( + format!( + "file={},cache=unsafe,readonly=on,format=raw,if=virtio", + FILE_NVIDIA_FILES + ) + .as_str(), + ); + } + let (vpn, inet) = // backward-compatibility mode if vpn_remote.is_none() && inet_remote.is_none() { From 918947258c8246658cff69d3c3903d6dcc49ec2c Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Wed, 15 Nov 2023 18:45:22 -0500 Subject: [PATCH 36/44] Allow processes to bind to privileged ports The container has CAP_NET_BIND_SERVICE, but since it does not own its network namespace, CAP_NET_BIND_SERVICE is useless. Use /proc/sys/net/ipv4/ip_unprivileged_port_start instead. --- runtime/init-container/src/init.c | 1 + 1 file changed, 1 insertion(+) diff --git a/runtime/init-container/src/init.c b/runtime/init-container/src/init.c index 0806ffa5..0a3b2c26 100644 --- a/runtime/init-container/src/init.c +++ b/runtime/init-container/src/init.c @@ -2095,6 +2095,7 @@ int main(int argc, char **argv) { block_signals(); if (do_sandbox) { + write_sys("/proc/sys/net/ipv4/ip_unprivileged_port_start", 0); get_namespace_fd(); } setup_sigfd(); From 6e19baf9ca3879560049ffbd8e2daa1cfbf19c44 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Wed, 15 Nov 2023 18:55:03 -0500 Subject: [PATCH 37/44] Ensure that sandboxed code cannot create new user namespaces User namespaces are a huge attack surface, so disallow creating new ones. --- runtime/init-container/src/init.c | 1 + 1 file changed, 1 insertion(+) diff --git a/runtime/init-container/src/init.c b/runtime/init-container/src/init.c index 0a3b2c26..6ff047b1 100644 --- a/runtime/init-container/src/init.c +++ b/runtime/init-container/src/init.c @@ -2096,6 +2096,7 @@ int main(int argc, char **argv) { block_signals(); if (do_sandbox) { write_sys("/proc/sys/net/ipv4/ip_unprivileged_port_start", 0); + write_sys("/proc/sys/user/max_user_namespaces", 1); get_namespace_fd(); } setup_sigfd(); From c7fd377d6de8f99e21e5500e390c57130c7828f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= Date: Fri, 17 Nov 2023 06:09:11 +0100 Subject: [PATCH 38/44] Fix setting up overlay with nvidia files - make them override (likely older version) in the task image - fix resolving squashfs path Fixes https://github.com/fepitre/golem-gpu-live/issues/8 --- runtime/init-container/src/init.c | 2 +- runtime/src/vmrt.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/runtime/init-container/src/init.c b/runtime/init-container/src/init.c index 6ff047b1..fa81121d 100644 --- a/runtime/init-container/src/init.c +++ b/runtime/init-container/src/init.c @@ -2006,7 +2006,7 @@ int main(int argc, char **argv) { CHECK(mkdir("/mnt/gpu-files", S_IRWXU)); CHECK(mount("/dev/vdb", "/mnt/gpu-files", "squashfs", MS_RDONLY | MS_NODEV, "")); CHECK(mount("overlay", SYSROOT, "overlay", MS_NODEV, - "lowerdir=/mnt/image:/mnt/gpu-files,upperdir=/mnt/overlay/upper,workdir=/mnt/overlay/work")); + "lowerdir=/mnt/gpu-files:/mnt/image,upperdir=/mnt/overlay/upper,workdir=/mnt/overlay/work")); } else { CHECK(mount("overlay", SYSROOT, "overlay", MS_NODEV, "lowerdir=/mnt/image,upperdir=/mnt/overlay/upper,workdir=/mnt/overlay/work")); diff --git a/runtime/src/vmrt.rs b/runtime/src/vmrt.rs index e5a23e27..4f89fcc6 100755 --- a/runtime/src/vmrt.rs +++ b/runtime/src/vmrt.rs @@ -116,12 +116,12 @@ pub async fn start_vmrt( cmd.arg("none"); } - if Path::new(FILE_NVIDIA_FILES).exists() { + if runtime_dir.join(FILE_NVIDIA_FILES).exists() { cmd.arg("-drive"); cmd.arg( format!( "file={},cache=unsafe,readonly=on,format=raw,if=virtio", - FILE_NVIDIA_FILES + runtime_dir.join(FILE_NVIDIA_FILES).display() ) .as_str(), ); From 986d8a15ba72bd01f42c4317206f2408efc04ec8 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Fri, 1 Dec 2023 15:23:28 -0500 Subject: [PATCH 39/44] Move initramfs contents to separate filesystem This is needed so that pivot_root() can be used later. --- runtime/init-container/Makefile | 4 +- runtime/init-container/src/init.c | 86 ++++++++++++++++++++++++++++++- 2 files changed, 87 insertions(+), 3 deletions(-) diff --git a/runtime/init-container/Makefile b/runtime/init-container/Makefile index aa0efde1..8f24e30c 100644 --- a/runtime/init-container/Makefile +++ b/runtime/init-container/Makefile @@ -1,8 +1,9 @@ CC := musl-gcc CXX := /bin/false LIBSECCOMP_SUBMODULE ?= libseccomp +NEW_ROOT := newroot # -MMD to create dependency files (*.d) on first compilation -CFLAGS := -MMD -std=c11 -O2 -Wall -Wextra -Werror -fPIE -pie -Iinclude/ -Wmaybe-uninitialized -Iunpacked_headers/usr/include -I$(CURDIR)/$(LIBSECCOMP_SUBMODULE)/include +CFLAGS := -MMD -std=c11 -O2 -Wall -Wextra -Werror -fPIE -pie -Iinclude/ -Wmaybe-uninitialized -Iunpacked_headers/usr/include -I$(CURDIR)/$(LIBSECCOMP_SUBMODULE)/include '-DNEW_ROOT="$(NEW_ROOT)"' ifneq ($(DEBUG), "") CFLAGS += -DNDEBUG @@ -128,6 +129,7 @@ initramfs.cpio.gz: init $(UNPACKED_KERNEL) cp $(UNPACKED_KERNEL)/lib/modules/$(KERNEL_VER)/kernel/net/core/failover.ko initramfs cp $(UNPACKED_KERNEL)/lib/modules/$(KERNEL_VER)/kernel/net/ipv6/ipv6.ko initramfs cp $(UNPACKED_KERNEL)/lib/modules/$(KERNEL_VER)/kernel/net/packet/af_packet.ko initramfs + mkdir initramfs/$(NEW_ROOT) set -euo pipefail; cd initramfs && find . | cpio --quiet -o -H newc -R 0:0 | gzip -9 > ../$@ $(RM) -rf initramfs diff --git a/runtime/init-container/src/init.c b/runtime/init-container/src/init.c index fa81121d..ee1c217f 100644 --- a/runtime/init-container/src/init.c +++ b/runtime/init-container/src/init.c @@ -21,6 +21,8 @@ #include #include #include +#include +#include #include #include #include @@ -29,6 +31,7 @@ #include #include #include +#include #include "communication.h" #include "cyclic_buffer.h" @@ -124,7 +127,7 @@ static noreturn void die(void) { #define CHECK_BOOL(x) ({ \ __typeof__(x) _x = (x); \ - if (_x == 0) { \ + if (!_x) { \ fprintf(stderr, "Error at %s:%d: %m\n", __FILE__, __LINE__); \ die(); \ } \ @@ -140,6 +143,7 @@ static noreturn void die(void) { } \ _x; \ }) +#pragma GCC poison _x static void load_module(const char* path) { int fd = CHECK(open(path, O_RDONLY | O_CLOEXEC)); @@ -878,6 +882,84 @@ static char* construct_output_path(uint64_t id, unsigned int fd) { return path; } +// This is recursive, but will only ever run on trusted input. +// FIXME: get this fixed in upstream Linux. +static void copy_initramfs_recursive(int dirfd, int newdirfd, const char *skip_name) { + CHECK_BOOL(newdirfd != dirfd); + DIR *d = fdopendir(dirfd); + CHECK_BOOL(d != NULL); + for (;;) { + errno = 0; + const struct dirent *entry = readdir(d); + if (entry == NULL) { + CHECK_BOOL(errno == 0); + break; + } + if (strcmp(entry->d_name, ".") == 0 || + strcmp(entry->d_name, "..") == 0 || + strcmp(entry->d_name, skip_name) == 0) + { + continue; // skip this entry + } + struct stat statbuf; + CHECK(fstatat(dirfd, entry->d_name, &statbuf, AT_SYMLINK_NOFOLLOW)); + switch (statbuf.st_mode & S_IFMT) { + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: + case S_IFIFO: + CHECK(mknodat(newdirfd, entry->d_name, statbuf.st_mode, statbuf.st_rdev)); + break; + case S_IFLNK: { + char *buf = CHECK_BOOL(malloc(statbuf.st_size + 1)); + ssize_t size = CHECK(readlinkat(dirfd, entry->d_name, buf, statbuf.st_size + 1)); + CHECK_BOOL(size == statbuf.st_size); + buf[size] = 0; + CHECK(symlinkat(buf, newdirfd, entry->d_name)); + free(buf); + break; + } + case S_IFREG: { + uint64_t size = statbuf.st_size; + int srcfd = CHECK(openat(dirfd, entry->d_name, O_RDONLY | O_NOFOLLOW | O_CLOEXEC)); + int dstfd = CHECK(openat(newdirfd, entry->d_name, O_WRONLY | O_NOFOLLOW | O_CLOEXEC | O_CREAT, statbuf.st_mode & 07777)); + while (size > 0) { + size_t res = (size_t)CHECK(sendfile(dstfd, srcfd, NULL, size > SIZE_MAX ? SIZE_MAX : size)); + size -= res; + } + close(dstfd); + close(srcfd); + break; + } + case S_IFDIR: { + int old_child_dirfd = CHECK(openat(dirfd, entry->d_name, O_DIRECTORY | O_NOFOLLOW | O_CLOEXEC | O_RDONLY)); + CHECK(mkdirat(newdirfd, entry->d_name, statbuf.st_mode & 07777)); + int new_child_dirfd = CHECK(openat(newdirfd, entry->d_name, O_DIRECTORY | O_NOFOLLOW | O_CLOEXEC | O_RDONLY)); + copy_initramfs_recursive(old_child_dirfd, new_child_dirfd, ""); + break; + } + default: + CHECK_BOOL(false); + break; + } + CHECK(unlinkat(dirfd, entry->d_name, S_ISDIR(statbuf.st_mode) ? AT_REMOVEDIR : 0)); + } + CHECK(closedir(d)); + CHECK(close(newdirfd)); +} + +static void copy_initramfs(void) { + int rootfd = CHECK(open("/", O_DIRECTORY | O_NOFOLLOW | O_RDONLY | O_CLOEXEC)); + struct stat stats; + CHECK(fstat(rootfd, &stats)); + CHECK_BOOL(mount("", "/" NEW_ROOT, "tmpfs", 0, "") == 0); + int newdirfd = CHECK(open("/" NEW_ROOT, O_DIRECTORY | O_NOFOLLOW | O_RDONLY | O_CLOEXEC)); + copy_initramfs_recursive(rootfd, newdirfd, NEW_ROOT); + CHECK_BOOL(chdir("/" NEW_ROOT) == 0); + CHECK_BOOL(mount(".", "/", NULL, MS_MOVE, NULL) == 0); + CHECK_BOOL(chroot(".") == 0); +} + static uint32_t spawn_new_process(struct new_process_args* new_proc_args, struct redir_fd_desc fd_descs[3], uint64_t* id) { @@ -889,7 +971,6 @@ static uint32_t spawn_new_process(struct new_process_args* new_proc_args, fprintf(stderr, "Caller bug, returning EEXIST\n"); return EEXIST; } - struct process_desc* proc_desc = calloc(1, sizeof(*proc_desc)); if (!proc_desc) { fprintf(stderr, "Memory allocation failed\n"); @@ -1932,6 +2013,7 @@ int main(int argc, char **argv) { int res = prctl(PR_SET_DUMPABLE, 0, 0, 0, 0); CHECK_BOOL(res == 0 || res == 1); bool nvidia_loaded = false; + copy_initramfs(); create_dir("/dev", DEFAULT_DIR_PERMS); CHECK(mount("devtmpfs", "/dev", "devtmpfs", MS_NOSUID, From 8bae90f4a274e439e3c9ab8716e9b970e1969ab1 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Sun, 3 Dec 2023 13:04:46 -0500 Subject: [PATCH 40/44] Use pivot_root() to clear old root filesystem from mount namespace This has two advantages over chroot(): - It allows chroot() to work in the sandbox, which allows OpenSSH's sshd to run. - It removes the old root filesystem from the mount namespace, so chroot escapes are not possible. --- runtime/examples/direct.rs | 2 + runtime/init-container/src/init.c | 64 ++++++++++++++++++++++++------- 2 files changed, 53 insertions(+), 13 deletions(-) diff --git a/runtime/examples/direct.rs b/runtime/examples/direct.rs index 069a0144..868f7dc0 100644 --- a/runtime/examples/direct.rs +++ b/runtime/examples/direct.rs @@ -208,6 +208,8 @@ async fn main() -> io::Result<()> { ) .await?; + run_process_with_output(&mut ga, ¬ifications, "/bin/mount", &["mount"]).await?; + let fds = [ None, Some(RedirectFdType::RedirectFdFile( diff --git a/runtime/init-container/src/init.c b/runtime/init-container/src/init.c index ee1c217f..1902b505 100644 --- a/runtime/init-container/src/init.c +++ b/runtime/init-container/src/init.c @@ -59,7 +59,6 @@ #define DEV_VPN "eth0" #define DEV_INET "eth1" -#define SYSROOT "/mnt/newroot" #define MODE_RW_UGO (S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH) #define OUTPUT_PATH_PREFIX "/var/tmp/guest_agent_private/fds" @@ -675,7 +674,8 @@ static int child_pipe = -1; #define NAMESPACES \ (CLONE_NEWUSER | /* new user namespace */ \ - 0) + CLONE_NEWNS | /* new mount namespace */ \ + 0) static int capset(cap_user_header_t hdrp, cap_user_data_t datap) { return syscall(SYS_capset, hdrp, datap); @@ -762,22 +762,26 @@ static noreturn void child_wrapper(int parent_pipe[2], if (close(global_pidfd)) { goto out; } + + if (chdir("/") != 0) { + goto out; + } + + if (chroot(".") != 0) { + goto out; + } } else { if (syscall(SYS_close_range, 3U, ~0U, 0U) != 0) { abort(); } - } - - if (chdir(SYSROOT) != 0) { - goto out; - } - if (chroot(".") != 0) { - goto out; - } + if (chroot(SYSROOT) != 0) { + goto out; + } - if (chdir("/") != 0) { - goto out; + if (chdir("/") != 0) { + goto out; + } } if (new_proc_args->cwd) { @@ -958,6 +962,7 @@ static void copy_initramfs(void) { CHECK_BOOL(chdir("/" NEW_ROOT) == 0); CHECK_BOOL(mount(".", "/", NULL, MS_MOVE, NULL) == 0); CHECK_BOOL(chroot(".") == 0); + CHECK_BOOL(mount(NULL, "/", NULL, MS_SHARED, NULL) == 0); } static uint32_t spawn_new_process(struct new_process_args* new_proc_args, @@ -1951,10 +1956,38 @@ static void get_namespace_fd(void) { }; sigset_t set; CHECK(sigemptyset(&set)); + int fds[2], status = 0; + CHECK_BOOL(pipe2(fds, O_CLOEXEC) == 0); errno = 0; global_zombie_pid = syscall(SYS_clone3, &args, sizeof args); CHECK_BOOL(global_zombie_pid >= 0); if (global_zombie_pid == 0) { + if (close(fds[0])) + abort(); + if (mount(SYSROOT, SYSROOT, NULL, MS_BIND | MS_REC, NULL)) { + status = errno; + goto bad; + } + if (mount(NULL, SYSROOT, NULL, MS_SLAVE | MS_REC, NULL)) { + status = errno; + goto bad; + } + if (chdir(SYSROOT)) + abort(); + if (syscall(SYS_pivot_root, ".", ".")) { + status = errno; + goto bad; + } + if (umount2(".", MNT_DETACH)) { + status = errno; + goto bad; + } + if (chdir("/")) { + status = errno; + } +bad: + if (write(fds[1], &status, sizeof status) != sizeof status || close(fds[1]) != 0) + _exit(1); for (;;) { const struct timespec x = { .tv_sec = INT32_MAX, @@ -1963,8 +1996,13 @@ static void get_namespace_fd(void) { (void)(nanosleep(&x, NULL)); } } - /* parent */ CHECK(global_pidfd); + /* parent */ + CHECK_BOOL(close(fds[1]) == 0); + CHECK_BOOL(read(fds[0], &status, sizeof status) == sizeof status); + errno = status; + CHECK_BOOL(status == 0); + CHECK_BOOL(close(fds[0]) == 0); int snprintf_res = snprintf(buf, sizeof buf, "/proc/%d/uid_map", global_zombie_pid); CHECK_BOOL(snprintf_res > (int)sizeof buf - 10); CHECK_BOOL(snprintf_res < (int)sizeof buf); From e9a3ed5da759f7a67b04f281707b3939043d1594 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Tue, 5 Dec 2023 19:40:20 -0500 Subject: [PATCH 41/44] Avoid leaving a pointless process Use bind-mounts of namespace FDs instead. --- runtime/init-container/src/init.c | 60 ++++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 17 deletions(-) diff --git a/runtime/init-container/src/init.c b/runtime/init-container/src/init.c index 1902b505..5788ea6d 100644 --- a/runtime/init-container/src/init.c +++ b/runtime/init-container/src/init.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -297,6 +298,8 @@ static struct exit_reason encode_status(int status, int type) { pid_t global_zombie_pid = -1; pid_t global_pidfd = -1; +int global_userns_fd = -1; +int global_mountns_fd = -1; static void handle_sigchld(void) { struct signalfd_siginfo siginfo = { 0 }; @@ -745,21 +748,29 @@ static noreturn void child_wrapper(int parent_pipe[2], goto out; } } - if (global_pidfd != -1) { - if (syscall(SYS_close_range, (unsigned int)global_pidfd + 1, ~0U, 0) != 0) { + int low_fd = global_userns_fd > global_mountns_fd ? global_mountns_fd : global_userns_fd; + int high_fd = global_userns_fd > global_mountns_fd ? global_userns_fd : global_mountns_fd; + if (low_fd < 3) abort(); + if (low_fd > 3 && syscall(SYS_close_range, 3, (unsigned int)low_fd - 1, 0) != 0) { + goto out; + } + if (high_fd - low_fd > 1 && + syscall(SYS_close_range, (unsigned int)low_fd + 1, (unsigned int)high_fd - 1, 0)) + { + goto out; } - if (global_pidfd > 3 && syscall(SYS_close_range, 3U, (unsigned int)(global_pidfd - 1), 0U) != 0) { - abort(); + if (setns(global_mountns_fd, CLONE_NEWNS) || close(global_mountns_fd)) { + goto out; } - if (setns(global_pidfd, NAMESPACES)) { + if (setns(global_userns_fd, CLONE_NEWUSER)) { goto out; } - if (close(global_pidfd)) { + if (close(global_userns_fd)) { goto out; } @@ -1938,6 +1949,10 @@ static void create_dir(const char *pathname, mode_t mode) { } static void get_namespace_fd(void) { + int tmp_fd = CHECK(open("/user_namespace", O_RDWR|O_CREAT|O_NOFOLLOW|O_CLOEXEC|O_EXCL|O_NOCTTY, 0600)); + CHECK(close(tmp_fd)); + tmp_fd = CHECK(open("/mount_namespace", O_RDWR|O_CREAT|O_NOFOLLOW|O_CLOEXEC|O_EXCL|O_NOCTTY, 0600)); + CHECK(close(tmp_fd)); char buf[sizeof "/proc//uid_map" + 10]; struct clone_args args = { .flags = CLONE_CLEAR_SIGHAND | @@ -1957,7 +1972,7 @@ static void get_namespace_fd(void) { sigset_t set; CHECK(sigemptyset(&set)); int fds[2], status = 0; - CHECK_BOOL(pipe2(fds, O_CLOEXEC) == 0); + CHECK_BOOL(socketpair(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC, 0, fds) == 0); errno = 0; global_zombie_pid = syscall(SYS_clone3, &args, sizeof args); CHECK_BOOL(global_zombie_pid >= 0); @@ -1986,15 +2001,10 @@ static void get_namespace_fd(void) { status = errno; } bad: - if (write(fds[1], &status, sizeof status) != sizeof status || close(fds[1]) != 0) + if (write(fds[1], &status, sizeof status) != sizeof status || shutdown(fds[1], SHUT_WR) != 0) _exit(1); - for (;;) { - const struct timespec x = { - .tv_sec = INT32_MAX, - .tv_nsec = 999999999, - }; - (void)(nanosleep(&x, NULL)); - } + (void)read(fds[1], &status, 1); + _exit(0); } CHECK(global_pidfd); /* parent */ @@ -2002,9 +2012,8 @@ static void get_namespace_fd(void) { CHECK_BOOL(read(fds[0], &status, sizeof status) == sizeof status); errno = status; CHECK_BOOL(status == 0); - CHECK_BOOL(close(fds[0]) == 0); int snprintf_res = snprintf(buf, sizeof buf, "/proc/%d/uid_map", global_zombie_pid); - CHECK_BOOL(snprintf_res > (int)sizeof buf - 10); + CHECK_BOOL(snprintf_res >= (int)sizeof("/proc/1/uid_map") - 1); CHECK_BOOL(snprintf_res < (int)sizeof buf); for (int i = 0; i < 2; ++i) { int uidmapfd = CHECK(open(buf, O_NOFOLLOW | O_CLOEXEC | O_NOCTTY | O_WRONLY)); @@ -2013,6 +2022,23 @@ static void get_namespace_fd(void) { CHECK_BOOL(close(uidmapfd) == 0); buf[snprintf_res - 7] = 'g'; } + static_assert(sizeof("ns/user") <= sizeof("uid_map"), "string size oops"); + static_assert(sizeof("ns/mnt") <= sizeof("uid_map"), "string size oops"); + snprintf_res = snprintf(buf, sizeof buf, "/proc/%d/ns/user", global_zombie_pid); + CHECK_BOOL(snprintf_res >= (int)sizeof "/proc/1/ns/user" - 1); + CHECK_BOOL(snprintf_res < (int)sizeof "/proc/1/ns/user" + 9); + CHECK(mount(buf, "/user_namespace", NULL, MS_BIND, NULL)); + global_userns_fd = CHECK(open("/user_namespace", O_RDONLY|O_NOFOLLOW|O_CLOEXEC|O_NOCTTY)); + snprintf_res = snprintf(buf, sizeof buf, "/proc/%d/ns/mnt", global_zombie_pid); + CHECK_BOOL(snprintf_res >= (int)sizeof "/proc/1/ns/mnt" - 1); + CHECK_BOOL(snprintf_res < (int)sizeof "/proc/1/ns/mnt" + 9); + CHECK(mount(buf, "/mount_namespace", NULL, MS_BIND, NULL)); + global_mountns_fd = CHECK(open("/mount_namespace", O_RDONLY|O_NOFOLLOW|O_CLOEXEC|O_NOCTTY)); + CHECK(write(fds[0], "", 1)); + int v; + CHECK_BOOL(waitpid(global_zombie_pid, &v, 0) == global_zombie_pid); + CHECK_BOOL(WIFEXITED(v)); + CHECK_BOOL(WEXITSTATUS(v) == 0); } static int find_device_major(const char *name) { From 566953194639bb74fba531c86441c9ab7027688f Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Sat, 9 Dec 2023 22:11:13 -0500 Subject: [PATCH 42/44] Allow chroot() syscall Now that pivot_root() is used instead of chroot(), allowing chroot() is safe. --- runtime/init-container/src/seccomp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/runtime/init-container/src/seccomp.c b/runtime/init-container/src/seccomp.c index 760c9fca..7017554b 100644 --- a/runtime/init-container/src/seccomp.c +++ b/runtime/init-container/src/seccomp.c @@ -37,6 +37,7 @@ static const char *allow_syscalls[] = { "chmod", "chown", "chown32", + "chroot", "clock_adjtime", "clock_adjtime64", "clock_getres", From 0a0241a857e9e25a99e4afc2f679bb5e750ae0cd Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Sun, 10 Dec 2023 11:48:02 -0500 Subject: [PATCH 43/44] Disallow mount-related system calls These should be blocked by not having CAP_SYS_ADMIN, but better safe than sorry. --- runtime/init-container/src/seccomp.c | 34 ++++++++++++++-------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/runtime/init-container/src/seccomp.c b/runtime/init-container/src/seccomp.c index 7017554b..08879927 100644 --- a/runtime/init-container/src/seccomp.c +++ b/runtime/init-container/src/seccomp.c @@ -90,11 +90,7 @@ static const char *allow_syscalls[] = { "flock", "fork", "fremovexattr", - "fsconfig", "fsetxattr", - "fsmount", - "fsopen", - "fspick", "fstat", "fstat64", "fstatat64", @@ -189,9 +185,6 @@ static const char *allow_syscalls[] = { "mlockall", "mmap", "mmap2", - "mount", - "mount_setattr", - "move_mount", "mprotect", "mq_getsetattr", "mq_notify", @@ -385,8 +378,6 @@ static const char *allow_syscalls[] = { "truncate64", "ugetrlimit", "umask", - "umount", - "umount2", "uname", "unlink", "unlinkat", @@ -417,10 +408,19 @@ static const char *x86_syscalls[] = { static const char *eperm_syscalls[] = { "bdflush", + "bpf", + "fanotify_init", + "fsconfig", + "fsmount", + "fsopen", + "fspick", "io_pgetevents", "kexec_file_load", "kexec_load", "migrate_pages", + "mount", + "mount_setattr", + "move_mount", "move_pages", "nfsservctl", "nice", @@ -432,26 +432,26 @@ static const char *eperm_syscalls[] = { "pciconfig_iobase", "pciconfig_read", "pciconfig_write", + "perf_event_open", + "quotactl", + "setdomainname", + "sethostname", + "setns", "sgetmask", "ssetmask", "swapcontext", "swapoff", "swapon", "sysfs", + "umount", + "umount2", + "unshare", "uselib", "userfaultfd", "ustat", "vm86", "vm86old", "vmsplice", - "bpf", - "fanotify_init", - "perf_event_open", - "quotactl", - "setdomainname", - "sethostname", - "setns", - "unshare", }; #define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0])) From 613cf642fb3e0ff3ce70210a5a298fbe63afaba7 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Sun, 10 Dec 2023 11:48:40 -0500 Subject: [PATCH 44/44] Add missing capabilities(7) This allows chown(), setting file capabilities, and BSD process accounting to work. --- runtime/init-container/src/init.c | 47 ++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 10 deletions(-) diff --git a/runtime/init-container/src/init.c b/runtime/init-container/src/init.c index 5788ea6d..08b00f46 100644 --- a/runtime/init-container/src/init.c +++ b/runtime/init-container/src/init.c @@ -821,19 +821,46 @@ static noreturn void child_wrapper(int parent_pipe[2], for (int i = 0; i < _LINUX_CAPABILITY_U32S_3 * 32; ++i) { switch (i) { - case CAP_SETUID: - case CAP_SETGID: - case CAP_SYS_NICE: - case CAP_SYS_CHROOT: - case CAP_SYS_RESOURCE: - case CAP_NET_BIND_SERVICE: - case CAP_KILL: - case CAP_FSETID: + // CAP_AUDIT_CONTROL: no + // CAP_AUDIT_READ: no + // CAP_AUDIT_WRITE: no + case CAP_BLOCK_SUSPEND: + // case CAP_BPF: + // case CAP_CHECKPOINT_RESTORE: + case CAP_CHOWN: case CAP_DAC_OVERRIDE: case CAP_DAC_READ_SEARCH: - case CAP_CHOWN: + case CAP_FOWNER: + case CAP_FSETID: case CAP_IPC_LOCK: - case CAP_IPC_OWNER: { + case CAP_IPC_OWNER: + case CAP_KILL: + case CAP_LEASE: + case CAP_LINUX_IMMUTABLE: + // case CAP_MKNOD: + // cas CAP_NET_ADMIN: + case CAP_NET_BIND_SERVICE: + case CAP_NET_BROADCAST: + case CAP_NET_RAW: + // case CAP_PERFMON: + case CAP_SETGID: + case CAP_SETFCAP: + case CAP_SETPCAP: + case CAP_SETUID: + // case CAP_SYS_ADMIN: + case CAP_SYS_BOOT: + case CAP_SYS_CHROOT: + // case CAP_SYS_MODULE: + case CAP_SYS_NICE: + case CAP_SYS_PACCT: + case CAP_SYS_PTRACE: + // case CAP_SYS_RAWIO + case CAP_SYS_RESOURCE: + // case CAP_SYS_TIME: + // case CAP_SYS_TTY_CONFIG: + // case CAP_SYSLOG: + case CAP_WAKE_ALARM: + { data[i / 32].permitted |= (UINT32_C(1) << (i % 32)); data[i / 32].effective |= (UINT32_C(1) << (i % 32)); break;