diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 14f8ff72..015bd988 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -47,10 +47,10 @@ jobs: } build-init: name: Build container Init - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - name: Install Musl - run: sudo apt-get install -y musl-tools musl + run: sudo apt-get install -y musl-tools musl autoconf gperf libtool automake - uses: actions/checkout@v1 - name: Make run: | diff --git a/.gitmodules b/.gitmodules index 17d1e89e..4ee70134 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "runtime/init-container/liburing"] path = runtime/init-container/liburing url = https://github.com/axboe/liburing +[submodule "runtime/init-container/libseccomp"] + path = runtime/init-container/libseccomp + url = https://github.com/seccomp/libseccomp.git diff --git a/Cargo.toml b/Cargo.toml index a4f7a94b..c4e2676d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,6 +3,7 @@ members = [ "runtime", "gvmkit", ] +resolver = "2" [patch.crates-io] ya-runtime-sdk = { git = "https://github.com/golemfactory/ya-runtime-sdk.git", rev = "0395b0c704ef644d7f0554ac41e319f03b11c068" } diff --git a/runtime/examples/direct.rs b/runtime/examples/direct.rs index 502b8415..868f7dc0 100644 --- a/runtime/examples/direct.rs +++ b/runtime/examples/direct.rs @@ -63,7 +63,6 @@ async fn run_process_with_output( .expect("Run process failed"); println!("Spawned process with id: {}", id); notifications.process_died.notified().await; - notifications.output_available.notified().await; match ga.query_output(id, 1, 0, u64::MAX).await? { Ok(out) => { println!("Output:"); @@ -96,11 +95,10 @@ fn join_as_string>(path: P, file: impl ToString) -> String { fn spawn_vm<'a, P: AsRef>(temp_path: P, mount_args: &'a [(&'a str, impl ToString)]) -> Child { let root_dir = get_root_dir(); let project_dir = get_project_dir(); - let runtime_dir = project_dir.join("poc").join("runtime"); let init_dir = project_dir.join("init-container"); - let mut cmd = Command::new("vmrt"); - cmd.current_dir(runtime_dir).args([ + let mut cmd = Command::new("qemu-system-x86_64"); + cmd.current_dir(&init_dir).args([ "-m", "256m", "-nographic", @@ -113,9 +111,6 @@ fn spawn_vm<'a, P: AsRef>(temp_path: P, mount_args: &'a [(&'a str, impl To "-no-reboot", "-net", "none", - "-enable-kvm", - "-cpu", - "host", "-smp", "1", "-append", @@ -126,7 +121,7 @@ fn spawn_vm<'a, P: AsRef>(temp_path: P, mount_args: &'a [(&'a str, impl To "virtio-rng-pci", "-chardev", format!( - "socket,path={},server,nowait,id=manager_cdev", + "socket,path={},server=true,wait=false,id=manager_cdev", temp_path.as_ref().join("manager.sock").display() ) .as_str(), @@ -213,6 +208,8 @@ async fn main() -> io::Result<()> { ) .await?; + run_process_with_output(&mut ga, ¬ifications, "/bin/mount", &["mount"]).await?; + let fds = [ None, Some(RedirectFdType::RedirectFdFile( @@ -324,15 +321,15 @@ async fn main() -> io::Result<()> { .expect("Output query failed"); println!("Big output 2: {}, expected 0", out.len()); - // ga.quit().await?.expect("Quit failed"); - let id = ga - .run_entrypoint("/bin/sleep", &["sleep", "2"], None, 0, 0, &no_redir, None) + .run_entrypoint("/bin/sleep", &["sleep", "100"], None, 0, 0, &no_redir, None) .await? .expect("Run process failed"); println!("Spawned process with id: {}", id); notifications.process_died.notified().await; + ga.quit().await?.expect("Quit failed"); + /* VM should quit now. */ let e = child.wait().await.expect("failed to wait on child"); println!("{:?}", e); diff --git a/runtime/init-container/Makefile b/runtime/init-container/Makefile index cc91055c..8f24e30c 100644 --- a/runtime/init-container/Makefile +++ b/runtime/init-container/Makefile @@ -1,7 +1,9 @@ CC := musl-gcc CXX := /bin/false +LIBSECCOMP_SUBMODULE ?= libseccomp +NEW_ROOT := newroot # -MMD to create dependency files (*.d) on first compilation -CFLAGS := -MMD -std=c11 -O2 -Wall -Wextra -Werror -fPIE -pie -Iinclude/ +CFLAGS := -MMD -std=c11 -O2 -Wall -Wextra -Werror -fPIE -pie -Iinclude/ -Wmaybe-uninitialized -Iunpacked_headers/usr/include -I$(CURDIR)/$(LIBSECCOMP_SUBMODULE)/include '-DNEW_ROOT="$(NEW_ROOT)"' ifneq ($(DEBUG), "") CFLAGS += -DNDEBUG @@ -26,7 +28,7 @@ LIBURING_SUBMODULE ?= liburing SRC_DIR ?= src TEST_DIR ?= tests -OBJECTS = $(addprefix $(SRC_DIR)/,init.o communication.o process_bookkeeping.o cyclic_buffer.o) +OBJECTS = $(addprefix $(SRC_DIR)/,init.o communication.o process_bookkeeping.o cyclic_buffer.o seccomp.o) OBJECTS_EXT = $(addprefix $(SRC_DIR)/,network.o forward.o) # Add headers to object dependencies for conditional recompilation on header change @@ -54,7 +56,8 @@ $(SRC_DIR)/network.o: $(SRC_DIR)/network.c -I"$(CURDIR)/$(UNPACKED_HEADERS)/usr/include" \ -o $@ -c $< -$(SRC_DIR)/forward.o: $(SRC_DIR)/forward.c uring +$(SRC_DIR)/seccomp.o: $(CURDIR)/$(LIBSECCOMP_SUBMODULE)/include/seccomp.h +$(SRC_DIR)/forward.o: $(SRC_DIR)/forward.c uring $(CURDIR)/$(LIBSECCOMP_SUBMODULE)/src/.libs/libseccomp.a $(QUIET_CC)$(CC) -MMD -O2 -Wall -Wextra -Werror -fPIE -pie \ -I"$(CURDIR)/$(UNPACKED_HEADERS)/usr/include/" \ -I"$(CURDIR)/$(LIBURING_SUBMODULE)/src/include/" \ @@ -65,9 +68,9 @@ $(SRC_DIR)/forward.o: $(SRC_DIR)/forward.c uring %.o: %.c $(QUIET_CC)$(CC) $(CFLAGS) -o $@ -c $< -init: $(UNPACKED_HEADERS) uring $(OBJECTS) $(OBJECTS_EXT) +init: $(UNPACKED_HEADERS) uring $(OBJECTS) $(OBJECTS_EXT) $(CURDIR)/$(LIBSECCOMP_SUBMODULE)/src/.libs/libseccomp.a @echo init - $(QUIET_CC)$(CC) $(CFLAGS) -static -o $@ $(wordlist 3, $(words $^), $^) "$(CURDIR)/$(LIBURING_SUBMODULE)/src/liburing.a" + $(QUIET_CC)$(CC) $(CFLAGS) -static -o $@ $(wordlist 3, $(words $^), $^) "$(CURDIR)/$(LIBURING_SUBMODULE)/src/liburing.a" "$(CURDIR)/$(LIBSECCOMP_SUBMODULE)/src/.libs/libseccomp.a" @# default musl libs on some distros have debug symbols, lets strip them (and everything else) strip $@ @@ -91,6 +94,15 @@ uring: $(UNPACKED_HEADERS) (cd $(LIBURING_SUBMODULE) && CC=$(CC) CXX=$(CXX) ./configure > /dev/null) $(MAKE) -e CC=$(CC) -e CFLAGS=-I"$(CURDIR)/$(UNPACKED_HEADERS)/usr/include" -C "$(LIBURING_SUBMODULE)/src" all +SHELL := /bin/bash +$(CURDIR)/$(LIBSECCOMP_SUBMODULE)/src/.libs/libseccomp.a $(CURDIR)/$(LIBSECCOMP_SUBMODULE)/include/seccomp.h: $(UNPACKED_HEADERS) $(LIBSECCOMP_SUBMODULE) + set -euo pipefail; \ + cd $(LIBSECCOMP_SUBMODULE); \ + export CC=$(CC) CXX=$(CXX) CFLAGS=-I"$$PWD/../$(UNPACKED_HEADERS)/usr/include";\ + ./autogen.sh; \ + ./configure --disable-python;\ + $(MAKE) all + vmlinuz-virt: $(UNPACKED_KERNEL) cp $(UNPACKED_KERNEL)/boot/vmlinuz-virt . @@ -117,7 +129,8 @@ initramfs.cpio.gz: init $(UNPACKED_KERNEL) cp $(UNPACKED_KERNEL)/lib/modules/$(KERNEL_VER)/kernel/net/core/failover.ko initramfs cp $(UNPACKED_KERNEL)/lib/modules/$(KERNEL_VER)/kernel/net/ipv6/ipv6.ko initramfs cp $(UNPACKED_KERNEL)/lib/modules/$(KERNEL_VER)/kernel/net/packet/af_packet.ko initramfs - cd initramfs && find . | cpio --quiet -o -H newc -R 0:0 | gzip -9 > ../$@ + mkdir initramfs/$(NEW_ROOT) + set -euo pipefail; cd initramfs && find . | cpio --quiet -o -H newc -R 0:0 | gzip -9 > ../$@ $(RM) -rf initramfs TESTS_NAMES := cyclic_buffer @@ -137,6 +150,7 @@ clean: $(RM) init $(SRC_DIR)/*.o $(SRC_DIR)/*.d $(TEST_DIR)/*.o *.o $(TESTS) $(RM) vmlinuz-virt initramfs.cpio.gz $(MAKE) -s -C $(LIBURING_SUBMODULE) clean + $(MAKE) -s -C $(LIBSECCOMP_SUBMODULE) clean .PHONY: distclean distclean: diff --git a/runtime/init-container/include/init-seccomp.h b/runtime/init-container/include/init-seccomp.h new file mode 100644 index 00000000..176a461f --- /dev/null +++ b/runtime/init-container/include/init-seccomp.h @@ -0,0 +1,7 @@ +#ifndef GOLEM_INIT_SANDBOX_H +#define GOLEM_INIT_SANDBOX_H GOLEM_INIT_SANDBOX_H +// Prepares for sandbox setup +void setup_sandbox(void); +// Actually enforces the sandbox. +void sandbox_apply(void); +#endif diff --git a/runtime/init-container/libseccomp b/runtime/init-container/libseccomp new file mode 160000 index 00000000..f1c3196d --- /dev/null +++ b/runtime/init-container/libseccomp @@ -0,0 +1 @@ +Subproject commit f1c3196d9b95de22dde8f23c5befcbeabef5711c diff --git a/runtime/init-container/src/init.c b/runtime/init-container/src/init.c index 92d94fb0..08b00f46 100644 --- a/runtime/init-container/src/init.c +++ b/runtime/init-container/src/init.c @@ -14,13 +14,25 @@ #include #include #include +#include #include #include #include #include +#include #include #include +#include +#include #include +#include +#include +#include +#include +#include +#include +#include +#include #include "communication.h" #include "cyclic_buffer.h" @@ -28,12 +40,11 @@ #include "process_bookkeeping.h" #include "proto.h" #include "forward.h" +#include "init-seccomp.h" +#define SYSROOT "/mnt/newroot" #define CONTAINER_OF(ptr, type, member) (type*)((char*)(ptr) - offsetof(type, member)) -// XXX: maybe obtain this with sysconf? -#define PAGE_SIZE 0x1000 - #define DEFAULT_UID 0 #define DEFAULT_GID 0 #define DEFAULT_OUT_FILE_PERM S_IRWXU @@ -58,6 +69,7 @@ #define MTU_VPN 1220 #define MTU_INET 65521 +static int g_sysroot_fd = AT_FDCWD; struct new_process_args { char* bin; @@ -113,6 +125,16 @@ static noreturn void die(void) { } } +#define CHECK_BOOL(x) ({ \ + __typeof__(x) _x = (x); \ + if (!_x) { \ + fprintf(stderr, "Error at %s:%d: %m\n", __FILE__, __LINE__); \ + die(); \ + } \ + _x; \ +}) + + #define CHECK(x) ({ \ __typeof__(x) _x = (x); \ if (_x == -1) { \ @@ -121,11 +143,12 @@ static noreturn void die(void) { } \ _x; \ }) +#pragma GCC poison _x static void load_module(const char* path) { int fd = CHECK(open(path, O_RDONLY | O_CLOEXEC)); - CHECK(syscall(SYS_finit_module, fd, "", 0)); - CHECK(close(fd)); + CHECK_BOOL(syscall(SYS_finit_module, fd, "", 0) == 0); + CHECK_BOOL(close(fd) == 0); } int make_nonblocking(int fd) { @@ -154,6 +177,25 @@ int make_cloexec(int fd) { } */ +static int open_relative(const char *path, uint64_t flags, uint64_t mode) { + /* + * Arch's musl 1.2.4-1 doesn't include , so + * open-code the parts that are needed. + */ + struct { + uint64_t flags; + uint64_t mode; + uint64_t resolve; + } how; + memset(&how, 0, sizeof how); + how.flags = flags | O_NOCTTY | O_CLOEXEC; + how.mode = mode; + how.resolve = 0x10 /* RESOLVE_IN_ROOT */; + long r = syscall(SYS_openat2, g_sysroot_fd, path, &how, sizeof how); + CHECK_BOOL(r >= -1 && r <= INT_MAX); + return r; +} + static void cleanup_fd_desc(struct redir_fd_desc* fd_desc) { switch (fd_desc->type) { case REDIRECT_FD_FILE: @@ -176,18 +218,20 @@ static void cleanup_fd_desc(struct redir_fd_desc* fd_desc) { } static bool redir_buffers_empty(struct redir_fd_desc *redirs, size_t len) { - FILE *f; for (size_t fd = 0; fd < len; ++fd) { switch (redirs[fd].type) { - case REDIRECT_FD_FILE: - if ((f = fopen(redirs[fd].path, "r")) == 0) { + case REDIRECT_FD_FILE:; + int this_fd = open_relative(redirs[fd].path, O_RDONLY, 0); + if (this_fd == -1) { continue; } - fseek(f, 0, SEEK_END); - bool empty = ftell(f) == 0; - fclose(f); - - if (!empty) { + struct stat statbuf; + int res = fstat(this_fd, &statbuf); + close(this_fd); + if (res != 0) { + continue; + } + if (statbuf.st_size) { return false; } break; @@ -252,6 +296,11 @@ static struct exit_reason encode_status(int status, int type) { return exit_reason; } +pid_t global_zombie_pid = -1; +pid_t global_pidfd = -1; +int global_userns_fd = -1; +int global_mountns_fd = -1; + static void handle_sigchld(void) { struct signalfd_siginfo siginfo = { 0 }; @@ -267,6 +316,10 @@ static void handle_sigchld(void) { } pid_t child_pid = (pid_t)siginfo.ssi_pid; + if (child_pid == global_zombie_pid) { + /* This process is deliberately kept as a zombie, ignore it */ + return; + } if (siginfo.ssi_code != CLD_EXITED && siginfo.ssi_code != CLD_KILLED @@ -299,6 +352,7 @@ static void handle_sigchld(void) { } if (redir_buffers_empty(proc_desc->redirs, 3)) { + fprintf(stderr, "Deleting process %" PRIu64 "\n", proc_desc->id); delete_proc(proc_desc); } } @@ -318,27 +372,59 @@ static void setup_sigfd(void) { g_sig_fd = CHECK(signalfd(g_sig_fd, &set, SFD_CLOEXEC)); } -static int create_dir_path(char* path) { +static int create_dir_path(char* path, int perms, int *out_fd) { assert(path[0] == '/'); char* next = path; - while (1) { - next = strchr(next + 1, '/'); - if (!next) { - break; + int fd = g_sysroot_fd; + int rc = -1; + char *prev; + do { + next++; + prev = next; + next = strchr(next, '/'); + if (next != NULL) { + *next = '\0'; } - *next = '\0'; - int ret = mkdir(path, DEFAULT_DIR_PERMS); - *next = '/'; - if (ret < 0 && errno != EEXIST) { - return -1; + if (*prev == '\0' || strcmp(prev, ".") == 0 || strcmp(prev, "..") == 0) { + fprintf(stderr, "Invalid path component '%s'\n", prev); + errno = EINVAL; + goto fail; + } + int ret = mkdirat(fd, prev, perms); + if (ret != 0 && errno != EEXIST) { + int tmp = errno; + assert(errno != EBADF); + fprintf(stderr, "mkdirat() failed: %m\n"); + errno = tmp; + goto fail; } - } - if (mkdir(path, DEFAULT_DIR_PERMS) < 0 && errno != EEXIST) { - return -1; - } - return 0; + int new_fd = openat(fd, prev, O_DIRECTORY | O_RDONLY | O_NOFOLLOW | O_CLOEXEC); + if (new_fd == -1) { + int tmp = errno; + assert(tmp != EBADF); + fprintf(stderr, "openat() failed: %m\n"); + errno = tmp; + goto fail; + } + if (fd != g_sysroot_fd) { + close(fd); + } + fd = new_fd; + } while (next); + rc = 0; + if (out_fd) { + *out_fd = fd; + fd = g_sysroot_fd; + } +fail: + if (fd != g_sysroot_fd) { + int save = errno; + close(fd); + errno = save; + } + return rc; } static void setup_agent_directories(void) { @@ -348,42 +434,49 @@ static void setup_agent_directories(void) { die(); } - CHECK(create_dir_path(path)); + CHECK(create_dir_path(path, DEFAULT_DIR_PERMS, NULL)); free(path); } static int add_network_hosts(char *entries[][2], int n) { FILE *f; - if ((f = fopen("/etc/hosts", "a")) == 0) { + if ((f = fopen(SYSROOT "/etc/hosts", "a")) == 0) { return -1; } for (int i = 0; i < n; ++i) { - fprintf(f, "%s\t%s\n", entries[i][0], entries[i][1]); + if (fprintf(f, "%s\t%s\n", entries[i][0], entries[i][1]) < 2) { + return -1; + } } - fflush(f); - fsync(fileno(f)); - fclose(f); + if (fflush(f)) { + return -1; + } + if (fsync(fileno(f))) { + return -1; + } + if (fclose(f)) { + return -1; + } return 0; } static int set_network_ns(char *entries[], int n) { FILE *f; - if ((f = fopen("/etc/resolv.conf", "w")) == 0) { + if ((f = fopen(SYSROOT "/etc/resolv.conf", "w")) == 0) { return -1; } - fprintf(f, "search example.com\n"); for (int i = 0; i < n; ++i) { - fprintf(f, "nameserver %s\n", entries[i]); + CHECK_BOOL(fprintf(f, "nameserver %s\n", entries[i]) > 0); } - fflush(f); - fsync(fileno(f)); - fclose(f); + CHECK_BOOL(fflush(f) == 0); + CHECK_BOOL(fsync(fileno(f)) == 0); + CHECK_BOOL(fclose(f) == 0); return 0; } @@ -394,9 +487,9 @@ int write_sys(char *path, size_t value) { return -1; } - fprintf(f, "%ld", value); - fflush(f); - fclose(f); + CHECK_BOOL(fprintf(f, "%ld", value) > 0); + CHECK_BOOL(fflush(f) == 0); + CHECK_BOOL(fclose(f) == 0); return 0; } @@ -544,12 +637,17 @@ static int del_epoll_fd_desc(struct epoll_fd_desc* epoll_fd_desc) { * Returns whether call was successful (setting errno on failures). */ static bool redirect_fd_to_path(int fd, const char* path) { assert(fd == 0 || fd == 1 || fd == 2); + if (path[0] != '/' || path[1] == '/') { + errno = EINVAL; + return false; + } + path++; int source_fd = -1; if (fd == 0) { - source_fd = open(path, O_RDONLY); + source_fd = open_relative(path, O_RDONLY, 0); } else { - source_fd = open(path, O_WRONLY | O_CREAT, DEFAULT_OUT_FILE_PERM); + source_fd = open_relative(path, O_WRONLY | O_CREAT, DEFAULT_OUT_FILE_PERM); } if (source_fd < 0) { @@ -577,42 +675,57 @@ static bool redirect_fd_to_path(int fd, const char* path) { // lives in a separate memory segment (after forking) static int child_pipe = -1; -static void close_child_pipe() { - if (child_pipe != -1) { - char c = '\0'; - /* Can't do anything with errors here. */ - (void)write(child_pipe, &c, sizeof(c)); - close(child_pipe); - } -} +#define NAMESPACES \ + (CLONE_NEWUSER | /* new user namespace */ \ + CLONE_NEWNS | /* new mount namespace */ \ + 0) +static int capset(cap_user_header_t hdrp, cap_user_data_t datap) { + return syscall(SYS_capset, hdrp, datap); +} static noreturn void child_wrapper(int parent_pipe[2], struct new_process_args* new_proc_args, struct redir_fd_desc fd_descs[3]) { child_pipe = parent_pipe[1]; - atexit(close_child_pipe); +#define MASSIVEDEBUGGING +#ifdef MASSIVEDEBUGGING +#define X(a) do { \ + int tmp = errno;\ + if (write(2, a "\n", sizeof(a)) != sizeof(a)) { \ + goto out; \ + } \ + errno = tmp; \ +} while (0) +#else +#define X(a) do (void)(a ""); while (0) +#endif if (close(parent_pipe[0]) < 0) { + X("close problem"); goto out; } sigset_t set; if (sigemptyset(&set) < 0) { + X("sigemptyset problem"); goto out; } if (sigprocmask(SIG_SETMASK, &set, NULL) < 0) { + X("sigprocmask problem"); goto out; } - - if (new_proc_args->cwd) { - if (chdir(new_proc_args->cwd) < 0) { - goto out; - } - } - + X("fd processing"); for (int fd = 0; fd < 3; ++fd) { + X("processing an FD"); switch (fd_descs[fd].type) { case REDIRECT_FD_FILE: + X("redirecting an FD to a file"); +#ifdef MASSIVEDEBUGGING + if ((size_t)write(2, fd_descs[fd].path, strlen(fd_descs[fd].path)) != strlen(fd_descs[fd].path)) { + goto out; + } + X(""); +#endif if (!redirect_fd_to_path(fd, fd_descs[fd].path)) { goto out; } @@ -620,18 +733,73 @@ static noreturn void child_wrapper(int parent_pipe[2], case REDIRECT_FD_PIPE_BLOCKING: case REDIRECT_FD_PIPE_CYCLIC: if (dup2(fd_descs[fd].buffer.fds[fd ? 1 : 0], fd) < 0) { + X("dup2 problem"); goto out; } if (close(fd_descs[fd].buffer.fds[0]) < 0 || close(fd_descs[fd].buffer.fds[1]) < 0) { + X("close problem"); goto out; } break; default: + X("bad command"); errno = ENOTRECOVERABLE; goto out; } } + if (global_pidfd != -1) { + int low_fd = global_userns_fd > global_mountns_fd ? global_mountns_fd : global_userns_fd; + int high_fd = global_userns_fd > global_mountns_fd ? global_userns_fd : global_mountns_fd; + if (low_fd < 3) + abort(); + if (low_fd > 3 && syscall(SYS_close_range, 3, (unsigned int)low_fd - 1, 0) != 0) { + goto out; + } + if (high_fd - low_fd > 1 && + syscall(SYS_close_range, (unsigned int)low_fd + 1, (unsigned int)high_fd - 1, 0)) + { + goto out; + } + + if (setns(global_mountns_fd, CLONE_NEWNS) || close(global_mountns_fd)) { + goto out; + } + + if (setns(global_userns_fd, CLONE_NEWUSER)) { + goto out; + } + + if (close(global_userns_fd)) { + goto out; + } + + if (chdir("/") != 0) { + goto out; + } + + if (chroot(".") != 0) { + goto out; + } + } else { + if (syscall(SYS_close_range, 3U, ~0U, 0U) != 0) { + abort(); + } + + if (chroot(SYSROOT) != 0) { + goto out; + } + + if (chdir("/") != 0) { + goto out; + } + } + + if (new_proc_args->cwd) { + if (chdir(new_proc_args->cwd) < 0) { + goto out; + } + } gid_t gid = new_proc_args->gid; if (setresgid(gid, gid, gid) < 0) { @@ -643,13 +811,86 @@ static noreturn void child_wrapper(int parent_pipe[2], goto out; } + if (global_pidfd != -1) { + sandbox_apply(); + + struct __user_cap_header_struct hdr = { + .version = _LINUX_CAPABILITY_VERSION_3, + }; + struct __user_cap_data_struct data[_LINUX_CAPABILITY_U32S_3] = { 0 }; + + for (int i = 0; i < _LINUX_CAPABILITY_U32S_3 * 32; ++i) { + switch (i) { + // CAP_AUDIT_CONTROL: no + // CAP_AUDIT_READ: no + // CAP_AUDIT_WRITE: no + case CAP_BLOCK_SUSPEND: + // case CAP_BPF: + // case CAP_CHECKPOINT_RESTORE: + case CAP_CHOWN: + case CAP_DAC_OVERRIDE: + case CAP_DAC_READ_SEARCH: + case CAP_FOWNER: + case CAP_FSETID: + case CAP_IPC_LOCK: + case CAP_IPC_OWNER: + case CAP_KILL: + case CAP_LEASE: + case CAP_LINUX_IMMUTABLE: + // case CAP_MKNOD: + // cas CAP_NET_ADMIN: + case CAP_NET_BIND_SERVICE: + case CAP_NET_BROADCAST: + case CAP_NET_RAW: + // case CAP_PERFMON: + case CAP_SETGID: + case CAP_SETFCAP: + case CAP_SETPCAP: + case CAP_SETUID: + // case CAP_SYS_ADMIN: + case CAP_SYS_BOOT: + case CAP_SYS_CHROOT: + // case CAP_SYS_MODULE: + case CAP_SYS_NICE: + case CAP_SYS_PACCT: + case CAP_SYS_PTRACE: + // case CAP_SYS_RAWIO + case CAP_SYS_RESOURCE: + // case CAP_SYS_TIME: + // case CAP_SYS_TTY_CONFIG: + // case CAP_SYSLOG: + case CAP_WAKE_ALARM: + { + data[i / 32].permitted |= (UINT32_C(1) << (i % 32)); + data[i / 32].effective |= (UINT32_C(1) << (i % 32)); + break; + } + default:; + int res = prctl(PR_CAPBSET_DROP, i); + if (res != 0 && (res != -1 && errno == EINVAL)) + goto out; + } + } + + if (capset(&hdr, &*data)) { + goto out; + } + } + /* If execve returns we know an error happened. */ (void)execve(new_proc_args->bin, new_proc_args->argv, new_proc_args->envp ?: environ); + out: - exit(errno); + if (child_pipe != -1) { + char c = '\0'; + /* Can't do anything with errors here. */ + (void)write(child_pipe, &c, sizeof(c)); + close(child_pipe); + } + _exit(errno); } /* 0 is considered an invalid ID. */ @@ -664,7 +905,7 @@ static int create_process_fds_dir(uint64_t id) { return -1; } - if (mkdir(path, S_IRWXU) < 0) { + if (create_dir_path(path, S_IRWXU, NULL) < 0) { int tmp = errno; free(path); errno = tmp; @@ -683,6 +924,85 @@ static char* construct_output_path(uint64_t id, unsigned int fd) { return path; } +// This is recursive, but will only ever run on trusted input. +// FIXME: get this fixed in upstream Linux. +static void copy_initramfs_recursive(int dirfd, int newdirfd, const char *skip_name) { + CHECK_BOOL(newdirfd != dirfd); + DIR *d = fdopendir(dirfd); + CHECK_BOOL(d != NULL); + for (;;) { + errno = 0; + const struct dirent *entry = readdir(d); + if (entry == NULL) { + CHECK_BOOL(errno == 0); + break; + } + if (strcmp(entry->d_name, ".") == 0 || + strcmp(entry->d_name, "..") == 0 || + strcmp(entry->d_name, skip_name) == 0) + { + continue; // skip this entry + } + struct stat statbuf; + CHECK(fstatat(dirfd, entry->d_name, &statbuf, AT_SYMLINK_NOFOLLOW)); + switch (statbuf.st_mode & S_IFMT) { + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: + case S_IFIFO: + CHECK(mknodat(newdirfd, entry->d_name, statbuf.st_mode, statbuf.st_rdev)); + break; + case S_IFLNK: { + char *buf = CHECK_BOOL(malloc(statbuf.st_size + 1)); + ssize_t size = CHECK(readlinkat(dirfd, entry->d_name, buf, statbuf.st_size + 1)); + CHECK_BOOL(size == statbuf.st_size); + buf[size] = 0; + CHECK(symlinkat(buf, newdirfd, entry->d_name)); + free(buf); + break; + } + case S_IFREG: { + uint64_t size = statbuf.st_size; + int srcfd = CHECK(openat(dirfd, entry->d_name, O_RDONLY | O_NOFOLLOW | O_CLOEXEC)); + int dstfd = CHECK(openat(newdirfd, entry->d_name, O_WRONLY | O_NOFOLLOW | O_CLOEXEC | O_CREAT, statbuf.st_mode & 07777)); + while (size > 0) { + size_t res = (size_t)CHECK(sendfile(dstfd, srcfd, NULL, size > SIZE_MAX ? SIZE_MAX : size)); + size -= res; + } + close(dstfd); + close(srcfd); + break; + } + case S_IFDIR: { + int old_child_dirfd = CHECK(openat(dirfd, entry->d_name, O_DIRECTORY | O_NOFOLLOW | O_CLOEXEC | O_RDONLY)); + CHECK(mkdirat(newdirfd, entry->d_name, statbuf.st_mode & 07777)); + int new_child_dirfd = CHECK(openat(newdirfd, entry->d_name, O_DIRECTORY | O_NOFOLLOW | O_CLOEXEC | O_RDONLY)); + copy_initramfs_recursive(old_child_dirfd, new_child_dirfd, ""); + break; + } + default: + CHECK_BOOL(false); + break; + } + CHECK(unlinkat(dirfd, entry->d_name, S_ISDIR(statbuf.st_mode) ? AT_REMOVEDIR : 0)); + } + CHECK(closedir(d)); + CHECK(close(newdirfd)); +} + +static void copy_initramfs(void) { + int rootfd = CHECK(open("/", O_DIRECTORY | O_NOFOLLOW | O_RDONLY | O_CLOEXEC)); + struct stat stats; + CHECK(fstat(rootfd, &stats)); + CHECK_BOOL(mount("", "/" NEW_ROOT, "tmpfs", 0, "") == 0); + int newdirfd = CHECK(open("/" NEW_ROOT, O_DIRECTORY | O_NOFOLLOW | O_RDONLY | O_CLOEXEC)); + copy_initramfs_recursive(rootfd, newdirfd, NEW_ROOT); + CHECK_BOOL(chdir("/" NEW_ROOT) == 0); + CHECK_BOOL(mount(".", "/", NULL, MS_MOVE, NULL) == 0); + CHECK_BOOL(chroot(".") == 0); + CHECK_BOOL(mount(NULL, "/", NULL, MS_SHARED, NULL) == 0); +} + static uint32_t spawn_new_process(struct new_process_args* new_proc_args, struct redir_fd_desc fd_descs[3], uint64_t* id) { @@ -691,28 +1011,31 @@ static uint32_t spawn_new_process(struct new_process_args* new_proc_args, struct epoll_fd_desc* epoll_fd_descs[3] = { NULL }; if (new_proc_args->is_entrypoint && g_entrypoint_desc) { + fprintf(stderr, "Caller bug, returning EEXIST\n"); return EEXIST; } - struct process_desc* proc_desc = calloc(1, sizeof(*proc_desc)); if (!proc_desc) { + fprintf(stderr, "Memory allocation failed\n"); return ENOMEM; } for (size_t fd = 0; fd < 3; ++fd) { proc_desc->redirs[fd].type = REDIRECT_FD_INVALID; } + int status_pipe[2] = { -1, -1 }; proc_desc->id = get_next_id(); if (create_process_fds_dir(proc_desc->id) < 0) { ret = errno; + fprintf(stderr, "Failed to create file descriptor directory: %m\n"); goto out_err; } /* All these shenanigans with pipes are so that we can distinguish internal * failures from spawned process exiting. */ - int status_pipe[2] = { -1, -1 }; if (pipe2(status_pipe, O_CLOEXEC | O_DIRECT) < 0) { ret = errno; + fprintf(stderr, "Failed to create status pipe: %m\n"); goto out_err; } @@ -724,6 +1047,7 @@ static uint32_t spawn_new_process(struct new_process_args* new_proc_args, proc_desc->redirs[fd].path = strdup(fd_descs[fd].path); if (!proc_desc->redirs[fd].path) { ret = errno; + fprintf(stderr, "Memory allocation failed\n"); goto out_err; } } else { @@ -731,13 +1055,15 @@ static uint32_t spawn_new_process(struct new_process_args* new_proc_args, construct_output_path(proc_desc->id, fd); if (!proc_desc->redirs[fd].path) { ret = errno; + fprintf(stderr, "Cannot construct output path: %m\n"); goto out_err; } - int tmp_fd = open(proc_desc->redirs[fd].path, - O_RDWR | O_CREAT | O_EXCL, + int tmp_fd = open_relative(proc_desc->redirs[fd].path, + O_RDWR | O_CREAT | O_EXCL | O_CLOEXEC | O_NOCTTY, S_IRWXU); if (tmp_fd < 0 || close(tmp_fd) < 0) { ret = errno; + fprintf(stderr, "Cannot open %s: %m\n", proc_desc->redirs[fd].path); goto out_err; } } @@ -754,6 +1080,7 @@ static uint32_t spawn_new_process(struct new_process_args* new_proc_args, if (pipe2(proc_desc->redirs[fd].buffer.fds, O_CLOEXEC) < 0) { ret = errno; + fprintf(stderr, "Failed to create redirection pipe: %m\n"); goto out_err; } break; @@ -765,6 +1092,7 @@ static uint32_t spawn_new_process(struct new_process_args* new_proc_args, p = fork(); if (p < 0) { ret = errno; + fprintf(stderr, "Failed to fork: %m\n"); goto out_err; } else if (p == 0) { child_wrapper(status_pipe, new_proc_args, proc_desc->redirs); @@ -777,8 +1105,10 @@ static uint32_t spawn_new_process(struct new_process_args* new_proc_args, ssize_t x = read(status_pipe[0], &c, sizeof(c)); if (x < 0) { ret = errno; + fprintf(stderr, "Failed to read from pipe: %m\n"); goto out_err; } else if (x > 0) { + fprintf(stderr, "Failed to spawn process\n"); /* Process failed to spawn. */ int status = 0; CHECK(waitpid(p, &status, 0)); @@ -808,6 +1138,7 @@ static uint32_t spawn_new_process(struct new_process_args* new_proc_args, &epoll_fd_descs[fd]) < 0) { if (errno == ENOMEM || errno == ENOSPC) { ret = errno; + fprintf(stderr, "Failed to add epoll descriptor: %m\n"); goto out_err; } CHECK(-1); @@ -822,6 +1153,7 @@ static uint32_t spawn_new_process(struct new_process_args* new_proc_args, *id = proc_desc->id; + fprintf(stderr, "Adding process with id %" PRIu64 "\n", *id); add_process(proc_desc); if (new_proc_args->is_entrypoint) { g_entrypoint_desc = proc_desc; @@ -1056,13 +1388,21 @@ static void handle_kill_process(msg_id_t msg_id) { } static uint32_t do_mount(const char* tag, char* path) { - if (create_dir_path(path) < 0) { + int fd; + char buf[sizeof "/proc/self/fd/" + 10]; + if (create_dir_path(path, DEFAULT_DIR_PERMS, &fd) < 0) { return errno; } - if (mount(tag, path, "9p", 0, "trans=virtio,version=9p2000.L") < 0) { - return errno; + CHECK_BOOL(fd > 2); + int res = snprintf(buf, sizeof buf, "/proc/self/fd/%d", fd); + CHECK_BOOL(res >= (int)sizeof "/proc/self/fd/" && res < (int)sizeof buf); + if (mount(tag, buf, "9p", 0, "trans=virtio,version=9p2000.L") < 0) { + res = errno; + } else { + res = 0; } - return 0; + close(fd); + return res; } static void handle_mount(msg_id_t msg_id) { @@ -1093,7 +1433,7 @@ static void handle_mount(msg_id_t msg_id) { } } - if (!tag || !path) { + if (!tag || !path || path[0] != '/') { ret = EINVAL; goto out; } @@ -1116,7 +1456,7 @@ static uint32_t do_query_output_path(char* path, uint64_t off, char** buf_ptr, char* buf = MAP_FAILED; size_t len = 0; - int fd = open(path, O_RDONLY); + int fd = open_relative(path, O_RDONLY, 0); if (fd < 0) { return errno; } @@ -1209,12 +1549,14 @@ static void handle_query_output(msg_id_t msg_id) { } if (!id || !len || !fd || fd > 2) { + fprintf(stderr, "caller bug, returning EINVAL\n"); ret = EINVAL; goto out_err; } struct process_desc* proc_desc = find_process_by_id(id); if (!proc_desc) { + fprintf(stderr, "no process %" PRIu64 ", returning ESRCH\n", id); ret = ESRCH; goto out_err; } @@ -1627,16 +1969,142 @@ static noreturn void main_loop(void) { } static void create_dir(const char *pathname, mode_t mode) { - if (mkdir(pathname, mode) < 0 && errno != EEXIST) { + if (mkdirat(g_sysroot_fd, pathname, mode) < 0 && errno != EEXIST) { fprintf(stderr, "mkdir(%s) failed with: %m\n", pathname); die(); } } -int main(void) { - setbuf(stdin, NULL); - setbuf(stdout, NULL); - setbuf(stderr, NULL); +static void get_namespace_fd(void) { + int tmp_fd = CHECK(open("/user_namespace", O_RDWR|O_CREAT|O_NOFOLLOW|O_CLOEXEC|O_EXCL|O_NOCTTY, 0600)); + CHECK(close(tmp_fd)); + tmp_fd = CHECK(open("/mount_namespace", O_RDWR|O_CREAT|O_NOFOLLOW|O_CLOEXEC|O_EXCL|O_NOCTTY, 0600)); + CHECK(close(tmp_fd)); + char buf[sizeof "/proc//uid_map" + 10]; + struct clone_args args = { + .flags = CLONE_CLEAR_SIGHAND | + CLONE_PIDFD | /* alloc a PID FD */ + NAMESPACES, + .pidfd = (uint64_t)&global_pidfd, + .child_tid = 0, + .parent_tid = 0, + .exit_signal = (uint64_t)SIGCHLD, + .stack = 0, + .stack_size = 0, + .tls = 0, + .set_tid = 0, + .set_tid_size = 0, + .cgroup = 0, + }; + sigset_t set; + CHECK(sigemptyset(&set)); + int fds[2], status = 0; + CHECK_BOOL(socketpair(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC, 0, fds) == 0); + errno = 0; + global_zombie_pid = syscall(SYS_clone3, &args, sizeof args); + CHECK_BOOL(global_zombie_pid >= 0); + if (global_zombie_pid == 0) { + if (close(fds[0])) + abort(); + if (mount(SYSROOT, SYSROOT, NULL, MS_BIND | MS_REC, NULL)) { + status = errno; + goto bad; + } + if (mount(NULL, SYSROOT, NULL, MS_SLAVE | MS_REC, NULL)) { + status = errno; + goto bad; + } + if (chdir(SYSROOT)) + abort(); + if (syscall(SYS_pivot_root, ".", ".")) { + status = errno; + goto bad; + } + if (umount2(".", MNT_DETACH)) { + status = errno; + goto bad; + } + if (chdir("/")) { + status = errno; + } +bad: + if (write(fds[1], &status, sizeof status) != sizeof status || shutdown(fds[1], SHUT_WR) != 0) + _exit(1); + (void)read(fds[1], &status, 1); + _exit(0); + } + CHECK(global_pidfd); + /* parent */ + CHECK_BOOL(close(fds[1]) == 0); + CHECK_BOOL(read(fds[0], &status, sizeof status) == sizeof status); + errno = status; + CHECK_BOOL(status == 0); + int snprintf_res = snprintf(buf, sizeof buf, "/proc/%d/uid_map", global_zombie_pid); + CHECK_BOOL(snprintf_res >= (int)sizeof("/proc/1/uid_map") - 1); + CHECK_BOOL(snprintf_res < (int)sizeof buf); + for (int i = 0; i < 2; ++i) { + int uidmapfd = CHECK(open(buf, O_NOFOLLOW | O_CLOEXEC | O_NOCTTY | O_WRONLY)); +#define UIDMAP "0 0 4294967295" + CHECK_BOOL(write(uidmapfd, UIDMAP, sizeof UIDMAP - 1) == sizeof UIDMAP - 1); + CHECK_BOOL(close(uidmapfd) == 0); + buf[snprintf_res - 7] = 'g'; + } + static_assert(sizeof("ns/user") <= sizeof("uid_map"), "string size oops"); + static_assert(sizeof("ns/mnt") <= sizeof("uid_map"), "string size oops"); + snprintf_res = snprintf(buf, sizeof buf, "/proc/%d/ns/user", global_zombie_pid); + CHECK_BOOL(snprintf_res >= (int)sizeof "/proc/1/ns/user" - 1); + CHECK_BOOL(snprintf_res < (int)sizeof "/proc/1/ns/user" + 9); + CHECK(mount(buf, "/user_namespace", NULL, MS_BIND, NULL)); + global_userns_fd = CHECK(open("/user_namespace", O_RDONLY|O_NOFOLLOW|O_CLOEXEC|O_NOCTTY)); + snprintf_res = snprintf(buf, sizeof buf, "/proc/%d/ns/mnt", global_zombie_pid); + CHECK_BOOL(snprintf_res >= (int)sizeof "/proc/1/ns/mnt" - 1); + CHECK_BOOL(snprintf_res < (int)sizeof "/proc/1/ns/mnt" + 9); + CHECK(mount(buf, "/mount_namespace", NULL, MS_BIND, NULL)); + global_mountns_fd = CHECK(open("/mount_namespace", O_RDONLY|O_NOFOLLOW|O_CLOEXEC|O_NOCTTY)); + CHECK(write(fds[0], "", 1)); + int v; + CHECK_BOOL(waitpid(global_zombie_pid, &v, 0) == global_zombie_pid); + CHECK_BOOL(WIFEXITED(v)); + CHECK_BOOL(WEXITSTATUS(v) == 0); +} + +static int find_device_major(const char *name) { + FILE *f; + char *line = NULL; + size_t line_size; /* size of the buffer */ + ssize_t line_len; + char entry_name[32]; + int entry_major; + int major = -1; + + if ((f = fopen("/proc/devices", "r")) == 0) + return -1; + + while ((line_len = getline(&line, &line_size, f)) != -1) { + if (strcmp(line, "Character devices:\n") == 0) { + /* initial header, nothing to do yet */ + } else if (strcmp(line, "\n") == 0 || + strcmp(line, "Block devices:\n") == 0) { + /* end of character devices, entry not found */ + break; + } else if (sscanf(line, " %d %31s", &entry_major, entry_name) == 2 && + strcmp(entry_name, name) == 0) { + major = entry_major; + break; + } + } + free(line); + return major; +} + +int main(int argc, char **argv) { + CHECK_BOOL(setvbuf(stdin, NULL, _IONBF, BUFSIZ) == 0); + CHECK_BOOL(setvbuf(stdout, NULL, _IONBF, BUFSIZ) == 0); + CHECK_BOOL(setvbuf(stderr, NULL, _IONBF, BUFSIZ) == 0); + int res = prctl(PR_SET_DUMPABLE, 0, 0, 0, 0); + CHECK_BOOL(res == 0 || res == 1); + bool nvidia_loaded = false; + copy_initramfs(); create_dir("/dev", DEFAULT_DIR_PERMS); CHECK(mount("devtmpfs", "/dev", "devtmpfs", MS_NOSUID, @@ -1645,6 +2113,10 @@ int main(void) { load_module("/failover.ko"); load_module("/virtio.ko"); load_module("/virtio_ring.ko"); + if (access("/virtio_pci_modern_dev.ko", R_OK) == 0) + load_module("/virtio_pci_modern_dev.ko"); + if (access("/virtio_pci_legacy_dev.ko", R_OK) == 0) + load_module("/virtio_pci_legacy_dev.ko"); load_module("/virtio_pci.ko"); load_module("/net_failover.ko"); load_module("/virtio_net.ko"); @@ -1654,6 +2126,8 @@ int main(void) { load_module("/virtio_blk.ko"); load_module("/squashfs.ko"); load_module("/overlay.ko"); + if (access("/netfs.ko", R_OK) == 0) + load_module("/netfs.ko"); load_module("/fscache.ko"); load_module("/af_packet.ko"); load_module("/ipv6.ko"); @@ -1662,73 +2136,142 @@ int main(void) { load_module("/9pnet_virtio.ko"); load_module("/9p.ko"); + if (access("/nvidia.ko", R_OK) == 0) { + load_module("/i2c-core.ko"); + load_module("/drm_panel_orientation_quirks.ko"); + load_module("/firmware_class.ko"); + load_module("/drm.ko"); + load_module("/nvidia.ko"); + load_module("/nvidia-uvm.ko"); + load_module("/fbdev.ko"); + load_module("/fb.ko"); + load_module("/fb_sys_fops.ko"); + load_module("/cfbcopyarea.ko"); + load_module("/cfbfillrect.ko"); + load_module("/cfbimgblt.ko"); + load_module("/syscopyarea.ko"); + load_module("/sysfillrect.ko"); + load_module("/sysimgblt.ko"); + load_module("/drm_kms_helper.ko"); + load_module("/nvidia-modeset.ko"); + load_module("/nvidia-drm.ko"); + nvidia_loaded = true; + } + g_cmds_fd = CHECK(open(VPORT_CMD, O_RDWR | O_CLOEXEC)); CHECK(mkdir("/mnt", S_IRWXU)); + CHECK(mkdir("/proc", S_IRWXU)); CHECK(mkdir("/mnt/image", S_IRWXU)); CHECK(mkdir("/mnt/overlay", S_IRWXU)); - CHECK(mkdir("/mnt/newroot", DEFAULT_DIR_PERMS)); + CHECK(mkdir(SYSROOT, DEFAULT_DIR_PERMS)); // 'workdir' and 'upperdir' have to be on the same filesystem CHECK(mount("tmpfs", "/mnt/overlay", "tmpfs", - MS_NOSUID, - "mode=0777,size=128M")); + MS_NOSUID | MS_NODEV, + "mode=0700,size=128M")); CHECK(mkdir("/mnt/overlay/upper", S_IRWXU)); CHECK(mkdir("/mnt/overlay/work", S_IRWXU)); - CHECK(mount("/dev/vda", "/mnt/image", "squashfs", MS_RDONLY, "")); - CHECK(mount("overlay", "/mnt/newroot", "overlay", 0, - "lowerdir=/mnt/image,upperdir=/mnt/overlay/upper,workdir=/mnt/overlay/work")); + CHECK(mount("/dev/vda", "/mnt/image", "squashfs", MS_RDONLY | MS_NODEV, "")); + if (access("/dev/vdb", R_OK) == 0) { + CHECK(mkdir("/mnt/gpu-files", S_IRWXU)); + CHECK(mount("/dev/vdb", "/mnt/gpu-files", "squashfs", MS_RDONLY | MS_NODEV, "")); + CHECK(mount("overlay", SYSROOT, "overlay", MS_NODEV, + "lowerdir=/mnt/gpu-files:/mnt/image,upperdir=/mnt/overlay/upper,workdir=/mnt/overlay/work")); + } else { + CHECK(mount("overlay", SYSROOT, "overlay", MS_NODEV, + "lowerdir=/mnt/image,upperdir=/mnt/overlay/upper,workdir=/mnt/overlay/work")); + } - CHECK(umount2("/dev", MNT_DETACH)); + g_sysroot_fd = CHECK(open(SYSROOT, O_RDONLY | O_DIRECTORY | O_CLOEXEC)); + assert(g_sysroot_fd >= 3); - CHECK(chdir("/mnt/newroot")); - CHECK(mount(".", "/", "none", MS_MOVE, NULL)); - CHECK(chroot(".")); - CHECK(chdir("/")); - - create_dir("/dev", DEFAULT_DIR_PERMS); - create_dir("/tmp", DEFAULT_DIR_PERMS); + create_dir("dev", DEFAULT_DIR_PERMS); + create_dir("tmp", DEFAULT_DIR_PERMS); CHECK(mount("proc", "/proc", "proc", MS_NODEV | MS_NOSUID | MS_NOEXEC, NULL)); - CHECK(mount("sysfs", "/sys", "sysfs", + CHECK(mount("proc", SYSROOT "/proc", "proc", + MS_NODEV | MS_NOSUID | MS_NOEXEC, + NULL)); + CHECK(mount("sysfs", SYSROOT "/sys", "sysfs", MS_NODEV | MS_NOSUID | MS_NOEXEC, NULL)); - CHECK(mount("devtmpfs", "/dev", "devtmpfs", + CHECK(mount("devtmpfs", SYSROOT "/dev", "devtmpfs", MS_NOSUID, - "exec,mode=0755,size=2M")); - CHECK(mount("tmpfs", "/tmp", "tmpfs", + "mode=0755,size=2M")); + CHECK(mount("tmpfs", SYSROOT "/tmp", "tmpfs", MS_NOSUID, "mode=0777")); - create_dir("/dev/pts", DEFAULT_DIR_PERMS); - create_dir("/dev/shm", DEFAULT_DIR_PERMS); + create_dir("dev/pts", DEFAULT_DIR_PERMS); + create_dir("dev/shm", DEFAULT_DIR_PERMS); - CHECK(mount("devpts", "/dev/pts", "devpts", + CHECK(mount("devpts", SYSROOT "/dev/pts", "devpts", MS_NOSUID | MS_NOEXEC, "gid=5,mode=0620")); - CHECK(mount("tmpfs", "/dev/shm", "tmpfs", + CHECK(mount("tmpfs", SYSROOT "/dev/shm", "tmpfs", MS_NODEV | MS_NOSUID | MS_NOEXEC, NULL)); - if (access("/dev/null", F_OK) != 0) { - CHECK(mknod("/dev/null", + bool do_sandbox = true; + for (int i = 1; i < argc; ++i) { + fprintf(stderr, "Command line argument: %s\n", argv[i]); + if (strcmp(argv[i], "sandbox=yes") == 0) { + do_sandbox = true; + } else if (strcmp(argv[i], "sandbox=no") == 0) { + fprintf(stderr, "WARNING: Disabling sandboxing.\n"); + do_sandbox = false; + } + } + for (char **p = environ; *p; ++p) { + fprintf(stderr, "Environment variable: %s\n", *p); + } + + if (nvidia_loaded) { + if (do_sandbox == false) { + fprintf(stderr, "Sandboxing is disabled, refusing to enable Nvidia GPU passthrough.\n"); + fprintf(stderr, "Please re-run the container with sandboxing enabled or disable GPU passthrough.\n"); + errno = 0; + CHECK_BOOL(0); + } + int nvidia_major = CHECK(find_device_major("nvidia-frontend")); + /* TODO: multi-card support needs more /dev/nvidia%d nodes */ + res = mknodat(g_sysroot_fd, "dev/nvidia0", S_IFCHR | (0666 & 0777), nvidia_major << 8 | 0); + CHECK_BOOL(res == 0 || (res == -1 && errno == EEXIST)); + res = mknodat(g_sysroot_fd, "dev/nvidiactl", S_IFCHR | (0666 & 0777), nvidia_major << 8 | 255); + CHECK_BOOL(res == 0 || (res == -1 && errno == EEXIST)); + nvidia_major = CHECK(find_device_major("nvidia-uvm")); + res = mknodat(g_sysroot_fd, "dev/nvidia-uvm", S_IFCHR | (0666 & 0777), nvidia_major << 8 | 0); + CHECK_BOOL(res == 0 || (res == -1 && errno == EEXIST)); + } + + if (access(SYSROOT "/dev/null", F_OK) != 0) { + CHECK_BOOL(errno == ENOENT); + CHECK(mknod(SYSROOT "/dev/null", MODE_RW_UGO | S_IFCHR, makedev(1, 3))); } - if (access("/dev/ptmx", F_OK) != 0) { - CHECK(mknod("/dev/ptmx", + if (access(SYSROOT "/dev/ptmx", F_OK) != 0) { + CHECK_BOOL(errno == ENOENT); + CHECK(mknod(SYSROOT "/dev/ptmx", MODE_RW_UGO | S_IFCHR, makedev(5, 2))); } + setup_sandbox(); setup_network(); setup_agent_directories(); block_signals(); + if (do_sandbox) { + write_sys("/proc/sys/net/ipv4/ip_unprivileged_port_start", 0); + write_sys("/proc/sys/user/max_user_namespaces", 1); + get_namespace_fd(); + } setup_sigfd(); main_loop(); diff --git a/runtime/init-container/src/seccomp.c b/runtime/init-container/src/seccomp.c new file mode 100644 index 00000000..08879927 --- /dev/null +++ b/runtime/init-container/src/seccomp.c @@ -0,0 +1,513 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "init-seccomp.h" + +static const char *allow_syscalls[] = { + "_llseek", + "_newselect", + "accept", + "accept4", + "access", + "adjtimex", + "alarm", + "bind", + "brk", + "capget", + "capset", + "chdir", + "chmod", + "chown", + "chown32", + "chroot", + "clock_adjtime", + "clock_adjtime64", + "clock_getres", + "clock_getres_time64", + "clock_gettime", + "clock_gettime64", + "clock_nanosleep", + "clock_nanosleep_time64", + "clone", + "clone3", + "close", + "close_range", + "connect", + "copy_file_range", + "creat", + "dup", + "dup2", + "dup3", + "epoll_create", + "epoll_create1", + "epoll_ctl", + "epoll_ctl_old", + "epoll_pwait", + "epoll_pwait2", + "epoll_wait", + "epoll_wait_old", + "eventfd", + "eventfd2", + "execve", + "execveat", + "exit", + "exit_group", + "faccessat", + "faccessat2", + "fadvise64", + "fadvise64_64", + "fallocate", + "fanotify_mark", + "fchdir", + "fchmod", + "fchmodat", + "fchown", + "fchown32", + "fchownat", + "fcntl", + "fcntl64", + "fdatasync", + "fgetxattr", + "flistxattr", + "flock", + "fork", + "fremovexattr", + "fsetxattr", + "fstat", + "fstat64", + "fstatat64", + "fstatfs", + "fstatfs64", + "fsync", + "ftruncate", + "ftruncate64", + "futex", + "futex_time64", + "futimesat", + "get_mempolicy", + "get_robust_list", + "get_thread_area", + "getcpu", + "getcwd", + "getdents", + "getdents64", + "getegid", + "getegid32", + "geteuid", + "geteuid32", + "getgid", + "getgid32", + "getgroups", + "getgroups32", + "getitimer", + "getpeername", + "getpgid", + "getpgrp", + "getpid", + "getppid", + "getpriority", + "getrandom", + "getresgid", + "getresgid32", + "getresuid", + "getresuid32", + "getrlimit", + "getrusage", + "getsid", + "getsockname", + "getsockopt", + "gettid", + "gettimeofday", + "getuid", + "getuid32", + "getxattr", + "inotify_add_watch", + "inotify_init", + "inotify_init1", + "inotify_rm_watch", + "io_cancel", + "io_destroy", + "io_getevents", + "io_setup", + "io_submit", + "ioctl", + "ioprio_get", + "ioprio_set", + "ipc", + "keyctl", + "kill", + "landlock_add_rule", + "landlock_create_ruleset", + "landlock_restrict_self", + "lchown", + "lchown32", + "lgetxattr", + "link", + "linkat", + "listen", + "listxattr", + "llistxattr", + "lremovexattr", + "lseek", + "lsetxattr", + "lstat", + "lstat64", + "madvise", + "mbind", + "membarrier", + "memfd_create", + "memfd_secret", + "mincore", + "mkdir", + "mkdirat", + "mknod", + "mknodat", + "mlock", + "mlock2", + "mlockall", + "mmap", + "mmap2", + "mprotect", + "mq_getsetattr", + "mq_notify", + "mq_open", + "mq_timedreceive", + "mq_timedreceive_time64", + "mq_timedsend", + "mq_timedsend_time64", + "mq_unlink", + "mremap", +#if 0 + "msgctl", + "msgget", + "msgrcv", + "msgsnd", +#endif + "msync", + "munlock", + "munlockall", + "munmap", + "name_to_handle_at", + "nanosleep", + "newfstatat", + "open", + "open_tree", + "openat", + "openat2", + "pause", + "pidfd_getfd", + "pidfd_open", + "pidfd_send_signal", + "pipe", + "pipe2", + "pivot_root", + "pkey_alloc", + "pkey_free", + "pkey_mprotect", + "poll", + "ppoll", + "ppoll_time64", + "prctl", + "pread64", + "preadv", + "preadv2", + "prlimit64", + "process_mrelease", + "process_vm_readv", + "process_vm_writev", + "pselect6", + "pselect6_time64", + "ptrace", + "pwrite64", + "pwritev", + "pwritev2", + "read", + "readahead", + "readdir", + "readlink", + "readlinkat", + "readv", + "reboot", + "recv", + "recvfrom", + "recvmmsg", + "recvmmsg_time64", + "recvmsg", + "remap_file_pages", + "removexattr", + "rename", + "renameat", + "renameat2", + "restart_syscall", + "rmdir", + "rseq", + "rt_sigaction", + "rt_sigpending", + "rt_sigprocmask", + "rt_sigqueueinfo", + "rt_sigreturn", + "rt_sigsuspend", + "rt_sigtimedwait", + "rt_sigtimedwait_time64", + "rt_tgsigqueueinfo", + "sched_get_priority_max", + "sched_get_priority_min", + "sched_getaffinity", + "sched_getattr", + "sched_getparam", + "sched_getscheduler", + "sched_rr_get_interval", + "sched_rr_get_interval_time64", + "sched_setaffinity", + "sched_setattr", + "sched_setparam", + "sched_setscheduler", + "sched_yield", + "seccomp", + "select", +#if 0 + "semctl", + "semget", + "semop", + "semtimedop", + "semtimedop_time64", +#endif + "send", + "sendfile", + "sendfile64", + "sendmmsg", + "sendmsg", + "sendto", + "set_mempolicy", + "set_robust_list", + "set_thread_area", + "set_tid_address", + "setfsgid", + "setfsgid32", + "setfsuid", + "setfsuid32", + "setgid", + "setgid32", + "setgroups", + "setgroups32", + "setitimer", + "setpgid", + "setpriority", + "setregid", + "setregid32", + "setresgid", + "setresgid32", + "setresuid", + "setresuid32", + "setreuid", + "setreuid32", + "setrlimit", + "setsid", + "setsockopt", + "setuid", + "setuid32", + "setxattr", + "shmat", + "shmctl", + "shmdt", + "shmget", + "shutdown", + "sigaction", + "sigaltstack", + "signal", + "signalfd", + "signalfd4", + "sigpending", + "sigprocmask", + "sigreturn", + "sigsuspend", + "socket", + "socketcall", + "socketpair", + "splice", + "stat", + "stat64", + "statfs", + "statfs64", + "statx", + "symlink", + "symlinkat", + "sync", + "sync_file_range", + "syncfs", + "syscall", + "sysinfo", + "syslog", + "tee", + "tgkill", + "time", + "timer_create", + "timer_delete", + "timer_getoverrun", + "timer_gettime", + "timer_gettime64", + "timer_settime", + "timer_settime64", + "timerfd", + "timerfd_create", + "timerfd_gettime", + "timerfd_gettime64", + "timerfd_settime", + "timerfd_settime64", + "times", + "tkill", + "truncate", + "truncate64", + "ugetrlimit", + "umask", + "uname", + "unlink", + "unlinkat", + "utime", + "utimensat", + "utimensat_time64", + "utimes", + "vfork", + "wait4", + "waitid", + "waitpid", + "write", + "writev", +}; + +static const char *arm_syscalls[] = { + "arm_fadvise64_64", + "arm_sync_file_range", + "breakpoint", + "cacheflush", + "set_tls", + "sync_file_range2", +}; + +static const char *x86_syscalls[] = { + "arch_prctl", +}; + +static const char *eperm_syscalls[] = { + "bdflush", + "bpf", + "fanotify_init", + "fsconfig", + "fsmount", + "fsopen", + "fspick", + "io_pgetevents", + "kexec_file_load", + "kexec_load", + "migrate_pages", + "mount", + "mount_setattr", + "move_mount", + "move_pages", + "nfsservctl", + "nice", + "oldfstat", + "oldlstat", + "oldolduname", + "oldstat", + "olduname", + "pciconfig_iobase", + "pciconfig_read", + "pciconfig_write", + "perf_event_open", + "quotactl", + "setdomainname", + "sethostname", + "setns", + "sgetmask", + "ssetmask", + "swapcontext", + "swapoff", + "swapon", + "sysfs", + "umount", + "umount2", + "unshare", + "uselib", + "userfaultfd", + "ustat", + "vm86", + "vm86old", + "vmsplice", +}; + +#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0])) + +static void +ya_runtime_add_syscalls(scmp_filter_ctx ctx, const char *const *syscalls, + size_t count, uint32_t arch, uint32_t action) { + for (size_t i = 0; i < count; ++i) { + int syscall_number = seccomp_syscall_resolve_name_rewrite(arch, syscalls[i]); + if (syscall_number == __NR_SCMP_ERROR) + abort(); + int status = seccomp_rule_add(ctx, action, syscall_number, 0); + if (status != 0) + abort(); + } +} + +static scmp_filter_ctx ctx; + +void setup_sandbox(void) { + uint32_t const arch = seccomp_arch_native(); + ctx = seccomp_init(SCMP_ACT_ERRNO(ENOSYS)); + + if (ctx == NULL) + abort(); + + ya_runtime_add_syscalls(ctx, allow_syscalls, ARRAY_SIZE(allow_syscalls), arch, SCMP_ACT_ALLOW); + int status = seccomp_rule_add(ctx, SCMP_ACT_ALLOW, + SCMP_SYS(personality), 1, SCMP_CMP64(0, SCMP_CMP_EQ, 0, 0)); + if (status != 0) { + abort(); + } + + switch (arch) { + case SCMP_ARCH_ARM: + case SCMP_ARCH_AARCH64: + ya_runtime_add_syscalls(ctx, arm_syscalls, ARRAY_SIZE(arm_syscalls), + arch, SCMP_ACT_ALLOW); + break; + case SCMP_ARCH_X86: + case SCMP_ARCH_X86_64: + ya_runtime_add_syscalls(ctx, x86_syscalls, ARRAY_SIZE(x86_syscalls), + arch, SCMP_ACT_ALLOW); + default: + break; + } + + ya_runtime_add_syscalls(ctx, eperm_syscalls, ARRAY_SIZE(eperm_syscalls), arch, SCMP_ACT_ERRNO(EPERM)); + int fd = memfd_create("fake", MFD_CLOEXEC); + if (fd < 3) + abort(); + if (seccomp_export_bpf(ctx, fd)) + abort(); +} + +void sandbox_apply(void) { + if (seccomp_load(ctx)) + abort(); +} diff --git a/runtime/src/vmrt.rs b/runtime/src/vmrt.rs index 354b4f42..4f89fcc6 100755 --- a/runtime/src/vmrt.rs +++ b/runtime/src/vmrt.rs @@ -21,6 +21,7 @@ const DIR_RUNTIME: &str = "runtime"; const FILE_RUNTIME: &str = "vmrt"; const FILE_VMLINUZ: &str = "vmlinuz-virt"; const FILE_INITRAMFS: &str = "initramfs.cpio.gz"; +const FILE_NVIDIA_FILES: &str = "nvidia-files.squashfs"; #[derive(Default)] pub struct RuntimeData { @@ -115,6 +116,17 @@ pub async fn start_vmrt( cmd.arg("none"); } + if runtime_dir.join(FILE_NVIDIA_FILES).exists() { + cmd.arg("-drive"); + cmd.arg( + format!( + "file={},cache=unsafe,readonly=on,format=raw,if=virtio", + runtime_dir.join(FILE_NVIDIA_FILES).display() + ) + .as_str(), + ); + } + let (vpn, inet) = // backward-compatibility mode if vpn_remote.is_none() && inet_remote.is_none() {