From bc92ec5abcbcd9ae67c59e7f07d86e3eee5c1bcc Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Tue, 28 Nov 2023 17:08:20 -0700 Subject: [PATCH 01/19] draft docs --- doc/ch-run.rst | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/doc/ch-run.rst b/doc/ch-run.rst index 8b96d5a29..6965e5940 100644 --- a/doc/ch-run.rst +++ b/doc/ch-run.rst @@ -29,12 +29,13 @@ mounting SquashFS images with FUSE. not specified is to use the same path as the host; i.e., the default is :code:`--bind=SRC:SRC`. Can be repeated. - If :code:`--write` is given and :code:`DST` does not exist, it will be - created as an empty directory. However, :code:`DST` must be entirely - within the image itself; :code:`DST` cannot enter a previous bind mount. - For example, :code:`--bind /foo:/tmp/foo` will fail because :code:`/tmp` - is shared with the host via bind-mount (unless :code:`$TMPDIR` is set to - something else or :code:`--private-tmp` is given). + If :code:`--write` or :code:`--write-fake` are given and :code:`DST` does + not exist, it will be created as an empty directory. However, :code:`DST` + must be entirely within the image itself; :code:`DST` cannot enter a + previous bind mount. For example, :code:`--bind /foo:/tmp/foo` will fail + because :code:`/tmp` is shared with the host via bind-mount (unless + :code:`$TMPDIR` is set to something else or :code:`--private-tmp` is + given). Most images do have ten directories :code:`/mnt/[0-9]` already available as mount points. @@ -132,8 +133,26 @@ mounting SquashFS images with FUSE. :code:`-v`, :code:`--verbose` Be more verbose (can be repeated). + :code:`-W`, :code:`--write-fake[=SIZE]` + Overlay a writeable tmpfs over the image. This makes the image appear + read-write, but it actually remains read-only and unchanged. All data + written are discarded when the container exits. The size of the writeable + filesystem :code:`SIZE` is any :code:`size` specification acceptable to + :code:`tmpfs`, e.g. :code:`4m` for 4MiB or :code:`50%` for half of + physical memory. The default is :code:`12%`. Note that this limit is a + maximum: only actually stored files consume virtual memory. This requires + a kernel that supports unprivileged overlayfs (`upstream 5.11 + `_, + but distributions vary considerably). + :code:`-w`, :code:`--write` - Mount image read-write (by default, the image is mounted read-only). + Mount image read-write. By default, the image is mounted read-only. *This + option should be avoided for most use cases,* because (1) changing images + live (as opposed to prescriptively with a Dockerfile) destroys their + provenance and (2) SquashFS images, which is the best-practice format on + parallel filesystems, must be read-only. It is better to use + `--write-fake` (for disposable data) or bind-mount host directories (for + retained data). :code:`-?`, :code:`--help` Print help and exit. @@ -696,4 +715,4 @@ status is 1 regardless of the signal value. .. include:: ./see_also.rst .. LocalWords: mtune NEWROOT hugetlbfs UsrMerge fusermount mybox IMG HOSTPATH -.. LocalWords: noprofile norc SHLVL PWD +.. LocalWords: noprofile norc SHLVL PWD kernelnewbies From a6fe7bc5dc7ad553576c07a5e198b608a9a13124 Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Thu, 30 Nov 2023 12:00:59 -0700 Subject: [PATCH 02/19] implement overlayfs (untested) --- bin/ch-run.c | 15 +++++++++++++-- bin/ch_core.c | 35 +++++++++++++++++++++++++---------- bin/ch_core.h | 1 + doc/ch-run.rst | 41 ++++++++++++++++++++++------------------- 4 files changed, 61 insertions(+), 31 deletions(-) diff --git a/bin/ch-run.c b/bin/ch-run.c index f2b13d8d9..47f68a784 100644 --- a/bin/ch-run.c +++ b/bin/ch-run.c @@ -27,6 +27,9 @@ char *JOIN_CT_ENV[] = { "OMPI_COMM_WORLD_LOCAL_SIZE", char *JOIN_TAG_ENV[] = { "SLURM_STEP_ID", NULL }; +/* Default overlaid tmpfs size. */ +char *WRITE_FAKE_DEFAULT = "12%"; + /** Command line options **/ @@ -75,7 +78,9 @@ const struct argp_option options[] = { { "verbose", 'v', 0, 0, "be more verbose (can be repeated)" }, { "version", 'V', 0, 0, "print version and exit" }, { "warnings", -16, "NUM", 0, "log NUM warnings and exit" }, - { "write", 'w', 0, 0, "mount image read-write"}, + { "write", 'w', 0, 0, "mount image read-write (avoid)"}, + { "write-fake", 'W', "SIZE", OPTION_ARG_OPTIONAL, + "overlay read-write tmpfs on top of image" }, { 0 } }; @@ -155,6 +160,7 @@ int main(int argc, char *argv[]) .join_ct = 0, .join_pid = 0, .join_tag = NULL, + .overlay_size = NULL, .private_passwd = false, .private_tmp = false, .type = IMG_NONE, @@ -173,7 +179,7 @@ int main(int argc, char *argv[]) argp_help_fmt_set = true; else { argp_help_fmt_set = false; - Z_ (setenv("ARGP_HELP_FMT", "opt-doc-col=25,no-dup-args-note", 0)); + Z_ (setenv("ARGP_HELP_FMT", "opt-doc-col=27,no-dup-args-note", 0)); } Z_ (argp_parse(&argp, argc, argv, 0, &arg_next, &args)); if (!argp_help_fmt_set) @@ -457,6 +463,8 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state) break; case -12: // --home Tf (args->c.host_home = getenv("HOME"), "--home failed: $HOME not set"); + if (args->c.overlay_size == NULL) + args->c.overlay_size = WRITE_FAKE_DEFAULT; break; case -13: // --unsafe args->unsafe = true; @@ -534,6 +542,9 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state) case 'w': // --write args->c.writable = true; break; + case 'W': // --write-fake + args->c.overlay_size = arg != NULL ? arg : WRITE_FAKE_DEFAULT; + break; case ARGP_KEY_NO_ARGS: argp_state_help(state, stderr, ( ARGP_HELP_SHORT_USAGE | ARGP_HELP_PRE_DOC diff --git a/bin/ch_core.c b/bin/ch_core.c index 304502411..d78b2f965 100644 --- a/bin/ch_core.c +++ b/bin/ch_core.c @@ -266,7 +266,8 @@ void containerize(struct container *c) } -/* Enter the UDSS. After this, we are inside the UDSS. +/* Enter the new root (UDSS). On entry, the namespaces are set up, and this + does the mounting and filesystem setup. Note that pivot_root(2) requires a complex dance to work, i.e., to avoid multiple undocumented error conditions. This dance is explained in detail @@ -276,9 +277,27 @@ void enter_udss(struct container *c) char *newroot_parent, *newroot_base; LOG_IDS; - path_split(c->newroot, &newroot_parent, &newroot_base); + // Overlay a tmpfs for --write-fake. See for useful details: + // https://www.kernel.org/doc/html/v5.7/filesystems/tmpfs.html + // https://www.kernel.org/doc/html/v5.7/filesystems/overlayfs.html + if (c->overlay_size != NULL) { + char *options; + T_ (1 <= asprintf(&options, "size=%s", c->overlay_size)); + Zf (mount(NULL, "/mnt", "tmpfs", 0, options), // host should have /mnt + "cannot mount tmpfs for overlay"); + free(options); + Z_ (mkdir("/mnt/upper", 0700)); + Z_ (mkdir("/mnt/work", 0700)); + T_ (1 <= asprintf(&options, "lowerdir=%s,upperdir=%s,workdir=%s," + "redirect_dir=on,metacopy=on", + c->newroot, "/mnt/upper", "/mnt/work")); + Zf (mount(NULL, c->newroot, "overlay", 0, options), + "cannot mount overlayfs"); + free(options); + } + // Claim new root for this namespace. We do need both calls to avoid // pivot_root(2) failing with EBUSY later. bind_mount(c->newroot, c->newroot, BD_REQUIRED, "/", MS_PRIVATE); @@ -294,15 +313,11 @@ void enter_udss(struct container *c) } else { bind_mount(host_tmp, "/tmp", BD_REQUIRED, c->newroot, 0); } - // Container /home. + // Bind-mount user’s home directory at /home/$USER if requested. if (c->host_home) { - char *newhome; - // Mount tmpfs on guest /home because guest root may be read-only. - tmpfs_mount("/home", c->newroot, "size=4m"); - // Bind-mount user's home directory at /home/$USER. - newhome = cat("/home/", username); - Z_ (mkdir(cat(c->newroot, newhome), 0755)); - bind_mount(c->host_home, newhome, BD_REQUIRED, c->newroot, 0); + T_ (c->overlay_size != NULL); + bind_mount(c->host_home, cat("/home/", username), BD_MAKE_DST, + c->newroot, 0); } // Re-mount new root read-only unless --write or already read-only. if (!c->writable && !(access(c->newroot, W_OK) == -1 && errno == EROFS)) { diff --git a/bin/ch_core.h b/bin/ch_core.h index 69a389e0a..f65cfc083 100644 --- a/bin/ch_core.h +++ b/bin/ch_core.h @@ -39,6 +39,7 @@ struct container { int join_ct; // number of peers in a synchronized join pid_t join_pid; // process in existing namespace to join char *join_tag; // identifier for synchronized join + char *overlay_size; // size of overlaid tmpfs (NULL for no overlay) bool private_passwd; // don't bind custom /etc/{passwd,group} bool private_tmp; // don't bind host's /tmp enum img_type type; // directory, SquashFS, etc. diff --git a/doc/ch-run.rst b/doc/ch-run.rst index 6965e5940..da0475562 100644 --- a/doc/ch-run.rst +++ b/doc/ch-run.rst @@ -67,10 +67,8 @@ mounting SquashFS images with FUSE. :code:`--home` Bind-mount your host home directory (i.e., :code:`$HOME`) at guest - :code:`/home/$USER`. This is accomplished by over-mounting a new - :code:`tmpfs` at :code:`/home`, which hides any image content under that - path. By default, neither of these things happens and the image’s - :code:`/home` is exposed unaltered. + :code:`/home/$USER` (hiding any image content that exists at that path) + Implies :code:`--write-fake`. :code:`-j`, :code:`--join` Use the same container (namespaces) as peer :code:`ch-run` invocations. @@ -133,26 +131,31 @@ mounting SquashFS images with FUSE. :code:`-v`, :code:`--verbose` Be more verbose (can be repeated). - :code:`-W`, :code:`--write-fake[=SIZE]` - Overlay a writeable tmpfs over the image. This makes the image appear - read-write, but it actually remains read-only and unchanged. All data - written are discarded when the container exits. The size of the writeable - filesystem :code:`SIZE` is any :code:`size` specification acceptable to - :code:`tmpfs`, e.g. :code:`4m` for 4MiB or :code:`50%` for half of - physical memory. The default is :code:`12%`. Note that this limit is a - maximum: only actually stored files consume virtual memory. This requires - a kernel that supports unprivileged overlayfs (`upstream 5.11 - `_, - but distributions vary considerably). - :code:`-w`, :code:`--write` Mount image read-write. By default, the image is mounted read-only. *This option should be avoided for most use cases,* because (1) changing images live (as opposed to prescriptively with a Dockerfile) destroys their provenance and (2) SquashFS images, which is the best-practice format on parallel filesystems, must be read-only. It is better to use - `--write-fake` (for disposable data) or bind-mount host directories (for - retained data). + :code:`--write-fake` (for disposable data) or bind-mount host directories + (for retained data). + + :code:`-W`, :code:`--write-fake[=SIZE]` + Overlay a writeable tmpfs on top of the image. This makes the image appear + read-write, but it actually remains read-only and unchanged. All data + “written” to the image are discarded when the container exits. + + The size of the writeable filesystem :code:`SIZE` is any size + specification acceptable to :code:`tmpfs`, e.g. :code:`4m` for 4MiB or + :code:`50%` for half of physical memory. If this option is specified + without :code:`SIZE`, the default is :code:`12%`. Note that this limit is + a maximum: only actually stored files consume virtual memory. + + This requires a kernel that supports unprivileged overlayfs (`upstream + 5.11 + `_, + but distributions vary considerably). If it does not, the error is + “operation not permitted”. :code:`-?`, :code:`--help` Print help and exit. @@ -715,4 +718,4 @@ status is 1 regardless of the signal value. .. include:: ./see_also.rst .. LocalWords: mtune NEWROOT hugetlbfs UsrMerge fusermount mybox IMG HOSTPATH -.. LocalWords: noprofile norc SHLVL PWD kernelnewbies +.. LocalWords: noprofile norc SHLVL PWD kernelnewbies extglob From 0d161ec93d9915c097ed70374154f4f801e53c86 Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Wed, 6 Dec 2023 10:51:01 -0700 Subject: [PATCH 03/19] implement overlay --- bin/ch_core.c | 80 ++++++++++++++++++++++++++++---------------------- bin/ch_misc.c | 6 ++-- doc/ch-run.rst | 42 +++++++++++++++++++++----- 3 files changed, 83 insertions(+), 45 deletions(-) diff --git a/bin/ch_core.c b/bin/ch_core.c index d78b2f965..c1cdf56b1 100644 --- a/bin/ch_core.c +++ b/bin/ch_core.c @@ -274,74 +274,84 @@ void containerize(struct container *c) in bin/ch-checkns.c. */ void enter_udss(struct container *c) { - char *newroot_parent, *newroot_base; + char *newroot, *newroot_parent, *newroot_base; LOG_IDS; - path_split(c->newroot, &newroot_parent, &newroot_base); + newroot = c->newroot; + path_split(newroot, &newroot_parent, &newroot_base); - // Overlay a tmpfs for --write-fake. See for useful details: - // https://www.kernel.org/doc/html/v5.7/filesystems/tmpfs.html - // https://www.kernel.org/doc/html/v5.7/filesystems/overlayfs.html + // Claim new root for this namespace. Despite MS_REC in bind_mount(), we do + // need both calls to avoid pivot_root(2) failing with EBUSY later. + DEBUG("claiming new root for this namespace") + bind_mount(newroot, newroot, BD_REQUIRED, "/", MS_PRIVATE); + bind_mount(newroot_parent, newroot_parent, BD_REQUIRED, "/", MS_PRIVATE); + // Re-mount new root read-only unless --write or already read-only. + if (!c->writable && !(access(newroot, W_OK) == -1 && errno == EROFS)) { + unsigned long flags = path_mount_flags(newroot) + | MS_REMOUNT // Re-mount ... + | MS_BIND // only this mount point ... + | MS_RDONLY; // read-only. + Zf (mount(NULL, newroot, NULL, flags, NULL), + "can't re-mount image read-only (is it on NFS?)"); + } + // Overlay a tmpfs if --write-fake. See for useful details: + // https://www.kernel.org/doc/html/v5.11/filesystems/tmpfs.html + // https://www.kernel.org/doc/html/v5.11/filesystems/overlayfs.html if (c->overlay_size != NULL) { + VERBOSE("overlaying tmpfs for --write-fake"); char *options; T_ (1 <= asprintf(&options, "size=%s", c->overlay_size)); Zf (mount(NULL, "/mnt", "tmpfs", 0, options), // host should have /mnt "cannot mount tmpfs for overlay"); free(options); - Z_ (mkdir("/mnt/upper", 0700)); - Z_ (mkdir("/mnt/work", 0700)); + Z_ (mkdir("/mnt/upper", 0755)); + Z_ (mkdir("/mnt/work", 0755)); + Z_ (mkdir("/mnt/merged", 0755)); T_ (1 <= asprintf(&options, "lowerdir=%s,upperdir=%s,workdir=%s," - "redirect_dir=on,metacopy=on", - c->newroot, "/mnt/upper", "/mnt/work")); - Zf (mount(NULL, c->newroot, "overlay", 0, options), + "index=on,userxattr,volatile", + newroot, "/mnt/upper", "/mnt/work")); + // update newroot + newroot = "/mnt/merged"; + free(newroot_parent); + free(newroot_base); + path_split(newroot, &newroot_parent, &newroot_base); + Zf (mount(NULL, newroot, "overlay", 0, options), "cannot mount overlayfs"); free(options); } - - // Claim new root for this namespace. We do need both calls to avoid - // pivot_root(2) failing with EBUSY later. - bind_mount(c->newroot, c->newroot, BD_REQUIRED, "/", MS_PRIVATE); - bind_mount(newroot_parent, newroot_parent, BD_REQUIRED, "/", MS_PRIVATE); + DEBUG("starting bind-mounts"); // Bind-mount default files and directories. - bind_mounts(BINDS_DEFAULT, c->newroot, MS_RDONLY); + bind_mounts(BINDS_DEFAULT, newroot, MS_RDONLY); // /etc/passwd and /etc/group. if (!c->private_passwd) setup_passwd(c); // Container /tmp. if (c->private_tmp) { - tmpfs_mount("/tmp", c->newroot, NULL); + tmpfs_mount("/tmp", newroot, NULL); } else { - bind_mount(host_tmp, "/tmp", BD_REQUIRED, c->newroot, 0); + bind_mount(host_tmp, "/tmp", BD_REQUIRED, newroot, 0); } // Bind-mount user’s home directory at /home/$USER if requested. if (c->host_home) { T_ (c->overlay_size != NULL); - bind_mount(c->host_home, cat("/home/", username), BD_MAKE_DST, - c->newroot, 0); - } - // Re-mount new root read-only unless --write or already read-only. - if (!c->writable && !(access(c->newroot, W_OK) == -1 && errno == EROFS)) { - unsigned long flags = path_mount_flags(c->newroot) - | MS_REMOUNT // Re-mount ... - | MS_BIND // only this mount point ... - | MS_RDONLY; // read-only. - Zf (mount(NULL, c->newroot, NULL, flags, NULL), - "can't re-mount image read-only (is it on NFS?)"); + bind_mount(c->host_home, cat("/home/", username), + BD_MAKE_DST, newroot, 0); } // Bind-mount user-specified directories. - bind_mounts(c->binds, c->newroot, 0); - // Overmount / to avoid EINVAL if it's a rootfs. + bind_mounts(c->binds, newroot, 0); + // Overmount / to avoid EINVAL if it’s a rootfs. Z_ (chdir(newroot_parent)); Z_ (mount(newroot_parent, "/", NULL, MS_MOVE, NULL)); Z_ (chroot(".")); - c->newroot = cat("/", newroot_base); - // Pivot into the new root. Use /dev because it's available even in + // Pivot into the new root. Use /dev because it’s available even in // extremely minimal images. - Zf (chdir(c->newroot), "can't chdir into new root"); - Zf (syscall(SYS_pivot_root, c->newroot, cat(c->newroot, "/dev")), + newroot = cat("/", newroot_base); + Zf (chdir(newroot), "can't chdir into new root"); + Zf (syscall(SYS_pivot_root, newroot, cat(newroot, "/dev")), "can't pivot_root(2)"); Zf (chroot("."), "can't chroot(2) into new root"); Zf (umount2("/dev", MNT_DETACH), "can't umount old root"); + DEBUG("pivot_root(2) dance successful") } /* Return image type of path, or exit with error if not a valid type. */ diff --git a/bin/ch_misc.c b/bin/ch_misc.c index 92034613e..2104e0da2 100644 --- a/bin/ch_misc.c +++ b/bin/ch_misc.c @@ -605,9 +605,11 @@ void path_split(const char *path, char **dir, char **base) char *path2; T_ (path2 = strdup(path)); - *dir = dirname(path2); + T_ (*dir = strdup(dirname(path2))); + free(path2); T_ (path2 = strdup(path)); - *base = basename(path2); + T_ (*base = strdup(basename(path2))); + free(path2); } /* Return true if path is a subdirectory of base, false otherwise. Acts on the diff --git a/doc/ch-run.rst b/doc/ch-run.rst index da0475562..9723cc203 100644 --- a/doc/ch-run.rst +++ b/doc/ch-run.rst @@ -141,9 +141,9 @@ mounting SquashFS images with FUSE. (for retained data). :code:`-W`, :code:`--write-fake[=SIZE]` - Overlay a writeable tmpfs on top of the image. This makes the image appear - read-write, but it actually remains read-only and unchanged. All data - “written” to the image are discarded when the container exits. + Overlay a writeable tmpfs on top of the image. This makes the image + *appear* read-write, but it actually remains read-only and unchanged. All + data “written” to the image are discarded when the container exits. The size of the writeable filesystem :code:`SIZE` is any size specification acceptable to :code:`tmpfs`, e.g. :code:`4m` for 4MiB or @@ -151,11 +151,8 @@ mounting SquashFS images with FUSE. without :code:`SIZE`, the default is :code:`12%`. Note that this limit is a maximum: only actually stored files consume virtual memory. - This requires a kernel that supports unprivileged overlayfs (`upstream - 5.11 - `_, - but distributions vary considerably). If it does not, the error is - “operation not permitted”. + This requires kernel support and there are some caveats. See section + “:ref:`ch-run_overlay`” below for details. :code:`-?`, :code:`--help` Print help and exit. @@ -332,6 +329,35 @@ Caveats: * Many of the arguments given to the race losers, such as the image path and :code:`--bind`, will be ignored in favor of what was given to the winner. +.. _ch-run_overlay: + +Writeable overlay with :code:`--write-fake` +=========================================== + +If you need the image to stay read-only but appear writeable, you may be able +to use :code:`--write-fake` to overlay a writeable tmpfs atop the image. This +requires kernel support. Specifically: + +1. To use the feature at all, you need unprivileged overlayfs support. This is + available in `upstream 5.11 + `_ + (February 2021), but distributions vary considerably. If you don’t have + this, the container will fail to start with error “operation not + permitted”. + +2. To allow fully arbitrary changes in the overlay, you need a tmpfs that + supports xattrs in the :code:`user` namespace. This is available in + `upstream 6.6 `_ (October 2023). + If you don’t have this, most things will work fine, but some operations + will fail with “I/O error”, for example creating a directory with the same + path as a previously deleted directory. There will also be syslog noise + about xattr problems. + + (overlayfs can also use xattrs in the :code:`trusted` namespace, but this + requires :code:`CAP_SYS_ADMIN` `on the host + `_ + and thus is not helpful for unprivileged containers.) + Environment variables ===================== From 12eca652127b1c642556e4fed0bb5974cedb8745 Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Wed, 6 Dec 2023 13:48:59 -0700 Subject: [PATCH 04/19] configure stuff --- bin/ch_core.c | 6 ++-- configure.ac | 88 +++++++++++++++++++++++++++++++++++++++++++++++--- doc/ch-run.rst | 14 ++++---- 3 files changed, 94 insertions(+), 14 deletions(-) diff --git a/bin/ch_core.c b/bin/ch_core.c index c1cdf56b1..6a4221019 100644 --- a/bin/ch_core.c +++ b/bin/ch_core.c @@ -304,9 +304,9 @@ void enter_udss(struct container *c) Zf (mount(NULL, "/mnt", "tmpfs", 0, options), // host should have /mnt "cannot mount tmpfs for overlay"); free(options); - Z_ (mkdir("/mnt/upper", 0755)); - Z_ (mkdir("/mnt/work", 0755)); - Z_ (mkdir("/mnt/merged", 0755)); + Z_ (mkdir("/mnt/upper", 0700)); + Z_ (mkdir("/mnt/work", 0700)); + Z_ (mkdir("/mnt/merged", 0700)); T_ (1 <= asprintf(&options, "lowerdir=%s,upperdir=%s,workdir=%s," "index=on,userxattr,volatile", newroot, "/mnt/upper", "/mnt/work")); diff --git a/configure.ac b/configure.ac index c1cb1acb2..3c74f6999 100644 --- a/configure.ac +++ b/configure.ac @@ -335,6 +335,83 @@ AC_RUN_IFELSE([AC_LANG_SOURCE([[ [AC_MSG_ERROR([cross-compilation not supported])]) AC_MSG_RESULT($have_userns) +# overlayfs +AC_DEFUN([CH_OVERLAY_C], [[ + #define _GNU_SOURCE + #include + #include + #include + #include + #include + #include + #include + #include + + #define T_(x) if (!(x)) fatal_(__FILE__, __LINE__, errno, #x) + #define Z_(x) if (x) fatal_(__FILE__, __LINE__, errno, #x) + + void fatal_(const char *file, int line, int errno_, const char *str) + { + fprintf(stderr, "error: %s: %d: %s\n", file, line, str); + fprintf(stderr, "errno: %d: %s\n", errno_, strerror(errno_)); + exit(1); + } + + int main(void) + { + int fd; + uid_t euid = geteuid(); + gid_t egid = getegid(); + + // enter namespaces + Z_ (unshare(CLONE_NEWNS|CLONE_NEWUSER)); + + // set up ID maps + T_ (-1 != (fd = open("/proc/self/uid_map", O_WRONLY))); + T_ (1 <= dprintf(fd, "%d %d 1\n", 0, euid)); + Z_ (close(fd)); + T_ (-1 != (fd = open("/proc/self/setgroups", O_WRONLY))); + T_ (1 <= dprintf(fd, "deny\n")); + Z_ (close(fd)); + T_ (-1 != (fd = open("/proc/self/gid_map", O_WRONLY))); + T_ (1 <= dprintf(fd, "%d %d 1\n", 0, egid)); + Z_ (close(fd)); + + // set up overlayfs + Z_ (mount("/", "/", NULL, MS_BIND | MS_REC | MS_PRIVATE, NULL)); + Z_ (mount(NULL, "/mnt", "tmpfs", 0, NULL)); + Z_ (mkdir("/mnt/upper", 0700)); + Z_ (mkdir("/mnt/lower", 0700)); + Z_ (mkdir("/mnt/lower/test", 0700)); + Z_ (mkdir("/mnt/work", 0700)); + Z_ (mkdir("/mnt/merged", 0700)); + Z_ (mount(NULL, "/mnt/merged", "overlay", MS_NOATIME, + "lowerdir=/mnt/lower," + "upperdir=/mnt/upper," + "workdir=/mnt/work," + "index=on,userxattr,volatile")); + + // test if user xattrs are working + #ifdef XATTRS + Z_ (rmdir("/mnt/merged/test")); + Z_ (mkdir("/mnt/merged/test", 0700)); + #endif + } +]]) +AC_MSG_CHECKING([for unprivileged overlayfs]) +AC_RUN_IFELSE([AC_LANG_SOURCE(CH_OVERLAY_C)], + [have_overlayfs=yes], + [have_overlayfs=no], + [AC_MSG_ERROR([cross-compilation not supported])]) +AC_MSG_RESULT($have_overlayfs) +AC_MSG_CHECKING([for tmpfs user xattrs]) +AC_RUN_IFELSE([#define XATTRS + AC_LANG_SOURCE(CH_OVERLAY_C)], + [have_tmpfs_xattrs=yes], + [have_tmpfs_xattrs=no], + [AC_MSG_ERROR([cross-compilation not supported])]) +AC_MSG_RESULT($have_tmpfs_xattrs) + ### ch-run optional ########################################################## @@ -847,10 +924,6 @@ Building Charliecloud libsquashfuse_ll ... ${have_libsquashfuse_ll} ll.h header ... ${have_ll_h} - fake system calls with seccomp(2): ${have_seccomp} - enabled ... ${msg_seccomp} - tested working ... ${test_seccomp} - documentation: ${have_docs} sphinx-build(1) ≥ $vmin_sphinx ... ${SPHINX_VERSION_NOTE} sphinx-build(1) Python ... ${sphinx_python:-n/a} @@ -903,6 +976,13 @@ Running containers manual mount with SquashFUSE ≥ $vmin_squashfuse ... ${SQUASHFUSE_VERSION_NOTE} internal mount with libsquashfuse ... ${have_libsquashfuse} + fake system calls with seccomp(2): ${have_seccomp} + enabled ... ${msg_seccomp} + tested working ... ${test_seccomp} + + writeable overlay (--write-fake): ${have_overlayfs} + fully functional ... ${have_tmpfs_xattrs} + inject nVidia GPU libraries: ${have_nvidia} nvidia-container-cli(1) ≥ $vmin_nvidia_cli ... ${NVIDIA_CLI_VERSION_NOTE} nVidia libraries & executables present ... ${have_nvidia_libs} diff --git a/doc/ch-run.rst b/doc/ch-run.rst index 9723cc203..8d9d47e4d 100644 --- a/doc/ch-run.rst +++ b/doc/ch-run.rst @@ -345,13 +345,13 @@ requires kernel support. Specifically: this, the container will fail to start with error “operation not permitted”. -2. To allow fully arbitrary changes in the overlay, you need a tmpfs that - supports xattrs in the :code:`user` namespace. This is available in - `upstream 6.6 `_ (October 2023). - If you don’t have this, most things will work fine, but some operations - will fail with “I/O error”, for example creating a directory with the same - path as a previously deleted directory. There will also be syslog noise - about xattr problems. +2. For a fully functional overlay, you need a tmpfs that supports xattrs in + the :code:`user` namespace. This is available in `upstream 6.6 + `_ (October 2023). If you don’t + have this, most things will work fine, but some operations will fail with + “I/O error”, for example creating a directory with the same path as a + previously deleted directory. There will also be syslog noise about xattr + problems. (overlayfs can also use xattrs in the :code:`trusted` namespace, but this requires :code:`CAP_SYS_ADMIN` `on the host From 2640eebdb4356ae8bcec28703e714919ea9decdc Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Wed, 6 Dec 2023 13:58:39 -0700 Subject: [PATCH 05/19] document SIZE has no limit --- bin/ch_core.c | 2 +- doc/ch-run.rst | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/bin/ch_core.c b/bin/ch_core.c index 6a4221019..11e594526 100644 --- a/bin/ch_core.c +++ b/bin/ch_core.c @@ -298,7 +298,7 @@ void enter_udss(struct container *c) // https://www.kernel.org/doc/html/v5.11/filesystems/tmpfs.html // https://www.kernel.org/doc/html/v5.11/filesystems/overlayfs.html if (c->overlay_size != NULL) { - VERBOSE("overlaying tmpfs for --write-fake"); + VERBOSE("overlaying tmpfs for --write-fake (%s)", c->overlay_size); char *options; T_ (1 <= asprintf(&options, "size=%s", c->overlay_size)); Zf (mount(NULL, "/mnt", "tmpfs", 0, options), // host should have /mnt diff --git a/doc/ch-run.rst b/doc/ch-run.rst index 8d9d47e4d..557535e62 100644 --- a/doc/ch-run.rst +++ b/doc/ch-run.rst @@ -148,8 +148,9 @@ mounting SquashFS images with FUSE. The size of the writeable filesystem :code:`SIZE` is any size specification acceptable to :code:`tmpfs`, e.g. :code:`4m` for 4MiB or :code:`50%` for half of physical memory. If this option is specified - without :code:`SIZE`, the default is :code:`12%`. Note that this limit is - a maximum: only actually stored files consume virtual memory. + without :code:`SIZE`, the default is :code:`12%`. Note (1) this limit is a + maximum — only actually stored files consume virtual memory, and + (2) :code:`SIZE` larger than memory can be requested without error. This requires kernel support and there are some caveats. See section “:ref:`ch-run_overlay`” below for details. From 2c6a11771010c4d9b77fb36f072ee49679c3d9f2 Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Thu, 7 Dec 2023 10:32:32 -0700 Subject: [PATCH 06/19] add/update tests --- bin/ch-run.c | 4 +- test/common.bash | 1 - test/run/ch-run_misc.bats | 184 +++++++++++++++++--------------------- test/run_first.bats | 9 -- 4 files changed, 85 insertions(+), 113 deletions(-) diff --git a/bin/ch-run.c b/bin/ch-run.c index 47f68a784..5444e761e 100644 --- a/bin/ch-run.c +++ b/bin/ch-run.c @@ -463,8 +463,10 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state) break; case -12: // --home Tf (args->c.host_home = getenv("HOME"), "--home failed: $HOME not set"); - if (args->c.overlay_size == NULL) + if (args->c.overlay_size == NULL) { + VERBOSE("--home specified; also setting --write-fake"); args->c.overlay_size = WRITE_FAKE_DEFAULT; + } break; case -13: // --unsafe args->unsafe = true; diff --git a/test/common.bash b/test/common.bash index f0cef9743..c9f7a1d30 100644 --- a/test/common.bash +++ b/test/common.bash @@ -352,7 +352,6 @@ ch_version_docker=$(echo "$ch_version" | tr '~+' '--') # [2]: http://man7.org/linux/man-pages/man1/readlink.1.html ch_imgdir=$(readlink -m "$CH_TEST_IMGDIR") ch_tardir=$(readlink -m "$CH_TEST_TARDIR") -ch_mounts="${ch_imgdir}/mounts" # Image information. ch_tag=${CH_TEST_TAG:-NO_TAG_SET} # set by Makefile; many tests don’t need it diff --git a/test/run/ch-run_misc.bats b/test/run/ch-run_misc.bats index e7aa0d42d..75cd1aae5 100644 --- a/test/run/ch-run_misc.bats +++ b/test/run/ch-run_misc.bats @@ -1,5 +1,20 @@ load ../common +bind1_dir=$BATS_TMPDIR/bind1 +bind2_dir=$BATS_TMPDIR/bind2 + +setup () { + mkdir -p "$bind1_dir" + echo bind1_dir.file1 > "${bind1_dir}/file1" + mkdir -p "$bind2_dir" + echo bind2_dir.file2 > "${bind2_dir}/file2" +} + + +demand-overlayfs () { + ch-run -W "$ch_timg" -- true || pedantic_fail 'no unpriv overlayfs' +} + @test 'relative path to image' { # issue #6 scope full @@ -89,6 +104,15 @@ EOF [[ $status -eq 0 ]] [[ $output = /home/$USER ]] + # /home is merged if --home + run ch-run --home "$ch_timg" -- ls -1 /home + echo "$output" + [[ $status -eq 0 ]] + cat < Date: Wed, 13 Dec 2023 15:58:59 -0700 Subject: [PATCH 07/19] add --disable-impolite-check --- configure.ac | 26 +++++++++++++++++--------- doc/install.rst | 15 +++++++++++++++ 2 files changed, 32 insertions(+), 9 deletions(-) diff --git a/configure.ac b/configure.ac index 6e1677595..34625b00d 100644 --- a/configure.ac +++ b/configure.ac @@ -110,6 +110,10 @@ AC_ARG_ENABLE([html], AS_HELP_STRING([--disable-html], [HTML documentation]), [], [enable_html=yes]) +AC_ARG_ENABLE([impolite-checks], + AS_HELP_STRING([--disable-impolite-checks], [potentially troublesome informational checks]), + [], [enable_impolite_checks=yes]) + AC_ARG_ENABLE([man], AS_HELP_STRING([--disable-man], [man pages]), [], [enable_man=yes]) @@ -405,17 +409,21 @@ AC_DEFUN([CH_OVERLAY_C], [[ } ]]) AC_MSG_CHECKING([for unprivileged overlayfs]) -AC_RUN_IFELSE([AC_LANG_SOURCE(CH_OVERLAY_C)], - [have_overlayfs=yes], - [have_overlayfs=no], - [AC_MSG_ERROR([cross-compilation not supported])]) +have_overlayfs="check disabled" +AS_IF([test $enable_impolite_checks = yes], + [AC_RUN_IFELSE([AC_LANG_SOURCE(CH_OVERLAY_C)], + [have_overlayfs=yes], + [have_overlayfs=no], + [AC_MSG_ERROR([cross-compilation not supported])])]) AC_MSG_RESULT($have_overlayfs) +have_tmpfs_xattrs="check disabled" AC_MSG_CHECKING([for tmpfs user xattrs]) -AC_RUN_IFELSE([#define XATTRS - AC_LANG_SOURCE(CH_OVERLAY_C)], - [have_tmpfs_xattrs=yes], - [have_tmpfs_xattrs=no], - [AC_MSG_ERROR([cross-compilation not supported])]) +AS_IF([test $enable_impolite_checks = yes], + [AC_RUN_IFELSE([#define XATTRS + AC_LANG_SOURCE(CH_OVERLAY_C)], + [have_tmpfs_xattrs=yes], + [have_tmpfs_xattrs=no], + [AC_MSG_ERROR([cross-compilation not supported])])]) AC_MSG_RESULT($have_tmpfs_xattrs) diff --git a/doc/install.rst b/doc/install.rst index 849b850dd..9cbacbf2d 100644 --- a/doc/install.rst +++ b/doc/install.rst @@ -199,6 +199,21 @@ The main use case for these options is to support package maintainers. If this is you and does not meet your needs, please get in touch with us and we will help. +Avoid potentially troublesome informational tests: :code:`--disable-impolite-checks` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:code:`configure` performs a lot of checks that do not inform decisions but +are simply informational, for the report at the end. These checks replicate +run-time decisions; their purpose is to offer guidance on what to expect at +run time. + +Some of these checks trigger alerts in some situations. for example, writing +files in :code:`/proc` confuses the Gentoo package build `sandbox +`_. + +Option :code:`--disable-impolite-checks` skips these checks. The only +consequence is a somewhat less informative report. + Install with package manager ============================ From 93c82fa612324239c96ed1f352eee23c52c29672 Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Thu, 14 Dec 2023 09:28:16 -0700 Subject: [PATCH 08/19] put the revised newroot in the struct container --- bin/ch_core.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/bin/ch_core.c b/bin/ch_core.c index ad6d75a31..4f26cd99e 100644 --- a/bin/ch_core.c +++ b/bin/ch_core.c @@ -277,24 +277,23 @@ void containerize(struct container *c) in bin/ch-checkns.c. */ void enter_udss(struct container *c) { - char *newroot, *newroot_parent, *newroot_base; + char *newroot_parent, *newroot_base; LOG_IDS; - newroot = c->newroot; - path_split(newroot, &newroot_parent, &newroot_base); + path_split(c->newroot, &newroot_parent, &newroot_base); // Claim new root for this namespace. Despite MS_REC in bind_mount(), we do // need both calls to avoid pivot_root(2) failing with EBUSY later. DEBUG("claiming new root for this namespace") - bind_mount(newroot, newroot, BD_REQUIRED, "/", MS_PRIVATE); + bind_mount(c->newroot, c->newroot, BD_REQUIRED, "/", MS_PRIVATE); bind_mount(newroot_parent, newroot_parent, BD_REQUIRED, "/", MS_PRIVATE); // Re-mount new root read-only unless --write or already read-only. - if (!c->writable && !(access(newroot, W_OK) == -1 && errno == EROFS)) { - unsigned long flags = path_mount_flags(newroot) + if (!c->writable && !(access(c->newroot, W_OK) == -1 && errno == EROFS)) { + unsigned long flags = path_mount_flags(c->newroot) | MS_REMOUNT // Re-mount ... | MS_BIND // only this mount point ... | MS_RDONLY; // read-only. - Zf (mount(NULL, newroot, NULL, flags, NULL), + Zf (mount(NULL, c->newroot, NULL, flags, NULL), "can't re-mount image read-only (is it on NFS?)"); } // Overlay a tmpfs if --write-fake. See for useful details: @@ -312,45 +311,46 @@ void enter_udss(struct container *c) Z_ (mkdir("/mnt/merged", 0700)); T_ (1 <= asprintf(&options, "lowerdir=%s,upperdir=%s,workdir=%s," "index=on,userxattr,volatile", - newroot, "/mnt/upper", "/mnt/work")); + c->newroot, "/mnt/upper", "/mnt/work")); // update newroot - newroot = "/mnt/merged"; + c->newroot = "/mnt/merged"; free(newroot_parent); free(newroot_base); - path_split(newroot, &newroot_parent, &newroot_base); - Zf (mount(NULL, newroot, "overlay", 0, options), + path_split(c->newroot, &newroot_parent, &newroot_base); + Zf (mount(NULL, c->newroot, "overlay", 0, options), "cannot mount overlayfs"); + VERBOSE("newroot updated: %s", c->newroot); free(options); } DEBUG("starting bind-mounts"); // Bind-mount default files and directories. - bind_mounts(BINDS_DEFAULT, newroot, MS_RDONLY); + bind_mounts(BINDS_DEFAULT, c->newroot, MS_RDONLY); // /etc/passwd and /etc/group. if (!c->private_passwd) setup_passwd(c); // Container /tmp. if (c->private_tmp) { - tmpfs_mount("/tmp", newroot, NULL); + tmpfs_mount("/tmp", c->newroot, NULL); } else { - bind_mount(host_tmp, "/tmp", BD_REQUIRED, newroot, 0); + bind_mount(host_tmp, "/tmp", BD_REQUIRED, c->newroot, 0); } // Bind-mount user’s home directory at /home/$USER if requested. if (c->host_home) { T_ (c->overlay_size != NULL); bind_mount(c->host_home, cat("/home/", username), - BD_MAKE_DST, newroot, 0); + BD_MAKE_DST, c->newroot, 0); } // Bind-mount user-specified directories. - bind_mounts(c->binds, newroot, 0); + bind_mounts(c->binds, c->newroot, 0); // Overmount / to avoid EINVAL if it’s a rootfs. Z_ (chdir(newroot_parent)); Z_ (mount(newroot_parent, "/", NULL, MS_MOVE, NULL)); Z_ (chroot(".")); // Pivot into the new root. Use /dev because it’s available even in // extremely minimal images. - newroot = cat("/", newroot_base); - Zf (chdir(newroot), "can't chdir into new root"); - Zf (syscall(SYS_pivot_root, newroot, cat(newroot, "/dev")), + c->newroot = cat("/", newroot_base); + Zf (chdir(c->newroot), "can't chdir into new root"); + Zf (syscall(SYS_pivot_root, c->newroot, cat(c->newroot, "/dev")), "can't pivot_root(2)"); Zf (chroot("."), "can't chroot(2) into new root"); Zf (umount2("/dev", MNT_DETACH), "can't umount old root"); From 43e8a28d238c6c272961aaa75a37c1c27600ef51 Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Fri, 15 Dec 2023 16:11:34 -0700 Subject: [PATCH 09/19] =?UTF-8?q?IT=E2=80=99S=20ALIVE=20[skip=20ci]?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- bin/ch_core.c | 56 ++++++++++---------- bin/ch_fuse.c | 2 +- bin/ch_misc.c | 142 ++++++++++++++++++++++++++++++++++++++++++++++++-- bin/ch_misc.h | 10 +++- 4 files changed, 177 insertions(+), 33 deletions(-) diff --git a/bin/ch_core.c b/bin/ch_core.c index 4f26cd99e..ac71620d9 100644 --- a/bin/ch_core.c +++ b/bin/ch_core.c @@ -174,9 +174,9 @@ char **bind_mount_paths = NULL; /** Function prototypes (private) **/ void bind_mount(const char *src, const char *dst, enum bind_dep, - const char *newroot, unsigned long flags); + const char *newroot, unsigned long flags, const char *scratch); void bind_mounts(const struct bind *binds, const char *newroot, - unsigned long flags); + unsigned long flags, const char * scratch); void enter_udss(struct container *c); #ifdef HAVE_SECCOMP void iw(struct sock_fprog *p, int i, @@ -197,7 +197,7 @@ void tmpfs_mount(const char *dst, const char *newroot, const char *data); /* Bind-mount the given path into the container image. */ void bind_mount(const char *src, const char *dst, enum bind_dep dep, - const char *newroot, unsigned long flags) + const char *newroot, unsigned long flags, const char *scratch) { char *dst_fullc, *newrootc; char *dst_full = cat(newroot, dst); @@ -218,7 +218,7 @@ void bind_mount(const char *src, const char *dst, enum bind_dep dep, case BD_OPTIONAL: return; case BD_MAKE_DST: - mkdirs(newroot, dst, bind_mount_paths); + mkdirs(newroot, dst, bind_mount_paths, scratch); break; } @@ -235,10 +235,11 @@ void bind_mount(const char *src, const char *dst, enum bind_dep dep, /* Bind-mount a null-terminated array of struct bind objects. */ void bind_mounts(const struct bind *binds, const char *newroot, - unsigned long flags) + unsigned long flags, const char * scratch) { for (int i = 0; binds[i].src != NULL; i++) - bind_mount(binds[i].src, binds[i].dst, binds[i].dep, newroot, flags); + bind_mount(binds[i].src, binds[i].dst, binds[i].dep, + newroot, flags, scratch); } /* Set up new namespaces or join existing namespaces. */ @@ -277,24 +278,24 @@ void containerize(struct container *c) in bin/ch-checkns.c. */ void enter_udss(struct container *c) { - char *newroot_parent, *newroot_base; + char *nr_parent, *nr_base, *mkdir_scratch; LOG_IDS; - path_split(c->newroot, &newroot_parent, &newroot_base); + mkdir_scratch = NULL; + path_split(c->newroot, &nr_parent, &nr_base); // Claim new root for this namespace. Despite MS_REC in bind_mount(), we do // need both calls to avoid pivot_root(2) failing with EBUSY later. DEBUG("claiming new root for this namespace") - bind_mount(c->newroot, c->newroot, BD_REQUIRED, "/", MS_PRIVATE); - bind_mount(newroot_parent, newroot_parent, BD_REQUIRED, "/", MS_PRIVATE); + bind_mount(c->newroot, c->newroot, BD_REQUIRED, "/", MS_PRIVATE, NULL); + bind_mount(nr_parent, nr_parent, BD_REQUIRED, "/", MS_PRIVATE, NULL); // Re-mount new root read-only unless --write or already read-only. if (!c->writable && !(access(c->newroot, W_OK) == -1 && errno == EROFS)) { unsigned long flags = path_mount_flags(c->newroot) | MS_REMOUNT // Re-mount ... | MS_BIND // only this mount point ... | MS_RDONLY; // read-only. - Zf (mount(NULL, c->newroot, NULL, flags, NULL), - "can't re-mount image read-only (is it on NFS?)"); + Z_ (mount(NULL, c->newroot, NULL, flags, NULL)); } // Overlay a tmpfs if --write-fake. See for useful details: // https://www.kernel.org/doc/html/v5.11/filesystems/tmpfs.html @@ -309,22 +310,23 @@ void enter_udss(struct container *c) Z_ (mkdir("/mnt/upper", 0700)); Z_ (mkdir("/mnt/work", 0700)); Z_ (mkdir("/mnt/merged", 0700)); + mkdir_scratch = "/mnt/mkdir_overmount"; + Z_ (mkdir(mkdir_scratch, 0700)); T_ (1 <= asprintf(&options, "lowerdir=%s,upperdir=%s,workdir=%s," "index=on,userxattr,volatile", c->newroot, "/mnt/upper", "/mnt/work")); // update newroot c->newroot = "/mnt/merged"; - free(newroot_parent); - free(newroot_base); - path_split(c->newroot, &newroot_parent, &newroot_base); - Zf (mount(NULL, c->newroot, "overlay", 0, options), - "cannot mount overlayfs"); + free(nr_parent); + free(nr_base); + path_split(c->newroot, &nr_parent, &nr_base); + Zf (mount(NULL, c->newroot, "overlay", 0, options), "can't overlay"); VERBOSE("newroot updated: %s", c->newroot); free(options); } DEBUG("starting bind-mounts"); // Bind-mount default files and directories. - bind_mounts(BINDS_DEFAULT, c->newroot, MS_RDONLY); + bind_mounts(BINDS_DEFAULT, c->newroot, MS_RDONLY, NULL); // /etc/passwd and /etc/group. if (!c->private_passwd) setup_passwd(c); @@ -332,25 +334,25 @@ void enter_udss(struct container *c) if (c->private_tmp) { tmpfs_mount("/tmp", c->newroot, NULL); } else { - bind_mount(host_tmp, "/tmp", BD_REQUIRED, c->newroot, 0); + bind_mount(host_tmp, "/tmp", BD_REQUIRED, c->newroot, 0, NULL); } // Bind-mount user’s home directory at /home/$USER if requested. if (c->host_home) { T_ (c->overlay_size != NULL); bind_mount(c->host_home, cat("/home/", username), - BD_MAKE_DST, c->newroot, 0); + BD_MAKE_DST, c->newroot, 0, mkdir_scratch); } // Bind-mount user-specified directories. - bind_mounts(c->binds, c->newroot, 0); + bind_mounts(c->binds, c->newroot, 0, mkdir_scratch); // Overmount / to avoid EINVAL if it’s a rootfs. - Z_ (chdir(newroot_parent)); - Z_ (mount(newroot_parent, "/", NULL, MS_MOVE, NULL)); + Z_ (chdir(nr_parent)); + Z_ (mount(nr_parent, "/", NULL, MS_MOVE, NULL)); Z_ (chroot(".")); // Pivot into the new root. Use /dev because it’s available even in // extremely minimal images. - c->newroot = cat("/", newroot_base); + c->newroot = cat("/", nr_base); Zf (chdir(c->newroot), "can't chdir into new root"); - Zf (syscall(SYS_pivot_root, c->newroot, cat(c->newroot, "/dev")), + Zf (syscall(SYS_pivot_root, c->newroot, path_join(c->newroot, "dev")), "can't pivot_root(2)"); Zf (chroot("."), "can't chroot(2) into new root"); Zf (umount2("/dev", MNT_DETACH), "can't umount old root"); @@ -725,7 +727,7 @@ void setup_passwd(const struct container *c) } } Z_ (close(fd)); - bind_mount(path, "/etc/passwd", BD_REQUIRED, c->newroot, 0); + bind_mount(path, "/etc/passwd", BD_REQUIRED, c->newroot, 0, NULL); Z_ (unlink(path)); // /etc/group @@ -748,7 +750,7 @@ void setup_passwd(const struct container *c) } } Z_ (close(fd)); - bind_mount(path, "/etc/group", BD_REQUIRED, c->newroot, 0); + bind_mount(path, "/etc/group", BD_REQUIRED, c->newroot, 0, NULL); Z_ (unlink(path)); } diff --git a/bin/ch_fuse.c b/bin/ch_fuse.c index 657ee759b..0ebda0a59 100644 --- a/bin/ch_fuse.c +++ b/bin/ch_fuse.c @@ -120,7 +120,7 @@ void sq_fork(struct container *c) T_ (asprintf(&subdir, "/%s.ch/mnt", username) > 0); c->newroot = cat("/var/tmp", subdir); VERBOSE("using default mount point: %s", c->newroot); - mkdirs("/var/tmp", subdir, NULL); + mkdirs("/var/tmp", subdir, NULL, NULL); } // Verify mount point exists and is a directory. (SquashFS file path diff --git a/bin/ch_misc.c b/bin/ch_misc.c index 2104e0da2..7ecac0eb7 100644 --- a/bin/ch_misc.c +++ b/bin/ch_misc.c @@ -2,6 +2,7 @@ #define _GNU_SOURCE #include +#include #include #include #include @@ -20,6 +21,13 @@ /** Macros **/ +/* When making a directory writeable with mkdirs_overlay(), this is the + maximum number of entries to bind-mount. It seems Linux can handle a very + large number of mounts [1] but I don’t want to explode /proc/mounts beyond + comprehensibility. + [1]: https://serverfault.com/questions/102588 */ +#define MKDIRS_OVERMOUNT_ENTRY_MAX 15 + /* FNM_EXTMATCH is a GNU extension to support extended globs in fnmatch(3). If not available, define as 0 to ignore this flag. */ #ifndef HAVE_FNM_EXTMATCH @@ -56,6 +64,7 @@ size_t warnings_offset = 0; /** Function prototypes (private) **/ +void mkdir_overlay(const char *path, const char *overscratch); void msgv(enum log_level level, const char *file, int line, int errno_, const char *fmt, va_list ap); @@ -191,6 +200,39 @@ char *cat(const char *a, const char *b) return ret; } +/* Like scandir(3), but (1) filter excludes “.” and “..”, (2) results are not + sorted, and (3) cannot fail (exits with an error instead). */ +int dir_ls(const char *path, struct dirent ***namelist) +{ + int entry_ct; + + entry_ct = scandir(path, namelist, dir_ls_filter, NULL); + Tf (entry_ct >= 0, "can't scan dir", path); + return entry_ct; +} + +/* Return the number of entries in directory path, not including “.” and “..”; + i.e., the empty directory returns 0 despite them. */ +int dir_ls_count(const char *path) +{ + int ct; + struct dirent **namelist; + + ct = dir_ls(path, &namelist); + for (size_t i = 0; i < ct; i++) + free(namelist[i]); + free(namelist); + + return ct; +} + +/* scandir(3) filter that excludes “.” and “..”: Return 0 if e->d_name is one + of those strings, else 1. */ +int dir_ls_filter(const struct dirent *e) +{ + return !(!strcmp(e->d_name, ".") || !strcmp(e->d_name, "..")); +} + /* Read the file listing environment variables at path, with records separated by delim, and return a corresponding list of struct env_var. Reads the entire file one time without seeking. If there is a problem reading the @@ -406,13 +448,86 @@ void log_ids(const char *func, int line) } } +/* Create the directory at path, despite its parent not allowing write access, + by overmounting a new, writeable directory with the existing contents of + the old directory bind-mounted in. The new directory lives initially in + scratch, which must not be used for any other purpose. No cleanup is done + here, so a disposable tmpfs is best. If anything goes wrong, exit with an + error message. */ +void mkdir_overmount(const char *path, const char *scratch) +{ + char *parent, *path2, *over; + int entry_ct; + struct dirent **entries; + + VERBOSE("making writeable via overmount trick: %s", path); + path2 = strdup(path); + parent = dirname(path2); + T_ (1 <= asprintf(&over, "%s/%d", scratch, dir_ls_count(scratch) + 1)); + + // bind-mount existing contents + Z_ (mkdir(over, 0755)); + entry_ct = dir_ls(parent, &entries); + DEBUG("existing entries: %d", entry_ct); + if (entry_ct > MKDIRS_OVERMOUNT_ENTRY_MAX) + WARNING("mkdir overmount: %d entries > limit %d, skipping extras: %s", + entry_ct, MKDIRS_OVERMOUNT_ENTRY_MAX, parent); + for (int i = 0; i < entry_ct; i++) { + if (i < MKDIRS_OVERMOUNT_ENTRY_MAX) { + char * src = path_join(parent, entries[i]->d_name); + char * dst = path_join(over, entries[i]->d_name); + struct stat st; + DEBUG("bind-mount %d: %s -> %s", i, src, dst); + + // Linux should always have the d_type field (if not, this won’t + // compile), but on some common filesystems (e.g. NFS?) it does not + // return a meaningful value, so we have to fall back to lstat(2). + if (entries[i]->d_type == DT_UNKNOWN) + st.st_mode = DTTOIF(entries[i]->d_type); + else + Zf (lstat(src, &st), "can't stat", src); + + // Create the mount point. + if (S_ISDIR(st.st_mode)) { + Z_ (mkdir(dst, 0755)); + } else { + // FIXME: not actually tested with non-regular-files + int fd = open(dst, O_WRONLY|O_CREAT|O_EXCL, 0600); + Zf (fd == -1, "can't open: %s", dst); + Zf (close(fd), "can't close: %s", dst); + } + + Zf (mount(src, dst, NULL, MS_REC|MS_BIND, NULL), + "can't bind-mount: %s -> %s", src, dst); + + free(src); + free(dst); + } + free(entries[i]); + } + free(entries); + + DEBUG("overmounting: %s -> %s", over, parent); + Zf (mount(over, parent, NULL, MS_REC|MS_BIND, NULL), + "can't bind-mount: %s- > %s", over, parent); + Zf (mkdir(path, 0755), "can't mkdir even after overmount: %s", path); + + free(over); + free(path2); +} + /* Create directories in path under base. Exit with an error if anything goes wrong. For example, mkdirs("/foo", "/bar/baz") will create directories /foo/bar and /foo/bar/baz if they don't already exist, but /foo must exist already. Symlinks are followed. path must remain under base, i.e. you can't use symlinks or ".." to climb out. denylist is a null-terminated array of - paths under which no directories may be created, or NULL if none. */ -void mkdirs(const char *base, const char *path, char **denylist) + paths under which no directories may be created, or NULL if none. + + Can defeat an un-writeable directory by overmounting a new writeable + directory atop it. To enable this behavior, pass the path to an appropriate + scratch directory in scratch. */ +void mkdirs(const char *base, const char *path, char **denylist, + const char *scratch) { char *basec, *component, *next, *nextc, *pathw, *saveptr; char *denylist_null[] = { NULL }; @@ -434,6 +549,7 @@ void mkdirs(const char *base, const char *path, char **denylist) saveptr = NULL; // avoid warning (#1048; see also strtok_r(3)) component = strtok_r(pathw, "/", &saveptr); nextc = basec; + next = NULL; while (component != NULL) { next = cat(nextc, "/"); next = cat(next, component); // canonical except for last component @@ -458,7 +574,12 @@ void mkdirs(const char *base, const char *path, char **denylist) Ze (path_subdir_p(denylist[i], next), "can't mkdir: %s under existing bind-mount %s", next, denylist[i]); - Zf (mkdir(next, 0777), "can't mkdir: %s", next); + if (mkdir(next, 0755)) { + if (scratch && (errno == EACCES || errno == EPERM)) + mkdir_overmount(next, scratch); + else + Tf (0, "can't mkdir: %s", next); + } nextc = next; // canonical b/c we just created last component as dir TRACE("mkdirs: created: %s", nextc) } @@ -557,6 +678,21 @@ bool path_exists(const char *path, struct stat *statbuf, bool follow_symlink) return false; } +/* Concatenate paths a and b, then return the result. */ +char *path_join(const char *a, const char *b) +{ + char *ret; + + T_ (a != NULL); + T_ (strlen(a) > 0); + T_ (b != NULL); + T_ (strlen(b) > 0); + + T_ (asprintf(&ret, "%s/%s", a, b) == strlen(a) + strlen(b) + 1); + + return ret; +} + /* Return the mount flags of the file system containing path, suitable for passing to mount(2). diff --git a/bin/ch_misc.h b/bin/ch_misc.h index 52a76ec6b..e42b7a818 100644 --- a/bin/ch_misc.h +++ b/bin/ch_misc.h @@ -5,9 +5,10 @@ libraries that ch_core requires. */ #define _GNU_SOURCE +#include #include -#include #include +#include /** Macros **/ @@ -117,6 +118,9 @@ char *argv_to_string(char **argv); int buf_strings_count(char *str, size_t s); bool buf_zero_p(void *buf, size_t size); char *cat(const char *a, const char *b); +int dir_ls(const char *path, struct dirent ***namelist); +int dir_ls_count(const char *path); +int dir_ls_filter(const struct dirent *e); struct env_var *env_file_read(const char *path, int delim); void env_set(const char *name, const char *value, const bool expand); void env_unset(const char *glob); @@ -124,12 +128,14 @@ struct env_var env_var_parse(const char *line, const char *path, size_t lineno); void list_append(void **ar, void *new, size_t size); void *list_new(size_t size, size_t ct); void log_ids(const char *func, int line); -void mkdirs(const char *base, const char *path, char **denylist); +void mkdirs(const char *base, const char *path, char **denylist, + const char *scratch); void msg(enum log_level level, const char *file, int line, int errno_, const char *fmt, ...); noreturn void msg_fatal(const char *file, int line, int errno_, const char *fmt, ...); bool path_exists(const char *path, struct stat *statbuf, bool follow_symlink); +char *path_join(const char *a, const char *b); unsigned long path_mount_flags(const char *path); void path_split(const char *path, char **dir, char **base); bool path_subdir_p(const char *base, const char *path); From cd9d0b2cf203cdafbb37edf77579d54335e06986 Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Tue, 19 Dec 2023 13:51:14 -0700 Subject: [PATCH 10/19] add test --- examples/chtest/Build | 6 ++-- test/run/ch-run_misc.bats | 63 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 65 insertions(+), 4 deletions(-) diff --git a/examples/chtest/Build b/examples/chtest/Build index b55121b82..b612c6691 100755 --- a/examples/chtest/Build +++ b/examples/chtest/Build @@ -91,9 +91,9 @@ chmod -R u+rw,ug-s img ## Install our test stuff. -# Sentinel file for --bind test -echo "tmpfs and host home are not overmounted" \ - > img/home/overmount-me +# Fixtures for --bind tests +mkdir img/home/directory-in-home +touch img/home/file-in-home # Test programs. cp -r "$srcdir" img/test diff --git a/test/run/ch-run_misc.bats b/test/run/ch-run_misc.bats index 75cd1aae5..4b8f3ba91 100644 --- a/test/run/ch-run_misc.bats +++ b/test/run/ch-run_misc.bats @@ -74,6 +74,7 @@ EOF [[ $output = 'CH_RUNNING=Weird Al Yankovic' ]] } + @test "\$HOME" { [[ $CH_TEST_BUILDER != 'none' ]] || skip 'image builder required' LC_ALL=C @@ -109,7 +110,8 @@ EOF echo "$output" [[ $status -eq 0 ]] cat < limit 15, skipping extras'* ]] + + # --home + run ch-run --home "$img" -- ls -1 /home + echo "$output" + [[ $status -eq 0 ]] + [[ $(echo "$output" | wc -l) -eq 3 ]] + [[ $output = *directory-in-home* ]] + [[ $output = *file-in-home* ]] + [[ $output = *"$USER"* ]] + + rm-img +} + + @test 'ch-run --bind errors' { scope quick [[ $CH_TEST_PACK_FMT == squash-mount ]] || skip 'squash-mount format only' @@ -794,6 +850,7 @@ EOF diff -u <(echo "$output_expected") <(echo "$output") } + @test 'ch-run: internal SquashFUSE mounting' { scope standard [[ $CH_TEST_PACK_FMT == squash-mount ]] || skip 'squash-mount format only' @@ -830,6 +887,7 @@ EOF rm -Rf --one-file-system "$img" } + @test 'ch-run: internal SquashFUSE errors' { scope standard [[ $CH_TEST_PACK_FMT == squash-mount ]] || skip 'squash-mount format only' @@ -873,6 +931,7 @@ EOF rm "$sq_tmp" } + @test 'broken image errors' { scope standard img=${BATS_TMPDIR}/broken-image @@ -1020,6 +1079,7 @@ EOF [[ $output = *"GID ${gid_bad} not found; using dummy info"* ]] } + @test 'syslog' { # This test depends on a fairly specific syslog configuration, so just do # it on GitHub Actions. @@ -1034,6 +1094,7 @@ EOF echo "$text" | grep -F "$expected" } + @test 'reprint warnings' { run ch-run --warnings=0 [[ $status -eq 0 ]] From 85c591529e5e943ac705aeebd2bf2b0c867499e5 Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Tue, 19 Dec 2023 16:09:20 -0700 Subject: [PATCH 11/19] actually build on whatever version of GCC CI has --- bin/ch_misc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/ch_misc.c b/bin/ch_misc.c index 7ecac0eb7..120c6609c 100644 --- a/bin/ch_misc.c +++ b/bin/ch_misc.c @@ -3,6 +3,7 @@ #define _GNU_SOURCE #include #include +#include #include #include #include From b73ab5ceb28bee22f5cc2a9d327887f9e97c8534 Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Wed, 20 Dec 2023 08:51:58 -0700 Subject: [PATCH 12/19] fix tests? --- packaging/fedora/build | 3 +-- test/run/ch-run_misc.bats | 11 +++++------ 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/packaging/fedora/build b/packaging/fedora/build index bfdf979c0..cf7db875c 100755 --- a/packaging/fedora/build +++ b/packaging/fedora/build @@ -97,8 +97,7 @@ rpmbuild root: %(rpmbuild)s # Build tarball. print("# building source tarball") - # FIXME: Why does this need --home? - cmd(CH_RUN, "--home", "-b", "%s:/mnt/0" % git_tmp, "-c", "/mnt/0", args.image, + cmd(CH_RUN, "-b", "%s:/mnt/0" % git_tmp, "-c", "/mnt/0", args.image, "--", "./autogen.sh") cmd(CH_RUN, "-b", "%s:/mnt/0" % git_tmp, "-c", "/mnt/0", args.image, "--", "./configure") diff --git a/test/run/ch-run_misc.bats b/test/run/ch-run_misc.bats index 4b8f3ba91..9b473ba3d 100644 --- a/test/run/ch-run_misc.bats +++ b/test/run/ch-run_misc.bats @@ -12,7 +12,7 @@ setup () { demand-overlayfs () { - ch-run -W "$ch_timg" -- true || pedantic_fail 'no unpriv overlayfs' + ch-run -W "$ch_timg" -- true || skip 'no unpriv overlayfs' } @@ -77,6 +77,7 @@ EOF @test "\$HOME" { [[ $CH_TEST_BUILDER != 'none' ]] || skip 'image builder required' + demand-overlayfs LC_ALL=C scope quick @@ -109,11 +110,9 @@ EOF run ch-run --home "$ch_timg" -- ls -1 /home echo "$output" [[ $status -eq 0 ]] - cat < Date: Thu, 21 Dec 2023 15:59:56 -0700 Subject: [PATCH 13/19] fix tests? --- packaging/fedora/build | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/packaging/fedora/build b/packaging/fedora/build index cf7db875c..6738ad851 100755 --- a/packaging/fedora/build +++ b/packaging/fedora/build @@ -72,7 +72,8 @@ rpmbuild root: %(rpmbuild)s # Create rpmbuild root rpm_sources = args.rpmbuild + '/SOURCES' rpm_specs = args.rpmbuild + '/SPECS' - for d in (rpm_sources, rpm_specs): + rpm_pips = args.rpmbuild + '/pip' + for d in (rpm_sources, rpm_specs, rpm_pips): print("# mkdir -p %s" % d) try: os.makedirs(d) @@ -97,8 +98,13 @@ rpmbuild root: %(rpmbuild)s # Build tarball. print("# building source tarball") - cmd(CH_RUN, "-b", "%s:/mnt/0" % git_tmp, "-c", "/mnt/0", args.image, - "--", "./autogen.sh") + # pip3 expects $HOME, here /root, to be writeable for its cache. If not, + # the warning is that the directory “is not owned by the current user”, + # which is not true. The warning further claims that “*caching* wheels has + # been disabled”, but in fact wheels are disabled entirely, so autogen.sh + # fails with “embedded Lark is broken”. Thanks pip! + cmd(CH_RUN, "-b", "%s:/mnt/0" % git_tmp, "-b", "%s:/root" % rpm_pips, + "-c", "/mnt/0", args.image, "--", "./autogen.sh") cmd(CH_RUN, "-b", "%s:/mnt/0" % git_tmp, "-c", "/mnt/0", args.image, "--", "./configure") cmd(CH_RUN, "-b", "%s:/mnt/0" % git_tmp, "-c", "/mnt/0", args.image, From 581ed2caead6f1399bae0d829659b18c34ef243f Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Fri, 22 Dec 2023 09:33:51 -0700 Subject: [PATCH 14/19] more docs [skip ci] --- doc/ch-run.rst | 21 ++++++++------ doc/faq.rst | 77 +++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 88 insertions(+), 10 deletions(-) diff --git a/doc/ch-run.rst b/doc/ch-run.rst index 557535e62..91369f32e 100644 --- a/doc/ch-run.rst +++ b/doc/ch-run.rst @@ -29,16 +29,18 @@ mounting SquashFS images with FUSE. not specified is to use the same path as the host; i.e., the default is :code:`--bind=SRC:SRC`. Can be repeated. - If :code:`--write` or :code:`--write-fake` are given and :code:`DST` does - not exist, it will be created as an empty directory. However, :code:`DST` - must be entirely within the image itself; :code:`DST` cannot enter a + With a read-only image (the default), :code:`DST` must exist. However, if + :code:`--write` or :code:`--write-fake` are given, :code:`DST` will be + created as an empty directory (possibly with the tmpfs overmount trick + described in :ref:`faq_mkdir-ro`). In this case, :code:`DST` must be + entirely within the image itself, i.e., :code:`DST` cannot enter a previous bind mount. For example, :code:`--bind /foo:/tmp/foo` will fail because :code:`/tmp` is shared with the host via bind-mount (unless :code:`$TMPDIR` is set to something else or :code:`--private-tmp` is given). - Most images do have ten directories :code:`/mnt/[0-9]` already available - as mount points. + Most images have ten directories :code:`/mnt/[0-9]` already available as + mount points. Symlinks in :code:`DST` are followed, and absolute links can have surprising behavior. Bind-mounting happens after namespace setup but @@ -67,8 +69,8 @@ mounting SquashFS images with FUSE. :code:`--home` Bind-mount your host home directory (i.e., :code:`$HOME`) at guest - :code:`/home/$USER` (hiding any image content that exists at that path) - Implies :code:`--write-fake`. + :code:`/home/$USER`, hiding any existing image content at that path. + Implies :code:`--write-fake` so the mount point can be created if needed. :code:`-j`, :code:`--join` Use the same container (namespaces) as peer :code:`ch-run` invocations. @@ -149,8 +151,9 @@ mounting SquashFS images with FUSE. specification acceptable to :code:`tmpfs`, e.g. :code:`4m` for 4MiB or :code:`50%` for half of physical memory. If this option is specified without :code:`SIZE`, the default is :code:`12%`. Note (1) this limit is a - maximum — only actually stored files consume virtual memory, and - (2) :code:`SIZE` larger than memory can be requested without error. + maximum — only actually stored files consume virtual memory — and + (2) :code:`SIZE` larger than memory can be requested without error (the + failure happens later if the actual contents become too large). This requires kernel support and there are some caveats. See section “:ref:`ch-run_overlay`” below for details. diff --git a/doc/faq.rst b/doc/faq.rst index 6fda2ebd7..ffc4ac65b 100644 --- a/doc/faq.rst +++ b/doc/faq.rst @@ -333,6 +333,81 @@ References: * http://lxr.free-electrons.com/source/kernel/capability.c?v=4.2#L442 * http://lxr.free-electrons.com/source/fs/namei.c?v=4.2#L328 +.. _faq_mkdir-ro: + +:code:`--bind` creates mount points within un-writeable directories! +-------------------------------------------------------------------- + +Consider this image:: + + $ ls /var/tmp/image + bin dev home media opt root sbin sys usr + ch etc lib mnt proc run srv tmp var + $ ls -ld /var/tmp/image/mnt + drwxr-xr-x 18 root root 360 Dec 20 16:23 /var/tmp/image/mnt + $ ls /var/tmp/image/mnt + 0 1 2 3 4 5 6 7 8 9 a b c d e f + +That is, :code:`/mnt` is owned by root, un-writeable by us even considering +the prior question, and contains sixteen subdirectories. Indeed, we cannot +create a new directory there:: + + $ mkdir /var/tmp/image/mnt/foo + mkdir: cannot create directory ‘/var/tmp/image/mnt/foo’: Permission denied + +Recall that bind-mounting to a path that does not exist in a read-only image +fails:: + + $ ls -R /tmp/foo + /tmp/foo: + file-in-foo + $ ch-run -b /tmp/foo:/mnt/foo /var/tmp/image -- ls /mnt + ch-run[40498]: error: can't mkdir: /var/tmp/image/mnt/foo: Read-only file system (ch_misc.c:582 30) + +That’s fine; we’ll just use :code:`--write-fake` to create a writeable overlay +on the container. Then we can make any mount points we need. Right? + +:: + + $ ch-run -W /var/tmp/image -- mkdir /bar + $ ch-run -W /var/tmp/image -- mkdir /mnt/foo + mkdir: can't create directory '/mnt/foo': Permission denied + +Wait — why could we create a subdirectory of (container path) :code:`/` but +not :code:`/mnt`? This is because the latter, which is at host path +:code:`/var/tmp/image/mnt`, is not writeable by us, despite the overlaid +writeable tmpfs. + +Despite this, we can in fact use paths that do not yet exist for bind-mount destinations:: + + $ ch-run -W -b /tmp/foo:/mnt/foo /var/tmp/image -- ls /mnt + ch-run[40751]: warning: mkdir overmount: 16 entries > limit 15, skipping extras: /mnt/merged/mnt (ch_misc.c:474) + 1 3 5 7 9 b d f + 2 4 6 8 a c e foo + +What’s happening is bind-mount trickery. :code:`ch-run` creates a side +directory on the overlaid tmpfs, bind-mounts the existing contents of (host +path) :code:`/var/tmp/images/mnt` onto newly-created mount points in this new +directory (up to a limit, hence the warning and :code:`0` is missing), and +then bind-mounts this new (writeable!) directory on top of +:code:`/var/tmp/images/mnt`. *Now* we can +:code:`mkdir("/var/tmp/images/mnt/foo")`. + +This is visible by examining :code:`/proc/mounts`:: + + $ ch-run -W -b /tmp/foo:/mnt/foo /var/tmp/image -- cat /proc/mounts | fgrep /mnt + ch-run[81642]: warning: mkdir overmount: 16 entries > limit 15, skipping extras: /mnt/merged/mnt (ch_misc.c:474) + none / overlay rw,relatime,lowerdir=/var/tmp/image,upperdir=/mnt/upper,workdir=/mnt/work,volatile,userxattr 0 0 + none /mnt tmpfs rw,relatime,size=3943804k,uid=1000,gid=1000,inode64 0 0 + none /mnt/f overlay rw,relatime,lowerdir=/var/tmp/image,upperdir=/mnt/upper,workdir=/mnt/work,volatile,userxattr 0 0 + none /mnt/e overlay rw,relatime,lowerdir=/var/tmp/image,upperdir=/mnt/upper,workdir=/mnt/work,volatile,userxattr 0 0 + [...] + none /mnt/1 overlay rw,relatime,lowerdir=/var/tmp/image,upperdir=/mnt/upper,workdir=/mnt/work,volatile,userxattr 0 0 + tmpfs /mnt/foo tmpfs rw,relatime,size=8388608k,inode64 0 0 + +(The overlaid tmpfs is mounted on *host* :code:`/mnt` during container +assembly, which is why it appears in mount options.) + Why does :code:`ping` not work? ------------------------------- @@ -1247,4 +1322,4 @@ Notes: :code:`git(1)` invocations). .. LocalWords: CAs SY Gutmann AUTH rHsFFqwwqh MrieaQ Za loc mpihello mvo du -.. LocalWords: VirtualSize linuxcontainers jour uk lxd rwxr xr qq qqq +.. LocalWords: VirtualSize linuxcontainers jour uk lxd rwxr xr qq qqq drwxr From c73a35adf10b83394483eb17802e423894ddb8fe Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Fri, 22 Dec 2023 10:59:06 -0700 Subject: [PATCH 15/19] reverse sense of test --- bin/ch_misc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/ch_misc.c b/bin/ch_misc.c index 120c6609c..4a8d640c1 100644 --- a/bin/ch_misc.c +++ b/bin/ch_misc.c @@ -483,7 +483,7 @@ void mkdir_overmount(const char *path, const char *scratch) // Linux should always have the d_type field (if not, this won’t // compile), but on some common filesystems (e.g. NFS?) it does not // return a meaningful value, so we have to fall back to lstat(2). - if (entries[i]->d_type == DT_UNKNOWN) + if (entries[i]->d_type != DT_UNKNOWN) st.st_mode = DTTOIF(entries[i]->d_type); else Zf (lstat(src, &st), "can't stat", src); From 749c675314e8755e57f6acfdfcb254eaa4f83588 Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Fri, 22 Dec 2023 11:12:31 -0700 Subject: [PATCH 16/19] docs tweak --- doc/faq.rst | 9 +++------ doc/install.rst | 2 +- packaging/fedora/build | 4 ++-- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/doc/faq.rst b/doc/faq.rst index ffc4ac65b..217449e0a 100644 --- a/doc/faq.rst +++ b/doc/faq.rst @@ -358,9 +358,6 @@ create a new directory there:: Recall that bind-mounting to a path that does not exist in a read-only image fails:: - $ ls -R /tmp/foo - /tmp/foo: - file-in-foo $ ch-run -b /tmp/foo:/mnt/foo /var/tmp/image -- ls /mnt ch-run[40498]: error: can't mkdir: /var/tmp/image/mnt/foo: Read-only file system (ch_misc.c:582 30) @@ -375,8 +372,8 @@ on the container. Then we can make any mount points we need. Right? Wait — why could we create a subdirectory of (container path) :code:`/` but not :code:`/mnt`? This is because the latter, which is at host path -:code:`/var/tmp/image/mnt`, is not writeable by us, despite the overlaid -writeable tmpfs. +:code:`/var/tmp/image/mnt`, is not writeable by us: the overlayfs propagates +the directory’s no-write permissions. Despite this, we can in fact use paths that do not yet exist for bind-mount destinations:: @@ -387,7 +384,7 @@ Despite this, we can in fact use paths that do not yet exist for bind-mount dest What’s happening is bind-mount trickery. :code:`ch-run` creates a side directory on the overlaid tmpfs, bind-mounts the existing contents of (host -path) :code:`/var/tmp/images/mnt` onto newly-created mount points in this new +path) :code:`/var/tmp/images/mnt` to newly-created mount points in this new directory (up to a limit, hence the warning and :code:`0` is missing), and then bind-mounts this new (writeable!) directory on top of :code:`/var/tmp/images/mnt`. *Now* we can diff --git a/doc/install.rst b/doc/install.rst index 9cbacbf2d..6aa276eeb 100644 --- a/doc/install.rst +++ b/doc/install.rst @@ -200,7 +200,7 @@ is you and does not meet your needs, please get in touch with us and we will help. Avoid potentially troublesome informational tests: :code:`--disable-impolite-checks` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :code:`configure` performs a lot of checks that do not inform decisions but are simply informational, for the report at the end. These checks replicate diff --git a/packaging/fedora/build b/packaging/fedora/build index 6738ad851..fb75e67cc 100755 --- a/packaging/fedora/build +++ b/packaging/fedora/build @@ -101,8 +101,8 @@ rpmbuild root: %(rpmbuild)s # pip3 expects $HOME, here /root, to be writeable for its cache. If not, # the warning is that the directory “is not owned by the current user”, # which is not true. The warning further claims that “*caching* wheels has - # been disabled”, but in fact wheels are disabled entirely, so autogen.sh - # fails with “embedded Lark is broken”. Thanks pip! + # been disabled” (emphasis added), but in fact wheels are disabled + # entirely, so autogen.sh fails with “embedded Lark is broken”. Thanks pip! cmd(CH_RUN, "-b", "%s:/mnt/0" % git_tmp, "-b", "%s:/root" % rpm_pips, "-c", "/mnt/0", args.image, "--", "./autogen.sh") cmd(CH_RUN, "-b", "%s:/mnt/0" % git_tmp, "-c", "/mnt/0", args.image, From f8555ca2d6e993824a1482b675538635973cb03e Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Fri, 22 Dec 2023 11:26:05 -0700 Subject: [PATCH 17/19] fix tests? --- test/run/ch-run_misc.bats | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/test/run/ch-run_misc.bats b/test/run/ch-run_misc.bats index 9b473ba3d..24e85910d 100644 --- a/test/run/ch-run_misc.bats +++ b/test/run/ch-run_misc.bats @@ -349,13 +349,13 @@ EOF run ch-run -b "${bind1_dir}:/.." "$ch_timg" -- /bin/true echo "$output" [[ $status -eq 1 ]] - [[ $output = *"can't bind: /var/tmp/${USER}.ch not subdirectory of /var/tmp/${USER}.ch/mnt"* ]] + [[ $output = *"can't bind: "*"/${USER}.ch not subdirectory of "*"/${USER}.ch/mnt"* ]] # destination climbs out of image, does not exist run ch-run -b "${bind1_dir}:/../doesnotexist/a" "$ch_timg" -- /bin/true echo "$output" [[ $status -eq 1 ]] - [[ $output = *"can't mkdir: /var/tmp/${USER}.ch/doesnotexist not subdirectory of /var/tmp/${USER}.ch/mnt"* ]] + [[ $output = *"can't mkdir: "*"/${USER}.ch/doesnotexist not subdirectory of "*"/${USER}.ch/mnt"* ]] [[ ! -e ${ch_imgdir}/doesnotexist ]] # source does not exist @@ -368,7 +368,7 @@ EOF run ch-run -b "${bind1_dir}:/doesnotexist" "$ch_timg" -- /bin/true echo "$output" [[ $status -eq 1 ]] - [[ $output = *"can't mkdir: /var/tmp/${USER}.ch/mnt/doesnotexist: Read-only file system"* ]] + [[ $output = *"can't mkdir: "*"/${USER}.ch/mnt/doesnotexist: Read-only file system"* ]] # neither source nor destination exist run ch-run -b /doesnotexist-out:/doesnotexist-in "$ch_timg" -- /bin/true @@ -387,13 +387,13 @@ EOF "$ch_timg" -- /bin/true echo "$output" [[ $status -eq 1 ]] - [[ $output = *"can't mkdir: /var/tmp/${USER}.ch/mnt/doesnotexist: Read-only file system"* ]] + [[ $output = *"can't mkdir: "*"/${USER}.ch/mnt/doesnotexist: Read-only file system"* ]] # destination is broken symlink run ch-run -b "${bind1_dir}:/mnt/link-b0rken-abs" "$ch_timg" -- /bin/true echo "$output" [[ $status -eq 1 ]] - [[ $output = *"can't mkdir: symlink not relative: /var/tmp/${USER}.ch/mnt/mnt/link-b0rken-abs"* ]] + [[ $output = *"can't mkdir: symlink not relative: "*"/${USER}.ch/mnt/mnt/link-b0rken-abs"* ]] # destination is absolute symlink outside image run ch-run -b "${bind1_dir}:/mnt/link-bad-abs" "$ch_timg" -- /bin/true @@ -411,20 +411,20 @@ EOF run ch-run -b "${bind1_dir}:/proc/doesnotexist" "$ch_timg" -- /bin/true echo "$output" [[ $status -eq 1 ]] - [[ $output = *"can't mkdir: /var/tmp/${USER}.ch/mnt/proc/doesnotexist under existing bind-mount /var/tmp/${USER}.ch/mnt/proc "* ]] + [[ $output = *"can't mkdir: "*"/${USER}.ch/mnt/proc/doesnotexist under existing bind-mount "*"/${USER}.ch/mnt/proc "* ]] # mkdir(2) under existing bind-mount, user-supplied, first level run ch-run -b "${bind1_dir}:/mnt/0" \ -b "${bind2_dir}:/mnt/0/foo" "$ch_timg" -- /bin/true echo "$output" [[ $status -eq 1 ]] - [[ $output = *"can't mkdir: /var/tmp/${USER}.ch/mnt/mnt/0/foo under existing bind-mount /var/tmp/${USER}.ch/mnt/mnt/0 "* ]] + [[ $output = *"can't mkdir: "*"/${USER}.ch/mnt/mnt/0/foo under existing bind-mount "*"/${USER}.ch/mnt/mnt/0 "* ]] # mkdir(2) under existing bind-mount, default, 2nd level run ch-run -b "${bind1_dir}:/proc/sys/doesnotexist" "$ch_timg" -- /bin/true echo "$output" [[ $status -eq 1 ]] - [[ $output = *"can't mkdir: /var/tmp/${USER}.ch/mnt/proc/sys/doesnotexist under existing bind-mount /var/tmp/${USER}.ch/mnt/proc "* ]] + [[ $output = *"can't mkdir: "*"/${USER}.ch/mnt/proc/sys/doesnotexist under existing bind-mount "*"/${USER}.ch/mnt/proc "* ]] } From 1b775bd32775673d4f091d784b36cdc9551d940c Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Thu, 4 Jan 2024 15:11:13 -0700 Subject: [PATCH 18/19] symlink ranch instead of overmount farm --- bin/ch_misc.c | 78 ++++++++++++++------------------------- test/run/ch-run_misc.bats | 15 ++++---- 2 files changed, 34 insertions(+), 59 deletions(-) diff --git a/bin/ch_misc.c b/bin/ch_misc.c index 4a8d640c1..e71ae1b3b 100644 --- a/bin/ch_misc.c +++ b/bin/ch_misc.c @@ -22,13 +22,6 @@ /** Macros **/ -/* When making a directory writeable with mkdirs_overlay(), this is the - maximum number of entries to bind-mount. It seems Linux can handle a very - large number of mounts [1] but I don’t want to explode /proc/mounts beyond - comprehensibility. - [1]: https://serverfault.com/questions/102588 */ -#define MKDIRS_OVERMOUNT_ENTRY_MAX 15 - /* FNM_EXTMATCH is a GNU extension to support extended globs in fnmatch(3). If not available, define as 0 to ignore this flag. */ #ifndef HAVE_FNM_EXTMATCH @@ -65,7 +58,7 @@ size_t warnings_offset = 0; /** Function prototypes (private) **/ -void mkdir_overlay(const char *path, const char *overscratch); +void mkdir_overmount(const char *path, const char *scratch); void msgv(enum log_level level, const char *file, int line, int errno_, const char *fmt, va_list ap); @@ -450,69 +443,52 @@ void log_ids(const char *func, int line) } /* Create the directory at path, despite its parent not allowing write access, - by overmounting a new, writeable directory with the existing contents of - the old directory bind-mounted in. The new directory lives initially in - scratch, which must not be used for any other purpose. No cleanup is done - here, so a disposable tmpfs is best. If anything goes wrong, exit with an - error message. */ + by overmounting a new, writeable directory atop it. We preserve the old + contents by bind-mounting the old directory as a subdirectory, then setting + up a symlink ranch. + + The new directory lives initially in scratch, which must not be used for + any other purpose. No cleanup is done here, so a disposable tmpfs is best. + If anything goes wrong, exit with an error message. */ void mkdir_overmount(const char *path, const char *scratch) { - char *parent, *path2, *over; + char *parent, *path2, *over, *path_dst; + char *orig_dir = ".orig"; // resisted calling this .weirdal int entry_ct; struct dirent **entries; - VERBOSE("making writeable via overmount trick: %s", path); + VERBOSE("making writeable via symlink ranch: %s", path); path2 = strdup(path); parent = dirname(path2); T_ (1 <= asprintf(&over, "%s/%d", scratch, dir_ls_count(scratch) + 1)); + path_dst = path_join(over, orig_dir); - // bind-mount existing contents + // bind-mounts Z_ (mkdir(over, 0755)); - entry_ct = dir_ls(parent, &entries); + Z_ (mkdir(path_dst, 0755)); + Zf (mount(parent, path_dst, NULL, MS_REC|MS_BIND, NULL), + "can't bind-mount: %s -> %s", path, path_dst); + Zf (mount(over, parent, NULL, MS_REC|MS_BIND, NULL), + "can't bind-mount: %s- > %s", over, parent); + + // symlink ranch + entry_ct = dir_ls(path_dst, &entries); DEBUG("existing entries: %d", entry_ct); - if (entry_ct > MKDIRS_OVERMOUNT_ENTRY_MAX) - WARNING("mkdir overmount: %d entries > limit %d, skipping extras: %s", - entry_ct, MKDIRS_OVERMOUNT_ENTRY_MAX, parent); for (int i = 0; i < entry_ct; i++) { - if (i < MKDIRS_OVERMOUNT_ENTRY_MAX) { - char * src = path_join(parent, entries[i]->d_name); - char * dst = path_join(over, entries[i]->d_name); - struct stat st; - DEBUG("bind-mount %d: %s -> %s", i, src, dst); - - // Linux should always have the d_type field (if not, this won’t - // compile), but on some common filesystems (e.g. NFS?) it does not - // return a meaningful value, so we have to fall back to lstat(2). - if (entries[i]->d_type != DT_UNKNOWN) - st.st_mode = DTTOIF(entries[i]->d_type); - else - Zf (lstat(src, &st), "can't stat", src); - - // Create the mount point. - if (S_ISDIR(st.st_mode)) { - Z_ (mkdir(dst, 0755)); - } else { - // FIXME: not actually tested with non-regular-files - int fd = open(dst, O_WRONLY|O_CREAT|O_EXCL, 0600); - Zf (fd == -1, "can't open: %s", dst); - Zf (close(fd), "can't close: %s", dst); - } + char * src = path_join(parent, entries[i]->d_name); + char * dst = path_join(orig_dir, entries[i]->d_name); - Zf (mount(src, dst, NULL, MS_REC|MS_BIND, NULL), - "can't bind-mount: %s -> %s", src, dst); + Zf (symlink(dst, src), "can't symlink: %s -> %s", src, dst); - free(src); - free(dst); - } + free(src); + free(dst); free(entries[i]); } free(entries); - DEBUG("overmounting: %s -> %s", over, parent); - Zf (mount(over, parent, NULL, MS_REC|MS_BIND, NULL), - "can't bind-mount: %s- > %s", over, parent); Zf (mkdir(path, 0755), "can't mkdir even after overmount: %s", path); + free(path_dst); free(over); free(path2); } diff --git a/test/run/ch-run_misc.bats b/test/run/ch-run_misc.bats index 24e85910d..e998e4cb6 100644 --- a/test/run/ch-run_misc.bats +++ b/test/run/ch-run_misc.bats @@ -259,7 +259,8 @@ EOF rm-img () { # Remove existing fixture, avoiding “sudo rm -Rf” b/c it’s too scary. - sudo rm -f "$img"/foo/* + sudo rm -f "$img"/foo/file-in-foo + sudo rmdir "$img"/foo/directory-in-foo || true sudo rmdir "$img"/foo || true sudo rm -f "$img"/home/file-in-home sudo rmdir "$img"/home/directory-in-home || true @@ -270,10 +271,9 @@ EOF rm-img ch-convert "$ch_tardir"/chtest.* "$img" ls -l "$img" - mkdir -m 755 "$img"/foo - for i in {1..16}; do # MKDIRS_OVERMOUNT_ENTRY_MAX + 1 - touch "${img}/foo/${i}" - done + mkdir "$img"/foo + touch "$img"/foo/file-in-foo + mkdir "$img"/foo/directory-in-foo sudo chown root:root "$img"/foo "$img"/home sudo chmod 755 "$img"/foo "$img"/home ls -ld "$img"/foo "$img"/home @@ -286,13 +286,12 @@ EOF ls -l "$src" # --bind - run ch-run -W -b "$src":/foo/bar "$img" -- ls -xw 78 /foo /foo/bar + run ch-run -W -b "$src":/foo/bar "$img" -- ls -lahR /foo echo "$output" [[ $status -eq 0 ]] - [[ $output = *'warning: mkdir overmount: 16 entries > limit 15, skipping extras'* ]] # --home - run ch-run --home "$img" -- ls -1 /home + run ch-run --home "$img" -- ls -lah /home echo "$output" [[ $status -eq 0 ]] [[ $(echo "$output" | wc -l) -eq 3 ]] From 547e1082d9d0294847c9b389ca61a3865cb7491f Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Fri, 5 Jan 2024 10:24:18 -0700 Subject: [PATCH 19/19] update docs [skip ci] --- doc/faq.rst | 91 ++++++++++++++++++++++++++++------------------------- 1 file changed, 49 insertions(+), 42 deletions(-) diff --git a/doc/faq.rst b/doc/faq.rst index 217449e0a..41e7831b3 100644 --- a/doc/faq.rst +++ b/doc/faq.rst @@ -344,66 +344,72 @@ Consider this image:: bin dev home media opt root sbin sys usr ch etc lib mnt proc run srv tmp var $ ls -ld /var/tmp/image/mnt - drwxr-xr-x 18 root root 360 Dec 20 16:23 /var/tmp/image/mnt + drwxr-xr-x 4 root root 80 Jan 5 09:52 /var/tmp/image/mnt $ ls /var/tmp/image/mnt - 0 1 2 3 4 5 6 7 8 9 a b c d e f + bar foo That is, :code:`/mnt` is owned by root, un-writeable by us even considering -the prior question, and contains sixteen subdirectories. Indeed, we cannot -create a new directory there:: +the prior question, and contains two subdirectories. Indeed, we cannot create +a new directory there:: - $ mkdir /var/tmp/image/mnt/foo - mkdir: cannot create directory ‘/var/tmp/image/mnt/foo’: Permission denied + $ mkdir /var/tmp/image/mnt/baz + mkdir: cannot create directory ‘/var/tmp/image/mnt/baz’: Permission denied Recall that bind-mounting to a path that does not exist in a read-only image fails:: - $ ch-run -b /tmp/foo:/mnt/foo /var/tmp/image -- ls /mnt - ch-run[40498]: error: can't mkdir: /var/tmp/image/mnt/foo: Read-only file system (ch_misc.c:582 30) + $ ch-run -b /tmp/baz:/mnt/baz /var/tmp/image -- ls /mnt + ch-run[40498]: error: can't mkdir: /var/tmp/image/mnt/baz: Read-only file system (ch_misc.c:582 30) That’s fine; we’ll just use :code:`--write-fake` to create a writeable overlay on the container. Then we can make any mount points we need. Right? :: - $ ch-run -W /var/tmp/image -- mkdir /bar - $ ch-run -W /var/tmp/image -- mkdir /mnt/foo - mkdir: can't create directory '/mnt/foo': Permission denied + $ ch-run -W /var/tmp/image -- mkdir /qux # succeeds + $ ch-run -W /var/tmp/image -- mkdir /mnt/baz # fails + mkdir: can't create directory '/mnt/baz': Permission denied Wait — why could we create a subdirectory of (container path) :code:`/` but -not :code:`/mnt`? This is because the latter, which is at host path -:code:`/var/tmp/image/mnt`, is not writeable by us: the overlayfs propagates -the directory’s no-write permissions. - -Despite this, we can in fact use paths that do not yet exist for bind-mount destinations:: - - $ ch-run -W -b /tmp/foo:/mnt/foo /var/tmp/image -- ls /mnt - ch-run[40751]: warning: mkdir overmount: 16 entries > limit 15, skipping extras: /mnt/merged/mnt (ch_misc.c:474) - 1 3 5 7 9 b d f - 2 4 6 8 a c e foo - -What’s happening is bind-mount trickery. :code:`ch-run` creates a side -directory on the overlaid tmpfs, bind-mounts the existing contents of (host -path) :code:`/var/tmp/images/mnt` to newly-created mount points in this new -directory (up to a limit, hence the warning and :code:`0` is missing), and -then bind-mounts this new (writeable!) directory on top of -:code:`/var/tmp/images/mnt`. *Now* we can -:code:`mkdir("/var/tmp/images/mnt/foo")`. - -This is visible by examining :code:`/proc/mounts`:: - - $ ch-run -W -b /tmp/foo:/mnt/foo /var/tmp/image -- cat /proc/mounts | fgrep /mnt - ch-run[81642]: warning: mkdir overmount: 16 entries > limit 15, skipping extras: /mnt/merged/mnt (ch_misc.c:474) - none / overlay rw,relatime,lowerdir=/var/tmp/image,upperdir=/mnt/upper,workdir=/mnt/work,volatile,userxattr 0 0 +not a subdirectory of :code:`/mnt`? This is because the latter, which is at +host path :code:`/var/tmp/image/mnt`, is not writeable by us: the overlayfs +propagates the directory’s no-write permissions. Despite this, we can in fact +use paths that do not yet exist for bind-mount destinations:: + + $ ch-run -W -b /tmp/baz:/mnt/baz /var/tmp/image -- ls /mnt + bar baz foo + +What’s happening is bind-mount trickery and a symlink ranch. :code:`ch-run` +creates a new directory on the overlaid tmpfs, bind-mounts the old (host path) +:code:`/var/tmp/images/mnt` to a subdirectory of it, symlinks the old +contents, and finally overmounts the old, un-writeable directory with the new +one:: + + $ ch-run -W -b /tmp/baz:/mnt/baz /var/tmp/image -- ls -la /mnt + drwxr-x--- 4 reidpr reidpr 120 Jan 5 17:11 . + drwx------ 1 reidpr reidpr 40 Jan 5 17:11 .. + drwxr-xr-x 4 nobody nogroup 80 Jan 5 16:52 .orig + lrwxrwxrwx 1 reidpr reidpr 9 Jan 5 17:11 bar -> .orig/bar + drwxr-x--- 2 reidpr reidpr 40 Jan 3 23:49 baz + lrwxrwxrwx 1 reidpr reidpr 9 Jan 5 17:11 foo -> .orig/foo + $ ch-run -W -b /tmp/baz:/mnt/baz /var/tmp/image -- cat /proc/mounts | fgrep ' /mnt' none /mnt tmpfs rw,relatime,size=3943804k,uid=1000,gid=1000,inode64 0 0 - none /mnt/f overlay rw,relatime,lowerdir=/var/tmp/image,upperdir=/mnt/upper,workdir=/mnt/work,volatile,userxattr 0 0 - none /mnt/e overlay rw,relatime,lowerdir=/var/tmp/image,upperdir=/mnt/upper,workdir=/mnt/work,volatile,userxattr 0 0 - [...] - none /mnt/1 overlay rw,relatime,lowerdir=/var/tmp/image,upperdir=/mnt/upper,workdir=/mnt/work,volatile,userxattr 0 0 - tmpfs /mnt/foo tmpfs rw,relatime,size=8388608k,inode64 0 0 + none /mnt/.orig overlay rw,relatime,lowerdir=/var/tmp/image,upperdir=/mnt/upper,workdir=/mnt/work,volatile,userxattr 0 0 + tmpfs /mnt/baz tmpfs rw,relatime,size=8388608k,inode64 0 0 + +This new directory is writeable, and :code:`mkdir(2)` succeeds. (The overlaid +tmpfs is mounted on *host* :code:`/mnt` during container assembly, which is +why it appears in mount options.) + +There are differences from the original directory, of course. Most notably: + + * The ranched symlinks can be deleted by the user within the container, + contrary to the old directory’s read-only permissions. + + * The contents of the “ranched” directory become symlinks rather than their + original file type. -(The overlaid tmpfs is mounted on *host* :code:`/mnt` during container -assembly, which is why it appears in mount options.) +Software that cares about these things may break. Why does :code:`ping` not work? ------------------------------- @@ -1320,3 +1326,4 @@ Notes: .. LocalWords: CAs SY Gutmann AUTH rHsFFqwwqh MrieaQ Za loc mpihello mvo du .. LocalWords: VirtualSize linuxcontainers jour uk lxd rwxr xr qq qqq drwxr +.. LocalWords: drwx