All checks were successful
Test / Create distribution (push) Successful in 32s
Test / Sandbox (push) Successful in 2m2s
Test / Hakurei (push) Successful in 2m57s
Test / Hpkg (push) Successful in 3m54s
Test / Sandbox (race detector) (push) Successful in 4m6s
Test / Hakurei (race detector) (push) Successful in 4m51s
Test / Flake checks (push) Successful in 1m22s
There are significant limitations to using the overlay mount, and the implementation in the kernel is quite quirky. For now the Op is quite robust, however a higher level interface for it has not been decided yet. Signed-off-by: Ophestra <cat@gensokyo.uk>
233 lines
7.5 KiB
Go
233 lines
7.5 KiB
Go
package container
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
. "syscall"
|
|
|
|
"hakurei.app/container/vfs"
|
|
)
|
|
|
|
/*
|
|
Holding CAP_SYS_ADMIN within the user namespace that owns a process's mount namespace
|
|
allows that process to create bind mounts and mount the following types of filesystems:
|
|
- /proc (since Linux 3.8)
|
|
- /sys (since Linux 3.8)
|
|
- devpts (since Linux 3.9)
|
|
- tmpfs(5) (since Linux 3.9)
|
|
- ramfs (since Linux 3.9)
|
|
- mqueue (since Linux 3.9)
|
|
- bpf (since Linux 4.4)
|
|
- overlayfs (since Linux 5.11)
|
|
*/
|
|
|
|
const (
|
|
// zeroString is a zero value string, it represents NULL when passed to mount.
|
|
zeroString = ""
|
|
|
|
// SourceNone is used when the source value is ignored,
|
|
// such as when remounting.
|
|
SourceNone = "none"
|
|
// SourceProc is used when mounting proc.
|
|
// Note that any source value is allowed when fstype is [FstypeProc].
|
|
SourceProc = "proc"
|
|
// SourceDevpts is used when mounting devpts.
|
|
// Note that any source value is allowed when fstype is [FstypeDevpts].
|
|
SourceDevpts = "devpts"
|
|
// SourceMqueue is used when mounting mqueue.
|
|
// Note that any source value is allowed when fstype is [FstypeMqueue].
|
|
SourceMqueue = "mqueue"
|
|
// SourceOverlay is used when mounting overlay.
|
|
// Note that any source value is allowed when fstype is [FstypeOverlay].
|
|
SourceOverlay = "overlay"
|
|
|
|
// SourceTmpfsRootfs is used when mounting the tmpfs instance backing the intermediate root.
|
|
SourceTmpfsRootfs = "rootfs"
|
|
// SourceTmpfsDevtmpfs is used when mounting tmpfs representing a subset of host devtmpfs.
|
|
SourceTmpfsDevtmpfs = "devtmpfs"
|
|
// SourceTmpfsEphemeral is used when mounting a writable instance of tmpfs.
|
|
SourceTmpfsEphemeral = "ephemeral"
|
|
// SourceTmpfsReadonly is used when mounting a readonly instance of tmpfs.
|
|
SourceTmpfsReadonly = "readonly"
|
|
|
|
// FstypeNULL is used when the fstype value is ignored,
|
|
// such as when bind mounting or remounting.
|
|
FstypeNULL = zeroString
|
|
// FstypeProc represents the proc pseudo-filesystem.
|
|
// A fully visible instance of proc must be available in the mount namespace for proc to be mounted.
|
|
// This filesystem type is usually mounted on [FHSProc].
|
|
FstypeProc = "proc"
|
|
// FstypeDevpts represents the devpts pseudo-filesystem.
|
|
// This type of filesystem is usually mounted on /dev/pts.
|
|
FstypeDevpts = "devpts"
|
|
// FstypeTmpfs represents the tmpfs filesystem.
|
|
// This filesystem type can be mounted anywhere in the container filesystem.
|
|
FstypeTmpfs = "tmpfs"
|
|
// FstypeMqueue represents the mqueue pseudo-filesystem.
|
|
// This filesystem type is usually mounted on /dev/mqueue.
|
|
FstypeMqueue = "mqueue"
|
|
// FstypeOverlay represents the overlay pseudo-filesystem.
|
|
// This filesystem type can be mounted anywhere in the container filesystem.
|
|
FstypeOverlay = "overlay"
|
|
|
|
// OptionOverlayLowerdir represents the lowerdir option of the overlay pseudo-filesystem.
|
|
// Any filesystem, does not need to be on a writable filesystem.
|
|
OptionOverlayLowerdir = "lowerdir"
|
|
// OptionOverlayUpperdir represents the upperdir option of the overlay pseudo-filesystem.
|
|
// The upperdir is normally on a writable filesystem.
|
|
OptionOverlayUpperdir = "upperdir"
|
|
// OptionOverlayWorkdir represents the workdir option of the overlay pseudo-filesystem.
|
|
// The workdir needs to be an empty directory on the same filesystem as upperdir.
|
|
OptionOverlayWorkdir = "workdir"
|
|
// OptionOverlayUserxattr represents the userxattr option of the overlay pseudo-filesystem.
|
|
// Use the "user.overlay." xattr namespace instead of "trusted.overlay.".
|
|
OptionOverlayUserxattr = "userxattr"
|
|
|
|
// SpecialOverlayEscape is the escape string for overlay mount options.
|
|
SpecialOverlayEscape = `\`
|
|
// SpecialOverlayOption is the separator string between overlay mount options.
|
|
SpecialOverlayOption = ","
|
|
// SpecialOverlayPath is the separator string between overlay paths.
|
|
SpecialOverlayPath = ":"
|
|
)
|
|
|
|
// bindMount mounts source on target and recursively applies flags if MS_REC is set.
|
|
func (p *procPaths) bindMount(source, target string, flags uintptr, eq bool) error {
|
|
if eq {
|
|
msg.Verbosef("resolved %q flags %#x", target, flags)
|
|
} else {
|
|
msg.Verbosef("resolved %q on %q flags %#x", source, target, flags)
|
|
}
|
|
|
|
if err := Mount(source, target, FstypeNULL, MS_SILENT|MS_BIND|flags&MS_REC, zeroString); err != nil {
|
|
return wrapErrSuffix(err,
|
|
fmt.Sprintf("cannot mount %q on %q:", source, target))
|
|
}
|
|
|
|
return p.remount(target, flags)
|
|
}
|
|
|
|
// remount applies flags on target, recursively if MS_REC is set.
|
|
func (p *procPaths) remount(target string, flags uintptr) error {
|
|
var targetFinal string
|
|
if v, err := filepath.EvalSymlinks(target); err != nil {
|
|
return wrapErrSelf(err)
|
|
} else {
|
|
targetFinal = v
|
|
if targetFinal != target {
|
|
msg.Verbosef("target resolves to %q", targetFinal)
|
|
}
|
|
}
|
|
|
|
// final target path according to the kernel through proc
|
|
var targetKFinal string
|
|
{
|
|
var destFd int
|
|
if err := IgnoringEINTR(func() (err error) {
|
|
destFd, err = Open(targetFinal, O_PATH|O_CLOEXEC, 0)
|
|
return
|
|
}); err != nil {
|
|
return wrapErrSuffix(err,
|
|
fmt.Sprintf("cannot open %q:", targetFinal))
|
|
}
|
|
if v, err := os.Readlink(p.fd(destFd)); err != nil {
|
|
return wrapErrSelf(err)
|
|
} else if err = Close(destFd); err != nil {
|
|
return wrapErrSuffix(err,
|
|
fmt.Sprintf("cannot close %q:", targetFinal))
|
|
} else {
|
|
targetKFinal = v
|
|
}
|
|
}
|
|
|
|
mf := MS_NOSUID | flags&MS_NODEV | flags&MS_RDONLY
|
|
return hostProc.mountinfo(func(d *vfs.MountInfoDecoder) error {
|
|
n, err := d.Unfold(targetKFinal)
|
|
if err != nil {
|
|
if errors.Is(err, ESTALE) {
|
|
return msg.WrapErr(err,
|
|
fmt.Sprintf("mount point %q never appeared in mountinfo", targetKFinal))
|
|
}
|
|
return wrapErrSuffix(err,
|
|
"cannot unfold mount hierarchy:")
|
|
}
|
|
|
|
if err = remountWithFlags(n, mf); err != nil {
|
|
return err
|
|
}
|
|
if flags&MS_REC == 0 {
|
|
return nil
|
|
}
|
|
|
|
for cur := range n.Collective() {
|
|
err = remountWithFlags(cur, mf)
|
|
if err != nil && !errors.Is(err, EACCES) {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return nil
|
|
})
|
|
}
|
|
|
|
// remountWithFlags remounts mount point described by [vfs.MountInfoNode].
|
|
func remountWithFlags(n *vfs.MountInfoNode, mf uintptr) error {
|
|
kf, unmatched := n.Flags()
|
|
if len(unmatched) != 0 {
|
|
msg.Verbosef("unmatched vfs options: %q", unmatched)
|
|
}
|
|
|
|
if kf&mf != mf {
|
|
return wrapErrSuffix(
|
|
Mount(SourceNone, n.Clean, FstypeNULL, MS_SILENT|MS_BIND|MS_REMOUNT|kf|mf, zeroString),
|
|
fmt.Sprintf("cannot remount %q:", n.Clean))
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// mountTmpfs mounts tmpfs on target;
|
|
// callers who wish to mount to sysroot must pass the return value of toSysroot.
|
|
func mountTmpfs(fsname, target string, flags uintptr, size int, perm os.FileMode) error {
|
|
if err := os.MkdirAll(target, parentPerm(perm)); err != nil {
|
|
return wrapErrSelf(err)
|
|
}
|
|
opt := fmt.Sprintf("mode=%#o", perm)
|
|
if size > 0 {
|
|
opt += fmt.Sprintf(",size=%d", size)
|
|
}
|
|
return wrapErrSuffix(
|
|
Mount(fsname, target, FstypeTmpfs, flags, opt),
|
|
fmt.Sprintf("cannot mount tmpfs on %q:", target))
|
|
}
|
|
|
|
func parentPerm(perm os.FileMode) os.FileMode {
|
|
pperm := 0755
|
|
if perm&0070 == 0 {
|
|
pperm &= ^0050
|
|
}
|
|
if perm&0007 == 0 {
|
|
pperm &= ^0005
|
|
}
|
|
return os.FileMode(pperm)
|
|
}
|
|
|
|
// escapeOverlayDataSegment escapes a string for formatting into the data argument of an overlay mount call.
|
|
func escapeOverlayDataSegment(s string) string {
|
|
if s == zeroString {
|
|
return zeroString
|
|
}
|
|
|
|
if f := strings.SplitN(s, "\x00", 2); len(f) > 0 {
|
|
s = f[0]
|
|
}
|
|
|
|
return strings.NewReplacer(
|
|
SpecialOverlayEscape, SpecialOverlayEscape+SpecialOverlayEscape,
|
|
SpecialOverlayOption, SpecialOverlayEscape+SpecialOverlayOption,
|
|
SpecialOverlayPath, SpecialOverlayEscape+SpecialOverlayPath,
|
|
).Replace(s)
|
|
}
|