Ophestra e99d7affb0
All checks were successful
Test / Flake checks (push) Successful in 1m26s
Test / Create distribution (push) Successful in 33s
Test / Sandbox (push) Successful in 1m59s
Test / Hakurei (push) Successful in 2m58s
Test / Hpkg (push) Successful in 3m45s
Test / Sandbox (race detector) (push) Successful in 4m11s
Test / Hakurei (race detector) (push) Successful in 4m47s
container: use absolute for pathname
This is simultaneously more efficient and less error-prone. This change caused minor API changes in multiple other packages.

Signed-off-by: Ophestra <cat@gensokyo.uk>
2025-08-11 04:56:42 +09:00

644 lines
18 KiB
Go

package container
import (
"encoding/gob"
"fmt"
"math"
"os"
"path"
"path/filepath"
"slices"
"strings"
. "syscall"
"unsafe"
)
const (
// intermediate root file name pattern for [MountOverlayOp.Upper];
// remains after apply returns
intermediatePatternOverlayUpper = "overlay.upper.*"
// intermediate root file name pattern for [MountOverlayOp.Work];
// remains after apply returns
intermediatePatternOverlayWork = "overlay.work.*"
// intermediate root file name pattern for [TmpfileOp]
intermediatePatternTmpfile = "tmp.*"
)
type (
Ops []Op
// Op is a generic setup step ran inside the container init.
// Implementations of this interface are sent as a stream of gobs.
Op interface {
// early is called in host root.
early(params *Params) error
// apply is called in intermediate root.
apply(params *Params) error
prefix() string
Is(op Op) bool
fmt.Stringer
}
)
// Grow grows the slice Ops points to using [slices.Grow].
func (f *Ops) Grow(n int) { *f = slices.Grow(*f, n) }
func init() { gob.Register(new(RemountOp)) }
// Remount appends an [Op] that applies [RemountOp.Flags] on container path [RemountOp.Target].
func (f *Ops) Remount(target *Absolute, flags uintptr) *Ops {
*f = append(*f, &RemountOp{target, flags})
return f
}
type RemountOp struct {
Target *Absolute
Flags uintptr
}
func (*RemountOp) early(*Params) error { return nil }
func (r *RemountOp) apply(*Params) error {
if r.Target == nil {
return EBADE
}
return wrapErrSuffix(hostProc.remount(toSysroot(r.Target.String()), r.Flags),
fmt.Sprintf("cannot remount %q:", r.Target))
}
func (r *RemountOp) Is(op Op) bool { vr, ok := op.(*RemountOp); return ok && *r == *vr }
func (*RemountOp) prefix() string { return "remounting" }
func (r *RemountOp) String() string { return fmt.Sprintf("%q flags %#x", r.Target, r.Flags) }
func init() { gob.Register(new(BindMountOp)) }
// Bind appends an [Op] that bind mounts host path [BindMountOp.Source] on container path [BindMountOp.Target].
func (f *Ops) Bind(source, target *Absolute, flags int) *Ops {
*f = append(*f, &BindMountOp{nil, source, target, flags})
return f
}
type BindMountOp struct {
sourceFinal, Source, Target *Absolute
Flags int
}
const (
// BindOptional skips nonexistent host paths.
BindOptional = 1 << iota
// BindWritable mounts filesystem read-write.
BindWritable
// BindDevice allows access to devices (special files) on this filesystem.
BindDevice
)
func (b *BindMountOp) early(*Params) error {
if b.Source == nil || b.Target == nil {
return EBADE
}
if pathname, err := filepath.EvalSymlinks(b.Source.String()); err != nil {
if os.IsNotExist(err) && b.Flags&BindOptional != 0 {
// leave sourceFinal as nil
return nil
}
return wrapErrSelf(err)
} else {
b.sourceFinal, err = NewAbs(pathname)
return err
}
}
func (b *BindMountOp) apply(*Params) error {
if b.sourceFinal == nil {
if b.Flags&BindOptional == 0 {
// unreachable
return EBADE
}
return nil
}
source := toHost(b.sourceFinal.String())
target := toSysroot(b.Target.String())
// this perm value emulates bwrap behaviour as it clears bits from 0755 based on
// op->perms which is never set for any bind setup op so always results in 0700
if fi, err := os.Stat(source); err != nil {
return wrapErrSelf(err)
} else if fi.IsDir() {
if err = os.MkdirAll(target, 0700); err != nil {
return wrapErrSelf(err)
}
} else if err = ensureFile(target, 0444, 0700); err != nil {
return err
}
var flags uintptr = MS_REC
if b.Flags&BindWritable == 0 {
flags |= MS_RDONLY
}
if b.Flags&BindDevice == 0 {
flags |= MS_NODEV
}
return hostProc.bindMount(source, target, flags, b.sourceFinal == b.Target)
}
func (b *BindMountOp) Is(op Op) bool { vb, ok := op.(*BindMountOp); return ok && *b == *vb }
func (*BindMountOp) prefix() string { return "mounting" }
func (b *BindMountOp) String() string {
if b.Source == b.Target {
return fmt.Sprintf("%q flags %#x", b.Source, b.Flags)
}
return fmt.Sprintf("%q on %q flags %#x", b.Source, b.Target, b.Flags)
}
func init() { gob.Register(new(MountProcOp)) }
// Proc appends an [Op] that mounts a private instance of proc.
func (f *Ops) Proc(target *Absolute) *Ops {
*f = append(*f, &MountProcOp{target})
return f
}
type MountProcOp struct {
Target *Absolute
}
func (p *MountProcOp) early(*Params) error { return nil }
func (p *MountProcOp) apply(params *Params) error {
if p.Target == nil {
return EBADE
}
target := toSysroot(p.Target.String())
if err := os.MkdirAll(target, params.ParentPerm); err != nil {
return wrapErrSelf(err)
}
return wrapErrSuffix(Mount(SourceProc, target, FstypeProc, MS_NOSUID|MS_NOEXEC|MS_NODEV, zeroString),
fmt.Sprintf("cannot mount proc on %q:", p.Target.String()))
}
func (p *MountProcOp) Is(op Op) bool {
vp, ok := op.(*MountProcOp)
return ok && ((p == nil && vp == nil) || p == vp)
}
func (*MountProcOp) prefix() string { return "mounting" }
func (p *MountProcOp) String() string { return fmt.Sprintf("proc on %q", p.Target) }
func init() { gob.Register(new(MountDevOp)) }
// Dev appends an [Op] that mounts a subset of host /dev.
func (f *Ops) Dev(target *Absolute, mqueue bool) *Ops {
*f = append(*f, &MountDevOp{target, mqueue, false})
return f
}
// DevWritable appends an [Op] that mounts a writable subset of host /dev.
// There is usually no good reason to write to /dev, so this should always be followed by a [RemountOp].
func (f *Ops) DevWritable(target *Absolute, mqueue bool) *Ops {
*f = append(*f, &MountDevOp{target, mqueue, true})
return f
}
type MountDevOp struct {
Target *Absolute
Mqueue bool
Write bool
}
func (d *MountDevOp) early(*Params) error { return nil }
func (d *MountDevOp) apply(params *Params) error {
if d.Target == nil {
return EBADE
}
target := toSysroot(d.Target.String())
if err := mountTmpfs(SourceTmpfsDevtmpfs, target, MS_NOSUID|MS_NODEV, 0, params.ParentPerm); err != nil {
return err
}
for _, name := range []string{"null", "zero", "full", "random", "urandom", "tty"} {
targetPath := path.Join(target, name)
if err := ensureFile(targetPath, 0444, params.ParentPerm); err != nil {
return err
}
if err := hostProc.bindMount(
toHost(FHSDev+name),
targetPath,
0,
true,
); err != nil {
return err
}
}
for i, name := range []string{"stdin", "stdout", "stderr"} {
if err := os.Symlink(
FHSProc+"self/fd/"+string(rune(i+'0')),
path.Join(target, name),
); err != nil {
return wrapErrSelf(err)
}
}
for _, pair := range [][2]string{
{FHSProc + "self/fd", "fd"},
{FHSProc + "kcore", "core"},
{"pts/ptmx", "ptmx"},
} {
if err := os.Symlink(pair[0], path.Join(target, pair[1])); err != nil {
return wrapErrSelf(err)
}
}
devPtsPath := path.Join(target, "pts")
for _, name := range []string{path.Join(target, "shm"), devPtsPath} {
if err := os.Mkdir(name, params.ParentPerm); err != nil {
return wrapErrSelf(err)
}
}
if err := Mount(SourceDevpts, devPtsPath, FstypeDevpts, MS_NOSUID|MS_NOEXEC,
"newinstance,ptmxmode=0666,mode=620"); err != nil {
return wrapErrSuffix(err,
fmt.Sprintf("cannot mount devpts on %q:", devPtsPath))
}
if params.RetainSession {
var buf [8]byte
if _, _, errno := Syscall(SYS_IOCTL, 1, TIOCGWINSZ, uintptr(unsafe.Pointer(&buf[0]))); errno == 0 {
consolePath := path.Join(target, "console")
if err := ensureFile(consolePath, 0444, params.ParentPerm); err != nil {
return err
}
if name, err := os.Readlink(hostProc.stdout()); err != nil {
return wrapErrSelf(err)
} else if err = hostProc.bindMount(
toHost(name),
consolePath,
0,
false,
); err != nil {
return err
}
}
}
if d.Mqueue {
mqueueTarget := path.Join(target, "mqueue")
if err := os.Mkdir(mqueueTarget, params.ParentPerm); err != nil {
return wrapErrSelf(err)
}
if err := Mount(SourceMqueue, mqueueTarget, FstypeMqueue, MS_NOSUID|MS_NOEXEC|MS_NODEV, zeroString); err != nil {
return wrapErrSuffix(err, "cannot mount mqueue:")
}
}
if d.Write {
return nil
}
return wrapErrSuffix(hostProc.remount(target, MS_RDONLY),
fmt.Sprintf("cannot remount %q:", target))
}
func (d *MountDevOp) Is(op Op) bool { vd, ok := op.(*MountDevOp); return ok && *d == *vd }
func (*MountDevOp) prefix() string { return "mounting" }
func (d *MountDevOp) String() string {
if d.Mqueue {
return fmt.Sprintf("dev on %q with mqueue", d.Target)
}
return fmt.Sprintf("dev on %q", d.Target)
}
func init() { gob.Register(new(MountTmpfsOp)) }
// Tmpfs appends an [Op] that mounts tmpfs on container path [MountTmpfsOp.Path].
func (f *Ops) Tmpfs(target *Absolute, size int, perm os.FileMode) *Ops {
*f = append(*f, &MountTmpfsOp{SourceTmpfsEphemeral, target, MS_NOSUID | MS_NODEV, size, perm})
return f
}
// Readonly appends an [Op] that mounts read-only tmpfs on container path [MountTmpfsOp.Path].
func (f *Ops) Readonly(target *Absolute, perm os.FileMode) *Ops {
*f = append(*f, &MountTmpfsOp{SourceTmpfsReadonly, target, MS_RDONLY | MS_NOSUID | MS_NODEV, 0, perm})
return f
}
type MountTmpfsOp struct {
FSName string
Path *Absolute
Flags uintptr
Size int
Perm os.FileMode
}
func (t *MountTmpfsOp) early(*Params) error { return nil }
func (t *MountTmpfsOp) apply(*Params) error {
if t.Path == nil {
return EBADE
}
if t.Size < 0 || t.Size > math.MaxUint>>1 {
return msg.WrapErr(EBADE, fmt.Sprintf("size %d out of bounds", t.Size))
}
return mountTmpfs(t.FSName, toSysroot(t.Path.String()), t.Flags, t.Size, t.Perm)
}
func (t *MountTmpfsOp) Is(op Op) bool { vt, ok := op.(*MountTmpfsOp); return ok && *t == *vt }
func (*MountTmpfsOp) prefix() string { return "mounting" }
func (t *MountTmpfsOp) String() string { return fmt.Sprintf("tmpfs on %q size %d", t.Path, t.Size) }
func init() { gob.Register(new(MountOverlayOp)) }
// Overlay appends an [Op] that mounts the overlay pseudo filesystem on [MountOverlayOp.Target].
func (f *Ops) Overlay(target, state, work *Absolute, layers ...*Absolute) *Ops {
*f = append(*f, &MountOverlayOp{
Target: target,
Lower: layers,
Upper: state,
Work: work,
})
return f
}
// OverlayEphemeral appends an [Op] that mounts the overlay pseudo filesystem on [MountOverlayOp.Target]
// with an ephemeral upperdir and workdir.
func (f *Ops) OverlayEphemeral(target *Absolute, layers ...*Absolute) *Ops {
return f.Overlay(target, AbsFHSRoot, nil, layers...)
}
// OverlayReadonly appends an [Op] that mounts the overlay pseudo filesystem readonly on [MountOverlayOp.Target]
func (f *Ops) OverlayReadonly(target *Absolute, layers ...*Absolute) *Ops {
return f.Overlay(target, nil, nil, layers...)
}
type MountOverlayOp struct {
Target *Absolute
// Any filesystem, does not need to be on a writable filesystem.
Lower []*Absolute
// formatted for [OptionOverlayLowerdir], resolved, prefixed and escaped during early
lower []string
// The upperdir is normally on a writable filesystem.
//
// If Work is nil and Upper holds the special value [FHSRoot],
// an ephemeral upperdir and workdir will be set up.
//
// If both Work and Upper are empty strings, upperdir and workdir is omitted and the overlay is mounted readonly.
Upper *Absolute
// formatted for [OptionOverlayUpperdir], resolved, prefixed and escaped during early
upper string
// The workdir needs to be an empty directory on the same filesystem as upperdir.
Work *Absolute
// formatted for [OptionOverlayWorkdir], resolved, prefixed and escaped during early
work string
ephemeral bool
}
func (o *MountOverlayOp) early(*Params) error {
if o.Work == nil && o.Upper != nil {
switch o.Upper.String() {
case FHSRoot: // ephemeral
o.ephemeral = true // intermediate root not yet available
default:
return msg.WrapErr(EINVAL, fmt.Sprintf("upperdir has unexpected value %q", o.Upper))
}
}
// readonly handled in apply
if !o.ephemeral {
if o.Upper != o.Work && (o.Upper == nil || o.Work == nil) {
// unreachable
return msg.WrapErr(ENOTRECOVERABLE, "impossible overlay state reached")
}
if o.Upper != nil {
if v, err := filepath.EvalSymlinks(o.Upper.String()); err != nil {
return wrapErrSelf(err)
} else {
o.upper = escapeOverlayDataSegment(toHost(v))
}
}
if o.Work != nil {
if v, err := filepath.EvalSymlinks(o.Work.String()); err != nil {
return wrapErrSelf(err)
} else {
o.work = escapeOverlayDataSegment(toHost(v))
}
}
}
o.lower = make([]string, len(o.Lower))
for i, a := range o.Lower {
if a == nil {
return EBADE
}
if v, err := filepath.EvalSymlinks(a.String()); err != nil {
return wrapErrSelf(err)
} else {
o.lower[i] = escapeOverlayDataSegment(toHost(v))
}
}
return nil
}
func (o *MountOverlayOp) apply(params *Params) error {
if o.Target == nil {
return EBADE
}
target := toSysroot(o.Target.String())
if err := os.MkdirAll(target, params.ParentPerm); err != nil {
return wrapErrSelf(err)
}
if o.ephemeral {
var err error
// these directories are created internally, therefore early (absolute, symlink, prefix, escape) is bypassed
if o.upper, err = os.MkdirTemp(FHSRoot, intermediatePatternOverlayUpper); err != nil {
return wrapErrSelf(err)
}
if o.work, err = os.MkdirTemp(FHSRoot, intermediatePatternOverlayWork); err != nil {
return wrapErrSelf(err)
}
}
options := make([]string, 0, 4)
if o.upper == zeroString && o.work == zeroString { // readonly
if len(o.Lower) < 2 {
return msg.WrapErr(EINVAL, "readonly overlay requires at least two lowerdir")
}
// "upperdir=" and "workdir=" may be omitted. In that case the overlay will be read-only
} else {
if len(o.Lower) == 0 {
return msg.WrapErr(EINVAL, "overlay requires at least one lowerdir")
}
options = append(options,
OptionOverlayUpperdir+"="+o.upper,
OptionOverlayWorkdir+"="+o.work)
}
options = append(options,
OptionOverlayLowerdir+"="+strings.Join(o.lower, SpecialOverlayPath),
OptionOverlayUserxattr)
return wrapErrSuffix(Mount(SourceOverlay, target, FstypeOverlay, 0, strings.Join(options, SpecialOverlayOption)),
fmt.Sprintf("cannot mount overlay on %q:", o.Target))
}
func (o *MountOverlayOp) Is(op Op) bool {
vo, ok := op.(*MountOverlayOp)
return ok &&
o.Target == vo.Target &&
slices.Equal(o.Lower, vo.Lower) &&
o.Upper == vo.Upper &&
o.Work == vo.Work
}
func (*MountOverlayOp) prefix() string { return "mounting" }
func (o *MountOverlayOp) String() string {
return fmt.Sprintf("overlay on %q with %d layers", o.Target, len(o.Lower))
}
func init() { gob.Register(new(SymlinkOp)) }
// Link appends an [Op] that creates a symlink in the container filesystem.
func (f *Ops) Link(target *Absolute, linkName string, dereference bool) *Ops {
*f = append(*f, &SymlinkOp{target, linkName, dereference})
return f
}
type SymlinkOp struct {
Target *Absolute
// LinkName is an arbitrary uninterpreted pathname.
LinkName string
// Dereference causes LinkName to be dereferenced during early.
Dereference bool
}
func (l *SymlinkOp) early(*Params) error {
if l.Dereference {
if !isAbs(l.LinkName) {
return msg.WrapErr(EBADE, fmt.Sprintf("path %q is not absolute", l.LinkName))
}
if name, err := os.Readlink(l.LinkName); err != nil {
return wrapErrSelf(err)
} else {
l.LinkName = name
}
}
return nil
}
func (l *SymlinkOp) apply(params *Params) error {
if l.Target == nil {
return EBADE
}
target := toSysroot(l.Target.String())
if err := os.MkdirAll(path.Dir(target), params.ParentPerm); err != nil {
return wrapErrSelf(err)
}
if err := os.Symlink(l.LinkName, target); err != nil {
return wrapErrSelf(err)
}
return nil
}
func (l *SymlinkOp) Is(op Op) bool { vl, ok := op.(*SymlinkOp); return ok && *l == *vl }
func (*SymlinkOp) prefix() string { return "creating" }
func (l *SymlinkOp) String() string {
return fmt.Sprintf("symlink on %q linkname %q", l.Target, l.LinkName)
}
func init() { gob.Register(new(MkdirOp)) }
// Mkdir appends an [Op] that creates a directory in the container filesystem.
func (f *Ops) Mkdir(name *Absolute, perm os.FileMode) *Ops {
*f = append(*f, &MkdirOp{name, perm})
return f
}
type MkdirOp struct {
Path *Absolute
Perm os.FileMode
}
func (m *MkdirOp) early(*Params) error { return nil }
func (m *MkdirOp) apply(*Params) error {
if m.Path == nil {
return EBADE
}
return wrapErrSelf(os.MkdirAll(toSysroot(m.Path.String()), m.Perm))
}
func (m *MkdirOp) Is(op Op) bool { vm, ok := op.(*MkdirOp); return ok && m == vm }
func (*MkdirOp) prefix() string { return "creating" }
func (m *MkdirOp) String() string { return fmt.Sprintf("directory %q perm %s", m.Path, m.Perm) }
func init() { gob.Register(new(TmpfileOp)) }
// Place appends an [Op] that places a file in container path [TmpfileOp.Path] containing [TmpfileOp.Data].
func (f *Ops) Place(name *Absolute, data []byte) *Ops {
*f = append(*f, &TmpfileOp{name, data})
return f
}
// PlaceP is like Place but writes the address of [TmpfileOp.Data] to the pointer dataP points to.
func (f *Ops) PlaceP(name *Absolute, dataP **[]byte) *Ops {
t := &TmpfileOp{Path: name}
*dataP = &t.Data
*f = append(*f, t)
return f
}
type TmpfileOp struct {
Path *Absolute
Data []byte
}
func (t *TmpfileOp) early(*Params) error { return nil }
func (t *TmpfileOp) apply(params *Params) error {
if t.Path == nil {
return EBADE
}
var tmpPath string
if f, err := os.CreateTemp(FHSRoot, intermediatePatternTmpfile); err != nil {
return wrapErrSelf(err)
} else if _, err = f.Write(t.Data); err != nil {
return wrapErrSuffix(err,
"cannot write to intermediate file:")
} else if err = f.Close(); err != nil {
return wrapErrSuffix(err,
"cannot close intermediate file:")
} else {
tmpPath = f.Name()
}
target := toSysroot(t.Path.String())
if err := ensureFile(target, 0444, params.ParentPerm); err != nil {
return err
} else if err = hostProc.bindMount(
tmpPath,
target,
MS_RDONLY|MS_NODEV,
false,
); err != nil {
return err
} else if err = os.Remove(tmpPath); err != nil {
return wrapErrSelf(err)
}
return nil
}
func (t *TmpfileOp) Is(op Op) bool {
vt, ok := op.(*TmpfileOp)
return ok && t.Path == vt.Path && slices.Equal(t.Data, vt.Data)
}
func (*TmpfileOp) prefix() string { return "placing" }
func (t *TmpfileOp) String() string {
return fmt.Sprintf("tmpfile %q (%d bytes)", t.Path, len(t.Data))
}