From cd9b534d6b1d432d3c997ccc5b14f630afb9dda6 Mon Sep 17 00:00:00 2001 From: Ophestra Date: Sat, 28 Feb 2026 20:18:30 +0900 Subject: [PATCH] container: improve documentation This change removes inconsistencies collected over time in this package. Signed-off-by: Ophestra --- container/autoetc.go | 6 +- container/autoroot.go | 6 +- container/capability.go | 12 +++- container/check/overlay.go | 3 +- container/container.go | 104 +++++++++++++++++++++++--------- container/dispatcher.go | 3 +- container/errors.go | 3 +- container/fhs/fhs.go | 21 ++++--- container/init.go | 32 +++++----- container/initbind.go | 8 ++- container/initdaemon.go | 3 +- container/initdev.go | 4 +- container/initmkdir.go | 2 +- container/initoverlay.go | 35 +++++++---- container/initplace.go | 2 +- container/initproc.go | 2 +- container/initremount.go | 2 +- container/landlock.go | 24 ++++++-- container/path.go | 5 +- container/seccomp/libseccomp.go | 18 +++--- container/vfs/mangle.go | 2 + container/vfs/mountinfo.go | 20 ++++-- container/vfs/unfold.go | 2 + 23 files changed, 222 insertions(+), 97 deletions(-) diff --git a/container/autoetc.go b/container/autoetc.go index 3fd9727..a2be68b 100644 --- a/container/autoetc.go +++ b/container/autoetc.go @@ -10,8 +10,7 @@ import ( func init() { gob.Register(new(AutoEtcOp)) } -// Etc appends an [Op] that expands host /etc into a toplevel symlink mirror with /etc semantics. -// This is not a generic setup op. It is implemented here to reduce ipc overhead. +// Etc is a helper for appending [AutoEtcOp] to [Ops]. func (f *Ops) Etc(host *check.Absolute, prefix string) *Ops { e := &AutoEtcOp{prefix} f.Mkdir(fhs.AbsEtc, 0755) @@ -20,6 +19,9 @@ func (f *Ops) Etc(host *check.Absolute, prefix string) *Ops { return f } +// AutoEtcOp expands host /etc into a toplevel symlink mirror with /etc semantics. +// +// This is not a generic setup op. It is implemented here to reduce ipc overhead. type AutoEtcOp struct{ Prefix string } func (e *AutoEtcOp) Valid() bool { return e != nil } diff --git a/container/autoroot.go b/container/autoroot.go index 09e059d..82ca9e2 100644 --- a/container/autoroot.go +++ b/container/autoroot.go @@ -11,13 +11,15 @@ import ( func init() { gob.Register(new(AutoRootOp)) } -// Root appends an [Op] that expands a directory into a toplevel bind mount mirror on container root. -// This is not a generic setup op. It is implemented here to reduce ipc overhead. +// Root is a helper for appending [AutoRootOp] to [Ops]. func (f *Ops) Root(host *check.Absolute, flags int) *Ops { *f = append(*f, &AutoRootOp{host, flags, nil}) return f } +// AutoRootOp expands a directory into a toplevel bind mount mirror on container root. +// +// This is not a generic setup op. It is implemented here to reduce ipc overhead. type AutoRootOp struct { Host *check.Absolute // passed through to bindMount diff --git a/container/capability.go b/container/capability.go index 9f526e6..3ea9d02 100644 --- a/container/capability.go +++ b/container/capability.go @@ -50,10 +50,16 @@ func capset(hdrp *capHeader, datap *[2]capData) error { } // capBoundingSetDrop drops a capability from the calling thread's capability bounding set. -func capBoundingSetDrop(cap uintptr) error { return Prctl(syscall.PR_CAPBSET_DROP, cap, 0) } +func capBoundingSetDrop(cap uintptr) error { + return Prctl(syscall.PR_CAPBSET_DROP, cap, 0) +} // capAmbientClearAll clears the ambient capability set of the calling thread. -func capAmbientClearAll() error { return Prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0) } +func capAmbientClearAll() error { + return Prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0) +} // capAmbientRaise adds to the ambient capability set of the calling thread. -func capAmbientRaise(cap uintptr) error { return Prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, cap) } +func capAmbientRaise(cap uintptr) error { + return Prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, cap) +} diff --git a/container/check/overlay.go b/container/check/overlay.go index ab907fb..fbbcd1e 100644 --- a/container/check/overlay.go +++ b/container/check/overlay.go @@ -11,7 +11,8 @@ const ( SpecialOverlayPath = ":" ) -// EscapeOverlayDataSegment escapes a string for formatting into the data argument of an overlay mount call. +// EscapeOverlayDataSegment escapes a string for formatting into the data +// argument of an overlay mount system call. func EscapeOverlayDataSegment(s string) string { if s == "" { return "" diff --git a/container/container.go b/container/container.go index 2a6363a..a645771 100644 --- a/container/container.go +++ b/container/container.go @@ -1,4 +1,5 @@ -// Package container implements unprivileged Linux containers with built-in support for syscall filtering. +// Package container implements unprivileged Linux containers with built-in +// support for syscall filtering. package container import ( @@ -42,22 +43,25 @@ type ( SchedPolicy int // Cgroup fd, nil to disable. Cgroup *int - // ExtraFiles passed through to initial process in the container, - // with behaviour identical to its [exec.Cmd] counterpart. + // ExtraFiles passed through to initial process in the container, with + // behaviour identical to its [exec.Cmd] counterpart. ExtraFiles []*os.File - // param pipe for shim and init + // Write end of a pipe connected to the init to deliver [Params]. setup *os.File - // cancels cmd + // Cancels the context passed to the underlying cmd. cancel context.CancelFunc - // closed after Wait returns + // Closed after Wait returns. Keeps the spawning thread alive. wait chan struct{} Stdin io.Reader Stdout io.Writer Stderr io.Writer - Cancel func(cmd *exec.Cmd) error + // Custom cancellation behaviour for the underlying [exec.Cmd]. Must + // deliver [CancelSignal] before returning. + Cancel func(cmd *exec.Cmd) error + // Copied to the underlying [exec.Cmd]. WaitDelay time.Duration cmd *exec.Cmd @@ -286,7 +290,11 @@ func (p *Container) Start() error { // place setup pipe before user supplied extra files, this is later restored by init if fd, f, err := Setup(&p.cmd.ExtraFiles); err != nil { - return &StartError{true, "set up params stream", err, false, false} + return &StartError{ + Fatal: true, + Step: "set up params stream", + Err: err, + } } else { p.setup = f p.cmd.Env = []string{setupEnv + "=" + strconv.Itoa(fd)} @@ -298,10 +306,16 @@ func (p *Container) Start() error { runtime.LockOSThread() p.wait = make(chan struct{}) - done <- func() error { // setup depending on per-thread state must happen here - // PR_SET_NO_NEW_PRIVS: depends on per-thread state but acts on all processes created from that thread + // setup depending on per-thread state must happen here + done <- func() error { + // PR_SET_NO_NEW_PRIVS: thread-directed but acts on all processes + // created from the calling thread if err := SetNoNewPrivs(); err != nil { - return &StartError{true, "prctl(PR_SET_NO_NEW_PRIVS)", err, false, false} + return &StartError{ + Fatal: true, + Step: "prctl(PR_SET_NO_NEW_PRIVS)", + Err: err, + } } // landlock: depends on per-thread state but acts on a process group @@ -313,28 +327,40 @@ func (p *Container) Start() error { if abi, err := LandlockGetABI(); err != nil { if p.HostAbstract { - // landlock can be skipped here as it restricts access to resources - // already covered by namespaces (pid) + // landlock can be skipped here as it restricts access + // to resources already covered by namespaces (pid) goto landlockOut } - return &StartError{false, "get landlock ABI", err, false, false} + return &StartError{Step: "get landlock ABI", Err: err} } else if abi < 6 { if p.HostAbstract { // see above comment goto landlockOut } - return &StartError{false, "kernel version too old for LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET", ENOSYS, true, false} + return &StartError{ + Step: "kernel too old for LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET", + Err: ENOSYS, + Origin: true, + } } else { p.msg.Verbosef("landlock abi version %d", abi) } if rulesetFd, err := rulesetAttr.Create(0); err != nil { - return &StartError{true, "create landlock ruleset", err, false, false} + return &StartError{ + Fatal: true, + Step: "create landlock ruleset", + Err: err, + } } else { p.msg.Verbosef("enforcing landlock ruleset %s", rulesetAttr) if err = LandlockRestrictSelf(rulesetFd, 0); err != nil { _ = Close(rulesetFd) - return &StartError{true, "enforce landlock ruleset", err, false, false} + return &StartError{ + Fatal: true, + Step: "enforce landlock ruleset", + Err: err, + } } if err = Close(rulesetFd); err != nil { p.msg.Verbosef("cannot close landlock ruleset: %v", err) @@ -346,7 +372,7 @@ func (p *Container) Start() error { } // sched_setscheduler: thread-directed but acts on all processes - // created from that thread + // created from the calling thread if p.SchedPolicy > 0 { p.msg.Verbosef("setting scheduling policy %d", p.SchedPolicy) if err := schedSetscheduler( @@ -364,7 +390,11 @@ func (p *Container) Start() error { p.msg.Verbose("starting container init") if err := p.cmd.Start(); err != nil { - return &StartError{false, "start container init", err, false, true} + return &StartError{ + Step: "start container init", + Err: err, + Passthrough: true, + } } return nil }() @@ -376,6 +406,7 @@ func (p *Container) Start() error { } // Serve serves [Container.Params] to the container init. +// // Serve must only be called once. func (p *Container) Serve() error { if p.setup == nil { @@ -385,12 +416,21 @@ func (p *Container) Serve() error { setup := p.setup p.setup = nil if err := setup.SetDeadline(time.Now().Add(initSetupTimeout)); err != nil { - return &StartError{true, "set init pipe deadline", err, false, true} + return &StartError{ + Fatal: true, + Step: "set init pipe deadline", + Err: err, + Passthrough: true, + } } if p.Path == nil { p.cancel() - return &StartError{false, "invalid executable pathname", EINVAL, true, false} + return &StartError{ + Step: "invalid executable pathname", + Err: EINVAL, + Origin: true, + } } // do not transmit nil @@ -415,7 +455,8 @@ func (p *Container) Serve() error { return err } -// Wait waits for the container init process to exit and releases any resources associated with the [Container]. +// Wait blocks until the container init process to exit and releases any +// resources associated with the [Container]. func (p *Container) Wait() error { if p.cmd == nil || p.cmd.Process == nil { return EINVAL @@ -460,11 +501,13 @@ func (p *Container) StderrPipe() (r io.ReadCloser, err error) { } func (p *Container) String() string { - return fmt.Sprintf("argv: %q, filter: %v, rules: %d, flags: %#x, presets: %#x", - p.Args, !p.SeccompDisable, len(p.SeccompRules), int(p.SeccompFlags), int(p.SeccompPresets)) + return fmt.Sprintf( + "argv: %q, filter: %v, rules: %d, flags: %#x, presets: %#x", + p.Args, !p.SeccompDisable, len(p.SeccompRules), int(p.SeccompFlags), int(p.SeccompPresets), + ) } -// ProcessState returns the address to os.ProcessState held by the underlying [exec.Cmd]. +// ProcessState returns the address of os.ProcessState held by the underlying [exec.Cmd]. func (p *Container) ProcessState() *os.ProcessState { if p.cmd == nil { return nil @@ -472,7 +515,8 @@ func (p *Container) ProcessState() *os.ProcessState { return p.cmd.ProcessState } -// New returns the address to a new instance of [Container] that requires further initialisation before use. +// New returns the address to a new instance of [Container]. This value requires +// further initialisation before use. func New(ctx context.Context, msg message.Msg) *Container { if msg == nil { msg = message.New(nil) @@ -486,7 +530,13 @@ func New(ctx context.Context, msg message.Msg) *Container { } // NewCommand calls [New] and initialises the [Params.Path] and [Params.Args] fields. -func NewCommand(ctx context.Context, msg message.Msg, pathname *check.Absolute, name string, args ...string) *Container { +func NewCommand( + ctx context.Context, + msg message.Msg, + pathname *check.Absolute, + name string, + args ...string, +) *Container { z := New(ctx, msg) z.Path = pathname z.Args = append([]string{name}, args...) diff --git a/container/dispatcher.go b/container/dispatcher.go index 7231d56..23786b5 100644 --- a/container/dispatcher.go +++ b/container/dispatcher.go @@ -21,7 +21,8 @@ type osFile interface { fs.File } -// syscallDispatcher provides methods that make state-dependent system calls as part of their behaviour. +// syscallDispatcher provides methods that make state-dependent system calls as +// part of their behaviour. type syscallDispatcher interface { // new starts a goroutine with a new instance of syscallDispatcher. // A syscallDispatcher must never be used in any goroutine other than the one owning it, diff --git a/container/errors.go b/container/errors.go index d7651aa..150dcd1 100644 --- a/container/errors.go +++ b/container/errors.go @@ -43,7 +43,8 @@ func messageFromError(err error) (m string, ok bool) { } // messagePrefix checks and prefixes the error message of a non-pointer error. -// While this is usable for pointer errors, such use should be avoided as nil check is omitted. +// While this is usable for pointer errors, such use should be avoided as nil +// check is omitted. func messagePrefix[T error](prefix string, err error) (string, bool) { var targetError T if errors.As(err, &targetError) { diff --git a/container/fhs/fhs.go b/container/fhs/fhs.go index 7e0d36c..e0c1d86 100644 --- a/container/fhs/fhs.go +++ b/container/fhs/fhs.go @@ -9,7 +9,8 @@ const ( // Tmp points to the place for small temporary files. Tmp = "/tmp/" - // Run points to a "tmpfs" file system for system packages to place runtime data, socket files, and similar. + // Run points to a "tmpfs" file system for system packages to place runtime + // data, socket files, and similar. Run = "/run/" // RunUser points to a directory containing per-user runtime directories, // each usually individually mounted "tmpfs" instances. @@ -17,10 +18,12 @@ const ( // Usr points to vendor-supplied operating system resources. Usr = "/usr/" - // UsrBin points to binaries and executables for user commands that shall appear in the $PATH search path. + // UsrBin points to binaries and executables for user commands that shall + // appear in the $PATH search path. UsrBin = Usr + "bin/" - // Var points to persistent, variable system data. Writable during normal system operation. + // Var points to persistent, variable system data. Writable during normal + // system operation. Var = "/var/" // VarLib points to persistent system data. VarLib = Var + "lib/" @@ -29,12 +32,16 @@ const ( // Dev points to the root directory for device nodes. Dev = "/dev/" - // DevShm is the place for POSIX shared memory segments, as created via shm_open(3). + // DevShm is the place for POSIX shared memory segments, as created via + // shm_open(3). DevShm = "/dev/shm/" - // Proc points to a virtual kernel file system exposing the process list and other functionality. + // Proc points to a virtual kernel file system exposing the process list and + // other functionality. Proc = "/proc/" - // ProcSys points to a hierarchy below /proc/ that exposes a number of kernel tunables. + // ProcSys points to a hierarchy below /proc/ that exposes a number of + // kernel tunables. ProcSys = Proc + "sys/" - // Sys points to a virtual kernel file system exposing discovered devices and other functionality. + // Sys points to a virtual kernel file system exposing discovered devices + // and other functionality. Sys = "/sys/" ) diff --git a/container/init.go b/container/init.go index 5516c42..8fbf757 100644 --- a/container/init.go +++ b/container/init.go @@ -33,12 +33,12 @@ const ( - This path is only accessible by init and root: The container init sets SUID_DUMP_DISABLE and terminates if that fails. - It should be noted that none of this should become relevant at any point since the resulting - intermediate root tmpfs should be effectively anonymous. */ + It should be noted that none of this should become relevant at any point + since the resulting intermediate root tmpfs should be effectively anonymous. */ intermediateHostPath = fhs.Proc + "self/fd" - // setupEnv is the name of the environment variable holding the string representation of - // the read end file descriptor of the setup params pipe. + // setupEnv is the name of the environment variable holding the string + // representation of the read end file descriptor of the setup params pipe. setupEnv = "HAKUREI_SETUP" // exitUnexpectedWait4 is the exit code if wait4 returns an unexpected errno. @@ -59,7 +59,8 @@ type ( // late is called right before starting the initial process. late(state *setupState, k syscallDispatcher) error - // prefix returns a log message prefix, and whether this Op prints no identifying message on its own. + // prefix returns a log message prefix, and whether this Op prints no + // identifying message on its own. prefix() (string, bool) Is(op Op) bool @@ -71,9 +72,11 @@ type ( setupState struct { nonrepeatable uintptr - // Whether early reaping has concluded. Must only be accessed in the wait4 loop. + // Whether early reaping has concluded. Must only be accessed in the + // wait4 loop. processConcluded bool - // Process to syscall.WaitStatus populated in the wait4 loop. Freed after early reaping concludes. + // Process to syscall.WaitStatus populated in the wait4 loop. Freed + // after early reaping concludes. process map[int]WaitStatus // Synchronises access to process. processMu sync.RWMutex @@ -216,9 +219,10 @@ func initEntrypoint(k syscallDispatcher, msg message.Msg) { defer cancel() /* early is called right before pivot_root into intermediate root; - this step is mostly for gathering information that would otherwise be difficult to obtain - via library functions after pivot_root, and implementations are expected to avoid changing - the state of the mount namespace */ + this step is mostly for gathering information that would otherwise be + difficult to obtain via library functions after pivot_root, and + implementations are expected to avoid changing the state of the mount + namespace */ for i, op := range *params.Ops { if op == nil || !op.Valid() { k.fatalf(msg, "invalid op at index %d", i) @@ -258,10 +262,10 @@ func initEntrypoint(k syscallDispatcher, msg message.Msg) { k.fatalf(msg, "cannot enter intermediate root: %v", err) } - /* apply is called right after pivot_root and entering the new root; - this step sets up the container filesystem, and implementations are expected to keep the host root - and sysroot mount points intact but otherwise can do whatever they need to; - chdir is allowed but discouraged */ + /* apply is called right after pivot_root and entering the new root. This + step sets up the container filesystem, and implementations are expected to + keep the host root and sysroot mount points intact but otherwise can do + whatever they need to. Calling chdir is allowed but discouraged. */ for i, op := range *params.Ops { // ops already checked during early setup if prefix, ok := op.prefix(); ok { diff --git a/container/initbind.go b/container/initbind.go index f91d508..7f240a3 100644 --- a/container/initbind.go +++ b/container/initbind.go @@ -12,14 +12,16 @@ import ( func init() { gob.Register(new(BindMountOp)) } -// Bind appends an [Op] that bind mounts host path [BindMountOp.Source] on container path [BindMountOp.Target]. +// Bind is a helper for appending [BindMountOp] to [Ops]. func (f *Ops) Bind(source, target *check.Absolute, flags int) *Ops { *f = append(*f, &BindMountOp{nil, source, target, flags}) return f } -// BindMountOp bind mounts host path Source on container path Target. -// Note that Flags uses bits declared in this package and should not be set with constants in [syscall]. +// BindMountOp creates a bind mount from host path Source to container path Target. +// +// Note that Flags uses bits declared in the [std] package and should not be set +// with constants in [syscall]. type BindMountOp struct { sourceFinal, Source, Target *check.Absolute diff --git a/container/initdaemon.go b/container/initdaemon.go index 0e631b2..1e58e2a 100644 --- a/container/initdaemon.go +++ b/container/initdaemon.go @@ -24,8 +24,7 @@ const ( daemonTimeout = 5 * time.Second ) -// Daemon appends an [Op] that starts a daemon in the container and blocks until -// [DaemonOp.Target] appears. +// Daemon is a helper for appending [DaemonOp] to [Ops]. func (f *Ops) Daemon(target, path *check.Absolute, args ...string) *Ops { *f = append(*f, &DaemonOp{target, path, args}) return f diff --git a/container/initdev.go b/container/initdev.go index 50df680..1ce722c 100644 --- a/container/initdev.go +++ b/container/initdev.go @@ -19,7 +19,9 @@ func (f *Ops) Dev(target *check.Absolute, mqueue bool) *Ops { } // DevWritable appends an [Op] that mounts a writable subset of host /dev. -// There is usually no good reason to write to /dev, so this should always be followed by a [RemountOp]. +// +// There is usually no good reason to write to /dev, so this should always be +// followed by a [RemountOp]. func (f *Ops) DevWritable(target *check.Absolute, mqueue bool) *Ops { *f = append(*f, &MountDevOp{target, mqueue, true}) return f diff --git a/container/initmkdir.go b/container/initmkdir.go index d3c4bbb..d43c8b8 100644 --- a/container/initmkdir.go +++ b/container/initmkdir.go @@ -10,7 +10,7 @@ import ( func init() { gob.Register(new(MkdirOp)) } -// Mkdir appends an [Op] that creates a directory in the container filesystem. +// Mkdir is a helper for appending [MkdirOp] to [Ops]. func (f *Ops) Mkdir(name *check.Absolute, perm os.FileMode) *Ops { *f = append(*f, &MkdirOp{name, perm}) return f diff --git a/container/initoverlay.go b/container/initoverlay.go index 0c31f8f..2fec636 100644 --- a/container/initoverlay.go +++ b/container/initoverlay.go @@ -54,8 +54,11 @@ func (e *OverlayArgumentError) Error() string { } } -// Overlay appends an [Op] that mounts the overlay pseudo filesystem on [MountOverlayOp.Target]. -func (f *Ops) Overlay(target, state, work *check.Absolute, layers ...*check.Absolute) *Ops { +// Overlay is a helper for appending [MountOverlayOp] to [Ops]. +func (f *Ops) Overlay( + target, state, work *check.Absolute, + layers ...*check.Absolute, +) *Ops { *f = append(*f, &MountOverlayOp{ Target: target, Lower: layers, @@ -65,13 +68,12 @@ func (f *Ops) Overlay(target, state, work *check.Absolute, layers ...*check.Abso return f } -// OverlayEphemeral appends an [Op] that mounts the overlay pseudo filesystem on [MountOverlayOp.Target] -// with an ephemeral upperdir and workdir. +// OverlayEphemeral appends a [MountOverlayOp] with an ephemeral upperdir and workdir. func (f *Ops) OverlayEphemeral(target *check.Absolute, layers ...*check.Absolute) *Ops { return f.Overlay(target, fhs.AbsRoot, nil, layers...) } -// OverlayReadonly appends an [Op] that mounts the overlay pseudo filesystem readonly on [MountOverlayOp.Target] +// OverlayReadonly appends a readonly [MountOverlayOp]. func (f *Ops) OverlayReadonly(target *check.Absolute, layers ...*check.Absolute) *Ops { return f.Overlay(target, nil, nil, layers...) } @@ -82,25 +84,34 @@ type MountOverlayOp struct { // Any filesystem, does not need to be on a writable filesystem. Lower []*check.Absolute - // formatted for [OptionOverlayLowerdir], resolved, prefixed and escaped during early + // Formatted for [OptionOverlayLowerdir]. + // + // Resolved, prefixed and escaped during early. lower []string + // The upperdir is normally on a writable filesystem. // - // If Work is nil and Upper holds the special value [fhs.AbsRoot], - // an ephemeral upperdir and workdir will be set up. + // If Work is nil and Upper holds the special value [fhs.AbsRoot], an + // ephemeral upperdir and workdir will be set up. // - // If both Work and Upper are nil, upperdir and workdir is omitted and the overlay is mounted readonly. + // If both Work and Upper are nil, upperdir and workdir is omitted and the + // overlay is mounted readonly. Upper *check.Absolute - // formatted for [OptionOverlayUpperdir], resolved, prefixed and escaped during early + // Formatted for [OptionOverlayUpperdir]. + // + // Resolved, prefixed and escaped during early. upper string + // The workdir needs to be an empty directory on the same filesystem as upperdir. Work *check.Absolute - // formatted for [OptionOverlayWorkdir], resolved, prefixed and escaped during early + // Formatted for [OptionOverlayWorkdir]. + // + // Resolved, prefixed and escaped during early. work string ephemeral bool - // used internally for mounting to the intermediate root + // Used internally for mounting to the intermediate root. noPrefix bool } diff --git a/container/initplace.go b/container/initplace.go index 9974c58..d0567ff 100644 --- a/container/initplace.go +++ b/container/initplace.go @@ -16,7 +16,7 @@ const ( func init() { gob.Register(new(TmpfileOp)) } -// Place appends an [Op] that places a file in container path [TmpfileOp.Path] containing [TmpfileOp.Data]. +// Place is a helper for appending [TmpfileOp] to [Ops]. func (f *Ops) Place(name *check.Absolute, data []byte) *Ops { *f = append(*f, &TmpfileOp{name, data}) return f diff --git a/container/initproc.go b/container/initproc.go index 3ecb270..42b519d 100644 --- a/container/initproc.go +++ b/container/initproc.go @@ -10,7 +10,7 @@ import ( func init() { gob.Register(new(MountProcOp)) } -// Proc appends an [Op] that mounts a private instance of proc. +// Proc is a helper for appending [MountProcOp] to [Ops]. func (f *Ops) Proc(target *check.Absolute) *Ops { *f = append(*f, &MountProcOp{target}) return f diff --git a/container/initremount.go b/container/initremount.go index b6d1697..6482291 100644 --- a/container/initremount.go +++ b/container/initremount.go @@ -9,7 +9,7 @@ import ( func init() { gob.Register(new(RemountOp)) } -// Remount appends an [Op] that applies [RemountOp.Flags] on container path [RemountOp.Target]. +// Remount is a helper for appending [RemountOp] to [Ops]. func (f *Ops) Remount(target *check.Absolute, flags uintptr) *Ops { *f = append(*f, &RemountOp{target, flags}) return f diff --git a/container/landlock.go b/container/landlock.go index 7f0a821..acaab58 100644 --- a/container/landlock.go +++ b/container/landlock.go @@ -38,6 +38,7 @@ const ( _LANDLOCK_ACCESS_FS_DELIM ) +// String returns a space-separated string of [LandlockAccessFS] flags. func (f LandlockAccessFS) String() string { switch f { case LANDLOCK_ACCESS_FS_EXECUTE: @@ -116,6 +117,7 @@ const ( _LANDLOCK_ACCESS_NET_DELIM ) +// String returns a space-separated string of [LandlockAccessNet] flags. func (f LandlockAccessNet) String() string { switch f { case LANDLOCK_ACCESS_NET_BIND_TCP: @@ -152,6 +154,7 @@ const ( _LANDLOCK_SCOPE_DELIM ) +// String returns a space-separated string of [LandlockScope] flags. func (f LandlockScope) String() string { switch f { case LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET: @@ -184,10 +187,12 @@ type RulesetAttr struct { HandledAccessFS LandlockAccessFS // Bitmask of handled network actions. HandledAccessNet LandlockAccessNet - // Bitmask of scopes restricting a Landlock domain from accessing outside resources (e.g. IPCs). + // Bitmask of scopes restricting a Landlock domain from accessing outside + // resources (e.g. IPCs). Scoped LandlockScope } +// String returns a user-facing description of [RulesetAttr]. func (rulesetAttr *RulesetAttr) String() string { if rulesetAttr == nil { return "NULL" @@ -208,6 +213,7 @@ func (rulesetAttr *RulesetAttr) String() string { return strings.Join(elems, ", ") } +// Create loads the ruleset into the kernel. func (rulesetAttr *RulesetAttr) Create(flags uintptr) (fd int, err error) { var pointer, size uintptr // NULL needed for abi version @@ -216,10 +222,13 @@ func (rulesetAttr *RulesetAttr) Create(flags uintptr) (fd int, err error) { size = unsafe.Sizeof(*rulesetAttr) } - rulesetFd, _, errno := syscall.Syscall(std.SYS_LANDLOCK_CREATE_RULESET, pointer, size, flags) + rulesetFd, _, errno := syscall.Syscall( + std.SYS_LANDLOCK_CREATE_RULESET, + pointer, size, + flags, + ) fd = int(rulesetFd) err = errno - if fd < 0 { return } @@ -230,12 +239,19 @@ func (rulesetAttr *RulesetAttr) Create(flags uintptr) (fd int, err error) { return fd, nil } +// LandlockGetABI returns the ABI version supported by the kernel. func LandlockGetABI() (int, error) { return (*RulesetAttr)(nil).Create(LANDLOCK_CREATE_RULESET_VERSION) } +// LandlockRestrictSelf applies a loaded ruleset to the calling thread. func LandlockRestrictSelf(rulesetFd int, flags uintptr) error { - r, _, errno := syscall.Syscall(std.SYS_LANDLOCK_RESTRICT_SELF, uintptr(rulesetFd), flags, 0) + r, _, errno := syscall.Syscall( + std.SYS_LANDLOCK_RESTRICT_SELF, + uintptr(rulesetFd), + flags, + 0, + ) if r != 0 { return errno } diff --git a/container/path.go b/container/path.go index bc0000a..965ad49 100644 --- a/container/path.go +++ b/container/path.go @@ -15,7 +15,10 @@ import ( const ( // Nonexistent is a path that cannot exist. - // /proc is chosen because a system with covered /proc is unsupported by this package. + // + // This path can never be presented by the kernel if proc is mounted on + // /proc/. This can only exist if parts of /proc/ is covered, or proc is not + // mounted at all. Neither configuration is supported by this package. Nonexistent = fhs.Proc + "nonexistent" hostPath = fhs.Root + hostDir diff --git a/container/seccomp/libseccomp.go b/container/seccomp/libseccomp.go index 4684be2..afb1848 100644 --- a/container/seccomp/libseccomp.go +++ b/container/seccomp/libseccomp.go @@ -88,18 +88,22 @@ var resPrefix = [...]string{ 7: "seccomp_load failed", } -// cbAllocateBuffer is the function signature for the function handle passed to hakurei_export_filter -// which allocates the buffer that the resulting bpf program is copied into, and writes its slice header -// to a value held by the caller. +// cbAllocateBuffer is the function signature for the function handle passed to +// hakurei_scmp_make_filter which allocates the buffer that the resulting bpf +// program is copied into, and writes its slice header to a value held by the caller. type cbAllocateBuffer = func(len C.size_t) (buf unsafe.Pointer) +// hakurei_scmp_allocate allocates a buffer of specified size known to the +// runtime through a callback passed in a [cgo.Handle]. +// //export hakurei_scmp_allocate func hakurei_scmp_allocate(f C.uintptr_t, len C.size_t) (buf unsafe.Pointer) { return cgo.Handle(f).Value().(cbAllocateBuffer)(len) } -// makeFilter generates a bpf program from a slice of [std.NativeRule] and writes the resulting byte slice to p. -// The filter is installed to the current process if p is nil. +// makeFilter generates a bpf program from a slice of [std.NativeRule] and +// writes the resulting byte slice to p. The filter is installed to the current +// process if p is nil. func makeFilter(rules []std.NativeRule, flags ExportFlag, p *[]byte) error { if len(rules) == 0 { return ErrInvalidRules @@ -170,8 +174,8 @@ func Export(rules []std.NativeRule, flags ExportFlag) (data []byte, err error) { return } -// Load generates a bpf program from a slice of [std.NativeRule] and enforces it on the current process. -// Errors returned by libseccomp is wrapped in [LibraryError]. +// Load generates a bpf program from a slice of [std.NativeRule] and enforces it +// on the current process. Errors returned by libseccomp is wrapped in [LibraryError]. func Load(rules []std.NativeRule, flags ExportFlag) error { return makeFilter(rules, flags, nil) } type ( diff --git a/container/vfs/mangle.go b/container/vfs/mangle.go index 83aba58..af7a046 100644 --- a/container/vfs/mangle.go +++ b/container/vfs/mangle.go @@ -2,6 +2,8 @@ package vfs import "strings" +// Unmangle reverses mangling of strings done by the kernel. Its behaviour is +// consistent with the equivalent function in util-linux. func Unmangle(s string) string { if !strings.ContainsRune(s, '\\') { return s diff --git a/container/vfs/mountinfo.go b/container/vfs/mountinfo.go index 5f9f225..77f6a35 100644 --- a/container/vfs/mountinfo.go +++ b/container/vfs/mountinfo.go @@ -24,6 +24,7 @@ var ( ErrMountInfoSep = errors.New("bad optional fields separator") ) +// A DecoderError describes a nonrecoverable error decoding a mountinfo stream. type DecoderError struct { Op string Line int @@ -51,7 +52,8 @@ func (e *DecoderError) Error() string { } type ( - // A MountInfoDecoder reads and decodes proc_pid_mountinfo(5) entries from an input stream. + // A MountInfoDecoder reads and decodes proc_pid_mountinfo(5) entries from + // an input stream. MountInfoDecoder struct { s *bufio.Scanner m *MountInfo @@ -72,13 +74,16 @@ type ( MountInfoEntry struct { // mount ID: a unique ID for the mount (may be reused after umount(2)). ID int `json:"id"` - // parent ID: the ID of the parent mount (or of self for the root of this mount namespace's mount tree). + // parent ID: the ID of the parent mount (or of self for the root of + // this mount namespace's mount tree). Parent int `json:"parent"` // major:minor: the value of st_dev for files on this filesystem (see stat(2)). Devno DevT `json:"devno"` - // root: the pathname of the directory in the filesystem which forms the root of this mount. + // root: the pathname of the directory in the filesystem which forms the + // root of this mount. Root string `json:"root"` - // mount point: the pathname of the mount point relative to the process's root directory. + // mount point: the pathname of the mount point relative to the + // process's root directory. Target string `json:"target"` // mount options: per-mount options (see mount(2)). VfsOptstr string `json:"vfs_optstr"` @@ -126,7 +131,8 @@ func (e *MountInfoEntry) Flags() (flags uintptr, unmatched []string) { // NewMountInfoDecoder returns a new decoder that reads from r. // -// The decoder introduces its own buffering and may read data from r beyond the mountinfo entries requested. +// The decoder introduces its own buffering and may read data from r beyond the +// mountinfo entries requested. func NewMountInfoDecoder(r io.Reader) *MountInfoDecoder { return &MountInfoDecoder{s: bufio.NewScanner(r)} } @@ -271,6 +277,8 @@ func parseMountInfoLine(s string, ent *MountInfoEntry) error { return nil } +// EqualWithIgnore compares to [MountInfoEntry] values, ignoring fields that +// compare equal to ignore. func (e *MountInfoEntry) EqualWithIgnore(want *MountInfoEntry, ignore string) bool { return (e.ID == want.ID || want.ID == -1) && (e.Parent == want.Parent || want.Parent == -1) && @@ -284,6 +292,8 @@ func (e *MountInfoEntry) EqualWithIgnore(want *MountInfoEntry, ignore string) bo (e.FsOptstr == want.FsOptstr || want.FsOptstr == ignore) } +// String returns a user-facing representation of a [MountInfoEntry]. It fits +// roughly into the mountinfo format, but without mangling. func (e *MountInfoEntry) String() string { return fmt.Sprintf("%d %d %d:%d %s %s %s %s %s %s %s", e.ID, e.Parent, e.Devno[0], e.Devno[1], e.Root, e.Target, e.VfsOptstr, diff --git a/container/vfs/unfold.go b/container/vfs/unfold.go index d63f0a9..37c4cef 100644 --- a/container/vfs/unfold.go +++ b/container/vfs/unfold.go @@ -6,6 +6,7 @@ import ( "strings" ) +// UnfoldTargetError is a pathname that never appeared in a mount hierarchy. type UnfoldTargetError string func (e UnfoldTargetError) Error() string { @@ -27,6 +28,7 @@ func (n *MountInfoNode) Collective() iter.Seq[*MountInfoNode] { return func(yield func(*MountInfoNode) bool) { n.visit(yield) } } +// visit recursively visits all visible mountinfo nodes. func (n *MountInfoNode) visit(yield func(*MountInfoNode) bool) bool { if !n.Covered && !yield(n) { return false