container: improve documentation
All checks were successful
Test / Create distribution (push) Successful in 1m16s
Test / Sandbox (push) Successful in 3m2s
Test / Hakurei (push) Successful in 4m4s
Test / ShareFS (push) Successful in 4m17s
Test / Hpkg (push) Successful in 4m49s
Test / Sandbox (race detector) (push) Successful in 5m22s
Test / Hakurei (race detector) (push) Successful in 6m30s
Test / Flake checks (push) Successful in 1m48s

This change removes inconsistencies collected over time in this package.

Signed-off-by: Ophestra <cat@gensokyo.uk>
This commit is contained in:
2026-02-28 20:18:30 +09:00
parent 84e6922f30
commit cd9b534d6b
23 changed files with 222 additions and 97 deletions

View File

@@ -10,8 +10,7 @@ import (
func init() { gob.Register(new(AutoEtcOp)) } func init() { gob.Register(new(AutoEtcOp)) }
// Etc appends an [Op] that expands host /etc into a toplevel symlink mirror with /etc semantics. // Etc is a helper for appending [AutoEtcOp] to [Ops].
// This is not a generic setup op. It is implemented here to reduce ipc overhead.
func (f *Ops) Etc(host *check.Absolute, prefix string) *Ops { func (f *Ops) Etc(host *check.Absolute, prefix string) *Ops {
e := &AutoEtcOp{prefix} e := &AutoEtcOp{prefix}
f.Mkdir(fhs.AbsEtc, 0755) f.Mkdir(fhs.AbsEtc, 0755)
@@ -20,6 +19,9 @@ func (f *Ops) Etc(host *check.Absolute, prefix string) *Ops {
return f return f
} }
// AutoEtcOp expands host /etc into a toplevel symlink mirror with /etc semantics.
//
// This is not a generic setup op. It is implemented here to reduce ipc overhead.
type AutoEtcOp struct{ Prefix string } type AutoEtcOp struct{ Prefix string }
func (e *AutoEtcOp) Valid() bool { return e != nil } func (e *AutoEtcOp) Valid() bool { return e != nil }

View File

@@ -11,13 +11,15 @@ import (
func init() { gob.Register(new(AutoRootOp)) } func init() { gob.Register(new(AutoRootOp)) }
// Root appends an [Op] that expands a directory into a toplevel bind mount mirror on container root. // Root is a helper for appending [AutoRootOp] to [Ops].
// This is not a generic setup op. It is implemented here to reduce ipc overhead.
func (f *Ops) Root(host *check.Absolute, flags int) *Ops { func (f *Ops) Root(host *check.Absolute, flags int) *Ops {
*f = append(*f, &AutoRootOp{host, flags, nil}) *f = append(*f, &AutoRootOp{host, flags, nil})
return f return f
} }
// AutoRootOp expands a directory into a toplevel bind mount mirror on container root.
//
// This is not a generic setup op. It is implemented here to reduce ipc overhead.
type AutoRootOp struct { type AutoRootOp struct {
Host *check.Absolute Host *check.Absolute
// passed through to bindMount // passed through to bindMount

View File

@@ -50,10 +50,16 @@ func capset(hdrp *capHeader, datap *[2]capData) error {
} }
// capBoundingSetDrop drops a capability from the calling thread's capability bounding set. // capBoundingSetDrop drops a capability from the calling thread's capability bounding set.
func capBoundingSetDrop(cap uintptr) error { return Prctl(syscall.PR_CAPBSET_DROP, cap, 0) } func capBoundingSetDrop(cap uintptr) error {
return Prctl(syscall.PR_CAPBSET_DROP, cap, 0)
}
// capAmbientClearAll clears the ambient capability set of the calling thread. // capAmbientClearAll clears the ambient capability set of the calling thread.
func capAmbientClearAll() error { return Prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0) } func capAmbientClearAll() error {
return Prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0)
}
// capAmbientRaise adds to the ambient capability set of the calling thread. // capAmbientRaise adds to the ambient capability set of the calling thread.
func capAmbientRaise(cap uintptr) error { return Prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, cap) } func capAmbientRaise(cap uintptr) error {
return Prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, cap)
}

View File

@@ -11,7 +11,8 @@ const (
SpecialOverlayPath = ":" SpecialOverlayPath = ":"
) )
// EscapeOverlayDataSegment escapes a string for formatting into the data argument of an overlay mount call. // EscapeOverlayDataSegment escapes a string for formatting into the data
// argument of an overlay mount system call.
func EscapeOverlayDataSegment(s string) string { func EscapeOverlayDataSegment(s string) string {
if s == "" { if s == "" {
return "" return ""

View File

@@ -1,4 +1,5 @@
// Package container implements unprivileged Linux containers with built-in support for syscall filtering. // Package container implements unprivileged Linux containers with built-in
// support for syscall filtering.
package container package container
import ( import (
@@ -42,22 +43,25 @@ type (
SchedPolicy int SchedPolicy int
// Cgroup fd, nil to disable. // Cgroup fd, nil to disable.
Cgroup *int Cgroup *int
// ExtraFiles passed through to initial process in the container, // ExtraFiles passed through to initial process in the container, with
// with behaviour identical to its [exec.Cmd] counterpart. // behaviour identical to its [exec.Cmd] counterpart.
ExtraFiles []*os.File ExtraFiles []*os.File
// param pipe for shim and init // Write end of a pipe connected to the init to deliver [Params].
setup *os.File setup *os.File
// cancels cmd // Cancels the context passed to the underlying cmd.
cancel context.CancelFunc cancel context.CancelFunc
// closed after Wait returns // Closed after Wait returns. Keeps the spawning thread alive.
wait chan struct{} wait chan struct{}
Stdin io.Reader Stdin io.Reader
Stdout io.Writer Stdout io.Writer
Stderr io.Writer Stderr io.Writer
// Custom cancellation behaviour for the underlying [exec.Cmd]. Must
// deliver [CancelSignal] before returning.
Cancel func(cmd *exec.Cmd) error Cancel func(cmd *exec.Cmd) error
// Copied to the underlying [exec.Cmd].
WaitDelay time.Duration WaitDelay time.Duration
cmd *exec.Cmd cmd *exec.Cmd
@@ -286,7 +290,11 @@ func (p *Container) Start() error {
// place setup pipe before user supplied extra files, this is later restored by init // place setup pipe before user supplied extra files, this is later restored by init
if fd, f, err := Setup(&p.cmd.ExtraFiles); err != nil { if fd, f, err := Setup(&p.cmd.ExtraFiles); err != nil {
return &StartError{true, "set up params stream", err, false, false} return &StartError{
Fatal: true,
Step: "set up params stream",
Err: err,
}
} else { } else {
p.setup = f p.setup = f
p.cmd.Env = []string{setupEnv + "=" + strconv.Itoa(fd)} p.cmd.Env = []string{setupEnv + "=" + strconv.Itoa(fd)}
@@ -298,10 +306,16 @@ func (p *Container) Start() error {
runtime.LockOSThread() runtime.LockOSThread()
p.wait = make(chan struct{}) p.wait = make(chan struct{})
done <- func() error { // setup depending on per-thread state must happen here // setup depending on per-thread state must happen here
// PR_SET_NO_NEW_PRIVS: depends on per-thread state but acts on all processes created from that thread done <- func() error {
// PR_SET_NO_NEW_PRIVS: thread-directed but acts on all processes
// created from the calling thread
if err := SetNoNewPrivs(); err != nil { if err := SetNoNewPrivs(); err != nil {
return &StartError{true, "prctl(PR_SET_NO_NEW_PRIVS)", err, false, false} return &StartError{
Fatal: true,
Step: "prctl(PR_SET_NO_NEW_PRIVS)",
Err: err,
}
} }
// landlock: depends on per-thread state but acts on a process group // landlock: depends on per-thread state but acts on a process group
@@ -313,28 +327,40 @@ func (p *Container) Start() error {
if abi, err := LandlockGetABI(); err != nil { if abi, err := LandlockGetABI(); err != nil {
if p.HostAbstract { if p.HostAbstract {
// landlock can be skipped here as it restricts access to resources // landlock can be skipped here as it restricts access
// already covered by namespaces (pid) // to resources already covered by namespaces (pid)
goto landlockOut goto landlockOut
} }
return &StartError{false, "get landlock ABI", err, false, false} return &StartError{Step: "get landlock ABI", Err: err}
} else if abi < 6 { } else if abi < 6 {
if p.HostAbstract { if p.HostAbstract {
// see above comment // see above comment
goto landlockOut goto landlockOut
} }
return &StartError{false, "kernel version too old for LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET", ENOSYS, true, false} return &StartError{
Step: "kernel too old for LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET",
Err: ENOSYS,
Origin: true,
}
} else { } else {
p.msg.Verbosef("landlock abi version %d", abi) p.msg.Verbosef("landlock abi version %d", abi)
} }
if rulesetFd, err := rulesetAttr.Create(0); err != nil { if rulesetFd, err := rulesetAttr.Create(0); err != nil {
return &StartError{true, "create landlock ruleset", err, false, false} return &StartError{
Fatal: true,
Step: "create landlock ruleset",
Err: err,
}
} else { } else {
p.msg.Verbosef("enforcing landlock ruleset %s", rulesetAttr) p.msg.Verbosef("enforcing landlock ruleset %s", rulesetAttr)
if err = LandlockRestrictSelf(rulesetFd, 0); err != nil { if err = LandlockRestrictSelf(rulesetFd, 0); err != nil {
_ = Close(rulesetFd) _ = Close(rulesetFd)
return &StartError{true, "enforce landlock ruleset", err, false, false} return &StartError{
Fatal: true,
Step: "enforce landlock ruleset",
Err: err,
}
} }
if err = Close(rulesetFd); err != nil { if err = Close(rulesetFd); err != nil {
p.msg.Verbosef("cannot close landlock ruleset: %v", err) p.msg.Verbosef("cannot close landlock ruleset: %v", err)
@@ -346,7 +372,7 @@ func (p *Container) Start() error {
} }
// sched_setscheduler: thread-directed but acts on all processes // sched_setscheduler: thread-directed but acts on all processes
// created from that thread // created from the calling thread
if p.SchedPolicy > 0 { if p.SchedPolicy > 0 {
p.msg.Verbosef("setting scheduling policy %d", p.SchedPolicy) p.msg.Verbosef("setting scheduling policy %d", p.SchedPolicy)
if err := schedSetscheduler( if err := schedSetscheduler(
@@ -364,7 +390,11 @@ func (p *Container) Start() error {
p.msg.Verbose("starting container init") p.msg.Verbose("starting container init")
if err := p.cmd.Start(); err != nil { if err := p.cmd.Start(); err != nil {
return &StartError{false, "start container init", err, false, true} return &StartError{
Step: "start container init",
Err: err,
Passthrough: true,
}
} }
return nil return nil
}() }()
@@ -376,6 +406,7 @@ func (p *Container) Start() error {
} }
// Serve serves [Container.Params] to the container init. // Serve serves [Container.Params] to the container init.
//
// Serve must only be called once. // Serve must only be called once.
func (p *Container) Serve() error { func (p *Container) Serve() error {
if p.setup == nil { if p.setup == nil {
@@ -385,12 +416,21 @@ func (p *Container) Serve() error {
setup := p.setup setup := p.setup
p.setup = nil p.setup = nil
if err := setup.SetDeadline(time.Now().Add(initSetupTimeout)); err != nil { if err := setup.SetDeadline(time.Now().Add(initSetupTimeout)); err != nil {
return &StartError{true, "set init pipe deadline", err, false, true} return &StartError{
Fatal: true,
Step: "set init pipe deadline",
Err: err,
Passthrough: true,
}
} }
if p.Path == nil { if p.Path == nil {
p.cancel() p.cancel()
return &StartError{false, "invalid executable pathname", EINVAL, true, false} return &StartError{
Step: "invalid executable pathname",
Err: EINVAL,
Origin: true,
}
} }
// do not transmit nil // do not transmit nil
@@ -415,7 +455,8 @@ func (p *Container) Serve() error {
return err return err
} }
// Wait waits for the container init process to exit and releases any resources associated with the [Container]. // Wait blocks until the container init process to exit and releases any
// resources associated with the [Container].
func (p *Container) Wait() error { func (p *Container) Wait() error {
if p.cmd == nil || p.cmd.Process == nil { if p.cmd == nil || p.cmd.Process == nil {
return EINVAL return EINVAL
@@ -460,11 +501,13 @@ func (p *Container) StderrPipe() (r io.ReadCloser, err error) {
} }
func (p *Container) String() string { func (p *Container) String() string {
return fmt.Sprintf("argv: %q, filter: %v, rules: %d, flags: %#x, presets: %#x", return fmt.Sprintf(
p.Args, !p.SeccompDisable, len(p.SeccompRules), int(p.SeccompFlags), int(p.SeccompPresets)) "argv: %q, filter: %v, rules: %d, flags: %#x, presets: %#x",
p.Args, !p.SeccompDisable, len(p.SeccompRules), int(p.SeccompFlags), int(p.SeccompPresets),
)
} }
// ProcessState returns the address to os.ProcessState held by the underlying [exec.Cmd]. // ProcessState returns the address of os.ProcessState held by the underlying [exec.Cmd].
func (p *Container) ProcessState() *os.ProcessState { func (p *Container) ProcessState() *os.ProcessState {
if p.cmd == nil { if p.cmd == nil {
return nil return nil
@@ -472,7 +515,8 @@ func (p *Container) ProcessState() *os.ProcessState {
return p.cmd.ProcessState return p.cmd.ProcessState
} }
// New returns the address to a new instance of [Container] that requires further initialisation before use. // New returns the address to a new instance of [Container]. This value requires
// further initialisation before use.
func New(ctx context.Context, msg message.Msg) *Container { func New(ctx context.Context, msg message.Msg) *Container {
if msg == nil { if msg == nil {
msg = message.New(nil) msg = message.New(nil)
@@ -486,7 +530,13 @@ func New(ctx context.Context, msg message.Msg) *Container {
} }
// NewCommand calls [New] and initialises the [Params.Path] and [Params.Args] fields. // NewCommand calls [New] and initialises the [Params.Path] and [Params.Args] fields.
func NewCommand(ctx context.Context, msg message.Msg, pathname *check.Absolute, name string, args ...string) *Container { func NewCommand(
ctx context.Context,
msg message.Msg,
pathname *check.Absolute,
name string,
args ...string,
) *Container {
z := New(ctx, msg) z := New(ctx, msg)
z.Path = pathname z.Path = pathname
z.Args = append([]string{name}, args...) z.Args = append([]string{name}, args...)

View File

@@ -21,7 +21,8 @@ type osFile interface {
fs.File fs.File
} }
// syscallDispatcher provides methods that make state-dependent system calls as part of their behaviour. // syscallDispatcher provides methods that make state-dependent system calls as
// part of their behaviour.
type syscallDispatcher interface { type syscallDispatcher interface {
// new starts a goroutine with a new instance of syscallDispatcher. // new starts a goroutine with a new instance of syscallDispatcher.
// A syscallDispatcher must never be used in any goroutine other than the one owning it, // A syscallDispatcher must never be used in any goroutine other than the one owning it,

View File

@@ -43,7 +43,8 @@ func messageFromError(err error) (m string, ok bool) {
} }
// messagePrefix checks and prefixes the error message of a non-pointer error. // messagePrefix checks and prefixes the error message of a non-pointer error.
// While this is usable for pointer errors, such use should be avoided as nil check is omitted. // While this is usable for pointer errors, such use should be avoided as nil
// check is omitted.
func messagePrefix[T error](prefix string, err error) (string, bool) { func messagePrefix[T error](prefix string, err error) (string, bool) {
var targetError T var targetError T
if errors.As(err, &targetError) { if errors.As(err, &targetError) {

View File

@@ -9,7 +9,8 @@ const (
// Tmp points to the place for small temporary files. // Tmp points to the place for small temporary files.
Tmp = "/tmp/" Tmp = "/tmp/"
// Run points to a "tmpfs" file system for system packages to place runtime data, socket files, and similar. // Run points to a "tmpfs" file system for system packages to place runtime
// data, socket files, and similar.
Run = "/run/" Run = "/run/"
// RunUser points to a directory containing per-user runtime directories, // RunUser points to a directory containing per-user runtime directories,
// each usually individually mounted "tmpfs" instances. // each usually individually mounted "tmpfs" instances.
@@ -17,10 +18,12 @@ const (
// Usr points to vendor-supplied operating system resources. // Usr points to vendor-supplied operating system resources.
Usr = "/usr/" Usr = "/usr/"
// UsrBin points to binaries and executables for user commands that shall appear in the $PATH search path. // UsrBin points to binaries and executables for user commands that shall
// appear in the $PATH search path.
UsrBin = Usr + "bin/" UsrBin = Usr + "bin/"
// Var points to persistent, variable system data. Writable during normal system operation. // Var points to persistent, variable system data. Writable during normal
// system operation.
Var = "/var/" Var = "/var/"
// VarLib points to persistent system data. // VarLib points to persistent system data.
VarLib = Var + "lib/" VarLib = Var + "lib/"
@@ -29,12 +32,16 @@ const (
// Dev points to the root directory for device nodes. // Dev points to the root directory for device nodes.
Dev = "/dev/" Dev = "/dev/"
// DevShm is the place for POSIX shared memory segments, as created via shm_open(3). // DevShm is the place for POSIX shared memory segments, as created via
// shm_open(3).
DevShm = "/dev/shm/" DevShm = "/dev/shm/"
// Proc points to a virtual kernel file system exposing the process list and other functionality. // Proc points to a virtual kernel file system exposing the process list and
// other functionality.
Proc = "/proc/" Proc = "/proc/"
// ProcSys points to a hierarchy below /proc/ that exposes a number of kernel tunables. // ProcSys points to a hierarchy below /proc/ that exposes a number of
// kernel tunables.
ProcSys = Proc + "sys/" ProcSys = Proc + "sys/"
// Sys points to a virtual kernel file system exposing discovered devices and other functionality. // Sys points to a virtual kernel file system exposing discovered devices
// and other functionality.
Sys = "/sys/" Sys = "/sys/"
) )

View File

@@ -33,12 +33,12 @@ const (
- This path is only accessible by init and root: - This path is only accessible by init and root:
The container init sets SUID_DUMP_DISABLE and terminates if that fails. The container init sets SUID_DUMP_DISABLE and terminates if that fails.
It should be noted that none of this should become relevant at any point since the resulting It should be noted that none of this should become relevant at any point
intermediate root tmpfs should be effectively anonymous. */ since the resulting intermediate root tmpfs should be effectively anonymous. */
intermediateHostPath = fhs.Proc + "self/fd" intermediateHostPath = fhs.Proc + "self/fd"
// setupEnv is the name of the environment variable holding the string representation of // setupEnv is the name of the environment variable holding the string
// the read end file descriptor of the setup params pipe. // representation of the read end file descriptor of the setup params pipe.
setupEnv = "HAKUREI_SETUP" setupEnv = "HAKUREI_SETUP"
// exitUnexpectedWait4 is the exit code if wait4 returns an unexpected errno. // exitUnexpectedWait4 is the exit code if wait4 returns an unexpected errno.
@@ -59,7 +59,8 @@ type (
// late is called right before starting the initial process. // late is called right before starting the initial process.
late(state *setupState, k syscallDispatcher) error late(state *setupState, k syscallDispatcher) error
// prefix returns a log message prefix, and whether this Op prints no identifying message on its own. // prefix returns a log message prefix, and whether this Op prints no
// identifying message on its own.
prefix() (string, bool) prefix() (string, bool)
Is(op Op) bool Is(op Op) bool
@@ -71,9 +72,11 @@ type (
setupState struct { setupState struct {
nonrepeatable uintptr nonrepeatable uintptr
// Whether early reaping has concluded. Must only be accessed in the wait4 loop. // Whether early reaping has concluded. Must only be accessed in the
// wait4 loop.
processConcluded bool processConcluded bool
// Process to syscall.WaitStatus populated in the wait4 loop. Freed after early reaping concludes. // Process to syscall.WaitStatus populated in the wait4 loop. Freed
// after early reaping concludes.
process map[int]WaitStatus process map[int]WaitStatus
// Synchronises access to process. // Synchronises access to process.
processMu sync.RWMutex processMu sync.RWMutex
@@ -216,9 +219,10 @@ func initEntrypoint(k syscallDispatcher, msg message.Msg) {
defer cancel() defer cancel()
/* early is called right before pivot_root into intermediate root; /* early is called right before pivot_root into intermediate root;
this step is mostly for gathering information that would otherwise be difficult to obtain this step is mostly for gathering information that would otherwise be
via library functions after pivot_root, and implementations are expected to avoid changing difficult to obtain via library functions after pivot_root, and
the state of the mount namespace */ implementations are expected to avoid changing the state of the mount
namespace */
for i, op := range *params.Ops { for i, op := range *params.Ops {
if op == nil || !op.Valid() { if op == nil || !op.Valid() {
k.fatalf(msg, "invalid op at index %d", i) k.fatalf(msg, "invalid op at index %d", i)
@@ -258,10 +262,10 @@ func initEntrypoint(k syscallDispatcher, msg message.Msg) {
k.fatalf(msg, "cannot enter intermediate root: %v", err) k.fatalf(msg, "cannot enter intermediate root: %v", err)
} }
/* apply is called right after pivot_root and entering the new root; /* apply is called right after pivot_root and entering the new root. This
this step sets up the container filesystem, and implementations are expected to keep the host root step sets up the container filesystem, and implementations are expected to
and sysroot mount points intact but otherwise can do whatever they need to; keep the host root and sysroot mount points intact but otherwise can do
chdir is allowed but discouraged */ whatever they need to. Calling chdir is allowed but discouraged. */
for i, op := range *params.Ops { for i, op := range *params.Ops {
// ops already checked during early setup // ops already checked during early setup
if prefix, ok := op.prefix(); ok { if prefix, ok := op.prefix(); ok {

View File

@@ -12,14 +12,16 @@ import (
func init() { gob.Register(new(BindMountOp)) } func init() { gob.Register(new(BindMountOp)) }
// Bind appends an [Op] that bind mounts host path [BindMountOp.Source] on container path [BindMountOp.Target]. // Bind is a helper for appending [BindMountOp] to [Ops].
func (f *Ops) Bind(source, target *check.Absolute, flags int) *Ops { func (f *Ops) Bind(source, target *check.Absolute, flags int) *Ops {
*f = append(*f, &BindMountOp{nil, source, target, flags}) *f = append(*f, &BindMountOp{nil, source, target, flags})
return f return f
} }
// BindMountOp bind mounts host path Source on container path Target. // BindMountOp creates a bind mount from host path Source to container path Target.
// Note that Flags uses bits declared in this package and should not be set with constants in [syscall]. //
// Note that Flags uses bits declared in the [std] package and should not be set
// with constants in [syscall].
type BindMountOp struct { type BindMountOp struct {
sourceFinal, Source, Target *check.Absolute sourceFinal, Source, Target *check.Absolute

View File

@@ -24,8 +24,7 @@ const (
daemonTimeout = 5 * time.Second daemonTimeout = 5 * time.Second
) )
// Daemon appends an [Op] that starts a daemon in the container and blocks until // Daemon is a helper for appending [DaemonOp] to [Ops].
// [DaemonOp.Target] appears.
func (f *Ops) Daemon(target, path *check.Absolute, args ...string) *Ops { func (f *Ops) Daemon(target, path *check.Absolute, args ...string) *Ops {
*f = append(*f, &DaemonOp{target, path, args}) *f = append(*f, &DaemonOp{target, path, args})
return f return f

View File

@@ -19,7 +19,9 @@ func (f *Ops) Dev(target *check.Absolute, mqueue bool) *Ops {
} }
// DevWritable appends an [Op] that mounts a writable subset of host /dev. // DevWritable appends an [Op] that mounts a writable subset of host /dev.
// There is usually no good reason to write to /dev, so this should always be followed by a [RemountOp]. //
// There is usually no good reason to write to /dev, so this should always be
// followed by a [RemountOp].
func (f *Ops) DevWritable(target *check.Absolute, mqueue bool) *Ops { func (f *Ops) DevWritable(target *check.Absolute, mqueue bool) *Ops {
*f = append(*f, &MountDevOp{target, mqueue, true}) *f = append(*f, &MountDevOp{target, mqueue, true})
return f return f

View File

@@ -10,7 +10,7 @@ import (
func init() { gob.Register(new(MkdirOp)) } func init() { gob.Register(new(MkdirOp)) }
// Mkdir appends an [Op] that creates a directory in the container filesystem. // Mkdir is a helper for appending [MkdirOp] to [Ops].
func (f *Ops) Mkdir(name *check.Absolute, perm os.FileMode) *Ops { func (f *Ops) Mkdir(name *check.Absolute, perm os.FileMode) *Ops {
*f = append(*f, &MkdirOp{name, perm}) *f = append(*f, &MkdirOp{name, perm})
return f return f

View File

@@ -54,8 +54,11 @@ func (e *OverlayArgumentError) Error() string {
} }
} }
// Overlay appends an [Op] that mounts the overlay pseudo filesystem on [MountOverlayOp.Target]. // Overlay is a helper for appending [MountOverlayOp] to [Ops].
func (f *Ops) Overlay(target, state, work *check.Absolute, layers ...*check.Absolute) *Ops { func (f *Ops) Overlay(
target, state, work *check.Absolute,
layers ...*check.Absolute,
) *Ops {
*f = append(*f, &MountOverlayOp{ *f = append(*f, &MountOverlayOp{
Target: target, Target: target,
Lower: layers, Lower: layers,
@@ -65,13 +68,12 @@ func (f *Ops) Overlay(target, state, work *check.Absolute, layers ...*check.Abso
return f return f
} }
// OverlayEphemeral appends an [Op] that mounts the overlay pseudo filesystem on [MountOverlayOp.Target] // OverlayEphemeral appends a [MountOverlayOp] with an ephemeral upperdir and workdir.
// with an ephemeral upperdir and workdir.
func (f *Ops) OverlayEphemeral(target *check.Absolute, layers ...*check.Absolute) *Ops { func (f *Ops) OverlayEphemeral(target *check.Absolute, layers ...*check.Absolute) *Ops {
return f.Overlay(target, fhs.AbsRoot, nil, layers...) return f.Overlay(target, fhs.AbsRoot, nil, layers...)
} }
// OverlayReadonly appends an [Op] that mounts the overlay pseudo filesystem readonly on [MountOverlayOp.Target] // OverlayReadonly appends a readonly [MountOverlayOp].
func (f *Ops) OverlayReadonly(target *check.Absolute, layers ...*check.Absolute) *Ops { func (f *Ops) OverlayReadonly(target *check.Absolute, layers ...*check.Absolute) *Ops {
return f.Overlay(target, nil, nil, layers...) return f.Overlay(target, nil, nil, layers...)
} }
@@ -82,25 +84,34 @@ type MountOverlayOp struct {
// Any filesystem, does not need to be on a writable filesystem. // Any filesystem, does not need to be on a writable filesystem.
Lower []*check.Absolute Lower []*check.Absolute
// formatted for [OptionOverlayLowerdir], resolved, prefixed and escaped during early // Formatted for [OptionOverlayLowerdir].
//
// Resolved, prefixed and escaped during early.
lower []string lower []string
// The upperdir is normally on a writable filesystem. // The upperdir is normally on a writable filesystem.
// //
// If Work is nil and Upper holds the special value [fhs.AbsRoot], // If Work is nil and Upper holds the special value [fhs.AbsRoot], an
// an ephemeral upperdir and workdir will be set up. // ephemeral upperdir and workdir will be set up.
// //
// If both Work and Upper are nil, upperdir and workdir is omitted and the overlay is mounted readonly. // If both Work and Upper are nil, upperdir and workdir is omitted and the
// overlay is mounted readonly.
Upper *check.Absolute Upper *check.Absolute
// formatted for [OptionOverlayUpperdir], resolved, prefixed and escaped during early // Formatted for [OptionOverlayUpperdir].
//
// Resolved, prefixed and escaped during early.
upper string upper string
// The workdir needs to be an empty directory on the same filesystem as upperdir. // The workdir needs to be an empty directory on the same filesystem as upperdir.
Work *check.Absolute Work *check.Absolute
// formatted for [OptionOverlayWorkdir], resolved, prefixed and escaped during early // Formatted for [OptionOverlayWorkdir].
//
// Resolved, prefixed and escaped during early.
work string work string
ephemeral bool ephemeral bool
// used internally for mounting to the intermediate root // Used internally for mounting to the intermediate root.
noPrefix bool noPrefix bool
} }

View File

@@ -16,7 +16,7 @@ const (
func init() { gob.Register(new(TmpfileOp)) } func init() { gob.Register(new(TmpfileOp)) }
// Place appends an [Op] that places a file in container path [TmpfileOp.Path] containing [TmpfileOp.Data]. // Place is a helper for appending [TmpfileOp] to [Ops].
func (f *Ops) Place(name *check.Absolute, data []byte) *Ops { func (f *Ops) Place(name *check.Absolute, data []byte) *Ops {
*f = append(*f, &TmpfileOp{name, data}) *f = append(*f, &TmpfileOp{name, data})
return f return f

View File

@@ -10,7 +10,7 @@ import (
func init() { gob.Register(new(MountProcOp)) } func init() { gob.Register(new(MountProcOp)) }
// Proc appends an [Op] that mounts a private instance of proc. // Proc is a helper for appending [MountProcOp] to [Ops].
func (f *Ops) Proc(target *check.Absolute) *Ops { func (f *Ops) Proc(target *check.Absolute) *Ops {
*f = append(*f, &MountProcOp{target}) *f = append(*f, &MountProcOp{target})
return f return f

View File

@@ -9,7 +9,7 @@ import (
func init() { gob.Register(new(RemountOp)) } func init() { gob.Register(new(RemountOp)) }
// Remount appends an [Op] that applies [RemountOp.Flags] on container path [RemountOp.Target]. // Remount is a helper for appending [RemountOp] to [Ops].
func (f *Ops) Remount(target *check.Absolute, flags uintptr) *Ops { func (f *Ops) Remount(target *check.Absolute, flags uintptr) *Ops {
*f = append(*f, &RemountOp{target, flags}) *f = append(*f, &RemountOp{target, flags})
return f return f

View File

@@ -38,6 +38,7 @@ const (
_LANDLOCK_ACCESS_FS_DELIM _LANDLOCK_ACCESS_FS_DELIM
) )
// String returns a space-separated string of [LandlockAccessFS] flags.
func (f LandlockAccessFS) String() string { func (f LandlockAccessFS) String() string {
switch f { switch f {
case LANDLOCK_ACCESS_FS_EXECUTE: case LANDLOCK_ACCESS_FS_EXECUTE:
@@ -116,6 +117,7 @@ const (
_LANDLOCK_ACCESS_NET_DELIM _LANDLOCK_ACCESS_NET_DELIM
) )
// String returns a space-separated string of [LandlockAccessNet] flags.
func (f LandlockAccessNet) String() string { func (f LandlockAccessNet) String() string {
switch f { switch f {
case LANDLOCK_ACCESS_NET_BIND_TCP: case LANDLOCK_ACCESS_NET_BIND_TCP:
@@ -152,6 +154,7 @@ const (
_LANDLOCK_SCOPE_DELIM _LANDLOCK_SCOPE_DELIM
) )
// String returns a space-separated string of [LandlockScope] flags.
func (f LandlockScope) String() string { func (f LandlockScope) String() string {
switch f { switch f {
case LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET: case LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET:
@@ -184,10 +187,12 @@ type RulesetAttr struct {
HandledAccessFS LandlockAccessFS HandledAccessFS LandlockAccessFS
// Bitmask of handled network actions. // Bitmask of handled network actions.
HandledAccessNet LandlockAccessNet HandledAccessNet LandlockAccessNet
// Bitmask of scopes restricting a Landlock domain from accessing outside resources (e.g. IPCs). // Bitmask of scopes restricting a Landlock domain from accessing outside
// resources (e.g. IPCs).
Scoped LandlockScope Scoped LandlockScope
} }
// String returns a user-facing description of [RulesetAttr].
func (rulesetAttr *RulesetAttr) String() string { func (rulesetAttr *RulesetAttr) String() string {
if rulesetAttr == nil { if rulesetAttr == nil {
return "NULL" return "NULL"
@@ -208,6 +213,7 @@ func (rulesetAttr *RulesetAttr) String() string {
return strings.Join(elems, ", ") return strings.Join(elems, ", ")
} }
// Create loads the ruleset into the kernel.
func (rulesetAttr *RulesetAttr) Create(flags uintptr) (fd int, err error) { func (rulesetAttr *RulesetAttr) Create(flags uintptr) (fd int, err error) {
var pointer, size uintptr var pointer, size uintptr
// NULL needed for abi version // NULL needed for abi version
@@ -216,10 +222,13 @@ func (rulesetAttr *RulesetAttr) Create(flags uintptr) (fd int, err error) {
size = unsafe.Sizeof(*rulesetAttr) size = unsafe.Sizeof(*rulesetAttr)
} }
rulesetFd, _, errno := syscall.Syscall(std.SYS_LANDLOCK_CREATE_RULESET, pointer, size, flags) rulesetFd, _, errno := syscall.Syscall(
std.SYS_LANDLOCK_CREATE_RULESET,
pointer, size,
flags,
)
fd = int(rulesetFd) fd = int(rulesetFd)
err = errno err = errno
if fd < 0 { if fd < 0 {
return return
} }
@@ -230,12 +239,19 @@ func (rulesetAttr *RulesetAttr) Create(flags uintptr) (fd int, err error) {
return fd, nil return fd, nil
} }
// LandlockGetABI returns the ABI version supported by the kernel.
func LandlockGetABI() (int, error) { func LandlockGetABI() (int, error) {
return (*RulesetAttr)(nil).Create(LANDLOCK_CREATE_RULESET_VERSION) return (*RulesetAttr)(nil).Create(LANDLOCK_CREATE_RULESET_VERSION)
} }
// LandlockRestrictSelf applies a loaded ruleset to the calling thread.
func LandlockRestrictSelf(rulesetFd int, flags uintptr) error { func LandlockRestrictSelf(rulesetFd int, flags uintptr) error {
r, _, errno := syscall.Syscall(std.SYS_LANDLOCK_RESTRICT_SELF, uintptr(rulesetFd), flags, 0) r, _, errno := syscall.Syscall(
std.SYS_LANDLOCK_RESTRICT_SELF,
uintptr(rulesetFd),
flags,
0,
)
if r != 0 { if r != 0 {
return errno return errno
} }

View File

@@ -15,7 +15,10 @@ import (
const ( const (
// Nonexistent is a path that cannot exist. // Nonexistent is a path that cannot exist.
// /proc is chosen because a system with covered /proc is unsupported by this package. //
// This path can never be presented by the kernel if proc is mounted on
// /proc/. This can only exist if parts of /proc/ is covered, or proc is not
// mounted at all. Neither configuration is supported by this package.
Nonexistent = fhs.Proc + "nonexistent" Nonexistent = fhs.Proc + "nonexistent"
hostPath = fhs.Root + hostDir hostPath = fhs.Root + hostDir

View File

@@ -88,18 +88,22 @@ var resPrefix = [...]string{
7: "seccomp_load failed", 7: "seccomp_load failed",
} }
// cbAllocateBuffer is the function signature for the function handle passed to hakurei_export_filter // cbAllocateBuffer is the function signature for the function handle passed to
// which allocates the buffer that the resulting bpf program is copied into, and writes its slice header // hakurei_scmp_make_filter which allocates the buffer that the resulting bpf
// to a value held by the caller. // program is copied into, and writes its slice header to a value held by the caller.
type cbAllocateBuffer = func(len C.size_t) (buf unsafe.Pointer) type cbAllocateBuffer = func(len C.size_t) (buf unsafe.Pointer)
// hakurei_scmp_allocate allocates a buffer of specified size known to the
// runtime through a callback passed in a [cgo.Handle].
//
//export hakurei_scmp_allocate //export hakurei_scmp_allocate
func hakurei_scmp_allocate(f C.uintptr_t, len C.size_t) (buf unsafe.Pointer) { func hakurei_scmp_allocate(f C.uintptr_t, len C.size_t) (buf unsafe.Pointer) {
return cgo.Handle(f).Value().(cbAllocateBuffer)(len) return cgo.Handle(f).Value().(cbAllocateBuffer)(len)
} }
// makeFilter generates a bpf program from a slice of [std.NativeRule] and writes the resulting byte slice to p. // makeFilter generates a bpf program from a slice of [std.NativeRule] and
// The filter is installed to the current process if p is nil. // writes the resulting byte slice to p. The filter is installed to the current
// process if p is nil.
func makeFilter(rules []std.NativeRule, flags ExportFlag, p *[]byte) error { func makeFilter(rules []std.NativeRule, flags ExportFlag, p *[]byte) error {
if len(rules) == 0 { if len(rules) == 0 {
return ErrInvalidRules return ErrInvalidRules
@@ -170,8 +174,8 @@ func Export(rules []std.NativeRule, flags ExportFlag) (data []byte, err error) {
return return
} }
// Load generates a bpf program from a slice of [std.NativeRule] and enforces it on the current process. // Load generates a bpf program from a slice of [std.NativeRule] and enforces it
// Errors returned by libseccomp is wrapped in [LibraryError]. // on the current process. Errors returned by libseccomp is wrapped in [LibraryError].
func Load(rules []std.NativeRule, flags ExportFlag) error { return makeFilter(rules, flags, nil) } func Load(rules []std.NativeRule, flags ExportFlag) error { return makeFilter(rules, flags, nil) }
type ( type (

View File

@@ -2,6 +2,8 @@ package vfs
import "strings" import "strings"
// Unmangle reverses mangling of strings done by the kernel. Its behaviour is
// consistent with the equivalent function in util-linux.
func Unmangle(s string) string { func Unmangle(s string) string {
if !strings.ContainsRune(s, '\\') { if !strings.ContainsRune(s, '\\') {
return s return s

View File

@@ -24,6 +24,7 @@ var (
ErrMountInfoSep = errors.New("bad optional fields separator") ErrMountInfoSep = errors.New("bad optional fields separator")
) )
// A DecoderError describes a nonrecoverable error decoding a mountinfo stream.
type DecoderError struct { type DecoderError struct {
Op string Op string
Line int Line int
@@ -51,7 +52,8 @@ func (e *DecoderError) Error() string {
} }
type ( type (
// A MountInfoDecoder reads and decodes proc_pid_mountinfo(5) entries from an input stream. // A MountInfoDecoder reads and decodes proc_pid_mountinfo(5) entries from
// an input stream.
MountInfoDecoder struct { MountInfoDecoder struct {
s *bufio.Scanner s *bufio.Scanner
m *MountInfo m *MountInfo
@@ -72,13 +74,16 @@ type (
MountInfoEntry struct { MountInfoEntry struct {
// mount ID: a unique ID for the mount (may be reused after umount(2)). // mount ID: a unique ID for the mount (may be reused after umount(2)).
ID int `json:"id"` ID int `json:"id"`
// parent ID: the ID of the parent mount (or of self for the root of this mount namespace's mount tree). // parent ID: the ID of the parent mount (or of self for the root of
// this mount namespace's mount tree).
Parent int `json:"parent"` Parent int `json:"parent"`
// major:minor: the value of st_dev for files on this filesystem (see stat(2)). // major:minor: the value of st_dev for files on this filesystem (see stat(2)).
Devno DevT `json:"devno"` Devno DevT `json:"devno"`
// root: the pathname of the directory in the filesystem which forms the root of this mount. // root: the pathname of the directory in the filesystem which forms the
// root of this mount.
Root string `json:"root"` Root string `json:"root"`
// mount point: the pathname of the mount point relative to the process's root directory. // mount point: the pathname of the mount point relative to the
// process's root directory.
Target string `json:"target"` Target string `json:"target"`
// mount options: per-mount options (see mount(2)). // mount options: per-mount options (see mount(2)).
VfsOptstr string `json:"vfs_optstr"` VfsOptstr string `json:"vfs_optstr"`
@@ -126,7 +131,8 @@ func (e *MountInfoEntry) Flags() (flags uintptr, unmatched []string) {
// NewMountInfoDecoder returns a new decoder that reads from r. // NewMountInfoDecoder returns a new decoder that reads from r.
// //
// The decoder introduces its own buffering and may read data from r beyond the mountinfo entries requested. // The decoder introduces its own buffering and may read data from r beyond the
// mountinfo entries requested.
func NewMountInfoDecoder(r io.Reader) *MountInfoDecoder { func NewMountInfoDecoder(r io.Reader) *MountInfoDecoder {
return &MountInfoDecoder{s: bufio.NewScanner(r)} return &MountInfoDecoder{s: bufio.NewScanner(r)}
} }
@@ -271,6 +277,8 @@ func parseMountInfoLine(s string, ent *MountInfoEntry) error {
return nil return nil
} }
// EqualWithIgnore compares to [MountInfoEntry] values, ignoring fields that
// compare equal to ignore.
func (e *MountInfoEntry) EqualWithIgnore(want *MountInfoEntry, ignore string) bool { func (e *MountInfoEntry) EqualWithIgnore(want *MountInfoEntry, ignore string) bool {
return (e.ID == want.ID || want.ID == -1) && return (e.ID == want.ID || want.ID == -1) &&
(e.Parent == want.Parent || want.Parent == -1) && (e.Parent == want.Parent || want.Parent == -1) &&
@@ -284,6 +292,8 @@ func (e *MountInfoEntry) EqualWithIgnore(want *MountInfoEntry, ignore string) bo
(e.FsOptstr == want.FsOptstr || want.FsOptstr == ignore) (e.FsOptstr == want.FsOptstr || want.FsOptstr == ignore)
} }
// String returns a user-facing representation of a [MountInfoEntry]. It fits
// roughly into the mountinfo format, but without mangling.
func (e *MountInfoEntry) String() string { func (e *MountInfoEntry) String() string {
return fmt.Sprintf("%d %d %d:%d %s %s %s %s %s %s %s", return fmt.Sprintf("%d %d %d:%d %s %s %s %s %s %s %s",
e.ID, e.Parent, e.Devno[0], e.Devno[1], e.Root, e.Target, e.VfsOptstr, e.ID, e.Parent, e.Devno[0], e.Devno[1], e.Root, e.Target, e.VfsOptstr,

View File

@@ -6,6 +6,7 @@ import (
"strings" "strings"
) )
// UnfoldTargetError is a pathname that never appeared in a mount hierarchy.
type UnfoldTargetError string type UnfoldTargetError string
func (e UnfoldTargetError) Error() string { func (e UnfoldTargetError) Error() string {
@@ -27,6 +28,7 @@ func (n *MountInfoNode) Collective() iter.Seq[*MountInfoNode] {
return func(yield func(*MountInfoNode) bool) { n.visit(yield) } return func(yield func(*MountInfoNode) bool) { n.visit(yield) }
} }
// visit recursively visits all visible mountinfo nodes.
func (n *MountInfoNode) visit(yield func(*MountInfoNode) bool) bool { func (n *MountInfoNode) visit(yield func(*MountInfoNode) bool) bool {
if !n.Covered && !yield(n) { if !n.Covered && !yield(n) {
return false return false