container: improve documentation
All checks were successful
Test / Create distribution (push) Successful in 1m16s
Test / Sandbox (push) Successful in 3m2s
Test / Hakurei (push) Successful in 4m4s
Test / ShareFS (push) Successful in 4m17s
Test / Hpkg (push) Successful in 4m49s
Test / Sandbox (race detector) (push) Successful in 5m22s
Test / Hakurei (race detector) (push) Successful in 6m30s
Test / Flake checks (push) Successful in 1m48s

This change removes inconsistencies collected over time in this package.

Signed-off-by: Ophestra <cat@gensokyo.uk>
This commit is contained in:
2026-02-28 20:18:30 +09:00
parent 84e6922f30
commit cd9b534d6b
23 changed files with 222 additions and 97 deletions

View File

@@ -10,8 +10,7 @@ import (
func init() { gob.Register(new(AutoEtcOp)) }
// Etc appends an [Op] that expands host /etc into a toplevel symlink mirror with /etc semantics.
// This is not a generic setup op. It is implemented here to reduce ipc overhead.
// Etc is a helper for appending [AutoEtcOp] to [Ops].
func (f *Ops) Etc(host *check.Absolute, prefix string) *Ops {
e := &AutoEtcOp{prefix}
f.Mkdir(fhs.AbsEtc, 0755)
@@ -20,6 +19,9 @@ func (f *Ops) Etc(host *check.Absolute, prefix string) *Ops {
return f
}
// AutoEtcOp expands host /etc into a toplevel symlink mirror with /etc semantics.
//
// This is not a generic setup op. It is implemented here to reduce ipc overhead.
type AutoEtcOp struct{ Prefix string }
func (e *AutoEtcOp) Valid() bool { return e != nil }

View File

@@ -11,13 +11,15 @@ import (
func init() { gob.Register(new(AutoRootOp)) }
// Root appends an [Op] that expands a directory into a toplevel bind mount mirror on container root.
// This is not a generic setup op. It is implemented here to reduce ipc overhead.
// Root is a helper for appending [AutoRootOp] to [Ops].
func (f *Ops) Root(host *check.Absolute, flags int) *Ops {
*f = append(*f, &AutoRootOp{host, flags, nil})
return f
}
// AutoRootOp expands a directory into a toplevel bind mount mirror on container root.
//
// This is not a generic setup op. It is implemented here to reduce ipc overhead.
type AutoRootOp struct {
Host *check.Absolute
// passed through to bindMount

View File

@@ -50,10 +50,16 @@ func capset(hdrp *capHeader, datap *[2]capData) error {
}
// capBoundingSetDrop drops a capability from the calling thread's capability bounding set.
func capBoundingSetDrop(cap uintptr) error { return Prctl(syscall.PR_CAPBSET_DROP, cap, 0) }
func capBoundingSetDrop(cap uintptr) error {
return Prctl(syscall.PR_CAPBSET_DROP, cap, 0)
}
// capAmbientClearAll clears the ambient capability set of the calling thread.
func capAmbientClearAll() error { return Prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0) }
func capAmbientClearAll() error {
return Prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0)
}
// capAmbientRaise adds to the ambient capability set of the calling thread.
func capAmbientRaise(cap uintptr) error { return Prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, cap) }
func capAmbientRaise(cap uintptr) error {
return Prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, cap)
}

View File

@@ -11,7 +11,8 @@ const (
SpecialOverlayPath = ":"
)
// EscapeOverlayDataSegment escapes a string for formatting into the data argument of an overlay mount call.
// EscapeOverlayDataSegment escapes a string for formatting into the data
// argument of an overlay mount system call.
func EscapeOverlayDataSegment(s string) string {
if s == "" {
return ""

View File

@@ -1,4 +1,5 @@
// Package container implements unprivileged Linux containers with built-in support for syscall filtering.
// Package container implements unprivileged Linux containers with built-in
// support for syscall filtering.
package container
import (
@@ -42,22 +43,25 @@ type (
SchedPolicy int
// Cgroup fd, nil to disable.
Cgroup *int
// ExtraFiles passed through to initial process in the container,
// with behaviour identical to its [exec.Cmd] counterpart.
// ExtraFiles passed through to initial process in the container, with
// behaviour identical to its [exec.Cmd] counterpart.
ExtraFiles []*os.File
// param pipe for shim and init
// Write end of a pipe connected to the init to deliver [Params].
setup *os.File
// cancels cmd
// Cancels the context passed to the underlying cmd.
cancel context.CancelFunc
// closed after Wait returns
// Closed after Wait returns. Keeps the spawning thread alive.
wait chan struct{}
Stdin io.Reader
Stdout io.Writer
Stderr io.Writer
Cancel func(cmd *exec.Cmd) error
// Custom cancellation behaviour for the underlying [exec.Cmd]. Must
// deliver [CancelSignal] before returning.
Cancel func(cmd *exec.Cmd) error
// Copied to the underlying [exec.Cmd].
WaitDelay time.Duration
cmd *exec.Cmd
@@ -286,7 +290,11 @@ func (p *Container) Start() error {
// place setup pipe before user supplied extra files, this is later restored by init
if fd, f, err := Setup(&p.cmd.ExtraFiles); err != nil {
return &StartError{true, "set up params stream", err, false, false}
return &StartError{
Fatal: true,
Step: "set up params stream",
Err: err,
}
} else {
p.setup = f
p.cmd.Env = []string{setupEnv + "=" + strconv.Itoa(fd)}
@@ -298,10 +306,16 @@ func (p *Container) Start() error {
runtime.LockOSThread()
p.wait = make(chan struct{})
done <- func() error { // setup depending on per-thread state must happen here
// PR_SET_NO_NEW_PRIVS: depends on per-thread state but acts on all processes created from that thread
// setup depending on per-thread state must happen here
done <- func() error {
// PR_SET_NO_NEW_PRIVS: thread-directed but acts on all processes
// created from the calling thread
if err := SetNoNewPrivs(); err != nil {
return &StartError{true, "prctl(PR_SET_NO_NEW_PRIVS)", err, false, false}
return &StartError{
Fatal: true,
Step: "prctl(PR_SET_NO_NEW_PRIVS)",
Err: err,
}
}
// landlock: depends on per-thread state but acts on a process group
@@ -313,28 +327,40 @@ func (p *Container) Start() error {
if abi, err := LandlockGetABI(); err != nil {
if p.HostAbstract {
// landlock can be skipped here as it restricts access to resources
// already covered by namespaces (pid)
// landlock can be skipped here as it restricts access
// to resources already covered by namespaces (pid)
goto landlockOut
}
return &StartError{false, "get landlock ABI", err, false, false}
return &StartError{Step: "get landlock ABI", Err: err}
} else if abi < 6 {
if p.HostAbstract {
// see above comment
goto landlockOut
}
return &StartError{false, "kernel version too old for LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET", ENOSYS, true, false}
return &StartError{
Step: "kernel too old for LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET",
Err: ENOSYS,
Origin: true,
}
} else {
p.msg.Verbosef("landlock abi version %d", abi)
}
if rulesetFd, err := rulesetAttr.Create(0); err != nil {
return &StartError{true, "create landlock ruleset", err, false, false}
return &StartError{
Fatal: true,
Step: "create landlock ruleset",
Err: err,
}
} else {
p.msg.Verbosef("enforcing landlock ruleset %s", rulesetAttr)
if err = LandlockRestrictSelf(rulesetFd, 0); err != nil {
_ = Close(rulesetFd)
return &StartError{true, "enforce landlock ruleset", err, false, false}
return &StartError{
Fatal: true,
Step: "enforce landlock ruleset",
Err: err,
}
}
if err = Close(rulesetFd); err != nil {
p.msg.Verbosef("cannot close landlock ruleset: %v", err)
@@ -346,7 +372,7 @@ func (p *Container) Start() error {
}
// sched_setscheduler: thread-directed but acts on all processes
// created from that thread
// created from the calling thread
if p.SchedPolicy > 0 {
p.msg.Verbosef("setting scheduling policy %d", p.SchedPolicy)
if err := schedSetscheduler(
@@ -364,7 +390,11 @@ func (p *Container) Start() error {
p.msg.Verbose("starting container init")
if err := p.cmd.Start(); err != nil {
return &StartError{false, "start container init", err, false, true}
return &StartError{
Step: "start container init",
Err: err,
Passthrough: true,
}
}
return nil
}()
@@ -376,6 +406,7 @@ func (p *Container) Start() error {
}
// Serve serves [Container.Params] to the container init.
//
// Serve must only be called once.
func (p *Container) Serve() error {
if p.setup == nil {
@@ -385,12 +416,21 @@ func (p *Container) Serve() error {
setup := p.setup
p.setup = nil
if err := setup.SetDeadline(time.Now().Add(initSetupTimeout)); err != nil {
return &StartError{true, "set init pipe deadline", err, false, true}
return &StartError{
Fatal: true,
Step: "set init pipe deadline",
Err: err,
Passthrough: true,
}
}
if p.Path == nil {
p.cancel()
return &StartError{false, "invalid executable pathname", EINVAL, true, false}
return &StartError{
Step: "invalid executable pathname",
Err: EINVAL,
Origin: true,
}
}
// do not transmit nil
@@ -415,7 +455,8 @@ func (p *Container) Serve() error {
return err
}
// Wait waits for the container init process to exit and releases any resources associated with the [Container].
// Wait blocks until the container init process to exit and releases any
// resources associated with the [Container].
func (p *Container) Wait() error {
if p.cmd == nil || p.cmd.Process == nil {
return EINVAL
@@ -460,11 +501,13 @@ func (p *Container) StderrPipe() (r io.ReadCloser, err error) {
}
func (p *Container) String() string {
return fmt.Sprintf("argv: %q, filter: %v, rules: %d, flags: %#x, presets: %#x",
p.Args, !p.SeccompDisable, len(p.SeccompRules), int(p.SeccompFlags), int(p.SeccompPresets))
return fmt.Sprintf(
"argv: %q, filter: %v, rules: %d, flags: %#x, presets: %#x",
p.Args, !p.SeccompDisable, len(p.SeccompRules), int(p.SeccompFlags), int(p.SeccompPresets),
)
}
// ProcessState returns the address to os.ProcessState held by the underlying [exec.Cmd].
// ProcessState returns the address of os.ProcessState held by the underlying [exec.Cmd].
func (p *Container) ProcessState() *os.ProcessState {
if p.cmd == nil {
return nil
@@ -472,7 +515,8 @@ func (p *Container) ProcessState() *os.ProcessState {
return p.cmd.ProcessState
}
// New returns the address to a new instance of [Container] that requires further initialisation before use.
// New returns the address to a new instance of [Container]. This value requires
// further initialisation before use.
func New(ctx context.Context, msg message.Msg) *Container {
if msg == nil {
msg = message.New(nil)
@@ -486,7 +530,13 @@ func New(ctx context.Context, msg message.Msg) *Container {
}
// NewCommand calls [New] and initialises the [Params.Path] and [Params.Args] fields.
func NewCommand(ctx context.Context, msg message.Msg, pathname *check.Absolute, name string, args ...string) *Container {
func NewCommand(
ctx context.Context,
msg message.Msg,
pathname *check.Absolute,
name string,
args ...string,
) *Container {
z := New(ctx, msg)
z.Path = pathname
z.Args = append([]string{name}, args...)

View File

@@ -21,7 +21,8 @@ type osFile interface {
fs.File
}
// syscallDispatcher provides methods that make state-dependent system calls as part of their behaviour.
// syscallDispatcher provides methods that make state-dependent system calls as
// part of their behaviour.
type syscallDispatcher interface {
// new starts a goroutine with a new instance of syscallDispatcher.
// A syscallDispatcher must never be used in any goroutine other than the one owning it,

View File

@@ -43,7 +43,8 @@ func messageFromError(err error) (m string, ok bool) {
}
// messagePrefix checks and prefixes the error message of a non-pointer error.
// While this is usable for pointer errors, such use should be avoided as nil check is omitted.
// While this is usable for pointer errors, such use should be avoided as nil
// check is omitted.
func messagePrefix[T error](prefix string, err error) (string, bool) {
var targetError T
if errors.As(err, &targetError) {

View File

@@ -9,7 +9,8 @@ const (
// Tmp points to the place for small temporary files.
Tmp = "/tmp/"
// Run points to a "tmpfs" file system for system packages to place runtime data, socket files, and similar.
// Run points to a "tmpfs" file system for system packages to place runtime
// data, socket files, and similar.
Run = "/run/"
// RunUser points to a directory containing per-user runtime directories,
// each usually individually mounted "tmpfs" instances.
@@ -17,10 +18,12 @@ const (
// Usr points to vendor-supplied operating system resources.
Usr = "/usr/"
// UsrBin points to binaries and executables for user commands that shall appear in the $PATH search path.
// UsrBin points to binaries and executables for user commands that shall
// appear in the $PATH search path.
UsrBin = Usr + "bin/"
// Var points to persistent, variable system data. Writable during normal system operation.
// Var points to persistent, variable system data. Writable during normal
// system operation.
Var = "/var/"
// VarLib points to persistent system data.
VarLib = Var + "lib/"
@@ -29,12 +32,16 @@ const (
// Dev points to the root directory for device nodes.
Dev = "/dev/"
// DevShm is the place for POSIX shared memory segments, as created via shm_open(3).
// DevShm is the place for POSIX shared memory segments, as created via
// shm_open(3).
DevShm = "/dev/shm/"
// Proc points to a virtual kernel file system exposing the process list and other functionality.
// Proc points to a virtual kernel file system exposing the process list and
// other functionality.
Proc = "/proc/"
// ProcSys points to a hierarchy below /proc/ that exposes a number of kernel tunables.
// ProcSys points to a hierarchy below /proc/ that exposes a number of
// kernel tunables.
ProcSys = Proc + "sys/"
// Sys points to a virtual kernel file system exposing discovered devices and other functionality.
// Sys points to a virtual kernel file system exposing discovered devices
// and other functionality.
Sys = "/sys/"
)

View File

@@ -33,12 +33,12 @@ const (
- This path is only accessible by init and root:
The container init sets SUID_DUMP_DISABLE and terminates if that fails.
It should be noted that none of this should become relevant at any point since the resulting
intermediate root tmpfs should be effectively anonymous. */
It should be noted that none of this should become relevant at any point
since the resulting intermediate root tmpfs should be effectively anonymous. */
intermediateHostPath = fhs.Proc + "self/fd"
// setupEnv is the name of the environment variable holding the string representation of
// the read end file descriptor of the setup params pipe.
// setupEnv is the name of the environment variable holding the string
// representation of the read end file descriptor of the setup params pipe.
setupEnv = "HAKUREI_SETUP"
// exitUnexpectedWait4 is the exit code if wait4 returns an unexpected errno.
@@ -59,7 +59,8 @@ type (
// late is called right before starting the initial process.
late(state *setupState, k syscallDispatcher) error
// prefix returns a log message prefix, and whether this Op prints no identifying message on its own.
// prefix returns a log message prefix, and whether this Op prints no
// identifying message on its own.
prefix() (string, bool)
Is(op Op) bool
@@ -71,9 +72,11 @@ type (
setupState struct {
nonrepeatable uintptr
// Whether early reaping has concluded. Must only be accessed in the wait4 loop.
// Whether early reaping has concluded. Must only be accessed in the
// wait4 loop.
processConcluded bool
// Process to syscall.WaitStatus populated in the wait4 loop. Freed after early reaping concludes.
// Process to syscall.WaitStatus populated in the wait4 loop. Freed
// after early reaping concludes.
process map[int]WaitStatus
// Synchronises access to process.
processMu sync.RWMutex
@@ -216,9 +219,10 @@ func initEntrypoint(k syscallDispatcher, msg message.Msg) {
defer cancel()
/* early is called right before pivot_root into intermediate root;
this step is mostly for gathering information that would otherwise be difficult to obtain
via library functions after pivot_root, and implementations are expected to avoid changing
the state of the mount namespace */
this step is mostly for gathering information that would otherwise be
difficult to obtain via library functions after pivot_root, and
implementations are expected to avoid changing the state of the mount
namespace */
for i, op := range *params.Ops {
if op == nil || !op.Valid() {
k.fatalf(msg, "invalid op at index %d", i)
@@ -258,10 +262,10 @@ func initEntrypoint(k syscallDispatcher, msg message.Msg) {
k.fatalf(msg, "cannot enter intermediate root: %v", err)
}
/* apply is called right after pivot_root and entering the new root;
this step sets up the container filesystem, and implementations are expected to keep the host root
and sysroot mount points intact but otherwise can do whatever they need to;
chdir is allowed but discouraged */
/* apply is called right after pivot_root and entering the new root. This
step sets up the container filesystem, and implementations are expected to
keep the host root and sysroot mount points intact but otherwise can do
whatever they need to. Calling chdir is allowed but discouraged. */
for i, op := range *params.Ops {
// ops already checked during early setup
if prefix, ok := op.prefix(); ok {

View File

@@ -12,14 +12,16 @@ import (
func init() { gob.Register(new(BindMountOp)) }
// Bind appends an [Op] that bind mounts host path [BindMountOp.Source] on container path [BindMountOp.Target].
// Bind is a helper for appending [BindMountOp] to [Ops].
func (f *Ops) Bind(source, target *check.Absolute, flags int) *Ops {
*f = append(*f, &BindMountOp{nil, source, target, flags})
return f
}
// BindMountOp bind mounts host path Source on container path Target.
// Note that Flags uses bits declared in this package and should not be set with constants in [syscall].
// BindMountOp creates a bind mount from host path Source to container path Target.
//
// Note that Flags uses bits declared in the [std] package and should not be set
// with constants in [syscall].
type BindMountOp struct {
sourceFinal, Source, Target *check.Absolute

View File

@@ -24,8 +24,7 @@ const (
daemonTimeout = 5 * time.Second
)
// Daemon appends an [Op] that starts a daemon in the container and blocks until
// [DaemonOp.Target] appears.
// Daemon is a helper for appending [DaemonOp] to [Ops].
func (f *Ops) Daemon(target, path *check.Absolute, args ...string) *Ops {
*f = append(*f, &DaemonOp{target, path, args})
return f

View File

@@ -19,7 +19,9 @@ func (f *Ops) Dev(target *check.Absolute, mqueue bool) *Ops {
}
// DevWritable appends an [Op] that mounts a writable subset of host /dev.
// There is usually no good reason to write to /dev, so this should always be followed by a [RemountOp].
//
// There is usually no good reason to write to /dev, so this should always be
// followed by a [RemountOp].
func (f *Ops) DevWritable(target *check.Absolute, mqueue bool) *Ops {
*f = append(*f, &MountDevOp{target, mqueue, true})
return f

View File

@@ -10,7 +10,7 @@ import (
func init() { gob.Register(new(MkdirOp)) }
// Mkdir appends an [Op] that creates a directory in the container filesystem.
// Mkdir is a helper for appending [MkdirOp] to [Ops].
func (f *Ops) Mkdir(name *check.Absolute, perm os.FileMode) *Ops {
*f = append(*f, &MkdirOp{name, perm})
return f

View File

@@ -54,8 +54,11 @@ func (e *OverlayArgumentError) Error() string {
}
}
// Overlay appends an [Op] that mounts the overlay pseudo filesystem on [MountOverlayOp.Target].
func (f *Ops) Overlay(target, state, work *check.Absolute, layers ...*check.Absolute) *Ops {
// Overlay is a helper for appending [MountOverlayOp] to [Ops].
func (f *Ops) Overlay(
target, state, work *check.Absolute,
layers ...*check.Absolute,
) *Ops {
*f = append(*f, &MountOverlayOp{
Target: target,
Lower: layers,
@@ -65,13 +68,12 @@ func (f *Ops) Overlay(target, state, work *check.Absolute, layers ...*check.Abso
return f
}
// OverlayEphemeral appends an [Op] that mounts the overlay pseudo filesystem on [MountOverlayOp.Target]
// with an ephemeral upperdir and workdir.
// OverlayEphemeral appends a [MountOverlayOp] with an ephemeral upperdir and workdir.
func (f *Ops) OverlayEphemeral(target *check.Absolute, layers ...*check.Absolute) *Ops {
return f.Overlay(target, fhs.AbsRoot, nil, layers...)
}
// OverlayReadonly appends an [Op] that mounts the overlay pseudo filesystem readonly on [MountOverlayOp.Target]
// OverlayReadonly appends a readonly [MountOverlayOp].
func (f *Ops) OverlayReadonly(target *check.Absolute, layers ...*check.Absolute) *Ops {
return f.Overlay(target, nil, nil, layers...)
}
@@ -82,25 +84,34 @@ type MountOverlayOp struct {
// Any filesystem, does not need to be on a writable filesystem.
Lower []*check.Absolute
// formatted for [OptionOverlayLowerdir], resolved, prefixed and escaped during early
// Formatted for [OptionOverlayLowerdir].
//
// Resolved, prefixed and escaped during early.
lower []string
// The upperdir is normally on a writable filesystem.
//
// If Work is nil and Upper holds the special value [fhs.AbsRoot],
// an ephemeral upperdir and workdir will be set up.
// If Work is nil and Upper holds the special value [fhs.AbsRoot], an
// ephemeral upperdir and workdir will be set up.
//
// If both Work and Upper are nil, upperdir and workdir is omitted and the overlay is mounted readonly.
// If both Work and Upper are nil, upperdir and workdir is omitted and the
// overlay is mounted readonly.
Upper *check.Absolute
// formatted for [OptionOverlayUpperdir], resolved, prefixed and escaped during early
// Formatted for [OptionOverlayUpperdir].
//
// Resolved, prefixed and escaped during early.
upper string
// The workdir needs to be an empty directory on the same filesystem as upperdir.
Work *check.Absolute
// formatted for [OptionOverlayWorkdir], resolved, prefixed and escaped during early
// Formatted for [OptionOverlayWorkdir].
//
// Resolved, prefixed and escaped during early.
work string
ephemeral bool
// used internally for mounting to the intermediate root
// Used internally for mounting to the intermediate root.
noPrefix bool
}

View File

@@ -16,7 +16,7 @@ const (
func init() { gob.Register(new(TmpfileOp)) }
// Place appends an [Op] that places a file in container path [TmpfileOp.Path] containing [TmpfileOp.Data].
// Place is a helper for appending [TmpfileOp] to [Ops].
func (f *Ops) Place(name *check.Absolute, data []byte) *Ops {
*f = append(*f, &TmpfileOp{name, data})
return f

View File

@@ -10,7 +10,7 @@ import (
func init() { gob.Register(new(MountProcOp)) }
// Proc appends an [Op] that mounts a private instance of proc.
// Proc is a helper for appending [MountProcOp] to [Ops].
func (f *Ops) Proc(target *check.Absolute) *Ops {
*f = append(*f, &MountProcOp{target})
return f

View File

@@ -9,7 +9,7 @@ import (
func init() { gob.Register(new(RemountOp)) }
// Remount appends an [Op] that applies [RemountOp.Flags] on container path [RemountOp.Target].
// Remount is a helper for appending [RemountOp] to [Ops].
func (f *Ops) Remount(target *check.Absolute, flags uintptr) *Ops {
*f = append(*f, &RemountOp{target, flags})
return f

View File

@@ -38,6 +38,7 @@ const (
_LANDLOCK_ACCESS_FS_DELIM
)
// String returns a space-separated string of [LandlockAccessFS] flags.
func (f LandlockAccessFS) String() string {
switch f {
case LANDLOCK_ACCESS_FS_EXECUTE:
@@ -116,6 +117,7 @@ const (
_LANDLOCK_ACCESS_NET_DELIM
)
// String returns a space-separated string of [LandlockAccessNet] flags.
func (f LandlockAccessNet) String() string {
switch f {
case LANDLOCK_ACCESS_NET_BIND_TCP:
@@ -152,6 +154,7 @@ const (
_LANDLOCK_SCOPE_DELIM
)
// String returns a space-separated string of [LandlockScope] flags.
func (f LandlockScope) String() string {
switch f {
case LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET:
@@ -184,10 +187,12 @@ type RulesetAttr struct {
HandledAccessFS LandlockAccessFS
// Bitmask of handled network actions.
HandledAccessNet LandlockAccessNet
// Bitmask of scopes restricting a Landlock domain from accessing outside resources (e.g. IPCs).
// Bitmask of scopes restricting a Landlock domain from accessing outside
// resources (e.g. IPCs).
Scoped LandlockScope
}
// String returns a user-facing description of [RulesetAttr].
func (rulesetAttr *RulesetAttr) String() string {
if rulesetAttr == nil {
return "NULL"
@@ -208,6 +213,7 @@ func (rulesetAttr *RulesetAttr) String() string {
return strings.Join(elems, ", ")
}
// Create loads the ruleset into the kernel.
func (rulesetAttr *RulesetAttr) Create(flags uintptr) (fd int, err error) {
var pointer, size uintptr
// NULL needed for abi version
@@ -216,10 +222,13 @@ func (rulesetAttr *RulesetAttr) Create(flags uintptr) (fd int, err error) {
size = unsafe.Sizeof(*rulesetAttr)
}
rulesetFd, _, errno := syscall.Syscall(std.SYS_LANDLOCK_CREATE_RULESET, pointer, size, flags)
rulesetFd, _, errno := syscall.Syscall(
std.SYS_LANDLOCK_CREATE_RULESET,
pointer, size,
flags,
)
fd = int(rulesetFd)
err = errno
if fd < 0 {
return
}
@@ -230,12 +239,19 @@ func (rulesetAttr *RulesetAttr) Create(flags uintptr) (fd int, err error) {
return fd, nil
}
// LandlockGetABI returns the ABI version supported by the kernel.
func LandlockGetABI() (int, error) {
return (*RulesetAttr)(nil).Create(LANDLOCK_CREATE_RULESET_VERSION)
}
// LandlockRestrictSelf applies a loaded ruleset to the calling thread.
func LandlockRestrictSelf(rulesetFd int, flags uintptr) error {
r, _, errno := syscall.Syscall(std.SYS_LANDLOCK_RESTRICT_SELF, uintptr(rulesetFd), flags, 0)
r, _, errno := syscall.Syscall(
std.SYS_LANDLOCK_RESTRICT_SELF,
uintptr(rulesetFd),
flags,
0,
)
if r != 0 {
return errno
}

View File

@@ -15,7 +15,10 @@ import (
const (
// Nonexistent is a path that cannot exist.
// /proc is chosen because a system with covered /proc is unsupported by this package.
//
// This path can never be presented by the kernel if proc is mounted on
// /proc/. This can only exist if parts of /proc/ is covered, or proc is not
// mounted at all. Neither configuration is supported by this package.
Nonexistent = fhs.Proc + "nonexistent"
hostPath = fhs.Root + hostDir

View File

@@ -88,18 +88,22 @@ var resPrefix = [...]string{
7: "seccomp_load failed",
}
// cbAllocateBuffer is the function signature for the function handle passed to hakurei_export_filter
// which allocates the buffer that the resulting bpf program is copied into, and writes its slice header
// to a value held by the caller.
// cbAllocateBuffer is the function signature for the function handle passed to
// hakurei_scmp_make_filter which allocates the buffer that the resulting bpf
// program is copied into, and writes its slice header to a value held by the caller.
type cbAllocateBuffer = func(len C.size_t) (buf unsafe.Pointer)
// hakurei_scmp_allocate allocates a buffer of specified size known to the
// runtime through a callback passed in a [cgo.Handle].
//
//export hakurei_scmp_allocate
func hakurei_scmp_allocate(f C.uintptr_t, len C.size_t) (buf unsafe.Pointer) {
return cgo.Handle(f).Value().(cbAllocateBuffer)(len)
}
// makeFilter generates a bpf program from a slice of [std.NativeRule] and writes the resulting byte slice to p.
// The filter is installed to the current process if p is nil.
// makeFilter generates a bpf program from a slice of [std.NativeRule] and
// writes the resulting byte slice to p. The filter is installed to the current
// process if p is nil.
func makeFilter(rules []std.NativeRule, flags ExportFlag, p *[]byte) error {
if len(rules) == 0 {
return ErrInvalidRules
@@ -170,8 +174,8 @@ func Export(rules []std.NativeRule, flags ExportFlag) (data []byte, err error) {
return
}
// Load generates a bpf program from a slice of [std.NativeRule] and enforces it on the current process.
// Errors returned by libseccomp is wrapped in [LibraryError].
// Load generates a bpf program from a slice of [std.NativeRule] and enforces it
// on the current process. Errors returned by libseccomp is wrapped in [LibraryError].
func Load(rules []std.NativeRule, flags ExportFlag) error { return makeFilter(rules, flags, nil) }
type (

View File

@@ -2,6 +2,8 @@ package vfs
import "strings"
// Unmangle reverses mangling of strings done by the kernel. Its behaviour is
// consistent with the equivalent function in util-linux.
func Unmangle(s string) string {
if !strings.ContainsRune(s, '\\') {
return s

View File

@@ -24,6 +24,7 @@ var (
ErrMountInfoSep = errors.New("bad optional fields separator")
)
// A DecoderError describes a nonrecoverable error decoding a mountinfo stream.
type DecoderError struct {
Op string
Line int
@@ -51,7 +52,8 @@ func (e *DecoderError) Error() string {
}
type (
// A MountInfoDecoder reads and decodes proc_pid_mountinfo(5) entries from an input stream.
// A MountInfoDecoder reads and decodes proc_pid_mountinfo(5) entries from
// an input stream.
MountInfoDecoder struct {
s *bufio.Scanner
m *MountInfo
@@ -72,13 +74,16 @@ type (
MountInfoEntry struct {
// mount ID: a unique ID for the mount (may be reused after umount(2)).
ID int `json:"id"`
// parent ID: the ID of the parent mount (or of self for the root of this mount namespace's mount tree).
// parent ID: the ID of the parent mount (or of self for the root of
// this mount namespace's mount tree).
Parent int `json:"parent"`
// major:minor: the value of st_dev for files on this filesystem (see stat(2)).
Devno DevT `json:"devno"`
// root: the pathname of the directory in the filesystem which forms the root of this mount.
// root: the pathname of the directory in the filesystem which forms the
// root of this mount.
Root string `json:"root"`
// mount point: the pathname of the mount point relative to the process's root directory.
// mount point: the pathname of the mount point relative to the
// process's root directory.
Target string `json:"target"`
// mount options: per-mount options (see mount(2)).
VfsOptstr string `json:"vfs_optstr"`
@@ -126,7 +131,8 @@ func (e *MountInfoEntry) Flags() (flags uintptr, unmatched []string) {
// NewMountInfoDecoder returns a new decoder that reads from r.
//
// The decoder introduces its own buffering and may read data from r beyond the mountinfo entries requested.
// The decoder introduces its own buffering and may read data from r beyond the
// mountinfo entries requested.
func NewMountInfoDecoder(r io.Reader) *MountInfoDecoder {
return &MountInfoDecoder{s: bufio.NewScanner(r)}
}
@@ -271,6 +277,8 @@ func parseMountInfoLine(s string, ent *MountInfoEntry) error {
return nil
}
// EqualWithIgnore compares to [MountInfoEntry] values, ignoring fields that
// compare equal to ignore.
func (e *MountInfoEntry) EqualWithIgnore(want *MountInfoEntry, ignore string) bool {
return (e.ID == want.ID || want.ID == -1) &&
(e.Parent == want.Parent || want.Parent == -1) &&
@@ -284,6 +292,8 @@ func (e *MountInfoEntry) EqualWithIgnore(want *MountInfoEntry, ignore string) bo
(e.FsOptstr == want.FsOptstr || want.FsOptstr == ignore)
}
// String returns a user-facing representation of a [MountInfoEntry]. It fits
// roughly into the mountinfo format, but without mangling.
func (e *MountInfoEntry) String() string {
return fmt.Sprintf("%d %d %d:%d %s %s %s %s %s %s %s",
e.ID, e.Parent, e.Devno[0], e.Devno[1], e.Root, e.Target, e.VfsOptstr,

View File

@@ -6,6 +6,7 @@ import (
"strings"
)
// UnfoldTargetError is a pathname that never appeared in a mount hierarchy.
type UnfoldTargetError string
func (e UnfoldTargetError) Error() string {
@@ -27,6 +28,7 @@ func (n *MountInfoNode) Collective() iter.Seq[*MountInfoNode] {
return func(yield func(*MountInfoNode) bool) { n.visit(yield) }
}
// visit recursively visits all visible mountinfo nodes.
func (n *MountInfoNode) visit(yield func(*MountInfoNode) bool) bool {
if !n.Covered && !yield(n) {
return false