container: improve documentation
All checks were successful
Test / Create distribution (push) Successful in 1m16s
Test / Sandbox (push) Successful in 3m2s
Test / Hakurei (push) Successful in 4m4s
Test / ShareFS (push) Successful in 4m17s
Test / Hpkg (push) Successful in 4m49s
Test / Sandbox (race detector) (push) Successful in 5m22s
Test / Hakurei (race detector) (push) Successful in 6m30s
Test / Flake checks (push) Successful in 1m48s
All checks were successful
Test / Create distribution (push) Successful in 1m16s
Test / Sandbox (push) Successful in 3m2s
Test / Hakurei (push) Successful in 4m4s
Test / ShareFS (push) Successful in 4m17s
Test / Hpkg (push) Successful in 4m49s
Test / Sandbox (race detector) (push) Successful in 5m22s
Test / Hakurei (race detector) (push) Successful in 6m30s
Test / Flake checks (push) Successful in 1m48s
This change removes inconsistencies collected over time in this package. Signed-off-by: Ophestra <cat@gensokyo.uk>
This commit is contained in:
@@ -10,8 +10,7 @@ import (
|
||||
|
||||
func init() { gob.Register(new(AutoEtcOp)) }
|
||||
|
||||
// Etc appends an [Op] that expands host /etc into a toplevel symlink mirror with /etc semantics.
|
||||
// This is not a generic setup op. It is implemented here to reduce ipc overhead.
|
||||
// Etc is a helper for appending [AutoEtcOp] to [Ops].
|
||||
func (f *Ops) Etc(host *check.Absolute, prefix string) *Ops {
|
||||
e := &AutoEtcOp{prefix}
|
||||
f.Mkdir(fhs.AbsEtc, 0755)
|
||||
@@ -20,6 +19,9 @@ func (f *Ops) Etc(host *check.Absolute, prefix string) *Ops {
|
||||
return f
|
||||
}
|
||||
|
||||
// AutoEtcOp expands host /etc into a toplevel symlink mirror with /etc semantics.
|
||||
//
|
||||
// This is not a generic setup op. It is implemented here to reduce ipc overhead.
|
||||
type AutoEtcOp struct{ Prefix string }
|
||||
|
||||
func (e *AutoEtcOp) Valid() bool { return e != nil }
|
||||
|
||||
@@ -11,13 +11,15 @@ import (
|
||||
|
||||
func init() { gob.Register(new(AutoRootOp)) }
|
||||
|
||||
// Root appends an [Op] that expands a directory into a toplevel bind mount mirror on container root.
|
||||
// This is not a generic setup op. It is implemented here to reduce ipc overhead.
|
||||
// Root is a helper for appending [AutoRootOp] to [Ops].
|
||||
func (f *Ops) Root(host *check.Absolute, flags int) *Ops {
|
||||
*f = append(*f, &AutoRootOp{host, flags, nil})
|
||||
return f
|
||||
}
|
||||
|
||||
// AutoRootOp expands a directory into a toplevel bind mount mirror on container root.
|
||||
//
|
||||
// This is not a generic setup op. It is implemented here to reduce ipc overhead.
|
||||
type AutoRootOp struct {
|
||||
Host *check.Absolute
|
||||
// passed through to bindMount
|
||||
|
||||
@@ -50,10 +50,16 @@ func capset(hdrp *capHeader, datap *[2]capData) error {
|
||||
}
|
||||
|
||||
// capBoundingSetDrop drops a capability from the calling thread's capability bounding set.
|
||||
func capBoundingSetDrop(cap uintptr) error { return Prctl(syscall.PR_CAPBSET_DROP, cap, 0) }
|
||||
func capBoundingSetDrop(cap uintptr) error {
|
||||
return Prctl(syscall.PR_CAPBSET_DROP, cap, 0)
|
||||
}
|
||||
|
||||
// capAmbientClearAll clears the ambient capability set of the calling thread.
|
||||
func capAmbientClearAll() error { return Prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0) }
|
||||
func capAmbientClearAll() error {
|
||||
return Prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0)
|
||||
}
|
||||
|
||||
// capAmbientRaise adds to the ambient capability set of the calling thread.
|
||||
func capAmbientRaise(cap uintptr) error { return Prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, cap) }
|
||||
func capAmbientRaise(cap uintptr) error {
|
||||
return Prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, cap)
|
||||
}
|
||||
|
||||
@@ -11,7 +11,8 @@ const (
|
||||
SpecialOverlayPath = ":"
|
||||
)
|
||||
|
||||
// EscapeOverlayDataSegment escapes a string for formatting into the data argument of an overlay mount call.
|
||||
// EscapeOverlayDataSegment escapes a string for formatting into the data
|
||||
// argument of an overlay mount system call.
|
||||
func EscapeOverlayDataSegment(s string) string {
|
||||
if s == "" {
|
||||
return ""
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
// Package container implements unprivileged Linux containers with built-in support for syscall filtering.
|
||||
// Package container implements unprivileged Linux containers with built-in
|
||||
// support for syscall filtering.
|
||||
package container
|
||||
|
||||
import (
|
||||
@@ -42,22 +43,25 @@ type (
|
||||
SchedPolicy int
|
||||
// Cgroup fd, nil to disable.
|
||||
Cgroup *int
|
||||
// ExtraFiles passed through to initial process in the container,
|
||||
// with behaviour identical to its [exec.Cmd] counterpart.
|
||||
// ExtraFiles passed through to initial process in the container, with
|
||||
// behaviour identical to its [exec.Cmd] counterpart.
|
||||
ExtraFiles []*os.File
|
||||
|
||||
// param pipe for shim and init
|
||||
// Write end of a pipe connected to the init to deliver [Params].
|
||||
setup *os.File
|
||||
// cancels cmd
|
||||
// Cancels the context passed to the underlying cmd.
|
||||
cancel context.CancelFunc
|
||||
// closed after Wait returns
|
||||
// Closed after Wait returns. Keeps the spawning thread alive.
|
||||
wait chan struct{}
|
||||
|
||||
Stdin io.Reader
|
||||
Stdout io.Writer
|
||||
Stderr io.Writer
|
||||
|
||||
Cancel func(cmd *exec.Cmd) error
|
||||
// Custom cancellation behaviour for the underlying [exec.Cmd]. Must
|
||||
// deliver [CancelSignal] before returning.
|
||||
Cancel func(cmd *exec.Cmd) error
|
||||
// Copied to the underlying [exec.Cmd].
|
||||
WaitDelay time.Duration
|
||||
|
||||
cmd *exec.Cmd
|
||||
@@ -286,7 +290,11 @@ func (p *Container) Start() error {
|
||||
|
||||
// place setup pipe before user supplied extra files, this is later restored by init
|
||||
if fd, f, err := Setup(&p.cmd.ExtraFiles); err != nil {
|
||||
return &StartError{true, "set up params stream", err, false, false}
|
||||
return &StartError{
|
||||
Fatal: true,
|
||||
Step: "set up params stream",
|
||||
Err: err,
|
||||
}
|
||||
} else {
|
||||
p.setup = f
|
||||
p.cmd.Env = []string{setupEnv + "=" + strconv.Itoa(fd)}
|
||||
@@ -298,10 +306,16 @@ func (p *Container) Start() error {
|
||||
runtime.LockOSThread()
|
||||
p.wait = make(chan struct{})
|
||||
|
||||
done <- func() error { // setup depending on per-thread state must happen here
|
||||
// PR_SET_NO_NEW_PRIVS: depends on per-thread state but acts on all processes created from that thread
|
||||
// setup depending on per-thread state must happen here
|
||||
done <- func() error {
|
||||
// PR_SET_NO_NEW_PRIVS: thread-directed but acts on all processes
|
||||
// created from the calling thread
|
||||
if err := SetNoNewPrivs(); err != nil {
|
||||
return &StartError{true, "prctl(PR_SET_NO_NEW_PRIVS)", err, false, false}
|
||||
return &StartError{
|
||||
Fatal: true,
|
||||
Step: "prctl(PR_SET_NO_NEW_PRIVS)",
|
||||
Err: err,
|
||||
}
|
||||
}
|
||||
|
||||
// landlock: depends on per-thread state but acts on a process group
|
||||
@@ -313,28 +327,40 @@ func (p *Container) Start() error {
|
||||
|
||||
if abi, err := LandlockGetABI(); err != nil {
|
||||
if p.HostAbstract {
|
||||
// landlock can be skipped here as it restricts access to resources
|
||||
// already covered by namespaces (pid)
|
||||
// landlock can be skipped here as it restricts access
|
||||
// to resources already covered by namespaces (pid)
|
||||
goto landlockOut
|
||||
}
|
||||
return &StartError{false, "get landlock ABI", err, false, false}
|
||||
return &StartError{Step: "get landlock ABI", Err: err}
|
||||
} else if abi < 6 {
|
||||
if p.HostAbstract {
|
||||
// see above comment
|
||||
goto landlockOut
|
||||
}
|
||||
return &StartError{false, "kernel version too old for LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET", ENOSYS, true, false}
|
||||
return &StartError{
|
||||
Step: "kernel too old for LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET",
|
||||
Err: ENOSYS,
|
||||
Origin: true,
|
||||
}
|
||||
} else {
|
||||
p.msg.Verbosef("landlock abi version %d", abi)
|
||||
}
|
||||
|
||||
if rulesetFd, err := rulesetAttr.Create(0); err != nil {
|
||||
return &StartError{true, "create landlock ruleset", err, false, false}
|
||||
return &StartError{
|
||||
Fatal: true,
|
||||
Step: "create landlock ruleset",
|
||||
Err: err,
|
||||
}
|
||||
} else {
|
||||
p.msg.Verbosef("enforcing landlock ruleset %s", rulesetAttr)
|
||||
if err = LandlockRestrictSelf(rulesetFd, 0); err != nil {
|
||||
_ = Close(rulesetFd)
|
||||
return &StartError{true, "enforce landlock ruleset", err, false, false}
|
||||
return &StartError{
|
||||
Fatal: true,
|
||||
Step: "enforce landlock ruleset",
|
||||
Err: err,
|
||||
}
|
||||
}
|
||||
if err = Close(rulesetFd); err != nil {
|
||||
p.msg.Verbosef("cannot close landlock ruleset: %v", err)
|
||||
@@ -346,7 +372,7 @@ func (p *Container) Start() error {
|
||||
}
|
||||
|
||||
// sched_setscheduler: thread-directed but acts on all processes
|
||||
// created from that thread
|
||||
// created from the calling thread
|
||||
if p.SchedPolicy > 0 {
|
||||
p.msg.Verbosef("setting scheduling policy %d", p.SchedPolicy)
|
||||
if err := schedSetscheduler(
|
||||
@@ -364,7 +390,11 @@ func (p *Container) Start() error {
|
||||
|
||||
p.msg.Verbose("starting container init")
|
||||
if err := p.cmd.Start(); err != nil {
|
||||
return &StartError{false, "start container init", err, false, true}
|
||||
return &StartError{
|
||||
Step: "start container init",
|
||||
Err: err,
|
||||
Passthrough: true,
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}()
|
||||
@@ -376,6 +406,7 @@ func (p *Container) Start() error {
|
||||
}
|
||||
|
||||
// Serve serves [Container.Params] to the container init.
|
||||
//
|
||||
// Serve must only be called once.
|
||||
func (p *Container) Serve() error {
|
||||
if p.setup == nil {
|
||||
@@ -385,12 +416,21 @@ func (p *Container) Serve() error {
|
||||
setup := p.setup
|
||||
p.setup = nil
|
||||
if err := setup.SetDeadline(time.Now().Add(initSetupTimeout)); err != nil {
|
||||
return &StartError{true, "set init pipe deadline", err, false, true}
|
||||
return &StartError{
|
||||
Fatal: true,
|
||||
Step: "set init pipe deadline",
|
||||
Err: err,
|
||||
Passthrough: true,
|
||||
}
|
||||
}
|
||||
|
||||
if p.Path == nil {
|
||||
p.cancel()
|
||||
return &StartError{false, "invalid executable pathname", EINVAL, true, false}
|
||||
return &StartError{
|
||||
Step: "invalid executable pathname",
|
||||
Err: EINVAL,
|
||||
Origin: true,
|
||||
}
|
||||
}
|
||||
|
||||
// do not transmit nil
|
||||
@@ -415,7 +455,8 @@ func (p *Container) Serve() error {
|
||||
return err
|
||||
}
|
||||
|
||||
// Wait waits for the container init process to exit and releases any resources associated with the [Container].
|
||||
// Wait blocks until the container init process to exit and releases any
|
||||
// resources associated with the [Container].
|
||||
func (p *Container) Wait() error {
|
||||
if p.cmd == nil || p.cmd.Process == nil {
|
||||
return EINVAL
|
||||
@@ -460,11 +501,13 @@ func (p *Container) StderrPipe() (r io.ReadCloser, err error) {
|
||||
}
|
||||
|
||||
func (p *Container) String() string {
|
||||
return fmt.Sprintf("argv: %q, filter: %v, rules: %d, flags: %#x, presets: %#x",
|
||||
p.Args, !p.SeccompDisable, len(p.SeccompRules), int(p.SeccompFlags), int(p.SeccompPresets))
|
||||
return fmt.Sprintf(
|
||||
"argv: %q, filter: %v, rules: %d, flags: %#x, presets: %#x",
|
||||
p.Args, !p.SeccompDisable, len(p.SeccompRules), int(p.SeccompFlags), int(p.SeccompPresets),
|
||||
)
|
||||
}
|
||||
|
||||
// ProcessState returns the address to os.ProcessState held by the underlying [exec.Cmd].
|
||||
// ProcessState returns the address of os.ProcessState held by the underlying [exec.Cmd].
|
||||
func (p *Container) ProcessState() *os.ProcessState {
|
||||
if p.cmd == nil {
|
||||
return nil
|
||||
@@ -472,7 +515,8 @@ func (p *Container) ProcessState() *os.ProcessState {
|
||||
return p.cmd.ProcessState
|
||||
}
|
||||
|
||||
// New returns the address to a new instance of [Container] that requires further initialisation before use.
|
||||
// New returns the address to a new instance of [Container]. This value requires
|
||||
// further initialisation before use.
|
||||
func New(ctx context.Context, msg message.Msg) *Container {
|
||||
if msg == nil {
|
||||
msg = message.New(nil)
|
||||
@@ -486,7 +530,13 @@ func New(ctx context.Context, msg message.Msg) *Container {
|
||||
}
|
||||
|
||||
// NewCommand calls [New] and initialises the [Params.Path] and [Params.Args] fields.
|
||||
func NewCommand(ctx context.Context, msg message.Msg, pathname *check.Absolute, name string, args ...string) *Container {
|
||||
func NewCommand(
|
||||
ctx context.Context,
|
||||
msg message.Msg,
|
||||
pathname *check.Absolute,
|
||||
name string,
|
||||
args ...string,
|
||||
) *Container {
|
||||
z := New(ctx, msg)
|
||||
z.Path = pathname
|
||||
z.Args = append([]string{name}, args...)
|
||||
|
||||
@@ -21,7 +21,8 @@ type osFile interface {
|
||||
fs.File
|
||||
}
|
||||
|
||||
// syscallDispatcher provides methods that make state-dependent system calls as part of their behaviour.
|
||||
// syscallDispatcher provides methods that make state-dependent system calls as
|
||||
// part of their behaviour.
|
||||
type syscallDispatcher interface {
|
||||
// new starts a goroutine with a new instance of syscallDispatcher.
|
||||
// A syscallDispatcher must never be used in any goroutine other than the one owning it,
|
||||
|
||||
@@ -43,7 +43,8 @@ func messageFromError(err error) (m string, ok bool) {
|
||||
}
|
||||
|
||||
// messagePrefix checks and prefixes the error message of a non-pointer error.
|
||||
// While this is usable for pointer errors, such use should be avoided as nil check is omitted.
|
||||
// While this is usable for pointer errors, such use should be avoided as nil
|
||||
// check is omitted.
|
||||
func messagePrefix[T error](prefix string, err error) (string, bool) {
|
||||
var targetError T
|
||||
if errors.As(err, &targetError) {
|
||||
|
||||
@@ -9,7 +9,8 @@ const (
|
||||
// Tmp points to the place for small temporary files.
|
||||
Tmp = "/tmp/"
|
||||
|
||||
// Run points to a "tmpfs" file system for system packages to place runtime data, socket files, and similar.
|
||||
// Run points to a "tmpfs" file system for system packages to place runtime
|
||||
// data, socket files, and similar.
|
||||
Run = "/run/"
|
||||
// RunUser points to a directory containing per-user runtime directories,
|
||||
// each usually individually mounted "tmpfs" instances.
|
||||
@@ -17,10 +18,12 @@ const (
|
||||
|
||||
// Usr points to vendor-supplied operating system resources.
|
||||
Usr = "/usr/"
|
||||
// UsrBin points to binaries and executables for user commands that shall appear in the $PATH search path.
|
||||
// UsrBin points to binaries and executables for user commands that shall
|
||||
// appear in the $PATH search path.
|
||||
UsrBin = Usr + "bin/"
|
||||
|
||||
// Var points to persistent, variable system data. Writable during normal system operation.
|
||||
// Var points to persistent, variable system data. Writable during normal
|
||||
// system operation.
|
||||
Var = "/var/"
|
||||
// VarLib points to persistent system data.
|
||||
VarLib = Var + "lib/"
|
||||
@@ -29,12 +32,16 @@ const (
|
||||
|
||||
// Dev points to the root directory for device nodes.
|
||||
Dev = "/dev/"
|
||||
// DevShm is the place for POSIX shared memory segments, as created via shm_open(3).
|
||||
// DevShm is the place for POSIX shared memory segments, as created via
|
||||
// shm_open(3).
|
||||
DevShm = "/dev/shm/"
|
||||
// Proc points to a virtual kernel file system exposing the process list and other functionality.
|
||||
// Proc points to a virtual kernel file system exposing the process list and
|
||||
// other functionality.
|
||||
Proc = "/proc/"
|
||||
// ProcSys points to a hierarchy below /proc/ that exposes a number of kernel tunables.
|
||||
// ProcSys points to a hierarchy below /proc/ that exposes a number of
|
||||
// kernel tunables.
|
||||
ProcSys = Proc + "sys/"
|
||||
// Sys points to a virtual kernel file system exposing discovered devices and other functionality.
|
||||
// Sys points to a virtual kernel file system exposing discovered devices
|
||||
// and other functionality.
|
||||
Sys = "/sys/"
|
||||
)
|
||||
|
||||
@@ -33,12 +33,12 @@ const (
|
||||
- This path is only accessible by init and root:
|
||||
The container init sets SUID_DUMP_DISABLE and terminates if that fails.
|
||||
|
||||
It should be noted that none of this should become relevant at any point since the resulting
|
||||
intermediate root tmpfs should be effectively anonymous. */
|
||||
It should be noted that none of this should become relevant at any point
|
||||
since the resulting intermediate root tmpfs should be effectively anonymous. */
|
||||
intermediateHostPath = fhs.Proc + "self/fd"
|
||||
|
||||
// setupEnv is the name of the environment variable holding the string representation of
|
||||
// the read end file descriptor of the setup params pipe.
|
||||
// setupEnv is the name of the environment variable holding the string
|
||||
// representation of the read end file descriptor of the setup params pipe.
|
||||
setupEnv = "HAKUREI_SETUP"
|
||||
|
||||
// exitUnexpectedWait4 is the exit code if wait4 returns an unexpected errno.
|
||||
@@ -59,7 +59,8 @@ type (
|
||||
// late is called right before starting the initial process.
|
||||
late(state *setupState, k syscallDispatcher) error
|
||||
|
||||
// prefix returns a log message prefix, and whether this Op prints no identifying message on its own.
|
||||
// prefix returns a log message prefix, and whether this Op prints no
|
||||
// identifying message on its own.
|
||||
prefix() (string, bool)
|
||||
|
||||
Is(op Op) bool
|
||||
@@ -71,9 +72,11 @@ type (
|
||||
setupState struct {
|
||||
nonrepeatable uintptr
|
||||
|
||||
// Whether early reaping has concluded. Must only be accessed in the wait4 loop.
|
||||
// Whether early reaping has concluded. Must only be accessed in the
|
||||
// wait4 loop.
|
||||
processConcluded bool
|
||||
// Process to syscall.WaitStatus populated in the wait4 loop. Freed after early reaping concludes.
|
||||
// Process to syscall.WaitStatus populated in the wait4 loop. Freed
|
||||
// after early reaping concludes.
|
||||
process map[int]WaitStatus
|
||||
// Synchronises access to process.
|
||||
processMu sync.RWMutex
|
||||
@@ -216,9 +219,10 @@ func initEntrypoint(k syscallDispatcher, msg message.Msg) {
|
||||
defer cancel()
|
||||
|
||||
/* early is called right before pivot_root into intermediate root;
|
||||
this step is mostly for gathering information that would otherwise be difficult to obtain
|
||||
via library functions after pivot_root, and implementations are expected to avoid changing
|
||||
the state of the mount namespace */
|
||||
this step is mostly for gathering information that would otherwise be
|
||||
difficult to obtain via library functions after pivot_root, and
|
||||
implementations are expected to avoid changing the state of the mount
|
||||
namespace */
|
||||
for i, op := range *params.Ops {
|
||||
if op == nil || !op.Valid() {
|
||||
k.fatalf(msg, "invalid op at index %d", i)
|
||||
@@ -258,10 +262,10 @@ func initEntrypoint(k syscallDispatcher, msg message.Msg) {
|
||||
k.fatalf(msg, "cannot enter intermediate root: %v", err)
|
||||
}
|
||||
|
||||
/* apply is called right after pivot_root and entering the new root;
|
||||
this step sets up the container filesystem, and implementations are expected to keep the host root
|
||||
and sysroot mount points intact but otherwise can do whatever they need to;
|
||||
chdir is allowed but discouraged */
|
||||
/* apply is called right after pivot_root and entering the new root. This
|
||||
step sets up the container filesystem, and implementations are expected to
|
||||
keep the host root and sysroot mount points intact but otherwise can do
|
||||
whatever they need to. Calling chdir is allowed but discouraged. */
|
||||
for i, op := range *params.Ops {
|
||||
// ops already checked during early setup
|
||||
if prefix, ok := op.prefix(); ok {
|
||||
|
||||
@@ -12,14 +12,16 @@ import (
|
||||
|
||||
func init() { gob.Register(new(BindMountOp)) }
|
||||
|
||||
// Bind appends an [Op] that bind mounts host path [BindMountOp.Source] on container path [BindMountOp.Target].
|
||||
// Bind is a helper for appending [BindMountOp] to [Ops].
|
||||
func (f *Ops) Bind(source, target *check.Absolute, flags int) *Ops {
|
||||
*f = append(*f, &BindMountOp{nil, source, target, flags})
|
||||
return f
|
||||
}
|
||||
|
||||
// BindMountOp bind mounts host path Source on container path Target.
|
||||
// Note that Flags uses bits declared in this package and should not be set with constants in [syscall].
|
||||
// BindMountOp creates a bind mount from host path Source to container path Target.
|
||||
//
|
||||
// Note that Flags uses bits declared in the [std] package and should not be set
|
||||
// with constants in [syscall].
|
||||
type BindMountOp struct {
|
||||
sourceFinal, Source, Target *check.Absolute
|
||||
|
||||
|
||||
@@ -24,8 +24,7 @@ const (
|
||||
daemonTimeout = 5 * time.Second
|
||||
)
|
||||
|
||||
// Daemon appends an [Op] that starts a daemon in the container and blocks until
|
||||
// [DaemonOp.Target] appears.
|
||||
// Daemon is a helper for appending [DaemonOp] to [Ops].
|
||||
func (f *Ops) Daemon(target, path *check.Absolute, args ...string) *Ops {
|
||||
*f = append(*f, &DaemonOp{target, path, args})
|
||||
return f
|
||||
|
||||
@@ -19,7 +19,9 @@ func (f *Ops) Dev(target *check.Absolute, mqueue bool) *Ops {
|
||||
}
|
||||
|
||||
// DevWritable appends an [Op] that mounts a writable subset of host /dev.
|
||||
// There is usually no good reason to write to /dev, so this should always be followed by a [RemountOp].
|
||||
//
|
||||
// There is usually no good reason to write to /dev, so this should always be
|
||||
// followed by a [RemountOp].
|
||||
func (f *Ops) DevWritable(target *check.Absolute, mqueue bool) *Ops {
|
||||
*f = append(*f, &MountDevOp{target, mqueue, true})
|
||||
return f
|
||||
|
||||
@@ -10,7 +10,7 @@ import (
|
||||
|
||||
func init() { gob.Register(new(MkdirOp)) }
|
||||
|
||||
// Mkdir appends an [Op] that creates a directory in the container filesystem.
|
||||
// Mkdir is a helper for appending [MkdirOp] to [Ops].
|
||||
func (f *Ops) Mkdir(name *check.Absolute, perm os.FileMode) *Ops {
|
||||
*f = append(*f, &MkdirOp{name, perm})
|
||||
return f
|
||||
|
||||
@@ -54,8 +54,11 @@ func (e *OverlayArgumentError) Error() string {
|
||||
}
|
||||
}
|
||||
|
||||
// Overlay appends an [Op] that mounts the overlay pseudo filesystem on [MountOverlayOp.Target].
|
||||
func (f *Ops) Overlay(target, state, work *check.Absolute, layers ...*check.Absolute) *Ops {
|
||||
// Overlay is a helper for appending [MountOverlayOp] to [Ops].
|
||||
func (f *Ops) Overlay(
|
||||
target, state, work *check.Absolute,
|
||||
layers ...*check.Absolute,
|
||||
) *Ops {
|
||||
*f = append(*f, &MountOverlayOp{
|
||||
Target: target,
|
||||
Lower: layers,
|
||||
@@ -65,13 +68,12 @@ func (f *Ops) Overlay(target, state, work *check.Absolute, layers ...*check.Abso
|
||||
return f
|
||||
}
|
||||
|
||||
// OverlayEphemeral appends an [Op] that mounts the overlay pseudo filesystem on [MountOverlayOp.Target]
|
||||
// with an ephemeral upperdir and workdir.
|
||||
// OverlayEphemeral appends a [MountOverlayOp] with an ephemeral upperdir and workdir.
|
||||
func (f *Ops) OverlayEphemeral(target *check.Absolute, layers ...*check.Absolute) *Ops {
|
||||
return f.Overlay(target, fhs.AbsRoot, nil, layers...)
|
||||
}
|
||||
|
||||
// OverlayReadonly appends an [Op] that mounts the overlay pseudo filesystem readonly on [MountOverlayOp.Target]
|
||||
// OverlayReadonly appends a readonly [MountOverlayOp].
|
||||
func (f *Ops) OverlayReadonly(target *check.Absolute, layers ...*check.Absolute) *Ops {
|
||||
return f.Overlay(target, nil, nil, layers...)
|
||||
}
|
||||
@@ -82,25 +84,34 @@ type MountOverlayOp struct {
|
||||
|
||||
// Any filesystem, does not need to be on a writable filesystem.
|
||||
Lower []*check.Absolute
|
||||
// formatted for [OptionOverlayLowerdir], resolved, prefixed and escaped during early
|
||||
// Formatted for [OptionOverlayLowerdir].
|
||||
//
|
||||
// Resolved, prefixed and escaped during early.
|
||||
lower []string
|
||||
|
||||
// The upperdir is normally on a writable filesystem.
|
||||
//
|
||||
// If Work is nil and Upper holds the special value [fhs.AbsRoot],
|
||||
// an ephemeral upperdir and workdir will be set up.
|
||||
// If Work is nil and Upper holds the special value [fhs.AbsRoot], an
|
||||
// ephemeral upperdir and workdir will be set up.
|
||||
//
|
||||
// If both Work and Upper are nil, upperdir and workdir is omitted and the overlay is mounted readonly.
|
||||
// If both Work and Upper are nil, upperdir and workdir is omitted and the
|
||||
// overlay is mounted readonly.
|
||||
Upper *check.Absolute
|
||||
// formatted for [OptionOverlayUpperdir], resolved, prefixed and escaped during early
|
||||
// Formatted for [OptionOverlayUpperdir].
|
||||
//
|
||||
// Resolved, prefixed and escaped during early.
|
||||
upper string
|
||||
|
||||
// The workdir needs to be an empty directory on the same filesystem as upperdir.
|
||||
Work *check.Absolute
|
||||
// formatted for [OptionOverlayWorkdir], resolved, prefixed and escaped during early
|
||||
// Formatted for [OptionOverlayWorkdir].
|
||||
//
|
||||
// Resolved, prefixed and escaped during early.
|
||||
work string
|
||||
|
||||
ephemeral bool
|
||||
|
||||
// used internally for mounting to the intermediate root
|
||||
// Used internally for mounting to the intermediate root.
|
||||
noPrefix bool
|
||||
}
|
||||
|
||||
|
||||
@@ -16,7 +16,7 @@ const (
|
||||
|
||||
func init() { gob.Register(new(TmpfileOp)) }
|
||||
|
||||
// Place appends an [Op] that places a file in container path [TmpfileOp.Path] containing [TmpfileOp.Data].
|
||||
// Place is a helper for appending [TmpfileOp] to [Ops].
|
||||
func (f *Ops) Place(name *check.Absolute, data []byte) *Ops {
|
||||
*f = append(*f, &TmpfileOp{name, data})
|
||||
return f
|
||||
|
||||
@@ -10,7 +10,7 @@ import (
|
||||
|
||||
func init() { gob.Register(new(MountProcOp)) }
|
||||
|
||||
// Proc appends an [Op] that mounts a private instance of proc.
|
||||
// Proc is a helper for appending [MountProcOp] to [Ops].
|
||||
func (f *Ops) Proc(target *check.Absolute) *Ops {
|
||||
*f = append(*f, &MountProcOp{target})
|
||||
return f
|
||||
|
||||
@@ -9,7 +9,7 @@ import (
|
||||
|
||||
func init() { gob.Register(new(RemountOp)) }
|
||||
|
||||
// Remount appends an [Op] that applies [RemountOp.Flags] on container path [RemountOp.Target].
|
||||
// Remount is a helper for appending [RemountOp] to [Ops].
|
||||
func (f *Ops) Remount(target *check.Absolute, flags uintptr) *Ops {
|
||||
*f = append(*f, &RemountOp{target, flags})
|
||||
return f
|
||||
|
||||
@@ -38,6 +38,7 @@ const (
|
||||
_LANDLOCK_ACCESS_FS_DELIM
|
||||
)
|
||||
|
||||
// String returns a space-separated string of [LandlockAccessFS] flags.
|
||||
func (f LandlockAccessFS) String() string {
|
||||
switch f {
|
||||
case LANDLOCK_ACCESS_FS_EXECUTE:
|
||||
@@ -116,6 +117,7 @@ const (
|
||||
_LANDLOCK_ACCESS_NET_DELIM
|
||||
)
|
||||
|
||||
// String returns a space-separated string of [LandlockAccessNet] flags.
|
||||
func (f LandlockAccessNet) String() string {
|
||||
switch f {
|
||||
case LANDLOCK_ACCESS_NET_BIND_TCP:
|
||||
@@ -152,6 +154,7 @@ const (
|
||||
_LANDLOCK_SCOPE_DELIM
|
||||
)
|
||||
|
||||
// String returns a space-separated string of [LandlockScope] flags.
|
||||
func (f LandlockScope) String() string {
|
||||
switch f {
|
||||
case LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET:
|
||||
@@ -184,10 +187,12 @@ type RulesetAttr struct {
|
||||
HandledAccessFS LandlockAccessFS
|
||||
// Bitmask of handled network actions.
|
||||
HandledAccessNet LandlockAccessNet
|
||||
// Bitmask of scopes restricting a Landlock domain from accessing outside resources (e.g. IPCs).
|
||||
// Bitmask of scopes restricting a Landlock domain from accessing outside
|
||||
// resources (e.g. IPCs).
|
||||
Scoped LandlockScope
|
||||
}
|
||||
|
||||
// String returns a user-facing description of [RulesetAttr].
|
||||
func (rulesetAttr *RulesetAttr) String() string {
|
||||
if rulesetAttr == nil {
|
||||
return "NULL"
|
||||
@@ -208,6 +213,7 @@ func (rulesetAttr *RulesetAttr) String() string {
|
||||
return strings.Join(elems, ", ")
|
||||
}
|
||||
|
||||
// Create loads the ruleset into the kernel.
|
||||
func (rulesetAttr *RulesetAttr) Create(flags uintptr) (fd int, err error) {
|
||||
var pointer, size uintptr
|
||||
// NULL needed for abi version
|
||||
@@ -216,10 +222,13 @@ func (rulesetAttr *RulesetAttr) Create(flags uintptr) (fd int, err error) {
|
||||
size = unsafe.Sizeof(*rulesetAttr)
|
||||
}
|
||||
|
||||
rulesetFd, _, errno := syscall.Syscall(std.SYS_LANDLOCK_CREATE_RULESET, pointer, size, flags)
|
||||
rulesetFd, _, errno := syscall.Syscall(
|
||||
std.SYS_LANDLOCK_CREATE_RULESET,
|
||||
pointer, size,
|
||||
flags,
|
||||
)
|
||||
fd = int(rulesetFd)
|
||||
err = errno
|
||||
|
||||
if fd < 0 {
|
||||
return
|
||||
}
|
||||
@@ -230,12 +239,19 @@ func (rulesetAttr *RulesetAttr) Create(flags uintptr) (fd int, err error) {
|
||||
return fd, nil
|
||||
}
|
||||
|
||||
// LandlockGetABI returns the ABI version supported by the kernel.
|
||||
func LandlockGetABI() (int, error) {
|
||||
return (*RulesetAttr)(nil).Create(LANDLOCK_CREATE_RULESET_VERSION)
|
||||
}
|
||||
|
||||
// LandlockRestrictSelf applies a loaded ruleset to the calling thread.
|
||||
func LandlockRestrictSelf(rulesetFd int, flags uintptr) error {
|
||||
r, _, errno := syscall.Syscall(std.SYS_LANDLOCK_RESTRICT_SELF, uintptr(rulesetFd), flags, 0)
|
||||
r, _, errno := syscall.Syscall(
|
||||
std.SYS_LANDLOCK_RESTRICT_SELF,
|
||||
uintptr(rulesetFd),
|
||||
flags,
|
||||
0,
|
||||
)
|
||||
if r != 0 {
|
||||
return errno
|
||||
}
|
||||
|
||||
@@ -15,7 +15,10 @@ import (
|
||||
|
||||
const (
|
||||
// Nonexistent is a path that cannot exist.
|
||||
// /proc is chosen because a system with covered /proc is unsupported by this package.
|
||||
//
|
||||
// This path can never be presented by the kernel if proc is mounted on
|
||||
// /proc/. This can only exist if parts of /proc/ is covered, or proc is not
|
||||
// mounted at all. Neither configuration is supported by this package.
|
||||
Nonexistent = fhs.Proc + "nonexistent"
|
||||
|
||||
hostPath = fhs.Root + hostDir
|
||||
|
||||
@@ -88,18 +88,22 @@ var resPrefix = [...]string{
|
||||
7: "seccomp_load failed",
|
||||
}
|
||||
|
||||
// cbAllocateBuffer is the function signature for the function handle passed to hakurei_export_filter
|
||||
// which allocates the buffer that the resulting bpf program is copied into, and writes its slice header
|
||||
// to a value held by the caller.
|
||||
// cbAllocateBuffer is the function signature for the function handle passed to
|
||||
// hakurei_scmp_make_filter which allocates the buffer that the resulting bpf
|
||||
// program is copied into, and writes its slice header to a value held by the caller.
|
||||
type cbAllocateBuffer = func(len C.size_t) (buf unsafe.Pointer)
|
||||
|
||||
// hakurei_scmp_allocate allocates a buffer of specified size known to the
|
||||
// runtime through a callback passed in a [cgo.Handle].
|
||||
//
|
||||
//export hakurei_scmp_allocate
|
||||
func hakurei_scmp_allocate(f C.uintptr_t, len C.size_t) (buf unsafe.Pointer) {
|
||||
return cgo.Handle(f).Value().(cbAllocateBuffer)(len)
|
||||
}
|
||||
|
||||
// makeFilter generates a bpf program from a slice of [std.NativeRule] and writes the resulting byte slice to p.
|
||||
// The filter is installed to the current process if p is nil.
|
||||
// makeFilter generates a bpf program from a slice of [std.NativeRule] and
|
||||
// writes the resulting byte slice to p. The filter is installed to the current
|
||||
// process if p is nil.
|
||||
func makeFilter(rules []std.NativeRule, flags ExportFlag, p *[]byte) error {
|
||||
if len(rules) == 0 {
|
||||
return ErrInvalidRules
|
||||
@@ -170,8 +174,8 @@ func Export(rules []std.NativeRule, flags ExportFlag) (data []byte, err error) {
|
||||
return
|
||||
}
|
||||
|
||||
// Load generates a bpf program from a slice of [std.NativeRule] and enforces it on the current process.
|
||||
// Errors returned by libseccomp is wrapped in [LibraryError].
|
||||
// Load generates a bpf program from a slice of [std.NativeRule] and enforces it
|
||||
// on the current process. Errors returned by libseccomp is wrapped in [LibraryError].
|
||||
func Load(rules []std.NativeRule, flags ExportFlag) error { return makeFilter(rules, flags, nil) }
|
||||
|
||||
type (
|
||||
|
||||
@@ -2,6 +2,8 @@ package vfs
|
||||
|
||||
import "strings"
|
||||
|
||||
// Unmangle reverses mangling of strings done by the kernel. Its behaviour is
|
||||
// consistent with the equivalent function in util-linux.
|
||||
func Unmangle(s string) string {
|
||||
if !strings.ContainsRune(s, '\\') {
|
||||
return s
|
||||
|
||||
@@ -24,6 +24,7 @@ var (
|
||||
ErrMountInfoSep = errors.New("bad optional fields separator")
|
||||
)
|
||||
|
||||
// A DecoderError describes a nonrecoverable error decoding a mountinfo stream.
|
||||
type DecoderError struct {
|
||||
Op string
|
||||
Line int
|
||||
@@ -51,7 +52,8 @@ func (e *DecoderError) Error() string {
|
||||
}
|
||||
|
||||
type (
|
||||
// A MountInfoDecoder reads and decodes proc_pid_mountinfo(5) entries from an input stream.
|
||||
// A MountInfoDecoder reads and decodes proc_pid_mountinfo(5) entries from
|
||||
// an input stream.
|
||||
MountInfoDecoder struct {
|
||||
s *bufio.Scanner
|
||||
m *MountInfo
|
||||
@@ -72,13 +74,16 @@ type (
|
||||
MountInfoEntry struct {
|
||||
// mount ID: a unique ID for the mount (may be reused after umount(2)).
|
||||
ID int `json:"id"`
|
||||
// parent ID: the ID of the parent mount (or of self for the root of this mount namespace's mount tree).
|
||||
// parent ID: the ID of the parent mount (or of self for the root of
|
||||
// this mount namespace's mount tree).
|
||||
Parent int `json:"parent"`
|
||||
// major:minor: the value of st_dev for files on this filesystem (see stat(2)).
|
||||
Devno DevT `json:"devno"`
|
||||
// root: the pathname of the directory in the filesystem which forms the root of this mount.
|
||||
// root: the pathname of the directory in the filesystem which forms the
|
||||
// root of this mount.
|
||||
Root string `json:"root"`
|
||||
// mount point: the pathname of the mount point relative to the process's root directory.
|
||||
// mount point: the pathname of the mount point relative to the
|
||||
// process's root directory.
|
||||
Target string `json:"target"`
|
||||
// mount options: per-mount options (see mount(2)).
|
||||
VfsOptstr string `json:"vfs_optstr"`
|
||||
@@ -126,7 +131,8 @@ func (e *MountInfoEntry) Flags() (flags uintptr, unmatched []string) {
|
||||
|
||||
// NewMountInfoDecoder returns a new decoder that reads from r.
|
||||
//
|
||||
// The decoder introduces its own buffering and may read data from r beyond the mountinfo entries requested.
|
||||
// The decoder introduces its own buffering and may read data from r beyond the
|
||||
// mountinfo entries requested.
|
||||
func NewMountInfoDecoder(r io.Reader) *MountInfoDecoder {
|
||||
return &MountInfoDecoder{s: bufio.NewScanner(r)}
|
||||
}
|
||||
@@ -271,6 +277,8 @@ func parseMountInfoLine(s string, ent *MountInfoEntry) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// EqualWithIgnore compares to [MountInfoEntry] values, ignoring fields that
|
||||
// compare equal to ignore.
|
||||
func (e *MountInfoEntry) EqualWithIgnore(want *MountInfoEntry, ignore string) bool {
|
||||
return (e.ID == want.ID || want.ID == -1) &&
|
||||
(e.Parent == want.Parent || want.Parent == -1) &&
|
||||
@@ -284,6 +292,8 @@ func (e *MountInfoEntry) EqualWithIgnore(want *MountInfoEntry, ignore string) bo
|
||||
(e.FsOptstr == want.FsOptstr || want.FsOptstr == ignore)
|
||||
}
|
||||
|
||||
// String returns a user-facing representation of a [MountInfoEntry]. It fits
|
||||
// roughly into the mountinfo format, but without mangling.
|
||||
func (e *MountInfoEntry) String() string {
|
||||
return fmt.Sprintf("%d %d %d:%d %s %s %s %s %s %s %s",
|
||||
e.ID, e.Parent, e.Devno[0], e.Devno[1], e.Root, e.Target, e.VfsOptstr,
|
||||
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
// UnfoldTargetError is a pathname that never appeared in a mount hierarchy.
|
||||
type UnfoldTargetError string
|
||||
|
||||
func (e UnfoldTargetError) Error() string {
|
||||
@@ -27,6 +28,7 @@ func (n *MountInfoNode) Collective() iter.Seq[*MountInfoNode] {
|
||||
return func(yield func(*MountInfoNode) bool) { n.visit(yield) }
|
||||
}
|
||||
|
||||
// visit recursively visits all visible mountinfo nodes.
|
||||
func (n *MountInfoNode) visit(yield func(*MountInfoNode) bool) bool {
|
||||
if !n.Covered && !yield(n) {
|
||||
return false
|
||||
|
||||
Reference in New Issue
Block a user