hakurei/container/container.go
Ophestra e94acc424c
All checks were successful
Test / Create distribution (push) Successful in 32s
Test / Sandbox (push) Successful in 2m19s
Test / Hakurei (push) Successful in 3m9s
Test / Hpkg (push) Successful in 3m53s
Test / Sandbox (race detector) (push) Successful in 4m2s
Test / Hakurei (race detector) (push) Successful in 4m43s
Test / Flake checks (push) Successful in 1m23s
container/comp: rename from bits
This package will also hold syscall lookup tables for seccomp.

Signed-off-by: Ophestra <cat@gensokyo.uk>
2025-10-21 20:54:03 +09:00

419 lines
11 KiB
Go

// Package container implements unprivileged Linux containers with built-in support for syscall filtering.
package container
import (
"context"
"encoding/gob"
"errors"
"fmt"
"io"
"os"
"os/exec"
"runtime"
"strconv"
. "syscall"
"time"
"hakurei.app/container/check"
"hakurei.app/container/comp"
"hakurei.app/container/fhs"
"hakurei.app/container/seccomp"
"hakurei.app/message"
)
const (
// CancelSignal is the signal expected by container init on context cancel.
// A custom [Container.Cancel] function must eventually deliver this signal.
CancelSignal = SIGTERM
)
type (
// Container represents a container environment being prepared or run.
// None of [Container] methods are safe for concurrent use.
Container struct {
// Cgroup fd, nil to disable.
Cgroup *int
// ExtraFiles passed through to initial process in the container,
// with behaviour identical to its [exec.Cmd] counterpart.
ExtraFiles []*os.File
// param encoder for shim and init
setup *gob.Encoder
// cancels cmd
cancel context.CancelFunc
// closed after Wait returns
wait chan struct{}
Stdin io.Reader
Stdout io.Writer
Stderr io.Writer
Cancel func(cmd *exec.Cmd) error
WaitDelay time.Duration
cmd *exec.Cmd
ctx context.Context
msg message.Msg
Params
}
// Params holds container configuration and is safe to serialise.
Params struct {
// Working directory in the container.
Dir *check.Absolute
// Initial process environment.
Env []string
// Pathname of initial process in the container.
Path *check.Absolute
// Initial process argv.
Args []string
// Deliver SIGINT to the initial process on context cancellation.
ForwardCancel bool
// Time to wait for processes lingering after the initial process terminates.
AdoptWaitDelay time.Duration
// Mapped Uid in user namespace.
Uid int
// Mapped Gid in user namespace.
Gid int
// Hostname value in UTS namespace.
Hostname string
// Sequential container setup ops.
*Ops
// Seccomp system call filter rules.
SeccompRules []seccomp.NativeRule
// Extra seccomp flags.
SeccompFlags seccomp.ExportFlag
// Seccomp presets. Has no effect unless SeccompRules is zero-length.
SeccompPresets comp.FilterPreset
// Do not load seccomp program.
SeccompDisable bool
// Permission bits of newly created parent directories.
// The zero value is interpreted as 0755.
ParentPerm os.FileMode
// Do not syscall.Setsid.
RetainSession bool
// Do not [syscall.CLONE_NEWNET].
HostNet bool
// Do not [LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET].
HostAbstract bool
// Retain CAP_SYS_ADMIN.
Privileged bool
}
)
// A StartError contains additional information on a container startup failure.
type StartError struct {
// Fatal suggests whether this error should be considered fatal for the entire program.
Fatal bool
// Step refers to the part of the setup this error is returned from.
Step string
// Err is the underlying error.
Err error
// Origin is whether this error originated from the [Container.Start] method.
Origin bool
// Passthrough is whether the Error method is passed through to Err.
Passthrough bool
}
func (e *StartError) Unwrap() error { return e.Err }
func (e *StartError) Error() string {
if e.Passthrough {
return e.Err.Error()
}
if e.Origin {
return e.Step
}
{
var syscallError *os.SyscallError
if errors.As(e.Err, &syscallError) && syscallError != nil {
return e.Step + " " + syscallError.Error()
}
}
return e.Step + ": " + e.Err.Error()
}
// Message returns a user-facing error message.
func (e *StartError) Message() string {
if e.Passthrough {
switch {
case errors.As(e.Err, new(*os.PathError)),
errors.As(e.Err, new(*os.SyscallError)):
return "cannot " + e.Err.Error()
default:
return e.Err.Error()
}
}
if e.Origin {
return e.Step
}
return "cannot " + e.Error()
}
// Start starts the container init. The init process blocks until Serve is called.
func (p *Container) Start() error {
if p == nil || p.cmd == nil ||
p.Ops == nil || len(*p.Ops) == 0 {
return errors.New("container: starting an invalid container")
}
if p.cmd.Process != nil {
return errors.New("container: already started")
}
// map to overflow id to work around ownership checks
if p.Uid < 1 {
p.Uid = OverflowUid(p.msg)
}
if p.Gid < 1 {
p.Gid = OverflowGid(p.msg)
}
if !p.RetainSession {
p.SeccompPresets |= comp.PresetDenyTTY
}
if p.AdoptWaitDelay == 0 {
p.AdoptWaitDelay = 5 * time.Second
}
// to allow disabling this behaviour
if p.AdoptWaitDelay < 0 {
p.AdoptWaitDelay = 0
}
if p.cmd.Stdin == nil {
p.cmd.Stdin = p.Stdin
}
if p.cmd.Stdout == nil {
p.cmd.Stdout = p.Stdout
}
if p.cmd.Stderr == nil {
p.cmd.Stderr = p.Stderr
}
p.cmd.Args = []string{initName}
p.cmd.WaitDelay = p.WaitDelay
if p.Cancel != nil {
p.cmd.Cancel = func() error { return p.Cancel(p.cmd) }
} else {
p.cmd.Cancel = func() error { return p.cmd.Process.Signal(CancelSignal) }
}
p.cmd.Dir = fhs.Root
p.cmd.SysProcAttr = &SysProcAttr{
Setsid: !p.RetainSession,
Pdeathsig: SIGKILL,
Cloneflags: CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNS |
CLONE_NEWIPC | CLONE_NEWUTS | CLONE_NEWCGROUP,
AmbientCaps: []uintptr{
// general container setup
CAP_SYS_ADMIN,
// drop capabilities
CAP_SETPCAP,
// overlay access to upperdir and workdir
CAP_DAC_OVERRIDE,
},
UseCgroupFD: p.Cgroup != nil,
}
if p.cmd.SysProcAttr.UseCgroupFD {
p.cmd.SysProcAttr.CgroupFD = *p.Cgroup
}
if !p.HostNet {
p.cmd.SysProcAttr.Cloneflags |= CLONE_NEWNET
}
// place setup pipe before user supplied extra files, this is later restored by init
if fd, e, err := Setup(&p.cmd.ExtraFiles); err != nil {
return &StartError{true, "set up params stream", err, false, false}
} else {
p.setup = e
p.cmd.Env = []string{setupEnv + "=" + strconv.Itoa(fd)}
}
p.cmd.ExtraFiles = append(p.cmd.ExtraFiles, p.ExtraFiles...)
done := make(chan error, 1)
go func() {
runtime.LockOSThread()
p.wait = make(chan struct{})
done <- func() error { // setup depending on per-thread state must happen here
// PR_SET_NO_NEW_PRIVS: depends on per-thread state but acts on all processes created from that thread
if err := SetNoNewPrivs(); err != nil {
return &StartError{true, "prctl(PR_SET_NO_NEW_PRIVS)", err, false, false}
}
// landlock: depends on per-thread state but acts on a process group
{
rulesetAttr := &RulesetAttr{Scoped: LANDLOCK_SCOPE_SIGNAL}
if !p.HostAbstract {
rulesetAttr.Scoped |= LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET
}
if abi, err := LandlockGetABI(); err != nil {
if p.HostAbstract {
// landlock can be skipped here as it restricts access to resources
// already covered by namespaces (pid)
goto landlockOut
}
return &StartError{false, "get landlock ABI", err, false, false}
} else if abi < 6 {
if p.HostAbstract {
// see above comment
goto landlockOut
}
return &StartError{false, "kernel version too old for LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET", ENOSYS, true, false}
} else {
p.msg.Verbosef("landlock abi version %d", abi)
}
if rulesetFd, err := rulesetAttr.Create(0); err != nil {
return &StartError{true, "create landlock ruleset", err, false, false}
} else {
p.msg.Verbosef("enforcing landlock ruleset %s", rulesetAttr)
if err = LandlockRestrictSelf(rulesetFd, 0); err != nil {
_ = Close(rulesetFd)
return &StartError{true, "enforce landlock ruleset", err, false, false}
}
if err = Close(rulesetFd); err != nil {
p.msg.Verbosef("cannot close landlock ruleset: %v", err)
// not fatal
}
}
landlockOut:
}
p.msg.Verbose("starting container init")
if err := p.cmd.Start(); err != nil {
return &StartError{false, "start container init", err, false, true}
}
return nil
}()
// keep this thread alive until Wait returns for cancel
<-p.wait
}()
return <-done
}
// Serve serves [Container.Params] to the container init.
// Serve must only be called once.
func (p *Container) Serve() error {
if p.setup == nil {
panic("invalid serve")
}
setup := p.setup
p.setup = nil
if p.Path == nil {
p.cancel()
return &StartError{false, "invalid executable pathname", EINVAL, true, false}
}
// do not transmit nil
if p.Dir == nil {
p.Dir = fhs.AbsRoot
}
if p.SeccompRules == nil {
p.SeccompRules = make([]seccomp.NativeRule, 0)
}
err := setup.Encode(
&initParams{
p.Params,
Getuid(),
Getgid(),
len(p.ExtraFiles),
p.msg.IsVerbose(),
},
)
if err != nil {
p.cancel()
}
return err
}
// Wait waits for the container init process to exit and releases any resources associated with the [Container].
func (p *Container) Wait() error {
if p.cmd == nil || p.cmd.Process == nil {
return EINVAL
}
err := p.cmd.Wait()
p.cancel()
if p.wait != nil && err == nil {
close(p.wait)
}
return err
}
// StdinPipe calls the [exec.Cmd] method with the same name.
func (p *Container) StdinPipe() (w io.WriteCloser, err error) {
if p.Stdin != nil {
return nil, errors.New("container: Stdin already set")
}
w, err = p.cmd.StdinPipe()
p.Stdin = p.cmd.Stdin
return
}
// StdoutPipe calls the [exec.Cmd] method with the same name.
func (p *Container) StdoutPipe() (r io.ReadCloser, err error) {
if p.Stdout != nil {
return nil, errors.New("container: Stdout already set")
}
r, err = p.cmd.StdoutPipe()
p.Stdout = p.cmd.Stdout
return
}
// StderrPipe calls the [exec.Cmd] method with the same name.
func (p *Container) StderrPipe() (r io.ReadCloser, err error) {
if p.Stderr != nil {
return nil, errors.New("container: Stderr already set")
}
r, err = p.cmd.StderrPipe()
p.Stderr = p.cmd.Stderr
return
}
func (p *Container) String() string {
return fmt.Sprintf("argv: %q, filter: %v, rules: %d, flags: %#x, presets: %#x",
p.Args, !p.SeccompDisable, len(p.SeccompRules), int(p.SeccompFlags), int(p.SeccompPresets))
}
// ProcessState returns the address to os.ProcessState held by the underlying [exec.Cmd].
func (p *Container) ProcessState() *os.ProcessState {
if p.cmd == nil {
return nil
}
return p.cmd.ProcessState
}
// New returns the address to a new instance of [Container] that requires further initialisation before use.
func New(ctx context.Context, msg message.Msg) *Container {
if msg == nil {
msg = message.NewMsg(nil)
}
p := &Container{ctx: ctx, msg: msg, Params: Params{Ops: new(Ops)}}
c, cancel := context.WithCancel(ctx)
p.cancel = cancel
p.cmd = exec.CommandContext(c, MustExecutable(msg))
return p
}
// NewCommand calls [New] and initialises the [Params.Path] and [Params.Args] fields.
func NewCommand(ctx context.Context, msg message.Msg, pathname *check.Absolute, name string, args ...string) *Container {
z := New(ctx, msg)
z.Path = pathname
z.Args = append([]string{name}, args...)
return z
}