All checks were successful
Test / Create distribution (push) Successful in 1m20s
Test / Sandbox (push) Successful in 3m10s
Test / Hakurei (push) Successful in 4m24s
Test / ShareFS (push) Successful in 4m34s
Test / Sandbox (race detector) (push) Successful in 5m46s
Test / Hakurei (race detector) (push) Successful in 6m53s
Test / Flake checks (push) Successful in 1m30s
The API forces use of finalizer to close the read end of the setup pipe, which is no longer considered acceptable. Exporting this as part of package container also imposes unnecessary maintenance burden. Signed-off-by: Ophestra <cat@gensokyo.uk>
577 lines
14 KiB
Go
577 lines
14 KiB
Go
// Package container implements unprivileged Linux containers with built-in
|
|
// support for syscall filtering.
|
|
package container
|
|
|
|
import (
|
|
"context"
|
|
"encoding/gob"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"os"
|
|
"os/exec"
|
|
"runtime"
|
|
"strconv"
|
|
"sync"
|
|
. "syscall"
|
|
"time"
|
|
|
|
"hakurei.app/check"
|
|
"hakurei.app/container/seccomp"
|
|
"hakurei.app/container/std"
|
|
"hakurei.app/ext"
|
|
"hakurei.app/fhs"
|
|
"hakurei.app/message"
|
|
)
|
|
|
|
const (
|
|
// CancelSignal is the signal expected by container init on context cancel.
|
|
// A custom [Container.Cancel] function must eventually deliver this signal.
|
|
CancelSignal = SIGUSR2
|
|
|
|
// Timeout for writing initParams to Container.setup.
|
|
initSetupTimeout = 5 * time.Second
|
|
)
|
|
|
|
type (
|
|
// Container represents a container environment being prepared or run.
|
|
// None of [Container] methods are safe for concurrent use.
|
|
Container struct {
|
|
// Whether the container init should stay alive after its parent terminates.
|
|
AllowOrphan bool
|
|
// Whether to set SchedPolicy and SchedPriority via sched_setscheduler(2).
|
|
SetScheduler bool
|
|
// Scheduling policy to set via sched_setscheduler(2).
|
|
SchedPolicy ext.SchedPolicy
|
|
// Scheduling priority to set via sched_setscheduler(2). The zero value
|
|
// implies the minimum value supported by the current SchedPolicy.
|
|
SchedPriority ext.Int
|
|
// Cgroup fd, nil to disable.
|
|
Cgroup *int
|
|
// ExtraFiles passed through to initial process in the container, with
|
|
// behaviour identical to its [exec.Cmd] counterpart.
|
|
ExtraFiles []*os.File
|
|
|
|
// Write end of a pipe connected to the init to deliver [Params].
|
|
setup [2]*os.File
|
|
// Cancels the context passed to the underlying cmd.
|
|
cancel context.CancelFunc
|
|
// Closed after Wait returns. Keeps the spawning thread alive.
|
|
wait chan struct{}
|
|
|
|
Stdin io.Reader
|
|
Stdout io.Writer
|
|
Stderr io.Writer
|
|
|
|
// Custom cancellation behaviour for the underlying [exec.Cmd]. Must
|
|
// deliver [CancelSignal] before returning.
|
|
Cancel func(cmd *exec.Cmd) error
|
|
// Copied to the underlying [exec.Cmd].
|
|
WaitDelay time.Duration
|
|
|
|
cmd *exec.Cmd
|
|
ctx context.Context
|
|
msg message.Msg
|
|
Params
|
|
}
|
|
|
|
// Params holds container configuration and is safe to serialise.
|
|
Params struct {
|
|
// Working directory in the container.
|
|
Dir *check.Absolute
|
|
// Initial process environment.
|
|
Env []string
|
|
// Pathname of initial process in the container.
|
|
Path *check.Absolute
|
|
// Initial process argv.
|
|
Args []string
|
|
// Deliver SIGINT to the initial process on context cancellation.
|
|
ForwardCancel bool
|
|
// Time to wait for processes lingering after the initial process terminates.
|
|
AdoptWaitDelay time.Duration
|
|
|
|
// Mapped Uid in user namespace.
|
|
Uid int
|
|
// Mapped Gid in user namespace.
|
|
Gid int
|
|
// Hostname value in UTS namespace.
|
|
Hostname string
|
|
// Sequential container setup ops.
|
|
*Ops
|
|
|
|
// Seccomp system call filter rules.
|
|
SeccompRules []std.NativeRule
|
|
// Extra seccomp flags.
|
|
SeccompFlags seccomp.ExportFlag
|
|
// Seccomp presets. Has no effect unless SeccompRules is zero-length.
|
|
SeccompPresets std.FilterPreset
|
|
// Do not load seccomp program.
|
|
SeccompDisable bool
|
|
|
|
// Permission bits of newly created parent directories.
|
|
// The zero value is interpreted as 0755.
|
|
ParentPerm os.FileMode
|
|
// Do not syscall.Setsid.
|
|
RetainSession bool
|
|
// Do not [syscall.CLONE_NEWNET].
|
|
HostNet bool
|
|
// Do not [LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET].
|
|
HostAbstract bool
|
|
// Retain CAP_SYS_ADMIN.
|
|
Privileged bool
|
|
}
|
|
)
|
|
|
|
// A StartError contains additional information on a container startup failure.
|
|
type StartError struct {
|
|
// Fatal suggests whether this error should be considered fatal for the entire program.
|
|
Fatal bool
|
|
// Step refers to the part of the setup this error is returned from.
|
|
Step string
|
|
// Err is the underlying error.
|
|
Err error
|
|
// Origin is whether this error originated from the [Container.Start] method.
|
|
Origin bool
|
|
// Passthrough is whether the Error method is passed through to Err.
|
|
Passthrough bool
|
|
}
|
|
|
|
func (e *StartError) Unwrap() error { return e.Err }
|
|
func (e *StartError) Error() string {
|
|
if e.Passthrough {
|
|
return e.Err.Error()
|
|
}
|
|
if e.Origin {
|
|
return e.Step
|
|
}
|
|
|
|
{
|
|
var syscallError *os.SyscallError
|
|
if errors.As(e.Err, &syscallError) && syscallError != nil {
|
|
return e.Step + " " + syscallError.Error()
|
|
}
|
|
}
|
|
|
|
return e.Step + ": " + e.Err.Error()
|
|
}
|
|
|
|
// Message returns a user-facing error message.
|
|
func (e *StartError) Message() string {
|
|
if e.Passthrough {
|
|
var (
|
|
numError *strconv.NumError
|
|
)
|
|
|
|
switch {
|
|
case errors.As(e.Err, new(*os.PathError)),
|
|
errors.As(e.Err, new(*os.SyscallError)):
|
|
return "cannot " + e.Err.Error()
|
|
|
|
case errors.As(e.Err, &numError) && numError != nil:
|
|
return "cannot parse " + strconv.Quote(numError.Num) + ": " + numError.Err.Error()
|
|
|
|
default:
|
|
return e.Err.Error()
|
|
}
|
|
}
|
|
if e.Origin {
|
|
return e.Step
|
|
}
|
|
return "cannot " + e.Error()
|
|
}
|
|
|
|
// for ensureCloseOnExec
|
|
var (
|
|
closeOnExecOnce sync.Once
|
|
closeOnExecErr error
|
|
)
|
|
|
|
// ensureCloseOnExec ensures all currently open file descriptors have the
|
|
// syscall.FD_CLOEXEC flag set.
|
|
//
|
|
// This is only ran once as it is intended to handle files left open by the
|
|
// parent, and any file opened on this side should already have
|
|
// syscall.FD_CLOEXEC set.
|
|
func ensureCloseOnExec() error {
|
|
closeOnExecOnce.Do(func() { closeOnExecErr = doCloseOnExec() })
|
|
|
|
if closeOnExecErr == nil {
|
|
return nil
|
|
}
|
|
return &StartError{
|
|
Fatal: true,
|
|
Step: "set FD_CLOEXEC on all open files",
|
|
Err: closeOnExecErr,
|
|
Passthrough: true,
|
|
}
|
|
}
|
|
|
|
// Start starts the container init. The init process blocks until Serve is called.
|
|
func (p *Container) Start() error {
|
|
if p == nil || p.cmd == nil ||
|
|
p.Ops == nil || len(*p.Ops) == 0 {
|
|
return errors.New("container: starting an invalid container")
|
|
}
|
|
if p.cmd.Process != nil {
|
|
return errors.New("container: already started")
|
|
}
|
|
|
|
if err := ensureCloseOnExec(); err != nil {
|
|
return err
|
|
}
|
|
|
|
// map to overflow id to work around ownership checks
|
|
if p.Uid < 1 {
|
|
p.Uid = OverflowUid(p.msg)
|
|
}
|
|
if p.Gid < 1 {
|
|
p.Gid = OverflowGid(p.msg)
|
|
}
|
|
|
|
if !p.RetainSession {
|
|
p.SeccompPresets |= std.PresetDenyTTY
|
|
}
|
|
|
|
if p.AdoptWaitDelay == 0 {
|
|
p.AdoptWaitDelay = 5 * time.Second
|
|
}
|
|
// to allow disabling this behaviour
|
|
if p.AdoptWaitDelay < 0 {
|
|
p.AdoptWaitDelay = 0
|
|
}
|
|
|
|
if p.cmd.Stdin == nil {
|
|
p.cmd.Stdin = p.Stdin
|
|
}
|
|
if p.cmd.Stdout == nil {
|
|
p.cmd.Stdout = p.Stdout
|
|
}
|
|
if p.cmd.Stderr == nil {
|
|
p.cmd.Stderr = p.Stderr
|
|
}
|
|
|
|
p.cmd.Args = []string{initName}
|
|
p.cmd.WaitDelay = p.WaitDelay
|
|
if p.Cancel != nil {
|
|
p.cmd.Cancel = func() error { return p.Cancel(p.cmd) }
|
|
} else {
|
|
p.cmd.Cancel = func() error { return p.cmd.Process.Signal(CancelSignal) }
|
|
}
|
|
p.cmd.Dir = fhs.Root
|
|
p.cmd.SysProcAttr = &SysProcAttr{
|
|
Setsid: !p.RetainSession,
|
|
Cloneflags: CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNS |
|
|
CLONE_NEWIPC | CLONE_NEWUTS | CLONE_NEWCGROUP,
|
|
|
|
AmbientCaps: []uintptr{
|
|
// general container setup
|
|
CAP_SYS_ADMIN,
|
|
// drop capabilities
|
|
CAP_SETPCAP,
|
|
// bring up loopback interface
|
|
CAP_NET_ADMIN,
|
|
// overlay access to upperdir and workdir
|
|
CAP_DAC_OVERRIDE,
|
|
},
|
|
|
|
UseCgroupFD: p.Cgroup != nil,
|
|
}
|
|
if !p.AllowOrphan {
|
|
p.cmd.SysProcAttr.Pdeathsig = SIGKILL
|
|
}
|
|
if p.cmd.SysProcAttr.UseCgroupFD {
|
|
p.cmd.SysProcAttr.CgroupFD = *p.Cgroup
|
|
}
|
|
if !p.HostNet {
|
|
p.cmd.SysProcAttr.Cloneflags |= CLONE_NEWNET
|
|
}
|
|
|
|
// place setup pipe before user supplied extra files, this is later restored by init
|
|
if r, w, err := os.Pipe(); err != nil {
|
|
return &StartError{
|
|
Fatal: true,
|
|
Step: "set up params stream",
|
|
Err: err,
|
|
}
|
|
} else {
|
|
fd := 3 + len(p.cmd.ExtraFiles)
|
|
p.cmd.ExtraFiles = append(p.cmd.ExtraFiles, r)
|
|
p.setup[0], p.setup[1] = r, w
|
|
p.cmd.Env = []string{setupEnv + "=" + strconv.Itoa(fd)}
|
|
}
|
|
p.cmd.ExtraFiles = append(p.cmd.ExtraFiles, p.ExtraFiles...)
|
|
|
|
done := make(chan error, 1)
|
|
go func() {
|
|
runtime.LockOSThread()
|
|
p.wait = make(chan struct{})
|
|
|
|
// setup depending on per-thread state must happen here
|
|
done <- func() error {
|
|
// PR_SET_NO_NEW_PRIVS: thread-directed but acts on all processes
|
|
// created from the calling thread
|
|
if err := SetNoNewPrivs(); err != nil {
|
|
return &StartError{
|
|
Fatal: true,
|
|
Step: "prctl(PR_SET_NO_NEW_PRIVS)",
|
|
Err: err,
|
|
}
|
|
}
|
|
|
|
// landlock: depends on per-thread state but acts on a process group
|
|
{
|
|
rulesetAttr := &RulesetAttr{Scoped: LANDLOCK_SCOPE_SIGNAL}
|
|
if !p.HostAbstract {
|
|
rulesetAttr.Scoped |= LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET
|
|
}
|
|
|
|
if abi, err := LandlockGetABI(); err != nil {
|
|
if p.HostAbstract || !p.HostNet {
|
|
// landlock can be skipped here as it restricts access
|
|
// to resources already covered by namespaces (pid, net)
|
|
goto landlockOut
|
|
}
|
|
return &StartError{Step: "get landlock ABI", Err: err}
|
|
} else if abi < 6 {
|
|
if p.HostAbstract {
|
|
// see above comment
|
|
goto landlockOut
|
|
}
|
|
return &StartError{
|
|
Step: "kernel too old for LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET",
|
|
Err: ENOSYS,
|
|
Origin: true,
|
|
}
|
|
} else {
|
|
p.msg.Verbosef("landlock abi version %d", abi)
|
|
}
|
|
|
|
if rulesetFd, err := rulesetAttr.Create(0); err != nil {
|
|
return &StartError{
|
|
Fatal: true,
|
|
Step: "create landlock ruleset",
|
|
Err: err,
|
|
}
|
|
} else {
|
|
p.msg.Verbosef("enforcing landlock ruleset %s", rulesetAttr)
|
|
if err = LandlockRestrictSelf(rulesetFd, 0); err != nil {
|
|
_ = Close(rulesetFd)
|
|
return &StartError{
|
|
Fatal: true,
|
|
Step: "enforce landlock ruleset",
|
|
Err: err,
|
|
}
|
|
}
|
|
if err = Close(rulesetFd); err != nil {
|
|
p.msg.Verbosef("cannot close landlock ruleset: %v", err)
|
|
// not fatal
|
|
}
|
|
}
|
|
|
|
landlockOut:
|
|
}
|
|
|
|
// sched_setscheduler: thread-directed but acts on all processes
|
|
// created from the calling thread
|
|
if p.SetScheduler {
|
|
if p.SchedPolicy < 0 || p.SchedPolicy > ext.SCHED_LAST {
|
|
return &StartError{
|
|
Fatal: false,
|
|
Step: "set scheduling policy",
|
|
Err: EINVAL,
|
|
}
|
|
}
|
|
|
|
var param schedParam
|
|
if priority, err := p.SchedPolicy.GetPriorityMin(); err != nil {
|
|
return &StartError{
|
|
Fatal: true,
|
|
Step: "get minimum priority",
|
|
Err: err,
|
|
}
|
|
} else {
|
|
param.priority = max(priority, p.SchedPriority)
|
|
}
|
|
|
|
p.msg.Verbosef(
|
|
"setting scheduling policy %s priority %d",
|
|
p.SchedPolicy, param.priority,
|
|
)
|
|
if err := schedSetscheduler(
|
|
0, // calling thread
|
|
p.SchedPolicy,
|
|
¶m,
|
|
); err != nil {
|
|
return &StartError{
|
|
Fatal: true,
|
|
Step: "set scheduling policy",
|
|
Err: err,
|
|
}
|
|
}
|
|
}
|
|
|
|
p.msg.Verbose("starting container init")
|
|
if err := p.cmd.Start(); err != nil {
|
|
return &StartError{
|
|
Step: "start container init",
|
|
Err: err,
|
|
Passthrough: true,
|
|
}
|
|
}
|
|
return nil
|
|
}()
|
|
|
|
// keep this thread alive until Wait returns for cancel
|
|
<-p.wait
|
|
}()
|
|
return <-done
|
|
}
|
|
|
|
// Serve serves [Container.Params] to the container init.
|
|
//
|
|
// Serve must only be called once.
|
|
func (p *Container) Serve() (err error) {
|
|
if p.setup[0] == nil || p.setup[1] == nil {
|
|
panic("invalid serve")
|
|
}
|
|
defer func() {
|
|
if closeErr := p.setup[1].Close(); err == nil {
|
|
err = closeErr
|
|
}
|
|
|
|
if err != nil {
|
|
p.cancel()
|
|
}
|
|
p.setup[0], p.setup[1] = nil, nil
|
|
}()
|
|
if err = p.setup[0].Close(); err != nil {
|
|
return &StartError{
|
|
Fatal: true,
|
|
Step: "close read end of init pipe",
|
|
Err: err,
|
|
Passthrough: true,
|
|
}
|
|
}
|
|
|
|
if err = p.setup[1].SetDeadline(time.Now().Add(initSetupTimeout)); err != nil {
|
|
return &StartError{
|
|
Fatal: true,
|
|
Step: "set init pipe deadline",
|
|
Err: err,
|
|
Passthrough: true,
|
|
}
|
|
}
|
|
|
|
if p.Path == nil {
|
|
return &StartError{
|
|
Step: "invalid executable pathname",
|
|
Err: EINVAL,
|
|
Origin: true,
|
|
}
|
|
}
|
|
|
|
// do not transmit nil
|
|
if p.Dir == nil {
|
|
p.Dir = fhs.AbsRoot
|
|
}
|
|
if p.SeccompRules == nil {
|
|
p.SeccompRules = make([]std.NativeRule, 0)
|
|
}
|
|
|
|
return gob.NewEncoder(p.setup[1]).Encode(&initParams{
|
|
p.Params,
|
|
Getuid(),
|
|
Getgid(),
|
|
len(p.ExtraFiles),
|
|
p.msg.IsVerbose(),
|
|
})
|
|
}
|
|
|
|
// Wait blocks until the container init process to exit and releases any
|
|
// resources associated with the [Container].
|
|
func (p *Container) Wait() error {
|
|
if p.cmd == nil || p.cmd.Process == nil {
|
|
return EINVAL
|
|
}
|
|
|
|
err := p.cmd.Wait()
|
|
p.cancel()
|
|
if p.wait != nil && err == nil {
|
|
close(p.wait)
|
|
}
|
|
return err
|
|
}
|
|
|
|
// StdinPipe calls the [exec.Cmd] method with the same name.
|
|
func (p *Container) StdinPipe() (w io.WriteCloser, err error) {
|
|
if p.Stdin != nil {
|
|
return nil, errors.New("container: Stdin already set")
|
|
}
|
|
w, err = p.cmd.StdinPipe()
|
|
p.Stdin = p.cmd.Stdin
|
|
return
|
|
}
|
|
|
|
// StdoutPipe calls the [exec.Cmd] method with the same name.
|
|
func (p *Container) StdoutPipe() (r io.ReadCloser, err error) {
|
|
if p.Stdout != nil {
|
|
return nil, errors.New("container: Stdout already set")
|
|
}
|
|
r, err = p.cmd.StdoutPipe()
|
|
p.Stdout = p.cmd.Stdout
|
|
return
|
|
}
|
|
|
|
// StderrPipe calls the [exec.Cmd] method with the same name.
|
|
func (p *Container) StderrPipe() (r io.ReadCloser, err error) {
|
|
if p.Stderr != nil {
|
|
return nil, errors.New("container: Stderr already set")
|
|
}
|
|
r, err = p.cmd.StderrPipe()
|
|
p.Stderr = p.cmd.Stderr
|
|
return
|
|
}
|
|
|
|
func (p *Container) String() string {
|
|
return fmt.Sprintf(
|
|
"argv: %q, filter: %v, rules: %d, flags: %#x, presets: %#x",
|
|
p.Args, !p.SeccompDisable, len(p.SeccompRules), int(p.SeccompFlags), int(p.SeccompPresets),
|
|
)
|
|
}
|
|
|
|
// ProcessState returns the address of os.ProcessState held by the underlying [exec.Cmd].
|
|
func (p *Container) ProcessState() *os.ProcessState {
|
|
if p.cmd == nil {
|
|
return nil
|
|
}
|
|
return p.cmd.ProcessState
|
|
}
|
|
|
|
// New returns the address to a new instance of [Container]. This value requires
|
|
// further initialisation before use.
|
|
func New(ctx context.Context, msg message.Msg) *Container {
|
|
if msg == nil {
|
|
msg = message.New(nil)
|
|
}
|
|
|
|
p := &Container{ctx: ctx, msg: msg, Params: Params{Ops: new(Ops)}}
|
|
c, cancel := context.WithCancel(ctx)
|
|
p.cancel = cancel
|
|
p.cmd = exec.CommandContext(c, fhs.ProcSelfExe)
|
|
return p
|
|
}
|
|
|
|
// NewCommand calls [New] and initialises the [Params.Path] and [Params.Args] fields.
|
|
func NewCommand(
|
|
ctx context.Context,
|
|
msg message.Msg,
|
|
pathname *check.Absolute,
|
|
name string,
|
|
args ...string,
|
|
) *Container {
|
|
z := New(ctx, msg)
|
|
z.Path = pathname
|
|
z.Args = append([]string{name}, args...)
|
|
return z
|
|
}
|