Files
hakurei/container/init.go
Ophestra cd9b534d6b
All checks were successful
Test / Create distribution (push) Successful in 1m16s
Test / Sandbox (push) Successful in 3m2s
Test / Hakurei (push) Successful in 4m4s
Test / ShareFS (push) Successful in 4m17s
Test / Hpkg (push) Successful in 4m49s
Test / Sandbox (race detector) (push) Successful in 5m22s
Test / Hakurei (race detector) (push) Successful in 6m30s
Test / Flake checks (push) Successful in 1m48s
container: improve documentation
This change removes inconsistencies collected over time in this package.

Signed-off-by: Ophestra <cat@gensokyo.uk>
2026-02-28 20:18:30 +09:00

572 lines
16 KiB
Go

package container
import (
"context"
"errors"
"fmt"
"log"
"os"
"os/exec"
"path"
"slices"
"strconv"
"sync"
"sync/atomic"
. "syscall"
"time"
"hakurei.app/container/fhs"
"hakurei.app/container/seccomp"
"hakurei.app/message"
)
const (
/* intermediateHostPath is the pathname of the intermediate tmpfs mount point.
This path might seem like a weird choice, however there are many good reasons to use it:
- The contents of this path is never exposed to the container:
The tmpfs root established here effectively becomes anonymous after pivot_root.
- It is safe to assume this path exists and is a directory:
This program will not work correctly without a proper /proc and neither will most others.
- This path belongs to the container init:
The container init is not any more privileged or trusted than the rest of the container.
- This path is only accessible by init and root:
The container init sets SUID_DUMP_DISABLE and terminates if that fails.
It should be noted that none of this should become relevant at any point
since the resulting intermediate root tmpfs should be effectively anonymous. */
intermediateHostPath = fhs.Proc + "self/fd"
// setupEnv is the name of the environment variable holding the string
// representation of the read end file descriptor of the setup params pipe.
setupEnv = "HAKUREI_SETUP"
// exitUnexpectedWait4 is the exit code if wait4 returns an unexpected errno.
exitUnexpectedWait4 = 2
)
type (
// Ops is a collection of [Op].
Ops []Op
// Op is a generic setup step ran inside the container init.
// Implementations of this interface are sent as a stream of gobs.
Op interface {
// early is called in host root.
early(state *setupState, k syscallDispatcher) error
// apply is called in intermediate root.
apply(state *setupState, k syscallDispatcher) error
// late is called right before starting the initial process.
late(state *setupState, k syscallDispatcher) error
// prefix returns a log message prefix, and whether this Op prints no
// identifying message on its own.
prefix() (string, bool)
Is(op Op) bool
Valid() bool
fmt.Stringer
}
// setupState persists context between Ops.
setupState struct {
nonrepeatable uintptr
// Whether early reaping has concluded. Must only be accessed in the
// wait4 loop.
processConcluded bool
// Process to syscall.WaitStatus populated in the wait4 loop. Freed
// after early reaping concludes.
process map[int]WaitStatus
// Synchronises access to process.
processMu sync.RWMutex
*Params
context.Context
message.Msg
}
)
// terminated returns whether the specified pid has been reaped, and its
// syscall.WaitStatus if it had. This is only usable by [Op].
func (state *setupState) terminated(pid int) (wstatus WaitStatus, ok bool) {
state.processMu.RLock()
wstatus, ok = state.process[pid]
state.processMu.RUnlock()
return
}
// Grow grows the slice Ops points to using [slices.Grow].
func (f *Ops) Grow(n int) { *f = slices.Grow(*f, n) }
const (
nrAutoEtc = 1 << iota
nrAutoRoot
)
// OpRepeatError is returned applying a repeated nonrepeatable [Op].
type OpRepeatError string
func (e OpRepeatError) Error() string { return string(e) + " is not repeatable" }
// OpStateError indicates an impossible internal state has been reached in an [Op].
type OpStateError string
func (o OpStateError) Error() string { return "impossible " + string(o) + " state reached" }
// initParams are params passed from parent.
type initParams struct {
Params
HostUid, HostGid int
// extra files count
Count int
// verbosity pass through
Verbose bool
}
// Init is called by [TryArgv0] if the current process is the container init.
func Init(msg message.Msg) { initEntrypoint(direct{}, msg) }
func initEntrypoint(k syscallDispatcher, msg message.Msg) {
k.lockOSThread()
if msg == nil {
panic("attempting to call initEntrypoint with nil msg")
}
if k.getpid() != 1 {
k.fatal(msg, "this process must run as pid 1")
}
if err := k.setPtracer(0); err != nil {
msg.Verbosef("cannot enable ptrace protection via Yama LSM: %v", err)
// not fatal: this program has no additional privileges at initial program start
}
var (
params initParams
closeSetup func() error
setupFd uintptr
offsetSetup int
)
if f, err := k.receive(setupEnv, &params, &setupFd); err != nil {
if errors.Is(err, EBADF) {
k.fatal(msg, "invalid setup descriptor")
}
if errors.Is(err, ErrReceiveEnv) {
k.fatal(msg, setupEnv+" not set")
}
k.fatalf(msg, "cannot decode init setup payload: %v", err)
} else {
if params.Ops == nil {
k.fatal(msg, "invalid setup parameters")
}
if params.ParentPerm == 0 {
params.ParentPerm = 0755
}
msg.SwapVerbose(params.Verbose)
msg.Verbose("received setup parameters")
closeSetup = f
offsetSetup = int(setupFd + 1)
}
if !params.HostNet {
k.mustLoopback(msg)
}
// write uid/gid map here so parent does not need to set dumpable
if err := k.setDumpable(SUID_DUMP_USER); err != nil {
k.fatalf(msg, "cannot set SUID_DUMP_USER: %v", err)
}
if err := k.writeFile(fhs.Proc+"self/uid_map",
append([]byte{}, strconv.Itoa(params.Uid)+" "+strconv.Itoa(params.HostUid)+" 1\n"...),
0); err != nil {
k.fatalf(msg, "%v", err)
}
if err := k.writeFile(fhs.Proc+"self/setgroups",
[]byte("deny\n"),
0); err != nil && !os.IsNotExist(err) {
k.fatalf(msg, "%v", err)
}
if err := k.writeFile(fhs.Proc+"self/gid_map",
append([]byte{}, strconv.Itoa(params.Gid)+" "+strconv.Itoa(params.HostGid)+" 1\n"...),
0); err != nil {
k.fatalf(msg, "%v", err)
}
if err := k.setDumpable(SUID_DUMP_DISABLE); err != nil {
k.fatalf(msg, "cannot set SUID_DUMP_DISABLE: %v", err)
}
oldmask := k.umask(0)
if params.Hostname != "" {
if err := k.sethostname([]byte(params.Hostname)); err != nil {
k.fatalf(msg, "cannot set hostname: %v", err)
}
}
// cache sysctl before pivot_root
lastcap := k.lastcap(msg)
if err := k.mount(zeroString, fhs.Root, zeroString, MS_SILENT|MS_SLAVE|MS_REC, zeroString); err != nil {
k.fatalf(msg, "cannot make / rslave: %v", optionalErrorUnwrap(err))
}
ctx, cancel := context.WithCancel(context.Background())
state := &setupState{process: make(map[int]WaitStatus), Params: &params.Params, Msg: msg, Context: ctx}
defer cancel()
/* early is called right before pivot_root into intermediate root;
this step is mostly for gathering information that would otherwise be
difficult to obtain via library functions after pivot_root, and
implementations are expected to avoid changing the state of the mount
namespace */
for i, op := range *params.Ops {
if op == nil || !op.Valid() {
k.fatalf(msg, "invalid op at index %d", i)
}
if err := op.early(state, k); err != nil {
if m, ok := messageFromError(err); ok {
k.fatal(msg, m)
} else {
k.fatalf(msg, "cannot prepare op at index %d: %v", i, err)
}
}
}
if err := k.mount(SourceTmpfsRootfs, intermediateHostPath, FstypeTmpfs, MS_NODEV|MS_NOSUID, zeroString); err != nil {
k.fatalf(msg, "cannot mount intermediate root: %v", optionalErrorUnwrap(err))
}
if err := k.chdir(intermediateHostPath); err != nil {
k.fatalf(msg, "cannot enter intermediate host path: %v", err)
}
if err := k.mkdir(sysrootDir, 0755); err != nil {
k.fatalf(msg, "%v", err)
}
if err := k.mount(sysrootDir, sysrootDir, zeroString, MS_SILENT|MS_BIND|MS_REC, zeroString); err != nil {
k.fatalf(msg, "cannot bind sysroot: %v", optionalErrorUnwrap(err))
}
if err := k.mkdir(hostDir, 0755); err != nil {
k.fatalf(msg, "%v", err)
}
// pivot_root uncovers intermediateHostPath in hostDir
if err := k.pivotRoot(intermediateHostPath, hostDir); err != nil {
k.fatalf(msg, "cannot pivot into intermediate root: %v", err)
}
if err := k.chdir(fhs.Root); err != nil {
k.fatalf(msg, "cannot enter intermediate root: %v", err)
}
/* apply is called right after pivot_root and entering the new root. This
step sets up the container filesystem, and implementations are expected to
keep the host root and sysroot mount points intact but otherwise can do
whatever they need to. Calling chdir is allowed but discouraged. */
for i, op := range *params.Ops {
// ops already checked during early setup
if prefix, ok := op.prefix(); ok {
msg.Verbosef("%s %s", prefix, op)
}
if err := op.apply(state, k); err != nil {
if m, ok := messageFromError(err); ok {
k.fatal(msg, m)
} else {
k.fatalf(msg, "cannot apply op at index %d: %v", i, err)
}
}
}
// setup requiring host root complete at this point
if err := k.mount(hostDir, hostDir, zeroString, MS_SILENT|MS_REC|MS_PRIVATE, zeroString); err != nil {
k.fatalf(msg, "cannot make host root rprivate: %v", optionalErrorUnwrap(err))
}
if err := k.unmount(hostDir, MNT_DETACH); err != nil {
k.fatalf(msg, "cannot unmount host root: %v", err)
}
{
var fd int
if err := IgnoringEINTR(func() (err error) {
fd, err = k.open(fhs.Root, O_DIRECTORY|O_RDONLY, 0)
return
}); err != nil {
k.fatalf(msg, "cannot open intermediate root: %v", err)
}
if err := k.chdir(sysrootPath); err != nil {
k.fatalf(msg, "cannot enter sysroot: %v", err)
}
if err := k.pivotRoot(".", "."); err != nil {
k.fatalf(msg, "cannot pivot into sysroot: %v", err)
}
if err := k.fchdir(fd); err != nil {
k.fatalf(msg, "cannot re-enter intermediate root: %v", err)
}
if err := k.unmount(".", MNT_DETACH); err != nil {
k.fatalf(msg, "cannot unmount intermediate root: %v", err)
}
if err := k.chdir(fhs.Root); err != nil {
k.fatalf(msg, "cannot enter root: %v", err)
}
if err := k.close(fd); err != nil {
k.fatalf(msg, "cannot close intermediate root: %v", err)
}
}
if err := k.capAmbientClearAll(); err != nil {
k.fatalf(msg, "cannot clear the ambient capability set: %v", err)
}
for i := uintptr(0); i <= lastcap; i++ {
if params.Privileged && i == CAP_SYS_ADMIN {
continue
}
if err := k.capBoundingSetDrop(i); err != nil {
k.fatalf(msg, "cannot drop capability from bounding set: %v", err)
}
}
var keep [2]uint32
if params.Privileged {
keep[capToIndex(CAP_SYS_ADMIN)] |= capToMask(CAP_SYS_ADMIN)
if err := k.capAmbientRaise(CAP_SYS_ADMIN); err != nil {
k.fatalf(msg, "cannot raise CAP_SYS_ADMIN: %v", err)
}
}
if err := k.capset(
&capHeader{_LINUX_CAPABILITY_VERSION_3, 0},
&[2]capData{{0, keep[0], keep[0]}, {0, keep[1], keep[1]}},
); err != nil {
k.fatalf(msg, "cannot capset: %v", err)
}
if !params.SeccompDisable {
rules := params.SeccompRules
if len(rules) == 0 { // non-empty rules slice always overrides presets
msg.Verbosef("resolving presets %#x", params.SeccompPresets)
rules = seccomp.Preset(params.SeccompPresets, params.SeccompFlags)
}
if err := k.seccompLoad(rules, params.SeccompFlags); err != nil {
// this also indirectly asserts PR_SET_NO_NEW_PRIVS
k.fatalf(msg, "cannot load syscall filter: %v", err)
}
msg.Verbosef("%d filter rules loaded", len(rules))
} else {
msg.Verbose("syscall filter not configured")
}
extraFiles := make([]*os.File, params.Count)
for i := range extraFiles {
// setup fd is placed before all extra files
extraFiles[i] = k.newFile(uintptr(offsetSetup+i), "extra file "+strconv.Itoa(i))
}
k.umask(oldmask)
// winfo represents an exited process from wait4.
type winfo struct {
wpid int
wstatus WaitStatus
}
// info is closed as the wait4 thread terminates
// when there are no longer any processes left to reap
info := make(chan winfo, 1)
// whether initial process has started
var initialProcessStarted atomic.Bool
k.new(func(k syscallDispatcher) {
k.lockOSThread()
wait4:
var (
err error
wpid = -2
wstatus WaitStatus
// whether initial process has started
started bool
)
// keep going until no child process is left
for wpid != -1 {
if err != nil {
break
}
if wpid != -2 {
if !state.processConcluded {
state.processMu.Lock()
if state.process == nil {
// early reaping has already concluded at this point
state.processConcluded = true
info <- winfo{wpid, wstatus}
} else {
// initial process has not yet been created, and the
// info channel is not yet being received from
state.process[wpid] = wstatus
}
state.processMu.Unlock()
} else {
info <- winfo{wpid, wstatus}
}
}
if !started {
started = initialProcessStarted.Load()
}
err = EINTR
for errors.Is(err, EINTR) {
wpid, err = k.wait4(-1, &wstatus, 0, nil)
}
}
if !errors.Is(err, ECHILD) {
k.printf(msg, "unexpected wait4 response: %v", err)
} else if !started {
// initial process has not yet been reached and all daemons
// terminated or none were started in the first place
time.Sleep(500 * time.Microsecond)
goto wait4
}
close(info)
})
// called right before startup of initial process, all state changes to the
// current process is prohibited during late
for i, op := range *params.Ops {
// ops already checked during early setup
if err := op.late(state, k); err != nil {
if m, ok := messageFromError(err); ok {
k.fatal(msg, m)
} else if errors.Is(err, context.DeadlineExceeded) {
k.fatalf(msg, "%s deadline exceeded", op.String())
} else {
k.fatalf(msg, "cannot complete op at index %d: %v", i, err)
}
}
}
// early reaping has concluded, this must happen before initial process is created
state.processMu.Lock()
state.process = nil
state.processMu.Unlock()
if err := closeSetup(); err != nil {
k.fatalf(msg, "cannot close setup pipe: %v", err)
}
cmd := exec.Command(params.Path.String())
cmd.Stdin, cmd.Stdout, cmd.Stderr = os.Stdin, os.Stdout, os.Stderr
cmd.Args = params.Args
cmd.Env = params.Env
cmd.ExtraFiles = extraFiles
cmd.Dir = params.Dir.String()
msg.Verbosef("starting initial process %s", params.Path)
if err := k.start(cmd); err != nil {
k.fatalf(msg, "%v", err)
}
initialProcessStarted.Store(true)
// handle signals to dump withheld messages
sig := make(chan os.Signal, 2)
k.notify(sig, CancelSignal,
os.Interrupt, SIGTERM, SIGQUIT)
// closed after residualProcessTimeout has elapsed after initial process death
timeout := make(chan struct{})
r := exitUnexpectedWait4
for {
select {
case s := <-sig:
if s == CancelSignal && params.ForwardCancel && cmd.Process != nil {
msg.Verbose("forwarding context cancellation")
if err := k.signal(cmd, os.Interrupt); err != nil {
k.printf(msg, "cannot forward cancellation: %v", err)
}
continue
}
if s == SIGTERM || s == SIGQUIT {
msg.Verbosef("got %s, forwarding to initial process", s.String())
if err := k.signal(cmd, s); err != nil {
k.printf(msg, "cannot forward signal: %v", err)
}
continue
}
msg.Verbosef("got %s", s.String())
msg.BeforeExit()
k.exit(0)
case w, ok := <-info:
if !ok {
msg.BeforeExit()
k.exit(r)
continue // unreachable
}
if w.wpid == cmd.Process.Pid {
// cancel Op context early
cancel()
// start timeout early
go func() { time.Sleep(params.AdoptWaitDelay); close(timeout) }()
// close initial process files; this also keeps them alive
for _, f := range extraFiles {
if err := f.Close(); err != nil {
msg.Verbose(err.Error())
}
}
switch {
case w.wstatus.Exited():
r = w.wstatus.ExitStatus()
msg.Verbosef("initial process exited with code %d", w.wstatus.ExitStatus())
case w.wstatus.Signaled():
r = 128 + int(w.wstatus.Signal())
msg.Verbosef("initial process exited with signal %s", w.wstatus.Signal())
default:
r = 255
msg.Verbosef("initial process exited with status %#x", w.wstatus)
}
}
case <-timeout:
k.printf(msg, "timeout exceeded waiting for lingering processes")
msg.BeforeExit()
k.exit(r)
}
}
}
// initName is the prefix used by log.std in the init process.
const initName = "init"
// TryArgv0 calls [Init] if the last element of argv0 is "init".
// If a nil msg is passed, the system logger is used instead.
func TryArgv0(msg message.Msg) {
if msg == nil {
log.SetPrefix(initName + ": ")
log.SetFlags(0)
msg = message.New(log.Default())
}
if len(os.Args) > 0 && path.Base(os.Args[0]) == initName {
Init(msg)
msg.BeforeExit()
os.Exit(0)
}
}