All checks were successful
		
		
	
	Test / Create distribution (pull_request) Successful in 33s
				
			Test / Sandbox (pull_request) Successful in 2m10s
				
			Test / Hpkg (pull_request) Successful in 4m1s
				
			Test / Sandbox (race detector) (pull_request) Successful in 4m19s
				
			Test / Hakurei (pull_request) Successful in 4m55s
				
			Test / Hakurei (race detector) (pull_request) Successful in 5m0s
				
			Test / Create distribution (push) Successful in 27s
				
			Test / Sandbox (race detector) (push) Successful in 44s
				
			Test / Sandbox (push) Successful in 44s
				
			Test / Hakurei (push) Successful in 47s
				
			Test / Hakurei (race detector) (push) Successful in 47s
				
			Test / Hpkg (push) Successful in 45s
				
			Test / Flake checks (pull_request) Successful in 1m47s
				
			Test / Flake checks (push) Successful in 1m36s
				
			
		
			
				
	
	
		
			325 lines
		
	
	
		
			8.2 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			325 lines
		
	
	
		
			8.2 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
| // Package container implements unprivileged Linux containers with built-in support for syscall filtering.
 | |
| package container
 | |
| 
 | |
| import (
 | |
| 	"context"
 | |
| 	"encoding/gob"
 | |
| 	"errors"
 | |
| 	"fmt"
 | |
| 	"io"
 | |
| 	"os"
 | |
| 	"os/exec"
 | |
| 	"runtime"
 | |
| 	"strconv"
 | |
| 	. "syscall"
 | |
| 	"time"
 | |
| 
 | |
| 	"hakurei.app/container/seccomp"
 | |
| )
 | |
| 
 | |
| const (
 | |
| 	// CancelSignal is the signal expected by container init on context cancel.
 | |
| 	// A custom [Container.Cancel] function must eventually deliver this signal.
 | |
| 	CancelSignal = SIGTERM
 | |
| )
 | |
| 
 | |
| type (
 | |
| 	// Container represents a container environment being prepared or run.
 | |
| 	// None of [Container] methods are safe for concurrent use.
 | |
| 	Container struct {
 | |
| 		// Cgroup fd, nil to disable.
 | |
| 		Cgroup *int
 | |
| 		// ExtraFiles passed through to initial process in the container,
 | |
| 		// with behaviour identical to its [exec.Cmd] counterpart.
 | |
| 		ExtraFiles []*os.File
 | |
| 
 | |
| 		// param encoder for shim and init
 | |
| 		setup *gob.Encoder
 | |
| 		// cancels cmd
 | |
| 		cancel context.CancelFunc
 | |
| 		// closed after Wait returns
 | |
| 		wait chan struct{}
 | |
| 
 | |
| 		Stdin  io.Reader
 | |
| 		Stdout io.Writer
 | |
| 		Stderr io.Writer
 | |
| 
 | |
| 		Cancel    func(cmd *exec.Cmd) error
 | |
| 		WaitDelay time.Duration
 | |
| 
 | |
| 		cmd *exec.Cmd
 | |
| 		ctx context.Context
 | |
| 		Params
 | |
| 	}
 | |
| 
 | |
| 	// Params holds container configuration and is safe to serialise.
 | |
| 	Params struct {
 | |
| 		// Working directory in the container.
 | |
| 		Dir *Absolute
 | |
| 		// Initial process environment.
 | |
| 		Env []string
 | |
| 		// Pathname of initial process in the container.
 | |
| 		Path *Absolute
 | |
| 		// Initial process argv.
 | |
| 		Args []string
 | |
| 		// Deliver SIGINT to the initial process on context cancellation.
 | |
| 		ForwardCancel bool
 | |
| 		// time to wait for linger processes after death of initial process
 | |
| 		AdoptWaitDelay time.Duration
 | |
| 
 | |
| 		// Mapped Uid in user namespace.
 | |
| 		Uid int
 | |
| 		// Mapped Gid in user namespace.
 | |
| 		Gid int
 | |
| 		// Hostname value in UTS namespace.
 | |
| 		Hostname string
 | |
| 		// Sequential container setup ops.
 | |
| 		*Ops
 | |
| 
 | |
| 		// Seccomp system call filter rules.
 | |
| 		SeccompRules []seccomp.NativeRule
 | |
| 		// Extra seccomp flags.
 | |
| 		SeccompFlags seccomp.ExportFlag
 | |
| 		// Seccomp presets. Has no effect unless SeccompRules is zero-length.
 | |
| 		SeccompPresets seccomp.FilterPreset
 | |
| 		// Do not load seccomp program.
 | |
| 		SeccompDisable bool
 | |
| 
 | |
| 		// Permission bits of newly created parent directories.
 | |
| 		// The zero value is interpreted as 0755.
 | |
| 		ParentPerm os.FileMode
 | |
| 		// Do not syscall.Setsid.
 | |
| 		RetainSession bool
 | |
| 		// Do not [syscall.CLONE_NEWNET].
 | |
| 		HostNet bool
 | |
| 		// Do not [LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET].
 | |
| 		HostAbstract bool
 | |
| 		// Retain CAP_SYS_ADMIN.
 | |
| 		Privileged bool
 | |
| 	}
 | |
| )
 | |
| 
 | |
| // Start starts the container init. The init process blocks until Serve is called.
 | |
| func (p *Container) Start() error {
 | |
| 	if p.cmd != nil {
 | |
| 		return errors.New("container: already started")
 | |
| 	}
 | |
| 	if p.Ops == nil || len(*p.Ops) == 0 {
 | |
| 		return errors.New("container: starting an empty container")
 | |
| 	}
 | |
| 
 | |
| 	ctx, cancel := context.WithCancel(p.ctx)
 | |
| 	p.cancel = cancel
 | |
| 
 | |
| 	// map to overflow id to work around ownership checks
 | |
| 	if p.Uid < 1 {
 | |
| 		p.Uid = OverflowUid()
 | |
| 	}
 | |
| 	if p.Gid < 1 {
 | |
| 		p.Gid = OverflowGid()
 | |
| 	}
 | |
| 
 | |
| 	if !p.RetainSession {
 | |
| 		p.SeccompPresets |= seccomp.PresetDenyTTY
 | |
| 	}
 | |
| 
 | |
| 	if p.AdoptWaitDelay == 0 {
 | |
| 		p.AdoptWaitDelay = 5 * time.Second
 | |
| 	}
 | |
| 	// to allow disabling this behaviour
 | |
| 	if p.AdoptWaitDelay < 0 {
 | |
| 		p.AdoptWaitDelay = 0
 | |
| 	}
 | |
| 
 | |
| 	p.cmd = exec.CommandContext(ctx, MustExecutable())
 | |
| 	p.cmd.Args = []string{initName}
 | |
| 	p.cmd.Stdin, p.cmd.Stdout, p.cmd.Stderr = p.Stdin, p.Stdout, p.Stderr
 | |
| 	p.cmd.WaitDelay = p.WaitDelay
 | |
| 	if p.Cancel != nil {
 | |
| 		p.cmd.Cancel = func() error { return p.Cancel(p.cmd) }
 | |
| 	} else {
 | |
| 		p.cmd.Cancel = func() error { return p.cmd.Process.Signal(CancelSignal) }
 | |
| 	}
 | |
| 	p.cmd.Dir = FHSRoot
 | |
| 	p.cmd.SysProcAttr = &SysProcAttr{
 | |
| 		Setsid:    !p.RetainSession,
 | |
| 		Pdeathsig: SIGKILL,
 | |
| 		Cloneflags: CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNS |
 | |
| 			CLONE_NEWIPC | CLONE_NEWUTS | CLONE_NEWCGROUP,
 | |
| 
 | |
| 		AmbientCaps: []uintptr{
 | |
| 			// general container setup
 | |
| 			CAP_SYS_ADMIN,
 | |
| 			// drop capabilities
 | |
| 			CAP_SETPCAP,
 | |
| 			// overlay access to upperdir and workdir
 | |
| 			CAP_DAC_OVERRIDE,
 | |
| 		},
 | |
| 
 | |
| 		UseCgroupFD: p.Cgroup != nil,
 | |
| 	}
 | |
| 	if p.cmd.SysProcAttr.UseCgroupFD {
 | |
| 		p.cmd.SysProcAttr.CgroupFD = *p.Cgroup
 | |
| 	}
 | |
| 	if !p.HostNet {
 | |
| 		p.cmd.SysProcAttr.Cloneflags |= CLONE_NEWNET
 | |
| 	}
 | |
| 
 | |
| 	// place setup pipe before user supplied extra files, this is later restored by init
 | |
| 	if fd, e, err := Setup(&p.cmd.ExtraFiles); err != nil {
 | |
| 		return wrapErrSuffix(err,
 | |
| 			"cannot create shim setup pipe:")
 | |
| 	} else {
 | |
| 		p.setup = e
 | |
| 		p.cmd.Env = []string{setupEnv + "=" + strconv.Itoa(fd)}
 | |
| 	}
 | |
| 	p.cmd.ExtraFiles = append(p.cmd.ExtraFiles, p.ExtraFiles...)
 | |
| 
 | |
| 	done := make(chan error, 1)
 | |
| 	go func() {
 | |
| 		runtime.LockOSThread()
 | |
| 		p.wait = make(chan struct{})
 | |
| 
 | |
| 		done <- func() error { // setup depending on per-thread state must happen here
 | |
| 			// PR_SET_NO_NEW_PRIVS: depends on per-thread state but acts on all processes created from that thread
 | |
| 			if err := SetNoNewPrivs(); err != nil {
 | |
| 				return wrapErrSuffix(err,
 | |
| 					"prctl(PR_SET_NO_NEW_PRIVS):")
 | |
| 			}
 | |
| 
 | |
| 			// landlock: depends on per-thread state but acts on a process group
 | |
| 			{
 | |
| 				rulesetAttr := &RulesetAttr{Scoped: LANDLOCK_SCOPE_SIGNAL}
 | |
| 				if !p.HostAbstract {
 | |
| 					rulesetAttr.Scoped |= LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET
 | |
| 				}
 | |
| 
 | |
| 				if abi, err := LandlockGetABI(); err != nil {
 | |
| 					if p.HostAbstract {
 | |
| 						// landlock can be skipped here as it restricts access to resources
 | |
| 						// already covered by namespaces (pid)
 | |
| 						goto landlockOut
 | |
| 					}
 | |
| 					return wrapErrSuffix(err,
 | |
| 						"landlock does not appear to be enabled:")
 | |
| 				} else if abi < 6 {
 | |
| 					if p.HostAbstract {
 | |
| 						// see above comment
 | |
| 						goto landlockOut
 | |
| 					}
 | |
| 					return msg.WrapErr(ENOSYS,
 | |
| 						"kernel version too old for LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET")
 | |
| 				} else {
 | |
| 					msg.Verbosef("landlock abi version %d", abi)
 | |
| 				}
 | |
| 
 | |
| 				if rulesetFd, err := rulesetAttr.Create(0); err != nil {
 | |
| 					return wrapErrSuffix(err,
 | |
| 						"cannot create landlock ruleset:")
 | |
| 				} else {
 | |
| 					msg.Verbosef("enforcing landlock ruleset %s", rulesetAttr)
 | |
| 					if err = LandlockRestrictSelf(rulesetFd, 0); err != nil {
 | |
| 						_ = Close(rulesetFd)
 | |
| 						return wrapErrSuffix(err,
 | |
| 							"cannot enforce landlock ruleset:")
 | |
| 					}
 | |
| 					if err = Close(rulesetFd); err != nil {
 | |
| 						msg.Verbosef("cannot close landlock ruleset: %v", err)
 | |
| 						// not fatal
 | |
| 					}
 | |
| 				}
 | |
| 
 | |
| 			landlockOut:
 | |
| 			}
 | |
| 
 | |
| 			msg.Verbose("starting container init")
 | |
| 			if err := p.cmd.Start(); err != nil {
 | |
| 				return msg.WrapErr(err, err.Error())
 | |
| 			}
 | |
| 			return nil
 | |
| 		}()
 | |
| 
 | |
| 		// keep this thread alive until Wait returns for cancel
 | |
| 		<-p.wait
 | |
| 	}()
 | |
| 	return <-done
 | |
| }
 | |
| 
 | |
| // Serve serves [Container.Params] to the container init.
 | |
| // Serve must only be called once.
 | |
| func (p *Container) Serve() error {
 | |
| 	if p.setup == nil {
 | |
| 		panic("invalid serve")
 | |
| 	}
 | |
| 
 | |
| 	setup := p.setup
 | |
| 	p.setup = nil
 | |
| 
 | |
| 	if p.Path == nil {
 | |
| 		p.cancel()
 | |
| 		return msg.WrapErr(EINVAL, "invalid executable pathname")
 | |
| 	}
 | |
| 
 | |
| 	// do not transmit nil
 | |
| 	if p.Dir == nil {
 | |
| 		p.Dir = AbsFHSRoot
 | |
| 	}
 | |
| 	if p.SeccompRules == nil {
 | |
| 		p.SeccompRules = make([]seccomp.NativeRule, 0)
 | |
| 	}
 | |
| 
 | |
| 	err := setup.Encode(
 | |
| 		&initParams{
 | |
| 			p.Params,
 | |
| 			Getuid(),
 | |
| 			Getgid(),
 | |
| 			len(p.ExtraFiles),
 | |
| 			msg.IsVerbose(),
 | |
| 		},
 | |
| 	)
 | |
| 	if err != nil {
 | |
| 		p.cancel()
 | |
| 	}
 | |
| 	return err
 | |
| }
 | |
| 
 | |
| // Wait waits for the container init process to exit and releases any resources associated with the [Container].
 | |
| func (p *Container) Wait() error {
 | |
| 	if p.cmd == nil {
 | |
| 		return EINVAL
 | |
| 	}
 | |
| 
 | |
| 	err := p.cmd.Wait()
 | |
| 	p.cancel()
 | |
| 	if p.wait != nil && err == nil {
 | |
| 		close(p.wait)
 | |
| 	}
 | |
| 	return err
 | |
| }
 | |
| 
 | |
| func (p *Container) String() string {
 | |
| 	return fmt.Sprintf("argv: %q, filter: %v, rules: %d, flags: %#x, presets: %#x",
 | |
| 		p.Args, !p.SeccompDisable, len(p.SeccompRules), int(p.SeccompFlags), int(p.SeccompPresets))
 | |
| }
 | |
| 
 | |
| // ProcessState returns the address to os.ProcessState held by the underlying [exec.Cmd].
 | |
| func (p *Container) ProcessState() *os.ProcessState {
 | |
| 	if p.cmd == nil {
 | |
| 		return nil
 | |
| 	}
 | |
| 	return p.cmd.ProcessState
 | |
| }
 | |
| 
 | |
| // New returns the address to a new instance of [Container] that requires further initialisation before use.
 | |
| func New(ctx context.Context) *Container {
 | |
| 	return &Container{ctx: ctx, Params: Params{Ops: new(Ops)}}
 | |
| }
 | |
| 
 | |
| // NewCommand calls [New] and initialises the [Params.Path] and [Params.Args] fields.
 | |
| func NewCommand(ctx context.Context, pathname *Absolute, name string, args ...string) *Container {
 | |
| 	z := New(ctx)
 | |
| 	z.Path = pathname
 | |
| 	z.Args = append([]string{name}, args...)
 | |
| 	return z
 | |
| }
 |