hakurei: move container toplevel

Signed-off-by: Ophestra <cat@gensokyo.uk>
This commit is contained in:
2025-07-02 21:23:55 +09:00
parent 255b77d91d
commit a1d98823f8
33 changed files with 144 additions and 142 deletions

View File

@@ -1,229 +0,0 @@
// Package sandbox implements unprivileged Linux container with hardening options useful for creating application sandboxes.
package sandbox
import (
"context"
"encoding/gob"
"errors"
"fmt"
"io"
"os"
"os/exec"
"path"
"strconv"
. "syscall"
"time"
"git.gensokyo.uk/security/hakurei/sandbox/seccomp"
)
type (
// Container represents a container environment being prepared or run.
// None of [Container] methods are safe for concurrent use.
Container struct {
// Name of initial process in the container.
name string
// Cgroup fd, nil to disable.
Cgroup *int
// ExtraFiles passed through to initial process in the container,
// with behaviour identical to its [exec.Cmd] counterpart.
ExtraFiles []*os.File
// Custom [exec.Cmd] initialisation function.
CommandContext func(ctx context.Context) (cmd *exec.Cmd)
// param encoder for shim and init
setup *gob.Encoder
// cancels cmd
cancel context.CancelFunc
Stdin io.Reader
Stdout io.Writer
Stderr io.Writer
Cancel func(cmd *exec.Cmd) error
WaitDelay time.Duration
cmd *exec.Cmd
ctx context.Context
Params
}
// Params holds container configuration and is safe to serialise.
Params struct {
// Working directory in the container.
Dir string
// Initial process environment.
Env []string
// Absolute path of initial process in the container. Overrides name.
Path string
// Initial process argv.
Args []string
// Mapped Uid in user namespace.
Uid int
// Mapped Gid in user namespace.
Gid int
// Hostname value in UTS namespace.
Hostname string
// Sequential container setup ops.
*Ops
// Seccomp system call filter rules.
SeccompRules []seccomp.NativeRule
// Extra seccomp flags.
SeccompFlags seccomp.ExportFlag
// Seccomp presets. Has no effect unless SeccompRules is zero-length.
SeccompPresets seccomp.FilterPreset
// Do not load seccomp program.
SeccompDisable bool
// Permission bits of newly created parent directories.
// The zero value is interpreted as 0755.
ParentPerm os.FileMode
// Do not syscall.Setsid.
RetainSession bool
// Do not [syscall.CLONE_NEWNET].
HostNet bool
// Retain CAP_SYS_ADMIN.
Privileged bool
}
)
func (p *Container) Start() error {
if p.cmd != nil {
return errors.New("sandbox: already started")
}
if p.Ops == nil || len(*p.Ops) == 0 {
return errors.New("sandbox: starting an empty container")
}
ctx, cancel := context.WithCancel(p.ctx)
p.cancel = cancel
var cloneFlags uintptr = CLONE_NEWIPC | CLONE_NEWUTS | CLONE_NEWCGROUP
if !p.HostNet {
cloneFlags |= CLONE_NEWNET
}
// map to overflow id to work around ownership checks
if p.Uid < 1 {
p.Uid = OverflowUid()
}
if p.Gid < 1 {
p.Gid = OverflowGid()
}
if !p.RetainSession {
p.SeccompPresets |= seccomp.PresetDenyTTY
}
if p.CommandContext != nil {
p.cmd = p.CommandContext(ctx)
} else {
p.cmd = exec.CommandContext(ctx, MustExecutable())
p.cmd.Args = []string{"init"}
}
p.cmd.Stdin, p.cmd.Stdout, p.cmd.Stderr = p.Stdin, p.Stdout, p.Stderr
p.cmd.WaitDelay = p.WaitDelay
if p.Cancel != nil {
p.cmd.Cancel = func() error { return p.Cancel(p.cmd) }
} else {
p.cmd.Cancel = func() error { return p.cmd.Process.Signal(SIGTERM) }
}
p.cmd.Dir = "/"
p.cmd.SysProcAttr = &SysProcAttr{
Setsid: !p.RetainSession,
Pdeathsig: SIGKILL,
Cloneflags: cloneFlags | CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNS,
// remain privileged for setup
AmbientCaps: []uintptr{CAP_SYS_ADMIN, CAP_SETPCAP},
UseCgroupFD: p.Cgroup != nil,
}
if p.cmd.SysProcAttr.UseCgroupFD {
p.cmd.SysProcAttr.CgroupFD = *p.Cgroup
}
// place setup pipe before user supplied extra files, this is later restored by init
if fd, e, err := Setup(&p.cmd.ExtraFiles); err != nil {
return wrapErrSuffix(err,
"cannot create shim setup pipe:")
} else {
p.setup = e
p.cmd.Env = []string{setupEnv + "=" + strconv.Itoa(fd)}
}
p.cmd.ExtraFiles = append(p.cmd.ExtraFiles, p.ExtraFiles...)
msg.Verbose("starting container init")
if err := p.cmd.Start(); err != nil {
return msg.WrapErr(err, err.Error())
}
return nil
}
func (p *Container) Serve() error {
if p.setup == nil {
panic("invalid serve")
}
setup := p.setup
p.setup = nil
if p.Path != "" && !path.IsAbs(p.Path) {
p.cancel()
return msg.WrapErr(EINVAL,
fmt.Sprintf("invalid executable path %q", p.Path))
}
if p.Path == "" {
if p.name == "" {
p.Path = os.Getenv("SHELL")
if !path.IsAbs(p.Path) {
p.cancel()
return msg.WrapErr(EBADE,
"no command specified and $SHELL is invalid")
}
p.name = path.Base(p.Path)
} else if path.IsAbs(p.name) {
p.Path = p.name
} else if v, err := exec.LookPath(p.name); err != nil {
p.cancel()
return msg.WrapErr(err, err.Error())
} else {
p.Path = v
}
}
if p.SeccompRules == nil {
// do not transmit nil
p.SeccompRules = make([]seccomp.NativeRule, 0)
}
err := setup.Encode(
&initParams{
p.Params,
Getuid(),
Getgid(),
len(p.ExtraFiles),
msg.IsVerbose(),
},
)
if err != nil {
p.cancel()
}
return err
}
func (p *Container) Wait() error { defer p.cancel(); return p.cmd.Wait() }
func (p *Container) String() string {
return fmt.Sprintf("argv: %q, filter: %v, rules: %d, flags: %#x, presets: %#x",
p.Args, !p.SeccompDisable, len(p.SeccompRules), int(p.SeccompFlags), int(p.SeccompPresets))
}
func New(ctx context.Context, name string, args ...string) *Container {
return &Container{name: name, ctx: ctx,
Params: Params{Args: append([]string{name}, args...), Dir: "/", Ops: new(Ops)},
}
}

View File

@@ -1,281 +0,0 @@
package sandbox_test
import (
"bytes"
"context"
"encoding/gob"
"log"
"os"
"os/exec"
"strings"
"syscall"
"testing"
"time"
"git.gensokyo.uk/security/hakurei/hst"
"git.gensokyo.uk/security/hakurei/internal"
"git.gensokyo.uk/security/hakurei/internal/hlog"
"git.gensokyo.uk/security/hakurei/ldd"
"git.gensokyo.uk/security/hakurei/sandbox"
"git.gensokyo.uk/security/hakurei/sandbox/seccomp"
"git.gensokyo.uk/security/hakurei/sandbox/vfs"
)
const (
ignore = "\x00"
ignoreV = -1
)
func TestContainer(t *testing.T) {
{
oldVerbose := hlog.Load()
oldOutput := sandbox.GetOutput()
internal.InstallOutput(true)
t.Cleanup(func() { hlog.Store(oldVerbose) })
t.Cleanup(func() { sandbox.SetOutput(oldOutput) })
}
testCases := []struct {
name string
filter bool
session bool
net bool
ops *sandbox.Ops
mnt []*vfs.MountInfoEntry
host string
rules []seccomp.NativeRule
flags seccomp.ExportFlag
presets seccomp.FilterPreset
}{
{"minimal", true, false, false,
new(sandbox.Ops), nil, "test-minimal",
nil, 0, seccomp.PresetStrict},
{"allow", true, true, true,
new(sandbox.Ops), nil, "test-minimal",
nil, 0, seccomp.PresetExt | seccomp.PresetDenyDevel},
{"no filter", false, true, true,
new(sandbox.Ops), nil, "test-no-filter",
nil, 0, seccomp.PresetExt},
{"custom rules", true, true, true,
new(sandbox.Ops), nil, "test-no-filter",
[]seccomp.NativeRule{
{seccomp.ScmpSyscall(syscall.SYS_SETUID), seccomp.ScmpErrno(syscall.EPERM), nil},
}, 0, seccomp.PresetExt},
{"tmpfs", true, false, false,
new(sandbox.Ops).
Tmpfs(hst.Tmp, 0, 0755),
[]*vfs.MountInfoEntry{
e("/", hst.Tmp, "rw,nosuid,nodev,relatime", "tmpfs", "tmpfs", ignore),
}, "test-tmpfs",
nil, 0, seccomp.PresetStrict},
{"dev", true, true /* go test output is not a tty */, false,
new(sandbox.Ops).
Dev("/dev").
Mqueue("/dev/mqueue"),
[]*vfs.MountInfoEntry{
e("/", "/dev", "rw,nosuid,nodev,relatime", "tmpfs", "devtmpfs", ignore),
e("/null", "/dev/null", "rw,nosuid", "devtmpfs", "devtmpfs", ignore),
e("/zero", "/dev/zero", "rw,nosuid", "devtmpfs", "devtmpfs", ignore),
e("/full", "/dev/full", "rw,nosuid", "devtmpfs", "devtmpfs", ignore),
e("/random", "/dev/random", "rw,nosuid", "devtmpfs", "devtmpfs", ignore),
e("/urandom", "/dev/urandom", "rw,nosuid", "devtmpfs", "devtmpfs", ignore),
e("/tty", "/dev/tty", "rw,nosuid", "devtmpfs", "devtmpfs", ignore),
e("/", "/dev/pts", "rw,nosuid,noexec,relatime", "devpts", "devpts", "rw,mode=620,ptmxmode=666"),
e("/", "/dev/mqueue", "rw,nosuid,nodev,noexec,relatime", "mqueue", "mqueue", "rw"),
}, "",
nil, 0, seccomp.PresetStrict},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
ctx, cancel := context.WithTimeout(t.Context(), 5*time.Second)
defer cancel()
container := sandbox.New(ctx, "/usr/bin/sandbox.test", "-test.v",
"-test.run=TestHelperCheckContainer", "--", "check", tc.host)
container.Uid = 1000
container.Gid = 100
container.Hostname = tc.host
container.CommandContext = commandContext
container.Stdout, container.Stderr = os.Stdout, os.Stderr
container.Ops = tc.ops
container.SeccompRules = tc.rules
container.SeccompFlags = tc.flags | seccomp.AllowMultiarch
container.SeccompPresets = tc.presets
container.SeccompDisable = !tc.filter
container.RetainSession = tc.session
container.HostNet = tc.net
if container.Args[5] == "" {
if name, err := os.Hostname(); err != nil {
t.Fatalf("cannot get hostname: %v", err)
} else {
container.Args[5] = name
}
}
container.
Tmpfs("/tmp", 0, 0755).
Bind(os.Args[0], os.Args[0], 0).
Mkdir("/usr/bin", 0755).
Link(os.Args[0], "/usr/bin/sandbox.test").
Place("/etc/hostname", []byte(container.Args[5]))
// in case test has cgo enabled
var libPaths []string
if entries, err := ldd.ExecFilter(ctx,
commandContext,
func(v []byte) []byte {
return bytes.SplitN(v, []byte("TestHelperInit\n"), 2)[1]
}, os.Args[0]); err != nil {
log.Fatalf("ldd: %v", err)
} else {
libPaths = ldd.Path(entries)
}
for _, name := range libPaths {
container.Bind(name, name, 0)
}
// needs /proc to check mountinfo
container.Proc("/proc")
mnt := make([]*vfs.MountInfoEntry, 0, 3+len(libPaths))
mnt = append(mnt, e("/sysroot", "/", "rw,nosuid,nodev,relatime", "tmpfs", "rootfs", ignore))
mnt = append(mnt, tc.mnt...)
mnt = append(mnt,
e("/", "/tmp", "rw,nosuid,nodev,relatime", "tmpfs", "tmpfs", ignore),
e(ignore, os.Args[0], "ro,nosuid,nodev,relatime", ignore, ignore, ignore),
e(ignore, "/etc/hostname", "ro,nosuid,nodev,relatime", "tmpfs", "rootfs", ignore),
)
for _, name := range libPaths {
mnt = append(mnt, e(ignore, name, "ro,nosuid,nodev,relatime", ignore, ignore, ignore))
}
mnt = append(mnt, e("/", "/proc", "rw,nosuid,nodev,noexec,relatime", "proc", "proc", "rw"))
want := new(bytes.Buffer)
if err := gob.NewEncoder(want).Encode(mnt); err != nil {
t.Fatalf("cannot serialise expected mount points: %v", err)
}
container.Stdin = want
if err := container.Start(); err != nil {
hlog.PrintBaseError(err, "start:")
t.Fatalf("cannot start container: %v", err)
} else if err = container.Serve(); err != nil {
hlog.PrintBaseError(err, "serve:")
t.Errorf("cannot serve setup params: %v", err)
}
if err := container.Wait(); err != nil {
hlog.PrintBaseError(err, "wait:")
t.Fatalf("wait: %v", err)
}
})
}
}
func e(root, target, vfsOptstr, fsType, source, fsOptstr string) *vfs.MountInfoEntry {
return &vfs.MountInfoEntry{
ID: ignoreV,
Parent: ignoreV,
Devno: vfs.DevT{ignoreV, ignoreV},
Root: root,
Target: target,
VfsOptstr: vfsOptstr,
OptFields: []string{ignore},
FsType: fsType,
Source: source,
FsOptstr: fsOptstr,
}
}
func TestContainerString(t *testing.T) {
container := sandbox.New(t.Context(), "ldd", "/usr/bin/env")
container.SeccompFlags |= seccomp.AllowMultiarch
container.SeccompRules = seccomp.Preset(
seccomp.PresetExt|seccomp.PresetDenyNS|seccomp.PresetDenyTTY,
container.SeccompFlags)
container.SeccompPresets = seccomp.PresetStrict
want := `argv: ["ldd" "/usr/bin/env"], filter: true, rules: 65, flags: 0x1, presets: 0xf`
if got := container.String(); got != want {
t.Errorf("String: %s, want %s", got, want)
}
}
func TestHelperInit(t *testing.T) {
if len(os.Args) != 5 || os.Args[4] != "init" {
return
}
sandbox.SetOutput(hlog.Output{})
sandbox.Init(hlog.Prepare, internal.InstallOutput)
}
func TestHelperCheckContainer(t *testing.T) {
if len(os.Args) != 6 || os.Args[4] != "check" {
return
}
t.Run("user", func(t *testing.T) {
if uid := syscall.Getuid(); uid != 1000 {
t.Errorf("Getuid: %d, want 1000", uid)
}
if gid := syscall.Getgid(); gid != 100 {
t.Errorf("Getgid: %d, want 100", gid)
}
})
t.Run("hostname", func(t *testing.T) {
if name, err := os.Hostname(); err != nil {
t.Fatalf("cannot get hostname: %v", err)
} else if name != os.Args[5] {
t.Errorf("Hostname: %q, want %q", name, os.Args[5])
}
if p, err := os.ReadFile("/etc/hostname"); err != nil {
t.Fatalf("%v", err)
} else if string(p) != os.Args[5] {
t.Errorf("/etc/hostname: %q, want %q", string(p), os.Args[5])
}
})
t.Run("mount", func(t *testing.T) {
var mnt []*vfs.MountInfoEntry
if err := gob.NewDecoder(os.Stdin).Decode(&mnt); err != nil {
t.Fatalf("cannot receive expected mount points: %v", err)
}
var d *vfs.MountInfoDecoder
if f, err := os.Open("/proc/self/mountinfo"); err != nil {
t.Fatalf("cannot open mountinfo: %v", err)
} else {
d = vfs.NewMountInfoDecoder(f)
}
i := 0
for cur := range d.Entries() {
if i == len(mnt) {
t.Errorf("got more than %d entries", len(mnt))
break
}
// ugly hack but should be reliable and is less likely to false negative than comparing by parsed flags
cur.VfsOptstr = strings.TrimSuffix(cur.VfsOptstr, ",relatime")
cur.VfsOptstr = strings.TrimSuffix(cur.VfsOptstr, ",noatime")
mnt[i].VfsOptstr = strings.TrimSuffix(mnt[i].VfsOptstr, ",relatime")
mnt[i].VfsOptstr = strings.TrimSuffix(mnt[i].VfsOptstr, ",noatime")
if !cur.EqualWithIgnore(mnt[i], "\x00") {
t.Errorf("[FAIL] %s", cur)
} else {
t.Logf("[ OK ] %s", cur)
}
i++
}
if err := d.Err(); err != nil {
t.Errorf("cannot parse mountinfo: %v", err)
}
if i != len(mnt) {
t.Errorf("got %d entries, want %d", i, len(mnt))
}
})
}
func commandContext(ctx context.Context) *exec.Cmd {
return exec.CommandContext(ctx, os.Args[0], "-test.v",
"-test.run=TestHelperInit", "--", "init")
}

View File

@@ -1,26 +0,0 @@
package sandbox
import (
"log"
"os"
"sync"
)
var (
executable string
executableOnce sync.Once
)
func copyExecutable() {
if name, err := os.Executable(); err != nil {
msg.BeforeExit()
log.Fatalf("cannot read executable path: %v", err)
} else {
executable = name
}
}
func MustExecutable() string {
executableOnce.Do(copyExecutable)
return executable
}

View File

@@ -1,17 +0,0 @@
package sandbox_test
import (
"os"
"testing"
"git.gensokyo.uk/security/hakurei/sandbox"
)
func TestExecutable(t *testing.T) {
for i := 0; i < 16; i++ {
if got := sandbox.MustExecutable(); got != os.Args[0] {
t.Errorf("MustExecutable: %q, want %q",
got, os.Args[0])
}
}
}

View File

@@ -1,364 +0,0 @@
package sandbox
import (
"errors"
"fmt"
"log"
"os"
"os/exec"
"os/signal"
"path"
"runtime"
"strconv"
. "syscall"
"time"
"git.gensokyo.uk/security/hakurei/sandbox/seccomp"
)
const (
// time to wait for linger processes after death of initial process
residualProcessTimeout = 5 * time.Second
// intermediate tmpfs mount point
basePath = "/tmp"
// setup params file descriptor
setupEnv = "HAKUREI_SETUP"
)
type initParams struct {
Params
HostUid, HostGid int
// extra files count
Count int
// verbosity pass through
Verbose bool
}
func Init(prepare func(prefix string), setVerbose func(verbose bool)) {
runtime.LockOSThread()
prepare("init")
if os.Getpid() != 1 {
log.Fatal("this process must run as pid 1")
}
var (
params initParams
closeSetup func() error
setupFile *os.File
offsetSetup int
)
if f, err := Receive(setupEnv, &params, &setupFile); err != nil {
if errors.Is(err, ErrInvalid) {
log.Fatal("invalid setup descriptor")
}
if errors.Is(err, ErrNotSet) {
log.Fatal("HAKUREI_SETUP not set")
}
log.Fatalf("cannot decode init setup payload: %v", err)
} else {
if params.Ops == nil {
log.Fatal("invalid setup parameters")
}
if params.ParentPerm == 0 {
params.ParentPerm = 0755
}
setVerbose(params.Verbose)
msg.Verbose("received setup parameters")
closeSetup = f
offsetSetup = int(setupFile.Fd() + 1)
}
// write uid/gid map here so parent does not need to set dumpable
if err := SetDumpable(SUID_DUMP_USER); err != nil {
log.Fatalf("cannot set SUID_DUMP_USER: %s", err)
}
if err := os.WriteFile("/proc/self/uid_map",
append([]byte{}, strconv.Itoa(params.Uid)+" "+strconv.Itoa(params.HostUid)+" 1\n"...),
0); err != nil {
log.Fatalf("%v", err)
}
if err := os.WriteFile("/proc/self/setgroups",
[]byte("deny\n"),
0); err != nil && !os.IsNotExist(err) {
log.Fatalf("%v", err)
}
if err := os.WriteFile("/proc/self/gid_map",
append([]byte{}, strconv.Itoa(params.Gid)+" "+strconv.Itoa(params.HostGid)+" 1\n"...),
0); err != nil {
log.Fatalf("%v", err)
}
if err := SetDumpable(SUID_DUMP_DISABLE); err != nil {
log.Fatalf("cannot set SUID_DUMP_DISABLE: %s", err)
}
oldmask := Umask(0)
if params.Hostname != "" {
if err := Sethostname([]byte(params.Hostname)); err != nil {
log.Fatalf("cannot set hostname: %v", err)
}
}
// cache sysctl before pivot_root
LastCap()
if err := Mount("", "/", "", MS_SILENT|MS_SLAVE|MS_REC, ""); err != nil {
log.Fatalf("cannot make / rslave: %v", err)
}
for i, op := range *params.Ops {
if op == nil {
log.Fatalf("invalid op %d", i)
}
if err := op.early(&params.Params); err != nil {
msg.PrintBaseErr(err,
fmt.Sprintf("cannot prepare op %d:", i))
msg.BeforeExit()
os.Exit(1)
}
}
if err := Mount("rootfs", basePath, "tmpfs", MS_NODEV|MS_NOSUID, ""); err != nil {
log.Fatalf("cannot mount intermediate root: %v", err)
}
if err := os.Chdir(basePath); err != nil {
log.Fatalf("cannot enter base path: %v", err)
}
if err := os.Mkdir(sysrootDir, 0755); err != nil {
log.Fatalf("%v", err)
}
if err := Mount(sysrootDir, sysrootDir, "", MS_SILENT|MS_MGC_VAL|MS_BIND|MS_REC, ""); err != nil {
log.Fatalf("cannot bind sysroot: %v", err)
}
if err := os.Mkdir(hostDir, 0755); err != nil {
log.Fatalf("%v", err)
}
// pivot_root uncovers basePath in hostDir
if err := PivotRoot(basePath, hostDir); err != nil {
log.Fatalf("cannot pivot into intermediate root: %v", err)
}
if err := os.Chdir("/"); err != nil {
log.Fatalf("%v", err)
}
for i, op := range *params.Ops {
// ops already checked during early setup
msg.Verbosef("%s %s", op.prefix(), op)
if err := op.apply(&params.Params); err != nil {
msg.PrintBaseErr(err,
fmt.Sprintf("cannot apply op %d:", i))
msg.BeforeExit()
os.Exit(1)
}
}
// setup requiring host root complete at this point
if err := Mount(hostDir, hostDir, "", MS_SILENT|MS_REC|MS_PRIVATE, ""); err != nil {
log.Fatalf("cannot make host root rprivate: %v", err)
}
if err := Unmount(hostDir, MNT_DETACH); err != nil {
log.Fatalf("cannot unmount host root: %v", err)
}
{
var fd int
if err := IgnoringEINTR(func() (err error) {
fd, err = Open("/", O_DIRECTORY|O_RDONLY, 0)
return
}); err != nil {
log.Fatalf("cannot open intermediate root: %v", err)
}
if err := os.Chdir(sysrootPath); err != nil {
log.Fatalf("%v", err)
}
if err := PivotRoot(".", "."); err != nil {
log.Fatalf("cannot pivot into sysroot: %v", err)
}
if err := Fchdir(fd); err != nil {
log.Fatalf("cannot re-enter intermediate root: %v", err)
}
if err := Unmount(".", MNT_DETACH); err != nil {
log.Fatalf("cannot unmount intemediate root: %v", err)
}
if err := os.Chdir("/"); err != nil {
log.Fatalf("%v", err)
}
if err := Close(fd); err != nil {
log.Fatalf("cannot close intermediate root: %v", err)
}
}
if _, _, errno := Syscall(PR_SET_NO_NEW_PRIVS, 1, 0, 0); errno != 0 {
log.Fatalf("prctl(PR_SET_NO_NEW_PRIVS): %v", errno)
}
if _, _, errno := Syscall(SYS_PRCTL, PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0); errno != 0 {
log.Fatalf("cannot clear the ambient capability set: %v", errno)
}
for i := uintptr(0); i <= LastCap(); i++ {
if params.Privileged && i == CAP_SYS_ADMIN {
continue
}
if _, _, errno := Syscall(SYS_PRCTL, PR_CAPBSET_DROP, i, 0); errno != 0 {
log.Fatalf("cannot drop capability from bonding set: %v", errno)
}
}
var keep [2]uint32
if params.Privileged {
keep[capToIndex(CAP_SYS_ADMIN)] |= capToMask(CAP_SYS_ADMIN)
if _, _, errno := Syscall(SYS_PRCTL, PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_SYS_ADMIN); errno != 0 {
log.Fatalf("cannot raise CAP_SYS_ADMIN: %v", errno)
}
}
if err := capset(
&capHeader{_LINUX_CAPABILITY_VERSION_3, 0},
&[2]capData{{0, keep[0], keep[0]}, {0, keep[1], keep[1]}},
); err != nil {
log.Fatalf("cannot capset: %v", err)
}
if !params.SeccompDisable {
rules := params.SeccompRules
if len(rules) == 0 { // non-empty rules slice always overrides presets
msg.Verbosef("resolving presets %#x", params.SeccompPresets)
rules = seccomp.Preset(params.SeccompPresets, params.SeccompFlags)
}
if err := seccomp.Load(rules, params.SeccompFlags); err != nil {
log.Fatalf("cannot load syscall filter: %v", err)
}
msg.Verbosef("%d filter rules loaded", len(rules))
} else {
msg.Verbose("syscall filter not configured")
}
extraFiles := make([]*os.File, params.Count)
for i := range extraFiles {
// setup fd is placed before all extra files
extraFiles[i] = os.NewFile(uintptr(offsetSetup+i), "extra file "+strconv.Itoa(i))
}
Umask(oldmask)
cmd := exec.Command(params.Path)
cmd.Stdin, cmd.Stdout, cmd.Stderr = os.Stdin, os.Stdout, os.Stderr
cmd.Args = params.Args
cmd.Env = params.Env
cmd.ExtraFiles = extraFiles
cmd.Dir = params.Dir
if err := cmd.Start(); err != nil {
log.Fatalf("%v", err)
}
msg.Suspend()
if err := closeSetup(); err != nil {
log.Println("cannot close setup pipe:", err)
// not fatal
}
type winfo struct {
wpid int
wstatus WaitStatus
}
info := make(chan winfo, 1)
done := make(chan struct{})
go func() {
var (
err error
wpid = -2
wstatus WaitStatus
)
// keep going until no child process is left
for wpid != -1 {
if err != nil {
break
}
if wpid != -2 {
info <- winfo{wpid, wstatus}
}
err = EINTR
for errors.Is(err, EINTR) {
wpid, err = Wait4(-1, &wstatus, 0, nil)
}
}
if !errors.Is(err, ECHILD) {
log.Println("unexpected wait4 response:", err)
}
close(done)
}()
// handle signals to dump withheld messages
sig := make(chan os.Signal, 2)
signal.Notify(sig, SIGINT, SIGTERM)
// closed after residualProcessTimeout has elapsed after initial process death
timeout := make(chan struct{})
r := 2
for {
select {
case s := <-sig:
if msg.Resume() {
msg.Verbosef("terminating on %s after process start", s.String())
} else {
msg.Verbosef("terminating on %s", s.String())
}
os.Exit(0)
case w := <-info:
if w.wpid == cmd.Process.Pid {
// initial process exited, output is most likely available again
msg.Resume()
switch {
case w.wstatus.Exited():
r = w.wstatus.ExitStatus()
msg.Verbosef("initial process exited with code %d", w.wstatus.ExitStatus())
case w.wstatus.Signaled():
r = 128 + int(w.wstatus.Signal())
msg.Verbosef("initial process exited with signal %s", w.wstatus.Signal())
default:
r = 255
msg.Verbosef("initial process exited with status %#x", w.wstatus)
}
go func() {
time.Sleep(residualProcessTimeout)
close(timeout)
}()
}
case <-done:
msg.BeforeExit()
os.Exit(r)
case <-timeout:
log.Println("timeout exceeded waiting for lingering processes")
msg.BeforeExit()
os.Exit(r)
}
}
}
// TryArgv0 calls [Init] if the last element of argv0 is "init".
func TryArgv0(v Msg, prepare func(prefix string), setVerbose func(verbose bool)) {
if len(os.Args) > 0 && path.Base(os.Args[0]) == "init" {
msg = v
Init(prepare, setVerbose)
msg.BeforeExit()
os.Exit(0)
}
}

View File

@@ -1,123 +0,0 @@
package sandbox
import (
"errors"
"fmt"
"os"
"path/filepath"
. "syscall"
"git.gensokyo.uk/security/hakurei/sandbox/vfs"
)
func (p *procPaths) bindMount(source, target string, flags uintptr, eq bool) error {
if eq {
msg.Verbosef("resolved %q flags %#x", target, flags)
} else {
msg.Verbosef("resolved %q on %q flags %#x", source, target, flags)
}
if err := Mount(source, target, "", MS_SILENT|MS_BIND|flags&MS_REC, ""); err != nil {
return wrapErrSuffix(err,
fmt.Sprintf("cannot mount %q on %q:", source, target))
}
var targetFinal string
if v, err := filepath.EvalSymlinks(target); err != nil {
return wrapErrSelf(err)
} else {
targetFinal = v
if targetFinal != target {
msg.Verbosef("target resolves to %q", targetFinal)
}
}
// final target path according to the kernel through proc
var targetKFinal string
{
var destFd int
if err := IgnoringEINTR(func() (err error) {
destFd, err = Open(targetFinal, O_PATH|O_CLOEXEC, 0)
return
}); err != nil {
return wrapErrSuffix(err,
fmt.Sprintf("cannot open %q:", targetFinal))
}
if v, err := os.Readlink(p.fd(destFd)); err != nil {
return wrapErrSelf(err)
} else if err = Close(destFd); err != nil {
return wrapErrSuffix(err,
fmt.Sprintf("cannot close %q:", targetFinal))
} else {
targetKFinal = v
}
}
mf := MS_NOSUID | flags&MS_NODEV | flags&MS_RDONLY
return hostProc.mountinfo(func(d *vfs.MountInfoDecoder) error {
n, err := d.Unfold(targetKFinal)
if err != nil {
if errors.Is(err, ESTALE) {
return msg.WrapErr(err,
fmt.Sprintf("mount point %q never appeared in mountinfo", targetKFinal))
}
return wrapErrSuffix(err,
"cannot unfold mount hierarchy:")
}
if err = remountWithFlags(n, mf); err != nil {
return err
}
if flags&MS_REC == 0 {
return nil
}
for cur := range n.Collective() {
err = remountWithFlags(cur, mf)
if err != nil && !errors.Is(err, EACCES) {
return err
}
}
return nil
})
}
func remountWithFlags(n *vfs.MountInfoNode, mf uintptr) error {
kf, unmatched := n.Flags()
if len(unmatched) != 0 {
msg.Verbosef("unmatched vfs options: %q", unmatched)
}
if kf&mf != mf {
return wrapErrSuffix(
Mount("none", n.Clean, "", MS_SILENT|MS_BIND|MS_REMOUNT|kf|mf, ""),
fmt.Sprintf("cannot remount %q:", n.Clean))
}
return nil
}
func mountTmpfs(fsname, name string, size int, perm os.FileMode) error {
target := toSysroot(name)
if err := os.MkdirAll(target, parentPerm(perm)); err != nil {
return wrapErrSelf(err)
}
opt := fmt.Sprintf("mode=%#o", perm)
if size > 0 {
opt += fmt.Sprintf(",size=%d", size)
}
return wrapErrSuffix(
Mount(fsname, target, "tmpfs", MS_NOSUID|MS_NODEV, opt),
fmt.Sprintf("cannot mount tmpfs on %q:", name))
}
func parentPerm(perm os.FileMode) os.FileMode {
pperm := 0755
if perm&0070 == 0 {
pperm &= ^0050
}
if perm&0007 == 0 {
pperm &= ^0005
}
return os.FileMode(pperm)
}

View File

@@ -1,43 +0,0 @@
package sandbox
import (
"log"
"sync/atomic"
)
type Msg interface {
IsVerbose() bool
Verbose(v ...any)
Verbosef(format string, v ...any)
WrapErr(err error, a ...any) error
PrintBaseErr(err error, fallback string)
Suspend()
Resume() bool
BeforeExit()
}
type DefaultMsg struct{ inactive atomic.Bool }
func (msg *DefaultMsg) IsVerbose() bool { return true }
func (msg *DefaultMsg) Verbose(v ...any) {
if !msg.inactive.Load() {
log.Println(v...)
}
}
func (msg *DefaultMsg) Verbosef(format string, v ...any) {
if !msg.inactive.Load() {
log.Printf(format, v...)
}
}
func (msg *DefaultMsg) WrapErr(err error, a ...any) error {
log.Println(a...)
return err
}
func (msg *DefaultMsg) PrintBaseErr(err error, fallback string) { log.Println(fallback, err) }
func (msg *DefaultMsg) Suspend() { msg.inactive.Store(true) }
func (msg *DefaultMsg) Resume() bool { return msg.inactive.CompareAndSwap(true, false) }
func (msg *DefaultMsg) BeforeExit() {}

View File

@@ -1,482 +0,0 @@
package sandbox
import (
"encoding/gob"
"fmt"
"math"
"os"
"path"
"path/filepath"
"slices"
"strings"
. "syscall"
"unsafe"
)
type (
Ops []Op
Op interface {
// early is called in host root.
early(params *Params) error
// apply is called in intermediate root.
apply(params *Params) error
prefix() string
Is(op Op) bool
fmt.Stringer
}
)
func (f *Ops) Grow(n int) { *f = slices.Grow(*f, n) }
func init() { gob.Register(new(BindMountOp)) }
// BindMountOp bind mounts host path Source on container path Target.
type BindMountOp struct {
Source, SourceFinal, Target string
Flags int
}
const (
BindOptional = 1 << iota
BindWritable
BindDevice
)
func (b *BindMountOp) early(*Params) error {
if !path.IsAbs(b.Source) {
return msg.WrapErr(EBADE, fmt.Sprintf("path %q is not absolute", b.Source))
}
if v, err := filepath.EvalSymlinks(b.Source); err != nil {
if os.IsNotExist(err) && b.Flags&BindOptional != 0 {
b.SourceFinal = "\x00"
return nil
}
return wrapErrSelf(err)
} else {
b.SourceFinal = v
return nil
}
}
func (b *BindMountOp) apply(*Params) error {
if b.SourceFinal == "\x00" {
if b.Flags&BindOptional == 0 {
// unreachable
return EBADE
}
return nil
}
if !path.IsAbs(b.SourceFinal) || !path.IsAbs(b.Target) {
return msg.WrapErr(EBADE, "path is not absolute")
}
source := toHost(b.SourceFinal)
target := toSysroot(b.Target)
// this perm value emulates bwrap behaviour as it clears bits from 0755 based on
// op->perms which is never set for any bind setup op so always results in 0700
if fi, err := os.Stat(source); err != nil {
return wrapErrSelf(err)
} else if fi.IsDir() {
if err = os.MkdirAll(target, 0700); err != nil {
return wrapErrSelf(err)
}
} else if err = ensureFile(target, 0444, 0700); err != nil {
return err
}
var flags uintptr = MS_REC
if b.Flags&BindWritable == 0 {
flags |= MS_RDONLY
}
if b.Flags&BindDevice == 0 {
flags |= MS_NODEV
}
return hostProc.bindMount(source, target, flags, b.SourceFinal == b.Target)
}
func (b *BindMountOp) Is(op Op) bool { vb, ok := op.(*BindMountOp); return ok && *b == *vb }
func (*BindMountOp) prefix() string { return "mounting" }
func (b *BindMountOp) String() string {
if b.Source == b.Target {
return fmt.Sprintf("%q flags %#x", b.Source, b.Flags)
}
return fmt.Sprintf("%q on %q flags %#x", b.Source, b.Target, b.Flags&BindWritable)
}
func (f *Ops) Bind(source, target string, flags int) *Ops {
*f = append(*f, &BindMountOp{source, "", target, flags})
return f
}
func init() { gob.Register(new(MountProcOp)) }
// MountProcOp mounts a private instance of proc.
type MountProcOp string
func (p MountProcOp) early(*Params) error { return nil }
func (p MountProcOp) apply(params *Params) error {
v := string(p)
if !path.IsAbs(v) {
return msg.WrapErr(EBADE, fmt.Sprintf("path %q is not absolute", v))
}
target := toSysroot(v)
if err := os.MkdirAll(target, params.ParentPerm); err != nil {
return wrapErrSelf(err)
}
return wrapErrSuffix(Mount("proc", target, "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, ""),
fmt.Sprintf("cannot mount proc on %q:", v))
}
func (p MountProcOp) Is(op Op) bool { vp, ok := op.(MountProcOp); return ok && p == vp }
func (MountProcOp) prefix() string { return "mounting" }
func (p MountProcOp) String() string { return fmt.Sprintf("proc on %q", string(p)) }
func (f *Ops) Proc(dest string) *Ops {
*f = append(*f, MountProcOp(dest))
return f
}
func init() { gob.Register(new(MountDevOp)) }
// MountDevOp mounts part of host dev.
type MountDevOp string
func (d MountDevOp) early(*Params) error { return nil }
func (d MountDevOp) apply(params *Params) error {
v := string(d)
if !path.IsAbs(v) {
return msg.WrapErr(EBADE, fmt.Sprintf("path %q is not absolute", v))
}
target := toSysroot(v)
if err := mountTmpfs("devtmpfs", v, 0, params.ParentPerm); err != nil {
return err
}
for _, name := range []string{"null", "zero", "full", "random", "urandom", "tty"} {
targetPath := toSysroot(path.Join(v, name))
if err := ensureFile(targetPath, 0444, params.ParentPerm); err != nil {
return err
}
if err := hostProc.bindMount(
toHost("/dev/"+name),
targetPath,
0,
true,
); err != nil {
return err
}
}
for i, name := range []string{"stdin", "stdout", "stderr"} {
if err := os.Symlink(
"/proc/self/fd/"+string(rune(i+'0')),
path.Join(target, name),
); err != nil {
return wrapErrSelf(err)
}
}
for _, pair := range [][2]string{
{"/proc/self/fd", "fd"},
{"/proc/kcore", "core"},
{"pts/ptmx", "ptmx"},
} {
if err := os.Symlink(pair[0], path.Join(target, pair[1])); err != nil {
return wrapErrSelf(err)
}
}
devPtsPath := path.Join(target, "pts")
for _, name := range []string{path.Join(target, "shm"), devPtsPath} {
if err := os.Mkdir(name, params.ParentPerm); err != nil {
return wrapErrSelf(err)
}
}
if err := Mount("devpts", devPtsPath, "devpts", MS_NOSUID|MS_NOEXEC,
"newinstance,ptmxmode=0666,mode=620"); err != nil {
return wrapErrSuffix(err,
fmt.Sprintf("cannot mount devpts on %q:", devPtsPath))
}
if params.RetainSession {
var buf [8]byte
if _, _, errno := Syscall(SYS_IOCTL, 1, TIOCGWINSZ, uintptr(unsafe.Pointer(&buf[0]))); errno == 0 {
consolePath := toSysroot(path.Join(v, "console"))
if err := ensureFile(consolePath, 0444, params.ParentPerm); err != nil {
return err
}
if name, err := os.Readlink(hostProc.stdout()); err != nil {
return wrapErrSelf(err)
} else if err = hostProc.bindMount(
toHost(name),
consolePath,
0,
false,
); err != nil {
return err
}
}
}
return nil
}
func (d MountDevOp) Is(op Op) bool { vd, ok := op.(MountDevOp); return ok && d == vd }
func (MountDevOp) prefix() string { return "mounting" }
func (d MountDevOp) String() string { return fmt.Sprintf("dev on %q", string(d)) }
func (f *Ops) Dev(dest string) *Ops {
*f = append(*f, MountDevOp(dest))
return f
}
func init() { gob.Register(new(MountMqueueOp)) }
// MountMqueueOp mounts a private mqueue instance on container Path.
type MountMqueueOp string
func (m MountMqueueOp) early(*Params) error { return nil }
func (m MountMqueueOp) apply(params *Params) error {
v := string(m)
if !path.IsAbs(v) {
return msg.WrapErr(EBADE, fmt.Sprintf("path %q is not absolute", v))
}
target := toSysroot(v)
if err := os.MkdirAll(target, params.ParentPerm); err != nil {
return wrapErrSelf(err)
}
return wrapErrSuffix(Mount("mqueue", target, "mqueue", MS_NOSUID|MS_NOEXEC|MS_NODEV, ""),
fmt.Sprintf("cannot mount mqueue on %q:", v))
}
func (m MountMqueueOp) Is(op Op) bool { vm, ok := op.(MountMqueueOp); return ok && m == vm }
func (MountMqueueOp) prefix() string { return "mounting" }
func (m MountMqueueOp) String() string { return fmt.Sprintf("mqueue on %q", string(m)) }
func (f *Ops) Mqueue(dest string) *Ops {
*f = append(*f, MountMqueueOp(dest))
return f
}
func init() { gob.Register(new(MountTmpfsOp)) }
// MountTmpfsOp mounts tmpfs on container Path.
type MountTmpfsOp struct {
Path string
Size int
Perm os.FileMode
}
func (t *MountTmpfsOp) early(*Params) error { return nil }
func (t *MountTmpfsOp) apply(*Params) error {
if !path.IsAbs(t.Path) {
return msg.WrapErr(EBADE, fmt.Sprintf("path %q is not absolute", t.Path))
}
if t.Size < 0 || t.Size > math.MaxUint>>1 {
return msg.WrapErr(EBADE, fmt.Sprintf("size %d out of bounds", t.Size))
}
return mountTmpfs("tmpfs", t.Path, t.Size, t.Perm)
}
func (t *MountTmpfsOp) Is(op Op) bool { vt, ok := op.(*MountTmpfsOp); return ok && *t == *vt }
func (*MountTmpfsOp) prefix() string { return "mounting" }
func (t *MountTmpfsOp) String() string { return fmt.Sprintf("tmpfs on %q size %d", t.Path, t.Size) }
func (f *Ops) Tmpfs(dest string, size int, perm os.FileMode) *Ops {
*f = append(*f, &MountTmpfsOp{dest, size, perm})
return f
}
func init() { gob.Register(new(SymlinkOp)) }
// SymlinkOp creates a symlink in the container filesystem.
type SymlinkOp [2]string
func (l *SymlinkOp) early(*Params) error {
if strings.HasPrefix(l[0], "*") {
l[0] = l[0][1:]
if !path.IsAbs(l[0]) {
return msg.WrapErr(EBADE, fmt.Sprintf("path %q is not absolute", l[0]))
}
if name, err := os.Readlink(l[0]); err != nil {
return wrapErrSelf(err)
} else {
l[0] = name
}
}
return nil
}
func (l *SymlinkOp) apply(params *Params) error {
// symlink target is an arbitrary path value, so only validate link name here
if !path.IsAbs(l[1]) {
return msg.WrapErr(EBADE, fmt.Sprintf("path %q is not absolute", l[1]))
}
target := toSysroot(l[1])
if err := os.MkdirAll(path.Dir(target), params.ParentPerm); err != nil {
return wrapErrSelf(err)
}
if err := os.Symlink(l[0], target); err != nil {
return wrapErrSelf(err)
}
return nil
}
func (l *SymlinkOp) Is(op Op) bool { vl, ok := op.(*SymlinkOp); return ok && *l == *vl }
func (*SymlinkOp) prefix() string { return "creating" }
func (l *SymlinkOp) String() string { return fmt.Sprintf("symlink on %q target %q", l[1], l[0]) }
func (f *Ops) Link(target, linkName string) *Ops {
*f = append(*f, &SymlinkOp{target, linkName})
return f
}
func init() { gob.Register(new(MkdirOp)) }
// MkdirOp creates a directory in the container filesystem.
type MkdirOp struct {
Path string
Perm os.FileMode
}
func (m *MkdirOp) early(*Params) error { return nil }
func (m *MkdirOp) apply(*Params) error {
if !path.IsAbs(m.Path) {
return msg.WrapErr(EBADE, fmt.Sprintf("path %q is not absolute", m.Path))
}
if err := os.MkdirAll(toSysroot(m.Path), m.Perm); err != nil {
return wrapErrSelf(err)
}
return nil
}
func (m *MkdirOp) Is(op Op) bool { vm, ok := op.(*MkdirOp); return ok && m == vm }
func (*MkdirOp) prefix() string { return "creating" }
func (m *MkdirOp) String() string { return fmt.Sprintf("directory %q perm %s", m.Path, m.Perm) }
func (f *Ops) Mkdir(dest string, perm os.FileMode) *Ops {
*f = append(*f, &MkdirOp{dest, perm})
return f
}
func init() { gob.Register(new(TmpfileOp)) }
// TmpfileOp places a file in container Path containing Data.
type TmpfileOp struct {
Path string
Data []byte
}
func (t *TmpfileOp) early(*Params) error { return nil }
func (t *TmpfileOp) apply(params *Params) error {
if !path.IsAbs(t.Path) {
return msg.WrapErr(EBADE, fmt.Sprintf("path %q is not absolute", t.Path))
}
var tmpPath string
if f, err := os.CreateTemp("/", "tmp.*"); err != nil {
return wrapErrSelf(err)
} else if _, err = f.Write(t.Data); err != nil {
return wrapErrSuffix(err,
"cannot write to intermediate file:")
} else if err = f.Close(); err != nil {
return wrapErrSuffix(err,
"cannot close intermediate file:")
} else {
tmpPath = f.Name()
}
target := toSysroot(t.Path)
if err := ensureFile(target, 0444, params.ParentPerm); err != nil {
return err
} else if err = hostProc.bindMount(
tmpPath,
target,
MS_RDONLY|MS_NODEV,
false,
); err != nil {
return err
} else if err = os.Remove(tmpPath); err != nil {
return wrapErrSelf(err)
}
return nil
}
func (t *TmpfileOp) Is(op Op) bool {
vt, ok := op.(*TmpfileOp)
return ok && t.Path == vt.Path && slices.Equal(t.Data, vt.Data)
}
func (*TmpfileOp) prefix() string { return "placing" }
func (t *TmpfileOp) String() string {
return fmt.Sprintf("tmpfile %q (%d bytes)", t.Path, len(t.Data))
}
func (f *Ops) Place(name string, data []byte) *Ops { *f = append(*f, &TmpfileOp{name, data}); return f }
func (f *Ops) PlaceP(name string, dataP **[]byte) *Ops {
t := &TmpfileOp{Path: name}
*dataP = &t.Data
*f = append(*f, t)
return f
}
func init() { gob.Register(new(AutoEtcOp)) }
// AutoEtcOp expands host /etc into a toplevel symlink mirror with /etc semantics.
// This is not a generic setup op. It is implemented here to reduce ipc overhead.
type AutoEtcOp struct{ Prefix string }
func (e *AutoEtcOp) early(*Params) error { return nil }
func (e *AutoEtcOp) apply(*Params) error {
const target = sysrootPath + "/etc/"
rel := e.hostRel() + "/"
if err := os.MkdirAll(target, 0755); err != nil {
return wrapErrSelf(err)
}
if d, err := os.ReadDir(toSysroot(e.hostPath())); err != nil {
return wrapErrSelf(err)
} else {
for _, ent := range d {
n := ent.Name()
switch n {
case ".host":
case "passwd":
case "group":
case "mtab":
if err = os.Symlink("/proc/mounts", target+n); err != nil {
return wrapErrSelf(err)
}
default:
if err = os.Symlink(rel+n, target+n); err != nil {
return wrapErrSelf(err)
}
}
}
}
return nil
}
func (e *AutoEtcOp) hostPath() string { return "/etc/" + e.hostRel() }
func (e *AutoEtcOp) hostRel() string { return ".host/" + e.Prefix }
func (e *AutoEtcOp) Is(op Op) bool {
ve, ok := op.(*AutoEtcOp)
return ok && ((e == nil && ve == nil) || (e != nil && ve != nil && *e == *ve))
}
func (*AutoEtcOp) prefix() string { return "setting up" }
func (e *AutoEtcOp) String() string { return fmt.Sprintf("auto etc %s", e.Prefix) }
func (f *Ops) Etc(host, prefix string) *Ops {
e := &AutoEtcOp{prefix}
f.Mkdir("/etc", 0755)
f.Bind(host, e.hostPath(), 0)
*f = append(*f, e)
return f
}

View File

@@ -1,26 +0,0 @@
package sandbox
var msg Msg = new(DefaultMsg)
func GetOutput() Msg { return msg }
func SetOutput(v Msg) {
if v == nil {
msg = new(DefaultMsg)
} else {
msg = v
}
}
func wrapErrSuffix(err error, a ...any) error {
if err == nil {
return nil
}
return msg.WrapErr(err, append(a, err)...)
}
func wrapErrSelf(err error) error {
if err == nil {
return nil
}
return msg.WrapErr(err, err.Error())
}

View File

@@ -1,47 +0,0 @@
package sandbox
import (
"encoding/gob"
"errors"
"os"
"strconv"
)
var (
ErrNotSet = errors.New("environment variable not set")
ErrInvalid = errors.New("bad file descriptor")
)
// Setup appends the read end of a pipe for setup params transmission and returns its fd.
func Setup(extraFiles *[]*os.File) (int, *gob.Encoder, error) {
if r, w, err := os.Pipe(); err != nil {
return -1, nil, err
} else {
fd := 3 + len(*extraFiles)
*extraFiles = append(*extraFiles, r)
return fd, gob.NewEncoder(w), nil
}
}
// Receive retrieves setup fd from the environment and receives params.
func Receive(key string, e any, v **os.File) (func() error, error) {
var setup *os.File
if s, ok := os.LookupEnv(key); !ok {
return nil, ErrNotSet
} else {
if fd, err := strconv.Atoi(s); err != nil {
return nil, err
} else {
setup = os.NewFile(uintptr(fd), "setup")
if setup == nil {
return nil, ErrInvalid
}
if v != nil {
*v = setup
}
}
}
return setup.Close, gob.NewDecoder(setup).Decode(e)
}

View File

@@ -1,94 +0,0 @@
package sandbox
import (
"errors"
"fmt"
"io/fs"
"os"
"path"
"strconv"
"strings"
"syscall"
"git.gensokyo.uk/security/hakurei/sandbox/vfs"
)
const (
hostPath = "/" + hostDir
hostDir = "host"
sysrootPath = "/" + sysrootDir
sysrootDir = "sysroot"
)
func toSysroot(name string) string {
name = strings.TrimLeftFunc(name, func(r rune) bool { return r == '/' })
return path.Join(sysrootPath, name)
}
func toHost(name string) string {
name = strings.TrimLeftFunc(name, func(r rune) bool { return r == '/' })
return path.Join(hostPath, name)
}
func createFile(name string, perm, pperm os.FileMode, content []byte) error {
if err := os.MkdirAll(path.Dir(name), pperm); err != nil {
return wrapErrSelf(err)
}
f, err := os.OpenFile(name, syscall.O_CREAT|syscall.O_EXCL|syscall.O_WRONLY, perm)
if err != nil {
return wrapErrSelf(err)
}
if content != nil {
_, err = f.Write(content)
if err != nil {
err = wrapErrSelf(err)
}
}
return errors.Join(f.Close(), err)
}
func ensureFile(name string, perm, pperm os.FileMode) error {
fi, err := os.Stat(name)
if err != nil {
if !os.IsNotExist(err) {
return err
}
return createFile(name, perm, pperm, nil)
}
if mode := fi.Mode(); mode&fs.ModeDir != 0 || mode&fs.ModeSymlink != 0 {
err = msg.WrapErr(syscall.EISDIR,
fmt.Sprintf("path %q is a directory", name))
}
return err
}
var hostProc = newProcPats(hostPath)
func newProcPats(prefix string) *procPaths {
return &procPaths{prefix + "/proc", prefix + "/proc/self"}
}
type procPaths struct {
prefix string
self string
}
func (p *procPaths) stdout() string { return p.self + "/fd/1" }
func (p *procPaths) fd(fd int) string { return p.self + "/fd/" + strconv.Itoa(fd) }
func (p *procPaths) mountinfo(f func(d *vfs.MountInfoDecoder) error) error {
if r, err := os.Open(p.self + "/mountinfo"); err != nil {
return wrapErrSelf(err)
} else {
d := vfs.NewMountInfoDecoder(r)
err0 := f(d)
if err = r.Close(); err != nil {
return wrapErrSuffix(err,
"cannot close mountinfo:")
} else if err = d.Err(); err != nil {
return wrapErrSuffix(err,
"cannot parse mountinfo:")
}
return err0
}
}

View File

@@ -1,81 +0,0 @@
package sandbox
import (
"syscall"
"unsafe"
)
const (
O_PATH = 0x200000
PR_SET_NO_NEW_PRIVS = 0x26
CAP_SYS_ADMIN = 0x15
CAP_SETPCAP = 0x8
)
const (
SUID_DUMP_DISABLE = iota
SUID_DUMP_USER
)
func SetDumpable(dumpable uintptr) error {
// linux/sched/coredump.h
if _, _, errno := syscall.Syscall(syscall.SYS_PRCTL, syscall.PR_SET_DUMPABLE, dumpable, 0); errno != 0 {
return errno
}
return nil
}
const (
_LINUX_CAPABILITY_VERSION_3 = 0x20080522
PR_CAP_AMBIENT = 0x2f
PR_CAP_AMBIENT_RAISE = 0x2
PR_CAP_AMBIENT_CLEAR_ALL = 0x4
)
type (
capHeader struct {
version uint32
pid int32
}
capData struct {
effective uint32
permitted uint32
inheritable uint32
}
)
// See CAP_TO_INDEX in linux/capability.h:
func capToIndex(cap uintptr) uintptr { return cap >> 5 }
// See CAP_TO_MASK in linux/capability.h:
func capToMask(cap uintptr) uint32 { return 1 << uint(cap&31) }
func capset(hdrp *capHeader, datap *[2]capData) error {
if _, _, errno := syscall.Syscall(syscall.SYS_CAPSET,
uintptr(unsafe.Pointer(hdrp)),
uintptr(unsafe.Pointer(&datap[0])), 0); errno != 0 {
return errno
}
return nil
}
// IgnoringEINTR makes a function call and repeats it if it returns an
// EINTR error. This appears to be required even though we install all
// signal handlers with SA_RESTART: see #22838, #38033, #38836, #40846.
// Also #20400 and #36644 are issues in which a signal handler is
// installed without setting SA_RESTART. None of these are the common case,
// but there are enough of them that it seems that we can't avoid
// an EINTR loop.
func IgnoringEINTR(fn func() error) error {
for {
err := fn()
if err != syscall.EINTR {
return err
}
}
}

View File

@@ -1,47 +0,0 @@
package sandbox
import (
"bytes"
"log"
"os"
"strconv"
"sync"
)
var (
kernelOverflowuid int
kernelOverflowgid int
kernelCapLastCap int
sysctlOnce sync.Once
)
const (
kernelOverflowuidPath = "/proc/sys/kernel/overflowuid"
kernelOverflowgidPath = "/proc/sys/kernel/overflowgid"
kernelCapLastCapPath = "/proc/sys/kernel/cap_last_cap"
)
func mustReadSysctl() {
if v, err := os.ReadFile(kernelOverflowuidPath); err != nil {
log.Fatalf("cannot read %q: %v", kernelOverflowuidPath, err)
} else if kernelOverflowuid, err = strconv.Atoi(string(bytes.TrimSpace(v))); err != nil {
log.Fatalf("cannot interpret %q: %v", kernelOverflowuidPath, err)
}
if v, err := os.ReadFile(kernelOverflowgidPath); err != nil {
log.Fatalf("cannot read %q: %v", kernelOverflowgidPath, err)
} else if kernelOverflowgid, err = strconv.Atoi(string(bytes.TrimSpace(v))); err != nil {
log.Fatalf("cannot interpret %q: %v", kernelOverflowgidPath, err)
}
if v, err := os.ReadFile(kernelCapLastCapPath); err != nil {
log.Fatalf("cannot read %q: %v", kernelCapLastCapPath, err)
} else if kernelCapLastCap, err = strconv.Atoi(string(bytes.TrimSpace(v))); err != nil {
log.Fatalf("cannot interpret %q: %v", kernelCapLastCapPath, err)
}
}
func OverflowUid() int { sysctlOnce.Do(mustReadSysctl); return kernelOverflowuid }
func OverflowGid() int { sysctlOnce.Do(mustReadSysctl); return kernelOverflowgid }
func LastCap() uintptr { sysctlOnce.Do(mustReadSysctl); return uintptr(kernelCapLastCap) }