container: move out of toplevel
All checks were successful
Test / Create distribution (push) Successful in 32s
Test / Sandbox (push) Successful in 1m52s
Test / Sandbox (race detector) (push) Successful in 3m14s
Test / Planterette (push) Successful in 3m36s
Test / Hakurei (race detector) (push) Successful in 4m31s
Test / Hakurei (push) Successful in 2m3s
Test / Flake checks (push) Successful in 1m13s
All checks were successful
Test / Create distribution (push) Successful in 32s
Test / Sandbox (push) Successful in 1m52s
Test / Sandbox (race detector) (push) Successful in 3m14s
Test / Planterette (push) Successful in 3m36s
Test / Hakurei (race detector) (push) Successful in 4m31s
Test / Hakurei (push) Successful in 2m3s
Test / Flake checks (push) Successful in 1m13s
This allows slightly easier use of the vanity url. This also provides some disambiguation between low level containers and hakurei app containers. Signed-off-by: Ophestra <cat@gensokyo.uk>
This commit is contained in:
229
container/container.go
Normal file
229
container/container.go
Normal file
@@ -0,0 +1,229 @@
|
||||
// Package container implements unprivileged Linux containers with built-in support for syscall filtering.
|
||||
package container
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/gob"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path"
|
||||
"strconv"
|
||||
. "syscall"
|
||||
"time"
|
||||
|
||||
"git.gensokyo.uk/security/hakurei/container/seccomp"
|
||||
)
|
||||
|
||||
type (
|
||||
// Container represents a container environment being prepared or run.
|
||||
// None of [Container] methods are safe for concurrent use.
|
||||
Container struct {
|
||||
// Name of initial process in the container.
|
||||
name string
|
||||
// Cgroup fd, nil to disable.
|
||||
Cgroup *int
|
||||
// ExtraFiles passed through to initial process in the container,
|
||||
// with behaviour identical to its [exec.Cmd] counterpart.
|
||||
ExtraFiles []*os.File
|
||||
|
||||
// Custom [exec.Cmd] initialisation function.
|
||||
CommandContext func(ctx context.Context) (cmd *exec.Cmd)
|
||||
|
||||
// param encoder for shim and init
|
||||
setup *gob.Encoder
|
||||
// cancels cmd
|
||||
cancel context.CancelFunc
|
||||
|
||||
Stdin io.Reader
|
||||
Stdout io.Writer
|
||||
Stderr io.Writer
|
||||
|
||||
Cancel func(cmd *exec.Cmd) error
|
||||
WaitDelay time.Duration
|
||||
|
||||
cmd *exec.Cmd
|
||||
ctx context.Context
|
||||
Params
|
||||
}
|
||||
|
||||
// Params holds container configuration and is safe to serialise.
|
||||
Params struct {
|
||||
// Working directory in the container.
|
||||
Dir string
|
||||
// Initial process environment.
|
||||
Env []string
|
||||
// Absolute path of initial process in the container. Overrides name.
|
||||
Path string
|
||||
// Initial process argv.
|
||||
Args []string
|
||||
|
||||
// Mapped Uid in user namespace.
|
||||
Uid int
|
||||
// Mapped Gid in user namespace.
|
||||
Gid int
|
||||
// Hostname value in UTS namespace.
|
||||
Hostname string
|
||||
// Sequential container setup ops.
|
||||
*Ops
|
||||
// Seccomp system call filter rules.
|
||||
SeccompRules []seccomp.NativeRule
|
||||
// Extra seccomp flags.
|
||||
SeccompFlags seccomp.ExportFlag
|
||||
// Seccomp presets. Has no effect unless SeccompRules is zero-length.
|
||||
SeccompPresets seccomp.FilterPreset
|
||||
// Do not load seccomp program.
|
||||
SeccompDisable bool
|
||||
// Permission bits of newly created parent directories.
|
||||
// The zero value is interpreted as 0755.
|
||||
ParentPerm os.FileMode
|
||||
// Do not syscall.Setsid.
|
||||
RetainSession bool
|
||||
// Do not [syscall.CLONE_NEWNET].
|
||||
HostNet bool
|
||||
// Retain CAP_SYS_ADMIN.
|
||||
Privileged bool
|
||||
}
|
||||
)
|
||||
|
||||
func (p *Container) Start() error {
|
||||
if p.cmd != nil {
|
||||
return errors.New("sandbox: already started")
|
||||
}
|
||||
if p.Ops == nil || len(*p.Ops) == 0 {
|
||||
return errors.New("sandbox: starting an empty container")
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithCancel(p.ctx)
|
||||
p.cancel = cancel
|
||||
|
||||
var cloneFlags uintptr = CLONE_NEWIPC | CLONE_NEWUTS | CLONE_NEWCGROUP
|
||||
if !p.HostNet {
|
||||
cloneFlags |= CLONE_NEWNET
|
||||
}
|
||||
|
||||
// map to overflow id to work around ownership checks
|
||||
if p.Uid < 1 {
|
||||
p.Uid = OverflowUid()
|
||||
}
|
||||
if p.Gid < 1 {
|
||||
p.Gid = OverflowGid()
|
||||
}
|
||||
|
||||
if !p.RetainSession {
|
||||
p.SeccompPresets |= seccomp.PresetDenyTTY
|
||||
}
|
||||
|
||||
if p.CommandContext != nil {
|
||||
p.cmd = p.CommandContext(ctx)
|
||||
} else {
|
||||
p.cmd = exec.CommandContext(ctx, MustExecutable())
|
||||
p.cmd.Args = []string{"init"}
|
||||
}
|
||||
|
||||
p.cmd.Stdin, p.cmd.Stdout, p.cmd.Stderr = p.Stdin, p.Stdout, p.Stderr
|
||||
p.cmd.WaitDelay = p.WaitDelay
|
||||
if p.Cancel != nil {
|
||||
p.cmd.Cancel = func() error { return p.Cancel(p.cmd) }
|
||||
} else {
|
||||
p.cmd.Cancel = func() error { return p.cmd.Process.Signal(SIGTERM) }
|
||||
}
|
||||
p.cmd.Dir = "/"
|
||||
p.cmd.SysProcAttr = &SysProcAttr{
|
||||
Setsid: !p.RetainSession,
|
||||
Pdeathsig: SIGKILL,
|
||||
Cloneflags: cloneFlags | CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNS,
|
||||
|
||||
// remain privileged for setup
|
||||
AmbientCaps: []uintptr{CAP_SYS_ADMIN, CAP_SETPCAP},
|
||||
|
||||
UseCgroupFD: p.Cgroup != nil,
|
||||
}
|
||||
if p.cmd.SysProcAttr.UseCgroupFD {
|
||||
p.cmd.SysProcAttr.CgroupFD = *p.Cgroup
|
||||
}
|
||||
|
||||
// place setup pipe before user supplied extra files, this is later restored by init
|
||||
if fd, e, err := Setup(&p.cmd.ExtraFiles); err != nil {
|
||||
return wrapErrSuffix(err,
|
||||
"cannot create shim setup pipe:")
|
||||
} else {
|
||||
p.setup = e
|
||||
p.cmd.Env = []string{setupEnv + "=" + strconv.Itoa(fd)}
|
||||
}
|
||||
p.cmd.ExtraFiles = append(p.cmd.ExtraFiles, p.ExtraFiles...)
|
||||
|
||||
msg.Verbose("starting container init")
|
||||
if err := p.cmd.Start(); err != nil {
|
||||
return msg.WrapErr(err, err.Error())
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (p *Container) Serve() error {
|
||||
if p.setup == nil {
|
||||
panic("invalid serve")
|
||||
}
|
||||
|
||||
setup := p.setup
|
||||
p.setup = nil
|
||||
|
||||
if p.Path != "" && !path.IsAbs(p.Path) {
|
||||
p.cancel()
|
||||
return msg.WrapErr(EINVAL,
|
||||
fmt.Sprintf("invalid executable path %q", p.Path))
|
||||
}
|
||||
|
||||
if p.Path == "" {
|
||||
if p.name == "" {
|
||||
p.Path = os.Getenv("SHELL")
|
||||
if !path.IsAbs(p.Path) {
|
||||
p.cancel()
|
||||
return msg.WrapErr(EBADE,
|
||||
"no command specified and $SHELL is invalid")
|
||||
}
|
||||
p.name = path.Base(p.Path)
|
||||
} else if path.IsAbs(p.name) {
|
||||
p.Path = p.name
|
||||
} else if v, err := exec.LookPath(p.name); err != nil {
|
||||
p.cancel()
|
||||
return msg.WrapErr(err, err.Error())
|
||||
} else {
|
||||
p.Path = v
|
||||
}
|
||||
}
|
||||
|
||||
if p.SeccompRules == nil {
|
||||
// do not transmit nil
|
||||
p.SeccompRules = make([]seccomp.NativeRule, 0)
|
||||
}
|
||||
|
||||
err := setup.Encode(
|
||||
&initParams{
|
||||
p.Params,
|
||||
Getuid(),
|
||||
Getgid(),
|
||||
len(p.ExtraFiles),
|
||||
msg.IsVerbose(),
|
||||
},
|
||||
)
|
||||
if err != nil {
|
||||
p.cancel()
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
func (p *Container) Wait() error { defer p.cancel(); return p.cmd.Wait() }
|
||||
|
||||
func (p *Container) String() string {
|
||||
return fmt.Sprintf("argv: %q, filter: %v, rules: %d, flags: %#x, presets: %#x",
|
||||
p.Args, !p.SeccompDisable, len(p.SeccompRules), int(p.SeccompFlags), int(p.SeccompPresets))
|
||||
}
|
||||
|
||||
func New(ctx context.Context, name string, args ...string) *Container {
|
||||
return &Container{name: name, ctx: ctx,
|
||||
Params: Params{Args: append([]string{name}, args...), Dir: "/", Ops: new(Ops)},
|
||||
}
|
||||
}
|
||||
281
container/container_test.go
Normal file
281
container/container_test.go
Normal file
@@ -0,0 +1,281 @@
|
||||
package container_test
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/gob"
|
||||
"log"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strings"
|
||||
"syscall"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"git.gensokyo.uk/security/hakurei/container"
|
||||
"git.gensokyo.uk/security/hakurei/container/seccomp"
|
||||
"git.gensokyo.uk/security/hakurei/container/vfs"
|
||||
"git.gensokyo.uk/security/hakurei/hst"
|
||||
"git.gensokyo.uk/security/hakurei/internal"
|
||||
"git.gensokyo.uk/security/hakurei/internal/hlog"
|
||||
"git.gensokyo.uk/security/hakurei/ldd"
|
||||
)
|
||||
|
||||
const (
|
||||
ignore = "\x00"
|
||||
ignoreV = -1
|
||||
)
|
||||
|
||||
func TestContainer(t *testing.T) {
|
||||
{
|
||||
oldVerbose := hlog.Load()
|
||||
oldOutput := container.GetOutput()
|
||||
internal.InstallOutput(true)
|
||||
t.Cleanup(func() { hlog.Store(oldVerbose) })
|
||||
t.Cleanup(func() { container.SetOutput(oldOutput) })
|
||||
}
|
||||
|
||||
testCases := []struct {
|
||||
name string
|
||||
filter bool
|
||||
session bool
|
||||
net bool
|
||||
ops *container.Ops
|
||||
mnt []*vfs.MountInfoEntry
|
||||
host string
|
||||
rules []seccomp.NativeRule
|
||||
flags seccomp.ExportFlag
|
||||
presets seccomp.FilterPreset
|
||||
}{
|
||||
{"minimal", true, false, false,
|
||||
new(container.Ops), nil, "test-minimal",
|
||||
nil, 0, seccomp.PresetStrict},
|
||||
{"allow", true, true, true,
|
||||
new(container.Ops), nil, "test-minimal",
|
||||
nil, 0, seccomp.PresetExt | seccomp.PresetDenyDevel},
|
||||
{"no filter", false, true, true,
|
||||
new(container.Ops), nil, "test-no-filter",
|
||||
nil, 0, seccomp.PresetExt},
|
||||
{"custom rules", true, true, true,
|
||||
new(container.Ops), nil, "test-no-filter",
|
||||
[]seccomp.NativeRule{
|
||||
{seccomp.ScmpSyscall(syscall.SYS_SETUID), seccomp.ScmpErrno(syscall.EPERM), nil},
|
||||
}, 0, seccomp.PresetExt},
|
||||
{"tmpfs", true, false, false,
|
||||
new(container.Ops).
|
||||
Tmpfs(hst.Tmp, 0, 0755),
|
||||
[]*vfs.MountInfoEntry{
|
||||
e("/", hst.Tmp, "rw,nosuid,nodev,relatime", "tmpfs", "tmpfs", ignore),
|
||||
}, "test-tmpfs",
|
||||
nil, 0, seccomp.PresetStrict},
|
||||
{"dev", true, true /* go test output is not a tty */, false,
|
||||
new(container.Ops).
|
||||
Dev("/dev").
|
||||
Mqueue("/dev/mqueue"),
|
||||
[]*vfs.MountInfoEntry{
|
||||
e("/", "/dev", "rw,nosuid,nodev,relatime", "tmpfs", "devtmpfs", ignore),
|
||||
e("/null", "/dev/null", "rw,nosuid", "devtmpfs", "devtmpfs", ignore),
|
||||
e("/zero", "/dev/zero", "rw,nosuid", "devtmpfs", "devtmpfs", ignore),
|
||||
e("/full", "/dev/full", "rw,nosuid", "devtmpfs", "devtmpfs", ignore),
|
||||
e("/random", "/dev/random", "rw,nosuid", "devtmpfs", "devtmpfs", ignore),
|
||||
e("/urandom", "/dev/urandom", "rw,nosuid", "devtmpfs", "devtmpfs", ignore),
|
||||
e("/tty", "/dev/tty", "rw,nosuid", "devtmpfs", "devtmpfs", ignore),
|
||||
e("/", "/dev/pts", "rw,nosuid,noexec,relatime", "devpts", "devpts", "rw,mode=620,ptmxmode=666"),
|
||||
e("/", "/dev/mqueue", "rw,nosuid,nodev,noexec,relatime", "mqueue", "mqueue", "rw"),
|
||||
}, "",
|
||||
nil, 0, seccomp.PresetStrict},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
ctx, cancel := context.WithTimeout(t.Context(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
c := container.New(ctx, "/usr/bin/sandbox.test", "-test.v",
|
||||
"-test.run=TestHelperCheckContainer", "--", "check", tc.host)
|
||||
c.Uid = 1000
|
||||
c.Gid = 100
|
||||
c.Hostname = tc.host
|
||||
c.CommandContext = commandContext
|
||||
c.Stdout, c.Stderr = os.Stdout, os.Stderr
|
||||
c.Ops = tc.ops
|
||||
c.SeccompRules = tc.rules
|
||||
c.SeccompFlags = tc.flags | seccomp.AllowMultiarch
|
||||
c.SeccompPresets = tc.presets
|
||||
c.SeccompDisable = !tc.filter
|
||||
c.RetainSession = tc.session
|
||||
c.HostNet = tc.net
|
||||
if c.Args[5] == "" {
|
||||
if name, err := os.Hostname(); err != nil {
|
||||
t.Fatalf("cannot get hostname: %v", err)
|
||||
} else {
|
||||
c.Args[5] = name
|
||||
}
|
||||
}
|
||||
|
||||
c.
|
||||
Tmpfs("/tmp", 0, 0755).
|
||||
Bind(os.Args[0], os.Args[0], 0).
|
||||
Mkdir("/usr/bin", 0755).
|
||||
Link(os.Args[0], "/usr/bin/sandbox.test").
|
||||
Place("/etc/hostname", []byte(c.Args[5]))
|
||||
// in case test has cgo enabled
|
||||
var libPaths []string
|
||||
if entries, err := ldd.ExecFilter(ctx,
|
||||
commandContext,
|
||||
func(v []byte) []byte {
|
||||
return bytes.SplitN(v, []byte("TestHelperInit\n"), 2)[1]
|
||||
}, os.Args[0]); err != nil {
|
||||
log.Fatalf("ldd: %v", err)
|
||||
} else {
|
||||
libPaths = ldd.Path(entries)
|
||||
}
|
||||
for _, name := range libPaths {
|
||||
c.Bind(name, name, 0)
|
||||
}
|
||||
// needs /proc to check mountinfo
|
||||
c.Proc("/proc")
|
||||
|
||||
mnt := make([]*vfs.MountInfoEntry, 0, 3+len(libPaths))
|
||||
mnt = append(mnt, e("/sysroot", "/", "rw,nosuid,nodev,relatime", "tmpfs", "rootfs", ignore))
|
||||
mnt = append(mnt, tc.mnt...)
|
||||
mnt = append(mnt,
|
||||
e("/", "/tmp", "rw,nosuid,nodev,relatime", "tmpfs", "tmpfs", ignore),
|
||||
e(ignore, os.Args[0], "ro,nosuid,nodev,relatime", ignore, ignore, ignore),
|
||||
e(ignore, "/etc/hostname", "ro,nosuid,nodev,relatime", "tmpfs", "rootfs", ignore),
|
||||
)
|
||||
for _, name := range libPaths {
|
||||
mnt = append(mnt, e(ignore, name, "ro,nosuid,nodev,relatime", ignore, ignore, ignore))
|
||||
}
|
||||
mnt = append(mnt, e("/", "/proc", "rw,nosuid,nodev,noexec,relatime", "proc", "proc", "rw"))
|
||||
want := new(bytes.Buffer)
|
||||
if err := gob.NewEncoder(want).Encode(mnt); err != nil {
|
||||
t.Fatalf("cannot serialise expected mount points: %v", err)
|
||||
}
|
||||
c.Stdin = want
|
||||
|
||||
if err := c.Start(); err != nil {
|
||||
hlog.PrintBaseError(err, "start:")
|
||||
t.Fatalf("cannot start container: %v", err)
|
||||
} else if err = c.Serve(); err != nil {
|
||||
hlog.PrintBaseError(err, "serve:")
|
||||
t.Errorf("cannot serve setup params: %v", err)
|
||||
}
|
||||
if err := c.Wait(); err != nil {
|
||||
hlog.PrintBaseError(err, "wait:")
|
||||
t.Fatalf("wait: %v", err)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func e(root, target, vfsOptstr, fsType, source, fsOptstr string) *vfs.MountInfoEntry {
|
||||
return &vfs.MountInfoEntry{
|
||||
ID: ignoreV,
|
||||
Parent: ignoreV,
|
||||
Devno: vfs.DevT{ignoreV, ignoreV},
|
||||
Root: root,
|
||||
Target: target,
|
||||
VfsOptstr: vfsOptstr,
|
||||
OptFields: []string{ignore},
|
||||
FsType: fsType,
|
||||
Source: source,
|
||||
FsOptstr: fsOptstr,
|
||||
}
|
||||
}
|
||||
|
||||
func TestContainerString(t *testing.T) {
|
||||
c := container.New(t.Context(), "ldd", "/usr/bin/env")
|
||||
c.SeccompFlags |= seccomp.AllowMultiarch
|
||||
c.SeccompRules = seccomp.Preset(
|
||||
seccomp.PresetExt|seccomp.PresetDenyNS|seccomp.PresetDenyTTY,
|
||||
c.SeccompFlags)
|
||||
c.SeccompPresets = seccomp.PresetStrict
|
||||
want := `argv: ["ldd" "/usr/bin/env"], filter: true, rules: 65, flags: 0x1, presets: 0xf`
|
||||
if got := c.String(); got != want {
|
||||
t.Errorf("String: %s, want %s", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHelperInit(t *testing.T) {
|
||||
if len(os.Args) != 5 || os.Args[4] != "init" {
|
||||
return
|
||||
}
|
||||
container.SetOutput(hlog.Output{})
|
||||
container.Init(hlog.Prepare, internal.InstallOutput)
|
||||
}
|
||||
|
||||
func TestHelperCheckContainer(t *testing.T) {
|
||||
if len(os.Args) != 6 || os.Args[4] != "check" {
|
||||
return
|
||||
}
|
||||
|
||||
t.Run("user", func(t *testing.T) {
|
||||
if uid := syscall.Getuid(); uid != 1000 {
|
||||
t.Errorf("Getuid: %d, want 1000", uid)
|
||||
}
|
||||
if gid := syscall.Getgid(); gid != 100 {
|
||||
t.Errorf("Getgid: %d, want 100", gid)
|
||||
}
|
||||
})
|
||||
t.Run("hostname", func(t *testing.T) {
|
||||
if name, err := os.Hostname(); err != nil {
|
||||
t.Fatalf("cannot get hostname: %v", err)
|
||||
} else if name != os.Args[5] {
|
||||
t.Errorf("Hostname: %q, want %q", name, os.Args[5])
|
||||
}
|
||||
|
||||
if p, err := os.ReadFile("/etc/hostname"); err != nil {
|
||||
t.Fatalf("%v", err)
|
||||
} else if string(p) != os.Args[5] {
|
||||
t.Errorf("/etc/hostname: %q, want %q", string(p), os.Args[5])
|
||||
}
|
||||
})
|
||||
t.Run("mount", func(t *testing.T) {
|
||||
var mnt []*vfs.MountInfoEntry
|
||||
if err := gob.NewDecoder(os.Stdin).Decode(&mnt); err != nil {
|
||||
t.Fatalf("cannot receive expected mount points: %v", err)
|
||||
}
|
||||
|
||||
var d *vfs.MountInfoDecoder
|
||||
if f, err := os.Open("/proc/self/mountinfo"); err != nil {
|
||||
t.Fatalf("cannot open mountinfo: %v", err)
|
||||
} else {
|
||||
d = vfs.NewMountInfoDecoder(f)
|
||||
}
|
||||
|
||||
i := 0
|
||||
for cur := range d.Entries() {
|
||||
if i == len(mnt) {
|
||||
t.Errorf("got more than %d entries", len(mnt))
|
||||
break
|
||||
}
|
||||
|
||||
// ugly hack but should be reliable and is less likely to false negative than comparing by parsed flags
|
||||
cur.VfsOptstr = strings.TrimSuffix(cur.VfsOptstr, ",relatime")
|
||||
cur.VfsOptstr = strings.TrimSuffix(cur.VfsOptstr, ",noatime")
|
||||
mnt[i].VfsOptstr = strings.TrimSuffix(mnt[i].VfsOptstr, ",relatime")
|
||||
mnt[i].VfsOptstr = strings.TrimSuffix(mnt[i].VfsOptstr, ",noatime")
|
||||
|
||||
if !cur.EqualWithIgnore(mnt[i], "\x00") {
|
||||
t.Errorf("[FAIL] %s", cur)
|
||||
} else {
|
||||
t.Logf("[ OK ] %s", cur)
|
||||
}
|
||||
|
||||
i++
|
||||
}
|
||||
if err := d.Err(); err != nil {
|
||||
t.Errorf("cannot parse mountinfo: %v", err)
|
||||
}
|
||||
|
||||
if i != len(mnt) {
|
||||
t.Errorf("got %d entries, want %d", i, len(mnt))
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func commandContext(ctx context.Context) *exec.Cmd {
|
||||
return exec.CommandContext(ctx, os.Args[0], "-test.v",
|
||||
"-test.run=TestHelperInit", "--", "init")
|
||||
}
|
||||
26
container/executable.go
Normal file
26
container/executable.go
Normal file
@@ -0,0 +1,26 @@
|
||||
package container
|
||||
|
||||
import (
|
||||
"log"
|
||||
"os"
|
||||
"sync"
|
||||
)
|
||||
|
||||
var (
|
||||
executable string
|
||||
executableOnce sync.Once
|
||||
)
|
||||
|
||||
func copyExecutable() {
|
||||
if name, err := os.Executable(); err != nil {
|
||||
msg.BeforeExit()
|
||||
log.Fatalf("cannot read executable path: %v", err)
|
||||
} else {
|
||||
executable = name
|
||||
}
|
||||
}
|
||||
|
||||
func MustExecutable() string {
|
||||
executableOnce.Do(copyExecutable)
|
||||
return executable
|
||||
}
|
||||
17
container/executable_test.go
Normal file
17
container/executable_test.go
Normal file
@@ -0,0 +1,17 @@
|
||||
package container_test
|
||||
|
||||
import (
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
"git.gensokyo.uk/security/hakurei/container"
|
||||
)
|
||||
|
||||
func TestExecutable(t *testing.T) {
|
||||
for i := 0; i < 16; i++ {
|
||||
if got := container.MustExecutable(); got != os.Args[0] {
|
||||
t.Errorf("MustExecutable: %q, want %q",
|
||||
got, os.Args[0])
|
||||
}
|
||||
}
|
||||
}
|
||||
364
container/init.go
Normal file
364
container/init.go
Normal file
@@ -0,0 +1,364 @@
|
||||
package container
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"os/exec"
|
||||
"os/signal"
|
||||
"path"
|
||||
"runtime"
|
||||
"strconv"
|
||||
. "syscall"
|
||||
"time"
|
||||
|
||||
"git.gensokyo.uk/security/hakurei/container/seccomp"
|
||||
)
|
||||
|
||||
const (
|
||||
// time to wait for linger processes after death of initial process
|
||||
residualProcessTimeout = 5 * time.Second
|
||||
|
||||
// intermediate tmpfs mount point
|
||||
basePath = "/tmp"
|
||||
|
||||
// setup params file descriptor
|
||||
setupEnv = "HAKUREI_SETUP"
|
||||
)
|
||||
|
||||
type initParams struct {
|
||||
Params
|
||||
|
||||
HostUid, HostGid int
|
||||
// extra files count
|
||||
Count int
|
||||
// verbosity pass through
|
||||
Verbose bool
|
||||
}
|
||||
|
||||
func Init(prepare func(prefix string), setVerbose func(verbose bool)) {
|
||||
runtime.LockOSThread()
|
||||
prepare("init")
|
||||
|
||||
if os.Getpid() != 1 {
|
||||
log.Fatal("this process must run as pid 1")
|
||||
}
|
||||
|
||||
var (
|
||||
params initParams
|
||||
closeSetup func() error
|
||||
setupFile *os.File
|
||||
offsetSetup int
|
||||
)
|
||||
if f, err := Receive(setupEnv, ¶ms, &setupFile); err != nil {
|
||||
if errors.Is(err, ErrInvalid) {
|
||||
log.Fatal("invalid setup descriptor")
|
||||
}
|
||||
if errors.Is(err, ErrNotSet) {
|
||||
log.Fatal("HAKUREI_SETUP not set")
|
||||
}
|
||||
|
||||
log.Fatalf("cannot decode init setup payload: %v", err)
|
||||
} else {
|
||||
if params.Ops == nil {
|
||||
log.Fatal("invalid setup parameters")
|
||||
}
|
||||
if params.ParentPerm == 0 {
|
||||
params.ParentPerm = 0755
|
||||
}
|
||||
|
||||
setVerbose(params.Verbose)
|
||||
msg.Verbose("received setup parameters")
|
||||
closeSetup = f
|
||||
offsetSetup = int(setupFile.Fd() + 1)
|
||||
}
|
||||
|
||||
// write uid/gid map here so parent does not need to set dumpable
|
||||
if err := SetDumpable(SUID_DUMP_USER); err != nil {
|
||||
log.Fatalf("cannot set SUID_DUMP_USER: %s", err)
|
||||
}
|
||||
if err := os.WriteFile("/proc/self/uid_map",
|
||||
append([]byte{}, strconv.Itoa(params.Uid)+" "+strconv.Itoa(params.HostUid)+" 1\n"...),
|
||||
0); err != nil {
|
||||
log.Fatalf("%v", err)
|
||||
}
|
||||
if err := os.WriteFile("/proc/self/setgroups",
|
||||
[]byte("deny\n"),
|
||||
0); err != nil && !os.IsNotExist(err) {
|
||||
log.Fatalf("%v", err)
|
||||
}
|
||||
if err := os.WriteFile("/proc/self/gid_map",
|
||||
append([]byte{}, strconv.Itoa(params.Gid)+" "+strconv.Itoa(params.HostGid)+" 1\n"...),
|
||||
0); err != nil {
|
||||
log.Fatalf("%v", err)
|
||||
}
|
||||
if err := SetDumpable(SUID_DUMP_DISABLE); err != nil {
|
||||
log.Fatalf("cannot set SUID_DUMP_DISABLE: %s", err)
|
||||
}
|
||||
|
||||
oldmask := Umask(0)
|
||||
if params.Hostname != "" {
|
||||
if err := Sethostname([]byte(params.Hostname)); err != nil {
|
||||
log.Fatalf("cannot set hostname: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// cache sysctl before pivot_root
|
||||
LastCap()
|
||||
|
||||
if err := Mount("", "/", "", MS_SILENT|MS_SLAVE|MS_REC, ""); err != nil {
|
||||
log.Fatalf("cannot make / rslave: %v", err)
|
||||
}
|
||||
|
||||
for i, op := range *params.Ops {
|
||||
if op == nil {
|
||||
log.Fatalf("invalid op %d", i)
|
||||
}
|
||||
|
||||
if err := op.early(¶ms.Params); err != nil {
|
||||
msg.PrintBaseErr(err,
|
||||
fmt.Sprintf("cannot prepare op %d:", i))
|
||||
msg.BeforeExit()
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
|
||||
if err := Mount("rootfs", basePath, "tmpfs", MS_NODEV|MS_NOSUID, ""); err != nil {
|
||||
log.Fatalf("cannot mount intermediate root: %v", err)
|
||||
}
|
||||
if err := os.Chdir(basePath); err != nil {
|
||||
log.Fatalf("cannot enter base path: %v", err)
|
||||
}
|
||||
|
||||
if err := os.Mkdir(sysrootDir, 0755); err != nil {
|
||||
log.Fatalf("%v", err)
|
||||
}
|
||||
if err := Mount(sysrootDir, sysrootDir, "", MS_SILENT|MS_MGC_VAL|MS_BIND|MS_REC, ""); err != nil {
|
||||
log.Fatalf("cannot bind sysroot: %v", err)
|
||||
}
|
||||
|
||||
if err := os.Mkdir(hostDir, 0755); err != nil {
|
||||
log.Fatalf("%v", err)
|
||||
}
|
||||
// pivot_root uncovers basePath in hostDir
|
||||
if err := PivotRoot(basePath, hostDir); err != nil {
|
||||
log.Fatalf("cannot pivot into intermediate root: %v", err)
|
||||
}
|
||||
if err := os.Chdir("/"); err != nil {
|
||||
log.Fatalf("%v", err)
|
||||
}
|
||||
|
||||
for i, op := range *params.Ops {
|
||||
// ops already checked during early setup
|
||||
msg.Verbosef("%s %s", op.prefix(), op)
|
||||
if err := op.apply(¶ms.Params); err != nil {
|
||||
msg.PrintBaseErr(err,
|
||||
fmt.Sprintf("cannot apply op %d:", i))
|
||||
msg.BeforeExit()
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
|
||||
// setup requiring host root complete at this point
|
||||
if err := Mount(hostDir, hostDir, "", MS_SILENT|MS_REC|MS_PRIVATE, ""); err != nil {
|
||||
log.Fatalf("cannot make host root rprivate: %v", err)
|
||||
}
|
||||
if err := Unmount(hostDir, MNT_DETACH); err != nil {
|
||||
log.Fatalf("cannot unmount host root: %v", err)
|
||||
}
|
||||
|
||||
{
|
||||
var fd int
|
||||
if err := IgnoringEINTR(func() (err error) {
|
||||
fd, err = Open("/", O_DIRECTORY|O_RDONLY, 0)
|
||||
return
|
||||
}); err != nil {
|
||||
log.Fatalf("cannot open intermediate root: %v", err)
|
||||
}
|
||||
if err := os.Chdir(sysrootPath); err != nil {
|
||||
log.Fatalf("%v", err)
|
||||
}
|
||||
|
||||
if err := PivotRoot(".", "."); err != nil {
|
||||
log.Fatalf("cannot pivot into sysroot: %v", err)
|
||||
}
|
||||
if err := Fchdir(fd); err != nil {
|
||||
log.Fatalf("cannot re-enter intermediate root: %v", err)
|
||||
}
|
||||
if err := Unmount(".", MNT_DETACH); err != nil {
|
||||
log.Fatalf("cannot unmount intemediate root: %v", err)
|
||||
}
|
||||
if err := os.Chdir("/"); err != nil {
|
||||
log.Fatalf("%v", err)
|
||||
}
|
||||
|
||||
if err := Close(fd); err != nil {
|
||||
log.Fatalf("cannot close intermediate root: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
if _, _, errno := Syscall(PR_SET_NO_NEW_PRIVS, 1, 0, 0); errno != 0 {
|
||||
log.Fatalf("prctl(PR_SET_NO_NEW_PRIVS): %v", errno)
|
||||
}
|
||||
|
||||
if _, _, errno := Syscall(SYS_PRCTL, PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0); errno != 0 {
|
||||
log.Fatalf("cannot clear the ambient capability set: %v", errno)
|
||||
}
|
||||
for i := uintptr(0); i <= LastCap(); i++ {
|
||||
if params.Privileged && i == CAP_SYS_ADMIN {
|
||||
continue
|
||||
}
|
||||
if _, _, errno := Syscall(SYS_PRCTL, PR_CAPBSET_DROP, i, 0); errno != 0 {
|
||||
log.Fatalf("cannot drop capability from bonding set: %v", errno)
|
||||
}
|
||||
}
|
||||
|
||||
var keep [2]uint32
|
||||
if params.Privileged {
|
||||
keep[capToIndex(CAP_SYS_ADMIN)] |= capToMask(CAP_SYS_ADMIN)
|
||||
|
||||
if _, _, errno := Syscall(SYS_PRCTL, PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_SYS_ADMIN); errno != 0 {
|
||||
log.Fatalf("cannot raise CAP_SYS_ADMIN: %v", errno)
|
||||
}
|
||||
}
|
||||
if err := capset(
|
||||
&capHeader{_LINUX_CAPABILITY_VERSION_3, 0},
|
||||
&[2]capData{{0, keep[0], keep[0]}, {0, keep[1], keep[1]}},
|
||||
); err != nil {
|
||||
log.Fatalf("cannot capset: %v", err)
|
||||
}
|
||||
|
||||
if !params.SeccompDisable {
|
||||
rules := params.SeccompRules
|
||||
if len(rules) == 0 { // non-empty rules slice always overrides presets
|
||||
msg.Verbosef("resolving presets %#x", params.SeccompPresets)
|
||||
rules = seccomp.Preset(params.SeccompPresets, params.SeccompFlags)
|
||||
}
|
||||
if err := seccomp.Load(rules, params.SeccompFlags); err != nil {
|
||||
log.Fatalf("cannot load syscall filter: %v", err)
|
||||
}
|
||||
msg.Verbosef("%d filter rules loaded", len(rules))
|
||||
} else {
|
||||
msg.Verbose("syscall filter not configured")
|
||||
}
|
||||
|
||||
extraFiles := make([]*os.File, params.Count)
|
||||
for i := range extraFiles {
|
||||
// setup fd is placed before all extra files
|
||||
extraFiles[i] = os.NewFile(uintptr(offsetSetup+i), "extra file "+strconv.Itoa(i))
|
||||
}
|
||||
Umask(oldmask)
|
||||
|
||||
cmd := exec.Command(params.Path)
|
||||
cmd.Stdin, cmd.Stdout, cmd.Stderr = os.Stdin, os.Stdout, os.Stderr
|
||||
cmd.Args = params.Args
|
||||
cmd.Env = params.Env
|
||||
cmd.ExtraFiles = extraFiles
|
||||
cmd.Dir = params.Dir
|
||||
|
||||
if err := cmd.Start(); err != nil {
|
||||
log.Fatalf("%v", err)
|
||||
}
|
||||
msg.Suspend()
|
||||
|
||||
if err := closeSetup(); err != nil {
|
||||
log.Println("cannot close setup pipe:", err)
|
||||
// not fatal
|
||||
}
|
||||
|
||||
type winfo struct {
|
||||
wpid int
|
||||
wstatus WaitStatus
|
||||
}
|
||||
info := make(chan winfo, 1)
|
||||
done := make(chan struct{})
|
||||
|
||||
go func() {
|
||||
var (
|
||||
err error
|
||||
wpid = -2
|
||||
wstatus WaitStatus
|
||||
)
|
||||
|
||||
// keep going until no child process is left
|
||||
for wpid != -1 {
|
||||
if err != nil {
|
||||
break
|
||||
}
|
||||
|
||||
if wpid != -2 {
|
||||
info <- winfo{wpid, wstatus}
|
||||
}
|
||||
|
||||
err = EINTR
|
||||
for errors.Is(err, EINTR) {
|
||||
wpid, err = Wait4(-1, &wstatus, 0, nil)
|
||||
}
|
||||
}
|
||||
if !errors.Is(err, ECHILD) {
|
||||
log.Println("unexpected wait4 response:", err)
|
||||
}
|
||||
|
||||
close(done)
|
||||
}()
|
||||
|
||||
// handle signals to dump withheld messages
|
||||
sig := make(chan os.Signal, 2)
|
||||
signal.Notify(sig, SIGINT, SIGTERM)
|
||||
|
||||
// closed after residualProcessTimeout has elapsed after initial process death
|
||||
timeout := make(chan struct{})
|
||||
|
||||
r := 2
|
||||
for {
|
||||
select {
|
||||
case s := <-sig:
|
||||
if msg.Resume() {
|
||||
msg.Verbosef("terminating on %s after process start", s.String())
|
||||
} else {
|
||||
msg.Verbosef("terminating on %s", s.String())
|
||||
}
|
||||
os.Exit(0)
|
||||
case w := <-info:
|
||||
if w.wpid == cmd.Process.Pid {
|
||||
// initial process exited, output is most likely available again
|
||||
msg.Resume()
|
||||
|
||||
switch {
|
||||
case w.wstatus.Exited():
|
||||
r = w.wstatus.ExitStatus()
|
||||
msg.Verbosef("initial process exited with code %d", w.wstatus.ExitStatus())
|
||||
case w.wstatus.Signaled():
|
||||
r = 128 + int(w.wstatus.Signal())
|
||||
msg.Verbosef("initial process exited with signal %s", w.wstatus.Signal())
|
||||
default:
|
||||
r = 255
|
||||
msg.Verbosef("initial process exited with status %#x", w.wstatus)
|
||||
}
|
||||
|
||||
go func() {
|
||||
time.Sleep(residualProcessTimeout)
|
||||
close(timeout)
|
||||
}()
|
||||
}
|
||||
case <-done:
|
||||
msg.BeforeExit()
|
||||
os.Exit(r)
|
||||
case <-timeout:
|
||||
log.Println("timeout exceeded waiting for lingering processes")
|
||||
msg.BeforeExit()
|
||||
os.Exit(r)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TryArgv0 calls [Init] if the last element of argv0 is "init".
|
||||
func TryArgv0(v Msg, prepare func(prefix string), setVerbose func(verbose bool)) {
|
||||
if len(os.Args) > 0 && path.Base(os.Args[0]) == "init" {
|
||||
msg = v
|
||||
Init(prepare, setVerbose)
|
||||
msg.BeforeExit()
|
||||
os.Exit(0)
|
||||
}
|
||||
}
|
||||
123
container/mount.go
Normal file
123
container/mount.go
Normal file
@@ -0,0 +1,123 @@
|
||||
package container
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
. "syscall"
|
||||
|
||||
"git.gensokyo.uk/security/hakurei/container/vfs"
|
||||
)
|
||||
|
||||
func (p *procPaths) bindMount(source, target string, flags uintptr, eq bool) error {
|
||||
if eq {
|
||||
msg.Verbosef("resolved %q flags %#x", target, flags)
|
||||
} else {
|
||||
msg.Verbosef("resolved %q on %q flags %#x", source, target, flags)
|
||||
}
|
||||
|
||||
if err := Mount(source, target, "", MS_SILENT|MS_BIND|flags&MS_REC, ""); err != nil {
|
||||
return wrapErrSuffix(err,
|
||||
fmt.Sprintf("cannot mount %q on %q:", source, target))
|
||||
}
|
||||
|
||||
var targetFinal string
|
||||
if v, err := filepath.EvalSymlinks(target); err != nil {
|
||||
return wrapErrSelf(err)
|
||||
} else {
|
||||
targetFinal = v
|
||||
if targetFinal != target {
|
||||
msg.Verbosef("target resolves to %q", targetFinal)
|
||||
}
|
||||
}
|
||||
|
||||
// final target path according to the kernel through proc
|
||||
var targetKFinal string
|
||||
{
|
||||
var destFd int
|
||||
if err := IgnoringEINTR(func() (err error) {
|
||||
destFd, err = Open(targetFinal, O_PATH|O_CLOEXEC, 0)
|
||||
return
|
||||
}); err != nil {
|
||||
return wrapErrSuffix(err,
|
||||
fmt.Sprintf("cannot open %q:", targetFinal))
|
||||
}
|
||||
if v, err := os.Readlink(p.fd(destFd)); err != nil {
|
||||
return wrapErrSelf(err)
|
||||
} else if err = Close(destFd); err != nil {
|
||||
return wrapErrSuffix(err,
|
||||
fmt.Sprintf("cannot close %q:", targetFinal))
|
||||
} else {
|
||||
targetKFinal = v
|
||||
}
|
||||
}
|
||||
|
||||
mf := MS_NOSUID | flags&MS_NODEV | flags&MS_RDONLY
|
||||
return hostProc.mountinfo(func(d *vfs.MountInfoDecoder) error {
|
||||
n, err := d.Unfold(targetKFinal)
|
||||
if err != nil {
|
||||
if errors.Is(err, ESTALE) {
|
||||
return msg.WrapErr(err,
|
||||
fmt.Sprintf("mount point %q never appeared in mountinfo", targetKFinal))
|
||||
}
|
||||
return wrapErrSuffix(err,
|
||||
"cannot unfold mount hierarchy:")
|
||||
}
|
||||
|
||||
if err = remountWithFlags(n, mf); err != nil {
|
||||
return err
|
||||
}
|
||||
if flags&MS_REC == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
for cur := range n.Collective() {
|
||||
err = remountWithFlags(cur, mf)
|
||||
if err != nil && !errors.Is(err, EACCES) {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
})
|
||||
}
|
||||
|
||||
func remountWithFlags(n *vfs.MountInfoNode, mf uintptr) error {
|
||||
kf, unmatched := n.Flags()
|
||||
if len(unmatched) != 0 {
|
||||
msg.Verbosef("unmatched vfs options: %q", unmatched)
|
||||
}
|
||||
|
||||
if kf&mf != mf {
|
||||
return wrapErrSuffix(
|
||||
Mount("none", n.Clean, "", MS_SILENT|MS_BIND|MS_REMOUNT|kf|mf, ""),
|
||||
fmt.Sprintf("cannot remount %q:", n.Clean))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func mountTmpfs(fsname, name string, size int, perm os.FileMode) error {
|
||||
target := toSysroot(name)
|
||||
if err := os.MkdirAll(target, parentPerm(perm)); err != nil {
|
||||
return wrapErrSelf(err)
|
||||
}
|
||||
opt := fmt.Sprintf("mode=%#o", perm)
|
||||
if size > 0 {
|
||||
opt += fmt.Sprintf(",size=%d", size)
|
||||
}
|
||||
return wrapErrSuffix(
|
||||
Mount(fsname, target, "tmpfs", MS_NOSUID|MS_NODEV, opt),
|
||||
fmt.Sprintf("cannot mount tmpfs on %q:", name))
|
||||
}
|
||||
|
||||
func parentPerm(perm os.FileMode) os.FileMode {
|
||||
pperm := 0755
|
||||
if perm&0070 == 0 {
|
||||
pperm &= ^0050
|
||||
}
|
||||
if perm&0007 == 0 {
|
||||
pperm &= ^0005
|
||||
}
|
||||
return os.FileMode(pperm)
|
||||
}
|
||||
43
container/msg.go
Normal file
43
container/msg.go
Normal file
@@ -0,0 +1,43 @@
|
||||
package container
|
||||
|
||||
import (
|
||||
"log"
|
||||
"sync/atomic"
|
||||
)
|
||||
|
||||
type Msg interface {
|
||||
IsVerbose() bool
|
||||
Verbose(v ...any)
|
||||
Verbosef(format string, v ...any)
|
||||
WrapErr(err error, a ...any) error
|
||||
PrintBaseErr(err error, fallback string)
|
||||
|
||||
Suspend()
|
||||
Resume() bool
|
||||
|
||||
BeforeExit()
|
||||
}
|
||||
|
||||
type DefaultMsg struct{ inactive atomic.Bool }
|
||||
|
||||
func (msg *DefaultMsg) IsVerbose() bool { return true }
|
||||
func (msg *DefaultMsg) Verbose(v ...any) {
|
||||
if !msg.inactive.Load() {
|
||||
log.Println(v...)
|
||||
}
|
||||
}
|
||||
func (msg *DefaultMsg) Verbosef(format string, v ...any) {
|
||||
if !msg.inactive.Load() {
|
||||
log.Printf(format, v...)
|
||||
}
|
||||
}
|
||||
|
||||
func (msg *DefaultMsg) WrapErr(err error, a ...any) error {
|
||||
log.Println(a...)
|
||||
return err
|
||||
}
|
||||
func (msg *DefaultMsg) PrintBaseErr(err error, fallback string) { log.Println(fallback, err) }
|
||||
|
||||
func (msg *DefaultMsg) Suspend() { msg.inactive.Store(true) }
|
||||
func (msg *DefaultMsg) Resume() bool { return msg.inactive.CompareAndSwap(true, false) }
|
||||
func (msg *DefaultMsg) BeforeExit() {}
|
||||
482
container/ops.go
Normal file
482
container/ops.go
Normal file
@@ -0,0 +1,482 @@
|
||||
package container
|
||||
|
||||
import (
|
||||
"encoding/gob"
|
||||
"fmt"
|
||||
"math"
|
||||
"os"
|
||||
"path"
|
||||
"path/filepath"
|
||||
"slices"
|
||||
"strings"
|
||||
. "syscall"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
type (
|
||||
Ops []Op
|
||||
Op interface {
|
||||
// early is called in host root.
|
||||
early(params *Params) error
|
||||
// apply is called in intermediate root.
|
||||
apply(params *Params) error
|
||||
|
||||
prefix() string
|
||||
Is(op Op) bool
|
||||
fmt.Stringer
|
||||
}
|
||||
)
|
||||
|
||||
func (f *Ops) Grow(n int) { *f = slices.Grow(*f, n) }
|
||||
|
||||
func init() { gob.Register(new(BindMountOp)) }
|
||||
|
||||
// BindMountOp bind mounts host path Source on container path Target.
|
||||
type BindMountOp struct {
|
||||
Source, SourceFinal, Target string
|
||||
|
||||
Flags int
|
||||
}
|
||||
|
||||
const (
|
||||
BindOptional = 1 << iota
|
||||
BindWritable
|
||||
BindDevice
|
||||
)
|
||||
|
||||
func (b *BindMountOp) early(*Params) error {
|
||||
if !path.IsAbs(b.Source) {
|
||||
return msg.WrapErr(EBADE, fmt.Sprintf("path %q is not absolute", b.Source))
|
||||
}
|
||||
|
||||
if v, err := filepath.EvalSymlinks(b.Source); err != nil {
|
||||
if os.IsNotExist(err) && b.Flags&BindOptional != 0 {
|
||||
b.SourceFinal = "\x00"
|
||||
return nil
|
||||
}
|
||||
return wrapErrSelf(err)
|
||||
} else {
|
||||
b.SourceFinal = v
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
func (b *BindMountOp) apply(*Params) error {
|
||||
if b.SourceFinal == "\x00" {
|
||||
if b.Flags&BindOptional == 0 {
|
||||
// unreachable
|
||||
return EBADE
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
if !path.IsAbs(b.SourceFinal) || !path.IsAbs(b.Target) {
|
||||
return msg.WrapErr(EBADE, "path is not absolute")
|
||||
}
|
||||
|
||||
source := toHost(b.SourceFinal)
|
||||
target := toSysroot(b.Target)
|
||||
|
||||
// this perm value emulates bwrap behaviour as it clears bits from 0755 based on
|
||||
// op->perms which is never set for any bind setup op so always results in 0700
|
||||
if fi, err := os.Stat(source); err != nil {
|
||||
return wrapErrSelf(err)
|
||||
} else if fi.IsDir() {
|
||||
if err = os.MkdirAll(target, 0700); err != nil {
|
||||
return wrapErrSelf(err)
|
||||
}
|
||||
} else if err = ensureFile(target, 0444, 0700); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var flags uintptr = MS_REC
|
||||
if b.Flags&BindWritable == 0 {
|
||||
flags |= MS_RDONLY
|
||||
}
|
||||
if b.Flags&BindDevice == 0 {
|
||||
flags |= MS_NODEV
|
||||
}
|
||||
|
||||
return hostProc.bindMount(source, target, flags, b.SourceFinal == b.Target)
|
||||
}
|
||||
|
||||
func (b *BindMountOp) Is(op Op) bool { vb, ok := op.(*BindMountOp); return ok && *b == *vb }
|
||||
func (*BindMountOp) prefix() string { return "mounting" }
|
||||
func (b *BindMountOp) String() string {
|
||||
if b.Source == b.Target {
|
||||
return fmt.Sprintf("%q flags %#x", b.Source, b.Flags)
|
||||
}
|
||||
return fmt.Sprintf("%q on %q flags %#x", b.Source, b.Target, b.Flags&BindWritable)
|
||||
}
|
||||
func (f *Ops) Bind(source, target string, flags int) *Ops {
|
||||
*f = append(*f, &BindMountOp{source, "", target, flags})
|
||||
return f
|
||||
}
|
||||
|
||||
func init() { gob.Register(new(MountProcOp)) }
|
||||
|
||||
// MountProcOp mounts a private instance of proc.
|
||||
type MountProcOp string
|
||||
|
||||
func (p MountProcOp) early(*Params) error { return nil }
|
||||
func (p MountProcOp) apply(params *Params) error {
|
||||
v := string(p)
|
||||
|
||||
if !path.IsAbs(v) {
|
||||
return msg.WrapErr(EBADE, fmt.Sprintf("path %q is not absolute", v))
|
||||
}
|
||||
|
||||
target := toSysroot(v)
|
||||
if err := os.MkdirAll(target, params.ParentPerm); err != nil {
|
||||
return wrapErrSelf(err)
|
||||
}
|
||||
return wrapErrSuffix(Mount("proc", target, "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, ""),
|
||||
fmt.Sprintf("cannot mount proc on %q:", v))
|
||||
}
|
||||
|
||||
func (p MountProcOp) Is(op Op) bool { vp, ok := op.(MountProcOp); return ok && p == vp }
|
||||
func (MountProcOp) prefix() string { return "mounting" }
|
||||
func (p MountProcOp) String() string { return fmt.Sprintf("proc on %q", string(p)) }
|
||||
func (f *Ops) Proc(dest string) *Ops {
|
||||
*f = append(*f, MountProcOp(dest))
|
||||
return f
|
||||
}
|
||||
|
||||
func init() { gob.Register(new(MountDevOp)) }
|
||||
|
||||
// MountDevOp mounts part of host dev.
|
||||
type MountDevOp string
|
||||
|
||||
func (d MountDevOp) early(*Params) error { return nil }
|
||||
func (d MountDevOp) apply(params *Params) error {
|
||||
v := string(d)
|
||||
|
||||
if !path.IsAbs(v) {
|
||||
return msg.WrapErr(EBADE, fmt.Sprintf("path %q is not absolute", v))
|
||||
}
|
||||
target := toSysroot(v)
|
||||
|
||||
if err := mountTmpfs("devtmpfs", v, 0, params.ParentPerm); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, name := range []string{"null", "zero", "full", "random", "urandom", "tty"} {
|
||||
targetPath := toSysroot(path.Join(v, name))
|
||||
if err := ensureFile(targetPath, 0444, params.ParentPerm); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := hostProc.bindMount(
|
||||
toHost("/dev/"+name),
|
||||
targetPath,
|
||||
0,
|
||||
true,
|
||||
); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
for i, name := range []string{"stdin", "stdout", "stderr"} {
|
||||
if err := os.Symlink(
|
||||
"/proc/self/fd/"+string(rune(i+'0')),
|
||||
path.Join(target, name),
|
||||
); err != nil {
|
||||
return wrapErrSelf(err)
|
||||
}
|
||||
}
|
||||
for _, pair := range [][2]string{
|
||||
{"/proc/self/fd", "fd"},
|
||||
{"/proc/kcore", "core"},
|
||||
{"pts/ptmx", "ptmx"},
|
||||
} {
|
||||
if err := os.Symlink(pair[0], path.Join(target, pair[1])); err != nil {
|
||||
return wrapErrSelf(err)
|
||||
}
|
||||
}
|
||||
|
||||
devPtsPath := path.Join(target, "pts")
|
||||
for _, name := range []string{path.Join(target, "shm"), devPtsPath} {
|
||||
if err := os.Mkdir(name, params.ParentPerm); err != nil {
|
||||
return wrapErrSelf(err)
|
||||
}
|
||||
}
|
||||
|
||||
if err := Mount("devpts", devPtsPath, "devpts", MS_NOSUID|MS_NOEXEC,
|
||||
"newinstance,ptmxmode=0666,mode=620"); err != nil {
|
||||
return wrapErrSuffix(err,
|
||||
fmt.Sprintf("cannot mount devpts on %q:", devPtsPath))
|
||||
}
|
||||
|
||||
if params.RetainSession {
|
||||
var buf [8]byte
|
||||
if _, _, errno := Syscall(SYS_IOCTL, 1, TIOCGWINSZ, uintptr(unsafe.Pointer(&buf[0]))); errno == 0 {
|
||||
consolePath := toSysroot(path.Join(v, "console"))
|
||||
if err := ensureFile(consolePath, 0444, params.ParentPerm); err != nil {
|
||||
return err
|
||||
}
|
||||
if name, err := os.Readlink(hostProc.stdout()); err != nil {
|
||||
return wrapErrSelf(err)
|
||||
} else if err = hostProc.bindMount(
|
||||
toHost(name),
|
||||
consolePath,
|
||||
0,
|
||||
false,
|
||||
); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (d MountDevOp) Is(op Op) bool { vd, ok := op.(MountDevOp); return ok && d == vd }
|
||||
func (MountDevOp) prefix() string { return "mounting" }
|
||||
func (d MountDevOp) String() string { return fmt.Sprintf("dev on %q", string(d)) }
|
||||
func (f *Ops) Dev(dest string) *Ops {
|
||||
*f = append(*f, MountDevOp(dest))
|
||||
return f
|
||||
}
|
||||
|
||||
func init() { gob.Register(new(MountMqueueOp)) }
|
||||
|
||||
// MountMqueueOp mounts a private mqueue instance on container Path.
|
||||
type MountMqueueOp string
|
||||
|
||||
func (m MountMqueueOp) early(*Params) error { return nil }
|
||||
func (m MountMqueueOp) apply(params *Params) error {
|
||||
v := string(m)
|
||||
|
||||
if !path.IsAbs(v) {
|
||||
return msg.WrapErr(EBADE, fmt.Sprintf("path %q is not absolute", v))
|
||||
}
|
||||
|
||||
target := toSysroot(v)
|
||||
if err := os.MkdirAll(target, params.ParentPerm); err != nil {
|
||||
return wrapErrSelf(err)
|
||||
}
|
||||
return wrapErrSuffix(Mount("mqueue", target, "mqueue", MS_NOSUID|MS_NOEXEC|MS_NODEV, ""),
|
||||
fmt.Sprintf("cannot mount mqueue on %q:", v))
|
||||
}
|
||||
|
||||
func (m MountMqueueOp) Is(op Op) bool { vm, ok := op.(MountMqueueOp); return ok && m == vm }
|
||||
func (MountMqueueOp) prefix() string { return "mounting" }
|
||||
func (m MountMqueueOp) String() string { return fmt.Sprintf("mqueue on %q", string(m)) }
|
||||
func (f *Ops) Mqueue(dest string) *Ops {
|
||||
*f = append(*f, MountMqueueOp(dest))
|
||||
return f
|
||||
}
|
||||
|
||||
func init() { gob.Register(new(MountTmpfsOp)) }
|
||||
|
||||
// MountTmpfsOp mounts tmpfs on container Path.
|
||||
type MountTmpfsOp struct {
|
||||
Path string
|
||||
Size int
|
||||
Perm os.FileMode
|
||||
}
|
||||
|
||||
func (t *MountTmpfsOp) early(*Params) error { return nil }
|
||||
func (t *MountTmpfsOp) apply(*Params) error {
|
||||
if !path.IsAbs(t.Path) {
|
||||
return msg.WrapErr(EBADE, fmt.Sprintf("path %q is not absolute", t.Path))
|
||||
}
|
||||
if t.Size < 0 || t.Size > math.MaxUint>>1 {
|
||||
return msg.WrapErr(EBADE, fmt.Sprintf("size %d out of bounds", t.Size))
|
||||
}
|
||||
return mountTmpfs("tmpfs", t.Path, t.Size, t.Perm)
|
||||
}
|
||||
|
||||
func (t *MountTmpfsOp) Is(op Op) bool { vt, ok := op.(*MountTmpfsOp); return ok && *t == *vt }
|
||||
func (*MountTmpfsOp) prefix() string { return "mounting" }
|
||||
func (t *MountTmpfsOp) String() string { return fmt.Sprintf("tmpfs on %q size %d", t.Path, t.Size) }
|
||||
func (f *Ops) Tmpfs(dest string, size int, perm os.FileMode) *Ops {
|
||||
*f = append(*f, &MountTmpfsOp{dest, size, perm})
|
||||
return f
|
||||
}
|
||||
|
||||
func init() { gob.Register(new(SymlinkOp)) }
|
||||
|
||||
// SymlinkOp creates a symlink in the container filesystem.
|
||||
type SymlinkOp [2]string
|
||||
|
||||
func (l *SymlinkOp) early(*Params) error {
|
||||
if strings.HasPrefix(l[0], "*") {
|
||||
l[0] = l[0][1:]
|
||||
if !path.IsAbs(l[0]) {
|
||||
return msg.WrapErr(EBADE, fmt.Sprintf("path %q is not absolute", l[0]))
|
||||
}
|
||||
if name, err := os.Readlink(l[0]); err != nil {
|
||||
return wrapErrSelf(err)
|
||||
} else {
|
||||
l[0] = name
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
func (l *SymlinkOp) apply(params *Params) error {
|
||||
// symlink target is an arbitrary path value, so only validate link name here
|
||||
if !path.IsAbs(l[1]) {
|
||||
return msg.WrapErr(EBADE, fmt.Sprintf("path %q is not absolute", l[1]))
|
||||
}
|
||||
|
||||
target := toSysroot(l[1])
|
||||
if err := os.MkdirAll(path.Dir(target), params.ParentPerm); err != nil {
|
||||
return wrapErrSelf(err)
|
||||
}
|
||||
if err := os.Symlink(l[0], target); err != nil {
|
||||
return wrapErrSelf(err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (l *SymlinkOp) Is(op Op) bool { vl, ok := op.(*SymlinkOp); return ok && *l == *vl }
|
||||
func (*SymlinkOp) prefix() string { return "creating" }
|
||||
func (l *SymlinkOp) String() string { return fmt.Sprintf("symlink on %q target %q", l[1], l[0]) }
|
||||
func (f *Ops) Link(target, linkName string) *Ops {
|
||||
*f = append(*f, &SymlinkOp{target, linkName})
|
||||
return f
|
||||
}
|
||||
|
||||
func init() { gob.Register(new(MkdirOp)) }
|
||||
|
||||
// MkdirOp creates a directory in the container filesystem.
|
||||
type MkdirOp struct {
|
||||
Path string
|
||||
Perm os.FileMode
|
||||
}
|
||||
|
||||
func (m *MkdirOp) early(*Params) error { return nil }
|
||||
func (m *MkdirOp) apply(*Params) error {
|
||||
if !path.IsAbs(m.Path) {
|
||||
return msg.WrapErr(EBADE, fmt.Sprintf("path %q is not absolute", m.Path))
|
||||
}
|
||||
|
||||
if err := os.MkdirAll(toSysroot(m.Path), m.Perm); err != nil {
|
||||
return wrapErrSelf(err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *MkdirOp) Is(op Op) bool { vm, ok := op.(*MkdirOp); return ok && m == vm }
|
||||
func (*MkdirOp) prefix() string { return "creating" }
|
||||
func (m *MkdirOp) String() string { return fmt.Sprintf("directory %q perm %s", m.Path, m.Perm) }
|
||||
func (f *Ops) Mkdir(dest string, perm os.FileMode) *Ops {
|
||||
*f = append(*f, &MkdirOp{dest, perm})
|
||||
return f
|
||||
}
|
||||
|
||||
func init() { gob.Register(new(TmpfileOp)) }
|
||||
|
||||
// TmpfileOp places a file in container Path containing Data.
|
||||
type TmpfileOp struct {
|
||||
Path string
|
||||
Data []byte
|
||||
}
|
||||
|
||||
func (t *TmpfileOp) early(*Params) error { return nil }
|
||||
func (t *TmpfileOp) apply(params *Params) error {
|
||||
if !path.IsAbs(t.Path) {
|
||||
return msg.WrapErr(EBADE, fmt.Sprintf("path %q is not absolute", t.Path))
|
||||
}
|
||||
|
||||
var tmpPath string
|
||||
if f, err := os.CreateTemp("/", "tmp.*"); err != nil {
|
||||
return wrapErrSelf(err)
|
||||
} else if _, err = f.Write(t.Data); err != nil {
|
||||
return wrapErrSuffix(err,
|
||||
"cannot write to intermediate file:")
|
||||
} else if err = f.Close(); err != nil {
|
||||
return wrapErrSuffix(err,
|
||||
"cannot close intermediate file:")
|
||||
} else {
|
||||
tmpPath = f.Name()
|
||||
}
|
||||
|
||||
target := toSysroot(t.Path)
|
||||
if err := ensureFile(target, 0444, params.ParentPerm); err != nil {
|
||||
return err
|
||||
} else if err = hostProc.bindMount(
|
||||
tmpPath,
|
||||
target,
|
||||
MS_RDONLY|MS_NODEV,
|
||||
false,
|
||||
); err != nil {
|
||||
return err
|
||||
} else if err = os.Remove(tmpPath); err != nil {
|
||||
return wrapErrSelf(err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (t *TmpfileOp) Is(op Op) bool {
|
||||
vt, ok := op.(*TmpfileOp)
|
||||
return ok && t.Path == vt.Path && slices.Equal(t.Data, vt.Data)
|
||||
}
|
||||
func (*TmpfileOp) prefix() string { return "placing" }
|
||||
func (t *TmpfileOp) String() string {
|
||||
return fmt.Sprintf("tmpfile %q (%d bytes)", t.Path, len(t.Data))
|
||||
}
|
||||
func (f *Ops) Place(name string, data []byte) *Ops { *f = append(*f, &TmpfileOp{name, data}); return f }
|
||||
func (f *Ops) PlaceP(name string, dataP **[]byte) *Ops {
|
||||
t := &TmpfileOp{Path: name}
|
||||
*dataP = &t.Data
|
||||
|
||||
*f = append(*f, t)
|
||||
return f
|
||||
}
|
||||
|
||||
func init() { gob.Register(new(AutoEtcOp)) }
|
||||
|
||||
// AutoEtcOp expands host /etc into a toplevel symlink mirror with /etc semantics.
|
||||
// This is not a generic setup op. It is implemented here to reduce ipc overhead.
|
||||
type AutoEtcOp struct{ Prefix string }
|
||||
|
||||
func (e *AutoEtcOp) early(*Params) error { return nil }
|
||||
func (e *AutoEtcOp) apply(*Params) error {
|
||||
const target = sysrootPath + "/etc/"
|
||||
rel := e.hostRel() + "/"
|
||||
|
||||
if err := os.MkdirAll(target, 0755); err != nil {
|
||||
return wrapErrSelf(err)
|
||||
}
|
||||
if d, err := os.ReadDir(toSysroot(e.hostPath())); err != nil {
|
||||
return wrapErrSelf(err)
|
||||
} else {
|
||||
for _, ent := range d {
|
||||
n := ent.Name()
|
||||
switch n {
|
||||
case ".host":
|
||||
|
||||
case "passwd":
|
||||
case "group":
|
||||
|
||||
case "mtab":
|
||||
if err = os.Symlink("/proc/mounts", target+n); err != nil {
|
||||
return wrapErrSelf(err)
|
||||
}
|
||||
|
||||
default:
|
||||
if err = os.Symlink(rel+n, target+n); err != nil {
|
||||
return wrapErrSelf(err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
func (e *AutoEtcOp) hostPath() string { return "/etc/" + e.hostRel() }
|
||||
func (e *AutoEtcOp) hostRel() string { return ".host/" + e.Prefix }
|
||||
|
||||
func (e *AutoEtcOp) Is(op Op) bool {
|
||||
ve, ok := op.(*AutoEtcOp)
|
||||
return ok && ((e == nil && ve == nil) || (e != nil && ve != nil && *e == *ve))
|
||||
}
|
||||
func (*AutoEtcOp) prefix() string { return "setting up" }
|
||||
func (e *AutoEtcOp) String() string { return fmt.Sprintf("auto etc %s", e.Prefix) }
|
||||
func (f *Ops) Etc(host, prefix string) *Ops {
|
||||
e := &AutoEtcOp{prefix}
|
||||
f.Mkdir("/etc", 0755)
|
||||
f.Bind(host, e.hostPath(), 0)
|
||||
*f = append(*f, e)
|
||||
return f
|
||||
}
|
||||
26
container/output.go
Normal file
26
container/output.go
Normal file
@@ -0,0 +1,26 @@
|
||||
package container
|
||||
|
||||
var msg Msg = new(DefaultMsg)
|
||||
|
||||
func GetOutput() Msg { return msg }
|
||||
func SetOutput(v Msg) {
|
||||
if v == nil {
|
||||
msg = new(DefaultMsg)
|
||||
} else {
|
||||
msg = v
|
||||
}
|
||||
}
|
||||
|
||||
func wrapErrSuffix(err error, a ...any) error {
|
||||
if err == nil {
|
||||
return nil
|
||||
}
|
||||
return msg.WrapErr(err, append(a, err)...)
|
||||
}
|
||||
|
||||
func wrapErrSelf(err error) error {
|
||||
if err == nil {
|
||||
return nil
|
||||
}
|
||||
return msg.WrapErr(err, err.Error())
|
||||
}
|
||||
47
container/params.go
Normal file
47
container/params.go
Normal file
@@ -0,0 +1,47 @@
|
||||
package container
|
||||
|
||||
import (
|
||||
"encoding/gob"
|
||||
"errors"
|
||||
"os"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
var (
|
||||
ErrNotSet = errors.New("environment variable not set")
|
||||
ErrInvalid = errors.New("bad file descriptor")
|
||||
)
|
||||
|
||||
// Setup appends the read end of a pipe for setup params transmission and returns its fd.
|
||||
func Setup(extraFiles *[]*os.File) (int, *gob.Encoder, error) {
|
||||
if r, w, err := os.Pipe(); err != nil {
|
||||
return -1, nil, err
|
||||
} else {
|
||||
fd := 3 + len(*extraFiles)
|
||||
*extraFiles = append(*extraFiles, r)
|
||||
return fd, gob.NewEncoder(w), nil
|
||||
}
|
||||
}
|
||||
|
||||
// Receive retrieves setup fd from the environment and receives params.
|
||||
func Receive(key string, e any, v **os.File) (func() error, error) {
|
||||
var setup *os.File
|
||||
|
||||
if s, ok := os.LookupEnv(key); !ok {
|
||||
return nil, ErrNotSet
|
||||
} else {
|
||||
if fd, err := strconv.Atoi(s); err != nil {
|
||||
return nil, err
|
||||
} else {
|
||||
setup = os.NewFile(uintptr(fd), "setup")
|
||||
if setup == nil {
|
||||
return nil, ErrInvalid
|
||||
}
|
||||
if v != nil {
|
||||
*v = setup
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return setup.Close, gob.NewDecoder(setup).Decode(e)
|
||||
}
|
||||
94
container/path.go
Normal file
94
container/path.go
Normal file
@@ -0,0 +1,94 @@
|
||||
package container
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"io/fs"
|
||||
"os"
|
||||
"path"
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
|
||||
"git.gensokyo.uk/security/hakurei/container/vfs"
|
||||
)
|
||||
|
||||
const (
|
||||
hostPath = "/" + hostDir
|
||||
hostDir = "host"
|
||||
sysrootPath = "/" + sysrootDir
|
||||
sysrootDir = "sysroot"
|
||||
)
|
||||
|
||||
func toSysroot(name string) string {
|
||||
name = strings.TrimLeftFunc(name, func(r rune) bool { return r == '/' })
|
||||
return path.Join(sysrootPath, name)
|
||||
}
|
||||
|
||||
func toHost(name string) string {
|
||||
name = strings.TrimLeftFunc(name, func(r rune) bool { return r == '/' })
|
||||
return path.Join(hostPath, name)
|
||||
}
|
||||
|
||||
func createFile(name string, perm, pperm os.FileMode, content []byte) error {
|
||||
if err := os.MkdirAll(path.Dir(name), pperm); err != nil {
|
||||
return wrapErrSelf(err)
|
||||
}
|
||||
f, err := os.OpenFile(name, syscall.O_CREAT|syscall.O_EXCL|syscall.O_WRONLY, perm)
|
||||
if err != nil {
|
||||
return wrapErrSelf(err)
|
||||
}
|
||||
if content != nil {
|
||||
_, err = f.Write(content)
|
||||
if err != nil {
|
||||
err = wrapErrSelf(err)
|
||||
}
|
||||
}
|
||||
return errors.Join(f.Close(), err)
|
||||
}
|
||||
|
||||
func ensureFile(name string, perm, pperm os.FileMode) error {
|
||||
fi, err := os.Stat(name)
|
||||
if err != nil {
|
||||
if !os.IsNotExist(err) {
|
||||
return err
|
||||
}
|
||||
return createFile(name, perm, pperm, nil)
|
||||
}
|
||||
|
||||
if mode := fi.Mode(); mode&fs.ModeDir != 0 || mode&fs.ModeSymlink != 0 {
|
||||
err = msg.WrapErr(syscall.EISDIR,
|
||||
fmt.Sprintf("path %q is a directory", name))
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
var hostProc = newProcPats(hostPath)
|
||||
|
||||
func newProcPats(prefix string) *procPaths {
|
||||
return &procPaths{prefix + "/proc", prefix + "/proc/self"}
|
||||
}
|
||||
|
||||
type procPaths struct {
|
||||
prefix string
|
||||
self string
|
||||
}
|
||||
|
||||
func (p *procPaths) stdout() string { return p.self + "/fd/1" }
|
||||
func (p *procPaths) fd(fd int) string { return p.self + "/fd/" + strconv.Itoa(fd) }
|
||||
func (p *procPaths) mountinfo(f func(d *vfs.MountInfoDecoder) error) error {
|
||||
if r, err := os.Open(p.self + "/mountinfo"); err != nil {
|
||||
return wrapErrSelf(err)
|
||||
} else {
|
||||
d := vfs.NewMountInfoDecoder(r)
|
||||
err0 := f(d)
|
||||
if err = r.Close(); err != nil {
|
||||
return wrapErrSuffix(err,
|
||||
"cannot close mountinfo:")
|
||||
} else if err = d.Err(); err != nil {
|
||||
return wrapErrSuffix(err,
|
||||
"cannot parse mountinfo:")
|
||||
}
|
||||
return err0
|
||||
}
|
||||
}
|
||||
130
container/seccomp/libseccomp-helper.c
Normal file
130
container/seccomp/libseccomp-helper.c
Normal file
@@ -0,0 +1,130 @@
|
||||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE /* CLONE_NEWUSER */
|
||||
#endif
|
||||
|
||||
#include "libseccomp-helper.h"
|
||||
#include <assert.h>
|
||||
#include <errno.h>
|
||||
#include <sys/socket.h>
|
||||
|
||||
#define LEN(arr) (sizeof(arr) / sizeof((arr)[0]))
|
||||
|
||||
int32_t hakurei_export_filter(int *ret_p, int fd, uint32_t arch,
|
||||
uint32_t multiarch,
|
||||
struct hakurei_syscall_rule *rules,
|
||||
size_t rules_sz, hakurei_export_flag flags) {
|
||||
int i;
|
||||
int last_allowed_family;
|
||||
int disallowed;
|
||||
struct hakurei_syscall_rule *rule;
|
||||
|
||||
int32_t res = 0; /* refer to resPrefix for message */
|
||||
|
||||
/* Blocklist all but unix, inet, inet6 and netlink */
|
||||
struct {
|
||||
int family;
|
||||
hakurei_export_flag flags_mask;
|
||||
} socket_family_allowlist[] = {
|
||||
/* NOTE: Keep in numerical order */
|
||||
{AF_UNSPEC, 0},
|
||||
{AF_LOCAL, 0},
|
||||
{AF_INET, 0},
|
||||
{AF_INET6, 0},
|
||||
{AF_NETLINK, 0},
|
||||
{AF_CAN, HAKUREI_EXPORT_CAN},
|
||||
{AF_BLUETOOTH, HAKUREI_EXPORT_BLUETOOTH},
|
||||
};
|
||||
|
||||
scmp_filter_ctx ctx = seccomp_init(SCMP_ACT_ALLOW);
|
||||
if (ctx == NULL) {
|
||||
res = 1;
|
||||
goto out;
|
||||
} else
|
||||
errno = 0;
|
||||
|
||||
/* We only really need to handle arches on multiarch systems.
|
||||
* If only one arch is supported the default is fine */
|
||||
if (arch != 0) {
|
||||
/* This *adds* the target arch, instead of replacing the
|
||||
* native one. This is not ideal, because we'd like to only
|
||||
* allow the target arch, but we can't really disallow the
|
||||
* native arch at this point, because then bubblewrap
|
||||
* couldn't continue running. */
|
||||
*ret_p = seccomp_arch_add(ctx, arch);
|
||||
if (*ret_p < 0 && *ret_p != -EEXIST) {
|
||||
res = 2;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (flags & HAKUREI_EXPORT_MULTIARCH && multiarch != 0) {
|
||||
*ret_p = seccomp_arch_add(ctx, multiarch);
|
||||
if (*ret_p < 0 && *ret_p != -EEXIST) {
|
||||
res = 3;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < rules_sz; i++) {
|
||||
rule = &rules[i];
|
||||
assert(rule->m_errno == EPERM || rule->m_errno == ENOSYS);
|
||||
|
||||
if (rule->arg)
|
||||
*ret_p = seccomp_rule_add(ctx, SCMP_ACT_ERRNO(rule->m_errno),
|
||||
rule->syscall, 1, *rule->arg);
|
||||
else
|
||||
*ret_p = seccomp_rule_add(ctx, SCMP_ACT_ERRNO(rule->m_errno),
|
||||
rule->syscall, 0);
|
||||
|
||||
if (*ret_p == -EFAULT) {
|
||||
res = 4;
|
||||
goto out;
|
||||
} else if (*ret_p < 0) {
|
||||
res = 5;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
/* Socket filtering doesn't work on e.g. i386, so ignore failures here
|
||||
* However, we need to user seccomp_rule_add_exact to avoid libseccomp doing
|
||||
* something else: https://github.com/seccomp/libseccomp/issues/8 */
|
||||
last_allowed_family = -1;
|
||||
for (i = 0; i < LEN(socket_family_allowlist); i++) {
|
||||
if (socket_family_allowlist[i].flags_mask != 0 &&
|
||||
(socket_family_allowlist[i].flags_mask & flags) !=
|
||||
socket_family_allowlist[i].flags_mask)
|
||||
continue;
|
||||
|
||||
for (disallowed = last_allowed_family + 1;
|
||||
disallowed < socket_family_allowlist[i].family; disallowed++) {
|
||||
/* Blocklist the in-between valid families */
|
||||
seccomp_rule_add_exact(ctx, SCMP_ACT_ERRNO(EAFNOSUPPORT),
|
||||
SCMP_SYS(socket), 1,
|
||||
SCMP_A0(SCMP_CMP_EQ, disallowed));
|
||||
}
|
||||
last_allowed_family = socket_family_allowlist[i].family;
|
||||
}
|
||||
/* Blocklist the rest */
|
||||
seccomp_rule_add_exact(ctx, SCMP_ACT_ERRNO(EAFNOSUPPORT), SCMP_SYS(socket), 1,
|
||||
SCMP_A0(SCMP_CMP_GE, last_allowed_family + 1));
|
||||
|
||||
if (fd < 0) {
|
||||
*ret_p = seccomp_load(ctx);
|
||||
if (*ret_p != 0) {
|
||||
res = 7;
|
||||
goto out;
|
||||
}
|
||||
} else {
|
||||
*ret_p = seccomp_export_bpf(ctx, fd);
|
||||
if (*ret_p != 0) {
|
||||
res = 6;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
out:
|
||||
if (ctx)
|
||||
seccomp_release(ctx);
|
||||
|
||||
return res;
|
||||
}
|
||||
24
container/seccomp/libseccomp-helper.h
Normal file
24
container/seccomp/libseccomp-helper.h
Normal file
@@ -0,0 +1,24 @@
|
||||
#include <seccomp.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#if (SCMP_VER_MAJOR < 2) || (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR < 5) || \
|
||||
(SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR == 5 && SCMP_VER_MICRO < 1)
|
||||
#error This package requires libseccomp >= v2.5.1
|
||||
#endif
|
||||
|
||||
typedef enum {
|
||||
HAKUREI_EXPORT_MULTIARCH = 1 << 0,
|
||||
HAKUREI_EXPORT_CAN = 1 << 1,
|
||||
HAKUREI_EXPORT_BLUETOOTH = 1 << 2,
|
||||
} hakurei_export_flag;
|
||||
|
||||
struct hakurei_syscall_rule {
|
||||
int syscall;
|
||||
int m_errno;
|
||||
struct scmp_arg_cmp *arg;
|
||||
};
|
||||
|
||||
int32_t hakurei_export_filter(int *ret_p, int fd, uint32_t arch,
|
||||
uint32_t multiarch,
|
||||
struct hakurei_syscall_rule *rules,
|
||||
size_t rules_sz, hakurei_export_flag flags);
|
||||
188
container/seccomp/libseccomp.go
Normal file
188
container/seccomp/libseccomp.go
Normal file
@@ -0,0 +1,188 @@
|
||||
package seccomp
|
||||
|
||||
/*
|
||||
#cgo linux pkg-config: --static libseccomp
|
||||
|
||||
#include <libseccomp-helper.h>
|
||||
*/
|
||||
import "C"
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"runtime"
|
||||
"syscall"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
var (
|
||||
ErrInvalidRules = errors.New("invalid native rules slice")
|
||||
)
|
||||
|
||||
// LibraryError represents a libseccomp error.
|
||||
type LibraryError struct {
|
||||
Prefix string
|
||||
Seccomp syscall.Errno
|
||||
Errno error
|
||||
}
|
||||
|
||||
func (e *LibraryError) Error() string {
|
||||
if e.Seccomp == 0 {
|
||||
if e.Errno == nil {
|
||||
panic("invalid libseccomp error")
|
||||
}
|
||||
return fmt.Sprintf("%s: %s", e.Prefix, e.Errno)
|
||||
}
|
||||
if e.Errno == nil {
|
||||
return fmt.Sprintf("%s: %s", e.Prefix, e.Seccomp)
|
||||
}
|
||||
return fmt.Sprintf("%s: %s (%s)", e.Prefix, e.Seccomp, e.Errno)
|
||||
}
|
||||
|
||||
func (e *LibraryError) Is(err error) bool {
|
||||
if e == nil {
|
||||
return err == nil
|
||||
}
|
||||
if ef, ok := err.(*LibraryError); ok {
|
||||
return *e == *ef
|
||||
}
|
||||
return (e.Seccomp != 0 && errors.Is(err, e.Seccomp)) ||
|
||||
(e.Errno != nil && errors.Is(err, e.Errno))
|
||||
}
|
||||
|
||||
type (
|
||||
ScmpSyscall = C.int
|
||||
ScmpErrno = C.int
|
||||
)
|
||||
|
||||
// A NativeRule specifies an arch-specific action taken by seccomp under certain conditions.
|
||||
type NativeRule struct {
|
||||
// Syscall is the arch-dependent syscall number to act against.
|
||||
Syscall ScmpSyscall
|
||||
// Errno is the errno value to return when the condition is satisfied.
|
||||
Errno ScmpErrno
|
||||
// Arg is the optional struct scmp_arg_cmp passed to libseccomp.
|
||||
Arg *ScmpArgCmp
|
||||
}
|
||||
|
||||
type ExportFlag = C.hakurei_export_flag
|
||||
|
||||
const (
|
||||
// AllowMultiarch allows multiarch/emulation.
|
||||
AllowMultiarch ExportFlag = C.HAKUREI_EXPORT_MULTIARCH
|
||||
// AllowCAN allows AF_CAN.
|
||||
AllowCAN ExportFlag = C.HAKUREI_EXPORT_CAN
|
||||
// AllowBluetooth allows AF_BLUETOOTH.
|
||||
AllowBluetooth ExportFlag = C.HAKUREI_EXPORT_BLUETOOTH
|
||||
)
|
||||
|
||||
var resPrefix = [...]string{
|
||||
0: "",
|
||||
1: "seccomp_init failed",
|
||||
2: "seccomp_arch_add failed",
|
||||
3: "seccomp_arch_add failed (multiarch)",
|
||||
4: "internal libseccomp failure",
|
||||
5: "seccomp_rule_add failed",
|
||||
6: "seccomp_export_bpf failed",
|
||||
7: "seccomp_load failed",
|
||||
}
|
||||
|
||||
// Export streams filter contents to fd, or installs it to the current process if fd < 0.
|
||||
func Export(fd int, rules []NativeRule, flags ExportFlag) error {
|
||||
if len(rules) == 0 {
|
||||
return ErrInvalidRules
|
||||
}
|
||||
|
||||
var (
|
||||
arch C.uint32_t = 0
|
||||
multiarch C.uint32_t = 0
|
||||
)
|
||||
switch runtime.GOARCH {
|
||||
case "386":
|
||||
arch = C.SCMP_ARCH_X86
|
||||
case "amd64":
|
||||
arch = C.SCMP_ARCH_X86_64
|
||||
multiarch = C.SCMP_ARCH_X86
|
||||
case "arm":
|
||||
arch = C.SCMP_ARCH_ARM
|
||||
case "arm64":
|
||||
arch = C.SCMP_ARCH_AARCH64
|
||||
multiarch = C.SCMP_ARCH_ARM
|
||||
}
|
||||
|
||||
var ret C.int
|
||||
|
||||
rulesPinner := new(runtime.Pinner)
|
||||
for i := range rules {
|
||||
rule := &rules[i]
|
||||
rulesPinner.Pin(rule)
|
||||
if rule.Arg != nil {
|
||||
rulesPinner.Pin(rule.Arg)
|
||||
}
|
||||
}
|
||||
res, err := C.hakurei_export_filter(
|
||||
&ret, C.int(fd),
|
||||
arch, multiarch,
|
||||
(*C.struct_hakurei_syscall_rule)(unsafe.Pointer(&rules[0])),
|
||||
C.size_t(len(rules)),
|
||||
flags,
|
||||
)
|
||||
rulesPinner.Unpin()
|
||||
|
||||
if prefix := resPrefix[res]; prefix != "" {
|
||||
return &LibraryError{
|
||||
prefix,
|
||||
-syscall.Errno(ret),
|
||||
err,
|
||||
}
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
// ScmpCompare is the equivalent of scmp_compare;
|
||||
// Comparison operators
|
||||
type ScmpCompare = C.enum_scmp_compare
|
||||
|
||||
const (
|
||||
_SCMP_CMP_MIN = C._SCMP_CMP_MIN
|
||||
|
||||
// not equal
|
||||
SCMP_CMP_NE = C.SCMP_CMP_NE
|
||||
// less than
|
||||
SCMP_CMP_LT = C.SCMP_CMP_LT
|
||||
// less than or equal
|
||||
SCMP_CMP_LE = C.SCMP_CMP_LE
|
||||
// equal
|
||||
SCMP_CMP_EQ = C.SCMP_CMP_EQ
|
||||
// greater than or equal
|
||||
SCMP_CMP_GE = C.SCMP_CMP_GE
|
||||
// greater than
|
||||
SCMP_CMP_GT = C.SCMP_CMP_GT
|
||||
// masked equality
|
||||
SCMP_CMP_MASKED_EQ = C.SCMP_CMP_MASKED_EQ
|
||||
|
||||
_SCMP_CMP_MAX = C._SCMP_CMP_MAX
|
||||
)
|
||||
|
||||
// ScmpDatum is the equivalent of scmp_datum_t;
|
||||
// Argument datum
|
||||
type ScmpDatum uint64
|
||||
|
||||
// ScmpArgCmp is the equivalent of struct scmp_arg_cmp;
|
||||
// Argument / Value comparison definition
|
||||
type ScmpArgCmp struct {
|
||||
// argument number, starting at 0
|
||||
Arg C.uint
|
||||
// the comparison op, e.g. SCMP_CMP_*
|
||||
Op ScmpCompare
|
||||
|
||||
DatumA, DatumB ScmpDatum
|
||||
}
|
||||
|
||||
// only used for testing
|
||||
func syscallResolveName(s string) (trap int) {
|
||||
v := C.CString(s)
|
||||
trap = int(C.seccomp_syscall_resolve_name(v))
|
||||
C.free(unsafe.Pointer(v))
|
||||
|
||||
return
|
||||
}
|
||||
147
container/seccomp/libseccomp_test.go
Normal file
147
container/seccomp/libseccomp_test.go
Normal file
@@ -0,0 +1,147 @@
|
||||
package seccomp_test
|
||||
|
||||
import (
|
||||
"crypto/sha512"
|
||||
"errors"
|
||||
"io"
|
||||
"slices"
|
||||
"syscall"
|
||||
"testing"
|
||||
|
||||
. "git.gensokyo.uk/security/hakurei/container/seccomp"
|
||||
)
|
||||
|
||||
func TestExport(t *testing.T) {
|
||||
testCases := []struct {
|
||||
name string
|
||||
presets FilterPreset
|
||||
flags ExportFlag
|
||||
want []byte
|
||||
wantErr bool
|
||||
}{
|
||||
{"compat", 0, 0, []byte{
|
||||
0x95, 0xec, 0x69, 0xd0, 0x17, 0x73, 0x3e, 0x07,
|
||||
0x21, 0x60, 0xe0, 0xda, 0x80, 0xfd, 0xeb, 0xec,
|
||||
0xdf, 0x27, 0xae, 0x81, 0x66, 0xf5, 0xe2, 0xa7,
|
||||
0x31, 0x27, 0x0c, 0x98, 0xea, 0x2d, 0x29, 0x46,
|
||||
0xcb, 0x52, 0x31, 0x02, 0x90, 0x63, 0x66, 0x8a,
|
||||
0xf2, 0x15, 0x87, 0x91, 0x55, 0xda, 0x21, 0xac,
|
||||
0xa7, 0x9b, 0x07, 0x0e, 0x04, 0xc0, 0xee, 0x9a,
|
||||
0xcd, 0xf5, 0x8f, 0x55, 0xcf, 0xa8, 0x15, 0xa5,
|
||||
}, false},
|
||||
{"base", PresetExt, 0, []byte{
|
||||
0xdc, 0x7f, 0x2e, 0x1c, 0x5e, 0x82, 0x9b, 0x79,
|
||||
0xeb, 0xb7, 0xef, 0xc7, 0x59, 0x15, 0x0f, 0x54,
|
||||
0xa8, 0x3a, 0x75, 0xc8, 0xdf, 0x6f, 0xee, 0x4d,
|
||||
0xce, 0x5d, 0xad, 0xc4, 0x73, 0x6c, 0x58, 0x5d,
|
||||
0x4d, 0xee, 0xbf, 0xeb, 0x3c, 0x79, 0x69, 0xaf,
|
||||
0x3a, 0x07, 0x7e, 0x90, 0xb7, 0x7b, 0xb4, 0x74,
|
||||
0x1d, 0xb0, 0x5d, 0x90, 0x99, 0x7c, 0x86, 0x59,
|
||||
0xb9, 0x58, 0x91, 0x20, 0x6a, 0xc9, 0x95, 0x2d,
|
||||
}, false},
|
||||
{"everything", PresetExt |
|
||||
PresetDenyNS | PresetDenyTTY | PresetDenyDevel |
|
||||
PresetLinux32, AllowMultiarch | AllowCAN |
|
||||
AllowBluetooth, []byte{
|
||||
0xe9, 0x9d, 0xd3, 0x45, 0xe1, 0x95, 0x41, 0x34,
|
||||
0x73, 0xd3, 0xcb, 0xee, 0x07, 0xb4, 0xed, 0x57,
|
||||
0xb9, 0x08, 0xbf, 0xa8, 0x9e, 0xa2, 0x07, 0x2f,
|
||||
0xe9, 0x34, 0x82, 0x84, 0x7f, 0x50, 0xb5, 0xb7,
|
||||
0x58, 0xda, 0x17, 0xe7, 0x4c, 0xa2, 0xbb, 0xc0,
|
||||
0x08, 0x13, 0xde, 0x49, 0xa2, 0xb9, 0xbf, 0x83,
|
||||
0x4c, 0x02, 0x4e, 0xd4, 0x88, 0x50, 0xbe, 0x69,
|
||||
0xb6, 0x8a, 0x9a, 0x4c, 0x5f, 0x53, 0xa9, 0xdb,
|
||||
}, false},
|
||||
{"strict", PresetStrict, 0, []byte{
|
||||
0xe8, 0x80, 0x29, 0x8d, 0xf2, 0xbd, 0x67, 0x51,
|
||||
0xd0, 0x04, 0x0f, 0xc2, 0x1b, 0xc0, 0xed, 0x4c,
|
||||
0x00, 0xf9, 0x5d, 0xc0, 0xd7, 0xba, 0x50, 0x6c,
|
||||
0x24, 0x4d, 0x8b, 0x8c, 0xf6, 0x86, 0x6d, 0xba,
|
||||
0x8e, 0xf4, 0xa3, 0x32, 0x96, 0xf2, 0x87, 0xb6,
|
||||
0x6c, 0xcc, 0xc1, 0xd7, 0x8e, 0x97, 0x02, 0x65,
|
||||
0x97, 0xf8, 0x4c, 0xc7, 0xde, 0xc1, 0x57, 0x3e,
|
||||
0x14, 0x89, 0x60, 0xfb, 0xd3, 0x5c, 0xd7, 0x35,
|
||||
}, false},
|
||||
{"strict compat", 0 |
|
||||
PresetDenyNS | PresetDenyTTY | PresetDenyDevel, 0, []byte{
|
||||
0x39, 0x87, 0x1b, 0x93, 0xff, 0xaf, 0xc8, 0xb9,
|
||||
0x79, 0xfc, 0xed, 0xc0, 0xb0, 0xc3, 0x7b, 0x9e,
|
||||
0x03, 0x92, 0x2f, 0x5b, 0x02, 0x74, 0x8d, 0xc5,
|
||||
0xc3, 0xc1, 0x7c, 0x92, 0x52, 0x7f, 0x6e, 0x02,
|
||||
0x2e, 0xde, 0x1f, 0x48, 0xbf, 0xf5, 0x92, 0x46,
|
||||
0xea, 0x45, 0x2c, 0x0d, 0x1d, 0xe5, 0x48, 0x27,
|
||||
0x80, 0x8b, 0x1a, 0x6f, 0x84, 0xf3, 0x2b, 0xbd,
|
||||
0xe1, 0xaa, 0x02, 0xae, 0x30, 0xee, 0xdc, 0xfa,
|
||||
}, false},
|
||||
{"hakurei default", PresetExt | PresetDenyDevel, 0, []byte{
|
||||
0xc6, 0x98, 0xb0, 0x81, 0xff, 0x95, 0x7a, 0xfe,
|
||||
0x17, 0xa6, 0xd9, 0x43, 0x74, 0x53, 0x7d, 0x37,
|
||||
0xf2, 0xa6, 0x3f, 0x6f, 0x9d, 0xd7, 0x5d, 0xa7,
|
||||
0x54, 0x65, 0x42, 0x40, 0x7a, 0x9e, 0x32, 0x47,
|
||||
0x6e, 0xbd, 0xa3, 0x31, 0x2b, 0xa7, 0x78, 0x5d,
|
||||
0x7f, 0x61, 0x85, 0x42, 0xbc, 0xfa, 0xf2, 0x7c,
|
||||
0xa2, 0x7d, 0xcc, 0x2d, 0xdd, 0xba, 0x85, 0x20,
|
||||
0x69, 0xd2, 0x8b, 0xcf, 0xe8, 0xca, 0xd3, 0x9a,
|
||||
}, false},
|
||||
}
|
||||
|
||||
buf := make([]byte, 8)
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
e := New(Preset(tc.presets, tc.flags), tc.flags)
|
||||
digest := sha512.New()
|
||||
|
||||
if _, err := io.CopyBuffer(digest, e, buf); (err != nil) != tc.wantErr {
|
||||
t.Errorf("Exporter: error = %v, wantErr %v", err, tc.wantErr)
|
||||
return
|
||||
}
|
||||
if err := e.Close(); err != nil {
|
||||
t.Errorf("Close: error = %v", err)
|
||||
}
|
||||
if got := digest.Sum(nil); !slices.Equal(got, tc.want) {
|
||||
t.Fatalf("Export() hash = %x, want %x",
|
||||
got, tc.want)
|
||||
return
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
t.Run("close without use", func(t *testing.T) {
|
||||
e := New(Preset(0, 0), 0)
|
||||
if err := e.Close(); !errors.Is(err, syscall.EINVAL) {
|
||||
t.Errorf("Close: error = %v", err)
|
||||
return
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("close partial read", func(t *testing.T) {
|
||||
e := New(Preset(0, 0), 0)
|
||||
if _, err := e.Read(nil); err != nil {
|
||||
t.Errorf("Read: error = %v", err)
|
||||
return
|
||||
}
|
||||
// the underlying implementation uses buffered io, so the outcome of this is nondeterministic;
|
||||
// that is not harmful however, so both outcomes are checked for here
|
||||
if err := e.Close(); err != nil &&
|
||||
(!errors.Is(err, syscall.ECANCELED) || !errors.Is(err, syscall.EBADF)) {
|
||||
t.Errorf("Close: error = %v", err)
|
||||
return
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func BenchmarkExport(b *testing.B) {
|
||||
buf := make([]byte, 8)
|
||||
for i := 0; i < b.N; i++ {
|
||||
e := New(
|
||||
Preset(PresetExt|PresetDenyNS|PresetDenyTTY|PresetDenyDevel|PresetLinux32,
|
||||
AllowMultiarch|AllowCAN|AllowBluetooth),
|
||||
AllowMultiarch|AllowCAN|AllowBluetooth)
|
||||
if _, err := io.CopyBuffer(io.Discard, e, buf); err != nil {
|
||||
b.Fatalf("cannot export: %v", err)
|
||||
}
|
||||
if err := e.Close(); err != nil {
|
||||
b.Fatalf("cannot close exporter: %v", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
83
container/seccomp/mksysnum_linux.pl
Executable file
83
container/seccomp/mksysnum_linux.pl
Executable file
@@ -0,0 +1,83 @@
|
||||
#!/usr/bin/env perl
|
||||
# Copyright 2009 The Go Authors. All rights reserved.
|
||||
# Use of this source code is governed by a BSD-style
|
||||
# license that can be found in the LICENSE file.
|
||||
|
||||
use strict;
|
||||
|
||||
my $command = "mksysnum_linux.pl ". join(' ', @ARGV);
|
||||
|
||||
print <<EOF;
|
||||
// $command
|
||||
// Code generated by the command above; DO NOT EDIT.
|
||||
|
||||
package seccomp
|
||||
|
||||
import . "syscall"
|
||||
|
||||
var syscallNum = map[string]int{
|
||||
EOF
|
||||
|
||||
my $offset = 0;
|
||||
my $state = -1;
|
||||
|
||||
sub fmt {
|
||||
my ($name, $num) = @_;
|
||||
if($num > 999){
|
||||
# ignore deprecated syscalls that are no longer implemented
|
||||
# https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/include/uapi/asm-generic/unistd.h?id=refs/heads/master#n716
|
||||
return;
|
||||
}
|
||||
(my $name_upper = $name) =~ y/a-z/A-Z/;
|
||||
$num = $num + $offset;
|
||||
if($num > 302){ # not wired in Go standard library
|
||||
if($state < 0){
|
||||
print " \"$name\": SYS_$name_upper,\n";
|
||||
}
|
||||
else{
|
||||
print " SYS_$name_upper = $num;\n";
|
||||
}
|
||||
}
|
||||
elsif($state < 0){
|
||||
print " \"$name\": SYS_$name_upper,\n";
|
||||
}
|
||||
else{
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
GENERATE:
|
||||
|
||||
my $prev;
|
||||
open(GCC, "gcc -E -dD $ARGV[0] |") || die "can't run gcc";
|
||||
while(<GCC>){
|
||||
if(/^#define __NR_Linux\s+([0-9]+)/){
|
||||
# mips/mips64: extract offset
|
||||
$offset = $1;
|
||||
}
|
||||
elsif(/^#define __NR_syscalls\s+/) {
|
||||
# ignore redefinitions of __NR_syscalls
|
||||
}
|
||||
elsif(/^#define __NR_(\w+)\s+([0-9]+)/){
|
||||
$prev = $2;
|
||||
fmt($1, $2);
|
||||
}
|
||||
elsif(/^#define __NR3264_(\w+)\s+([0-9]+)/){
|
||||
$prev = $2;
|
||||
fmt($1, $2);
|
||||
}
|
||||
elsif(/^#define __NR_(\w+)\s+\(\w+\+\s*([0-9]+)\)/){
|
||||
fmt($1, $prev+$2)
|
||||
}
|
||||
elsif(/^#define __NR_(\w+)\s+\(__NR_Linux \+ ([0-9]+)/){
|
||||
fmt($1, $2);
|
||||
}
|
||||
}
|
||||
|
||||
if($state < 0){
|
||||
$state = $state + 1;
|
||||
print "}\n\nconst (\n";
|
||||
goto GENERATE;
|
||||
}
|
||||
|
||||
print ")";
|
||||
229
container/seccomp/presets.go
Normal file
229
container/seccomp/presets.go
Normal file
@@ -0,0 +1,229 @@
|
||||
package seccomp
|
||||
|
||||
/* flatpak commit 4c3bf179e2e4a2a298cd1db1d045adaf3f564532 */
|
||||
|
||||
import (
|
||||
. "syscall"
|
||||
)
|
||||
|
||||
type FilterPreset int
|
||||
|
||||
const (
|
||||
// PresetExt are project-specific extensions.
|
||||
PresetExt FilterPreset = 1 << iota
|
||||
// PresetDenyNS denies namespace setup syscalls.
|
||||
PresetDenyNS
|
||||
// PresetDenyTTY denies faking input.
|
||||
PresetDenyTTY
|
||||
// PresetDenyDevel denies development-related syscalls.
|
||||
PresetDenyDevel
|
||||
// PresetLinux32 sets PER_LINUX32.
|
||||
PresetLinux32
|
||||
)
|
||||
|
||||
func Preset(presets FilterPreset, flags ExportFlag) (rules []NativeRule) {
|
||||
allowedPersonality := PER_LINUX
|
||||
if presets&PresetLinux32 != 0 {
|
||||
allowedPersonality = PER_LINUX32
|
||||
}
|
||||
presetDevelFinal := presetDevel(ScmpDatum(allowedPersonality))
|
||||
|
||||
l := len(presetCommon)
|
||||
if presets&PresetDenyNS != 0 {
|
||||
l += len(presetNamespace)
|
||||
}
|
||||
if presets&PresetDenyTTY != 0 {
|
||||
l += len(presetTTY)
|
||||
}
|
||||
if presets&PresetDenyDevel != 0 {
|
||||
l += len(presetDevelFinal)
|
||||
}
|
||||
if flags&AllowMultiarch == 0 {
|
||||
l += len(presetEmu)
|
||||
}
|
||||
if presets&PresetExt != 0 {
|
||||
l += len(presetCommonExt)
|
||||
if presets&PresetDenyNS != 0 {
|
||||
l += len(presetNamespaceExt)
|
||||
}
|
||||
if flags&AllowMultiarch == 0 {
|
||||
l += len(presetEmuExt)
|
||||
}
|
||||
}
|
||||
|
||||
rules = make([]NativeRule, 0, l)
|
||||
rules = append(rules, presetCommon...)
|
||||
if presets&PresetDenyNS != 0 {
|
||||
rules = append(rules, presetNamespace...)
|
||||
}
|
||||
if presets&PresetDenyTTY != 0 {
|
||||
rules = append(rules, presetTTY...)
|
||||
}
|
||||
if presets&PresetDenyDevel != 0 {
|
||||
rules = append(rules, presetDevelFinal...)
|
||||
}
|
||||
if flags&AllowMultiarch == 0 {
|
||||
rules = append(rules, presetEmu...)
|
||||
}
|
||||
if presets&PresetExt != 0 {
|
||||
rules = append(rules, presetCommonExt...)
|
||||
if presets&PresetDenyNS != 0 {
|
||||
rules = append(rules, presetNamespaceExt...)
|
||||
}
|
||||
if flags&AllowMultiarch == 0 {
|
||||
rules = append(rules, presetEmuExt...)
|
||||
}
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
var (
|
||||
presetCommon = []NativeRule{
|
||||
/* Block dmesg */
|
||||
{ScmpSyscall(SYS_SYSLOG), ScmpErrno(EPERM), nil},
|
||||
/* Useless old syscall */
|
||||
{ScmpSyscall(SYS_USELIB), ScmpErrno(EPERM), nil},
|
||||
/* Don't allow disabling accounting */
|
||||
{ScmpSyscall(SYS_ACCT), ScmpErrno(EPERM), nil},
|
||||
/* Don't allow reading current quota use */
|
||||
{ScmpSyscall(SYS_QUOTACTL), ScmpErrno(EPERM), nil},
|
||||
|
||||
/* Don't allow access to the kernel keyring */
|
||||
{ScmpSyscall(SYS_ADD_KEY), ScmpErrno(EPERM), nil},
|
||||
{ScmpSyscall(SYS_KEYCTL), ScmpErrno(EPERM), nil},
|
||||
{ScmpSyscall(SYS_REQUEST_KEY), ScmpErrno(EPERM), nil},
|
||||
|
||||
/* Scary VM/NUMA ops */
|
||||
{ScmpSyscall(SYS_MOVE_PAGES), ScmpErrno(EPERM), nil},
|
||||
{ScmpSyscall(SYS_MBIND), ScmpErrno(EPERM), nil},
|
||||
{ScmpSyscall(SYS_GET_MEMPOLICY), ScmpErrno(EPERM), nil},
|
||||
{ScmpSyscall(SYS_SET_MEMPOLICY), ScmpErrno(EPERM), nil},
|
||||
{ScmpSyscall(SYS_MIGRATE_PAGES), ScmpErrno(EPERM), nil},
|
||||
}
|
||||
|
||||
/* hakurei: project-specific extensions */
|
||||
presetCommonExt = []NativeRule{
|
||||
/* system calls for changing the system clock */
|
||||
{ScmpSyscall(SYS_ADJTIMEX), ScmpErrno(EPERM), nil},
|
||||
{ScmpSyscall(SYS_CLOCK_ADJTIME), ScmpErrno(EPERM), nil},
|
||||
{ScmpSyscall(SYS_CLOCK_ADJTIME64), ScmpErrno(EPERM), nil},
|
||||
{ScmpSyscall(SYS_CLOCK_SETTIME), ScmpErrno(EPERM), nil},
|
||||
{ScmpSyscall(SYS_CLOCK_SETTIME64), ScmpErrno(EPERM), nil},
|
||||
{ScmpSyscall(SYS_SETTIMEOFDAY), ScmpErrno(EPERM), nil},
|
||||
|
||||
/* loading and unloading of kernel modules */
|
||||
{ScmpSyscall(SYS_DELETE_MODULE), ScmpErrno(EPERM), nil},
|
||||
{ScmpSyscall(SYS_FINIT_MODULE), ScmpErrno(EPERM), nil},
|
||||
{ScmpSyscall(SYS_INIT_MODULE), ScmpErrno(EPERM), nil},
|
||||
|
||||
/* system calls for rebooting and reboot preparation */
|
||||
{ScmpSyscall(SYS_KEXEC_FILE_LOAD), ScmpErrno(EPERM), nil},
|
||||
{ScmpSyscall(SYS_KEXEC_LOAD), ScmpErrno(EPERM), nil},
|
||||
{ScmpSyscall(SYS_REBOOT), ScmpErrno(EPERM), nil},
|
||||
|
||||
/* system calls for enabling/disabling swap devices */
|
||||
{ScmpSyscall(SYS_SWAPOFF), ScmpErrno(EPERM), nil},
|
||||
{ScmpSyscall(SYS_SWAPON), ScmpErrno(EPERM), nil},
|
||||
}
|
||||
|
||||
presetNamespace = []NativeRule{
|
||||
/* Don't allow subnamespace setups: */
|
||||
{ScmpSyscall(SYS_UNSHARE), ScmpErrno(EPERM), nil},
|
||||
{ScmpSyscall(SYS_SETNS), ScmpErrno(EPERM), nil},
|
||||
{ScmpSyscall(SYS_MOUNT), ScmpErrno(EPERM), nil},
|
||||
{ScmpSyscall(SYS_UMOUNT), ScmpErrno(EPERM), nil},
|
||||
{ScmpSyscall(SYS_UMOUNT2), ScmpErrno(EPERM), nil},
|
||||
{ScmpSyscall(SYS_PIVOT_ROOT), ScmpErrno(EPERM), nil},
|
||||
{ScmpSyscall(SYS_CHROOT), ScmpErrno(EPERM), nil},
|
||||
{ScmpSyscall(SYS_CLONE), ScmpErrno(EPERM),
|
||||
&ScmpArgCmp{cloneArg, SCMP_CMP_MASKED_EQ, CLONE_NEWUSER, CLONE_NEWUSER}},
|
||||
|
||||
/* seccomp can't look into clone3()'s struct clone_args to check whether
|
||||
* the flags are OK, so we have no choice but to block clone3().
|
||||
* Return ENOSYS so user-space will fall back to clone().
|
||||
* (CVE-2021-41133; see also https://github.com/moby/moby/commit/9f6b562d)
|
||||
*/
|
||||
{ScmpSyscall(SYS_CLONE3), ScmpErrno(ENOSYS), nil},
|
||||
|
||||
/* New mount manipulation APIs can also change our VFS. There's no
|
||||
* legitimate reason to do these in the sandbox, so block all of them
|
||||
* rather than thinking about which ones might be dangerous.
|
||||
* (CVE-2021-41133) */
|
||||
{ScmpSyscall(SYS_OPEN_TREE), ScmpErrno(ENOSYS), nil},
|
||||
{ScmpSyscall(SYS_MOVE_MOUNT), ScmpErrno(ENOSYS), nil},
|
||||
{ScmpSyscall(SYS_FSOPEN), ScmpErrno(ENOSYS), nil},
|
||||
{ScmpSyscall(SYS_FSCONFIG), ScmpErrno(ENOSYS), nil},
|
||||
{ScmpSyscall(SYS_FSMOUNT), ScmpErrno(ENOSYS), nil},
|
||||
{ScmpSyscall(SYS_FSPICK), ScmpErrno(ENOSYS), nil},
|
||||
{ScmpSyscall(SYS_MOUNT_SETATTR), ScmpErrno(ENOSYS), nil},
|
||||
}
|
||||
|
||||
/* hakurei: project-specific extensions */
|
||||
presetNamespaceExt = []NativeRule{
|
||||
/* changing file ownership */
|
||||
{ScmpSyscall(SYS_CHOWN), ScmpErrno(EPERM), nil},
|
||||
{ScmpSyscall(SYS_CHOWN32), ScmpErrno(EPERM), nil},
|
||||
{ScmpSyscall(SYS_FCHOWN), ScmpErrno(EPERM), nil},
|
||||
{ScmpSyscall(SYS_FCHOWN32), ScmpErrno(EPERM), nil},
|
||||
{ScmpSyscall(SYS_FCHOWNAT), ScmpErrno(EPERM), nil},
|
||||
{ScmpSyscall(SYS_LCHOWN), ScmpErrno(EPERM), nil},
|
||||
{ScmpSyscall(SYS_LCHOWN32), ScmpErrno(EPERM), nil},
|
||||
|
||||
/* system calls for changing user ID and group ID credentials */
|
||||
{ScmpSyscall(SYS_SETGID), ScmpErrno(EPERM), nil},
|
||||
{ScmpSyscall(SYS_SETGID32), ScmpErrno(EPERM), nil},
|
||||
{ScmpSyscall(SYS_SETGROUPS), ScmpErrno(EPERM), nil},
|
||||
{ScmpSyscall(SYS_SETGROUPS32), ScmpErrno(EPERM), nil},
|
||||
{ScmpSyscall(SYS_SETREGID), ScmpErrno(EPERM), nil},
|
||||
{ScmpSyscall(SYS_SETREGID32), ScmpErrno(EPERM), nil},
|
||||
{ScmpSyscall(SYS_SETRESGID), ScmpErrno(EPERM), nil},
|
||||
{ScmpSyscall(SYS_SETRESGID32), ScmpErrno(EPERM), nil},
|
||||
{ScmpSyscall(SYS_SETRESUID), ScmpErrno(EPERM), nil},
|
||||
{ScmpSyscall(SYS_SETRESUID32), ScmpErrno(EPERM), nil},
|
||||
{ScmpSyscall(SYS_SETREUID), ScmpErrno(EPERM), nil},
|
||||
{ScmpSyscall(SYS_SETREUID32), ScmpErrno(EPERM), nil},
|
||||
{ScmpSyscall(SYS_SETUID), ScmpErrno(EPERM), nil},
|
||||
{ScmpSyscall(SYS_SETUID32), ScmpErrno(EPERM), nil},
|
||||
}
|
||||
|
||||
presetTTY = []NativeRule{
|
||||
/* Don't allow faking input to the controlling tty (CVE-2017-5226) */
|
||||
{ScmpSyscall(SYS_IOCTL), ScmpErrno(EPERM),
|
||||
&ScmpArgCmp{1, SCMP_CMP_MASKED_EQ, 0xFFFFFFFF, TIOCSTI}},
|
||||
/* In the unlikely event that the controlling tty is a Linux virtual
|
||||
* console (/dev/tty2 or similar), copy/paste operations have an effect
|
||||
* similar to TIOCSTI (CVE-2023-28100) */
|
||||
{ScmpSyscall(SYS_IOCTL), ScmpErrno(EPERM),
|
||||
&ScmpArgCmp{1, SCMP_CMP_MASKED_EQ, 0xFFFFFFFF, TIOCLINUX}},
|
||||
}
|
||||
|
||||
presetEmu = []NativeRule{
|
||||
/* modify_ldt is a historic source of interesting information leaks,
|
||||
* so it's disabled as a hardening measure.
|
||||
* However, it is required to run old 16-bit applications
|
||||
* as well as some Wine patches, so it's allowed in multiarch. */
|
||||
{ScmpSyscall(SYS_MODIFY_LDT), ScmpErrno(EPERM), nil},
|
||||
}
|
||||
|
||||
/* hakurei: project-specific extensions */
|
||||
presetEmuExt = []NativeRule{
|
||||
{ScmpSyscall(SYS_SUBPAGE_PROT), ScmpErrno(ENOSYS), nil},
|
||||
{ScmpSyscall(SYS_SWITCH_ENDIAN), ScmpErrno(ENOSYS), nil},
|
||||
{ScmpSyscall(SYS_VM86), ScmpErrno(ENOSYS), nil},
|
||||
{ScmpSyscall(SYS_VM86OLD), ScmpErrno(ENOSYS), nil},
|
||||
}
|
||||
)
|
||||
|
||||
func presetDevel(allowedPersonality ScmpDatum) []NativeRule {
|
||||
return []NativeRule{
|
||||
/* Profiling operations; we expect these to be done by tools from outside
|
||||
* the sandbox. In particular perf has been the source of many CVEs. */
|
||||
{ScmpSyscall(SYS_PERF_EVENT_OPEN), ScmpErrno(EPERM), nil},
|
||||
/* Don't allow you to switch to bsd emulation or whatnot */
|
||||
{ScmpSyscall(SYS_PERSONALITY), ScmpErrno(EPERM),
|
||||
&ScmpArgCmp{0, SCMP_CMP_NE, allowedPersonality, 0}},
|
||||
|
||||
{ScmpSyscall(SYS_PTRACE), ScmpErrno(EPERM), nil},
|
||||
}
|
||||
}
|
||||
7
container/seccomp/presets_clone_backwards2.go
Normal file
7
container/seccomp/presets_clone_backwards2.go
Normal file
@@ -0,0 +1,7 @@
|
||||
//go:build s390 || s390x
|
||||
|
||||
package seccomp
|
||||
|
||||
/* Architectures with CONFIG_CLONE_BACKWARDS2: the child stack
|
||||
* and flags arguments are reversed so the flags come second */
|
||||
const cloneArg = 1
|
||||
6
container/seccomp/presets_clone_generic.go
Normal file
6
container/seccomp/presets_clone_generic.go
Normal file
@@ -0,0 +1,6 @@
|
||||
//go:build !s390 && !s390x
|
||||
|
||||
package seccomp
|
||||
|
||||
/* Normally the flags come first */
|
||||
const cloneArg = 0
|
||||
78
container/seccomp/proc.go
Normal file
78
container/seccomp/proc.go
Normal file
@@ -0,0 +1,78 @@
|
||||
package seccomp
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"syscall"
|
||||
|
||||
"git.gensokyo.uk/security/hakurei/helper/proc"
|
||||
)
|
||||
|
||||
const (
|
||||
PresetStrict = PresetExt | PresetDenyNS | PresetDenyTTY | PresetDenyDevel
|
||||
)
|
||||
|
||||
// New returns an inactive Encoder instance.
|
||||
func New(rules []NativeRule, flags ExportFlag) *Encoder { return &Encoder{newExporter(rules, flags)} }
|
||||
|
||||
// Load loads a filter into the kernel.
|
||||
func Load(rules []NativeRule, flags ExportFlag) error { return Export(-1, rules, flags) }
|
||||
|
||||
/*
|
||||
An Encoder writes a BPF program to an output stream.
|
||||
|
||||
Methods of Encoder are not safe for concurrent use.
|
||||
|
||||
An Encoder must not be copied after first use.
|
||||
*/
|
||||
type Encoder struct {
|
||||
*exporter
|
||||
}
|
||||
|
||||
func (e *Encoder) Read(p []byte) (n int, err error) {
|
||||
if err = e.prepare(); err != nil {
|
||||
return
|
||||
}
|
||||
return e.r.Read(p)
|
||||
}
|
||||
|
||||
func (e *Encoder) Close() error {
|
||||
if e.r == nil {
|
||||
return syscall.EINVAL
|
||||
}
|
||||
|
||||
// this hangs if the cgo thread fails to exit
|
||||
return errors.Join(e.closeWrite(), <-e.exportErr)
|
||||
}
|
||||
|
||||
// NewFile returns an instance of exporter implementing [proc.File].
|
||||
func NewFile(rules []NativeRule, flags ExportFlag) proc.File {
|
||||
return &File{rules: rules, flags: flags}
|
||||
}
|
||||
|
||||
// File implements [proc.File] and provides access to the read end of exporter pipe.
|
||||
type File struct {
|
||||
rules []NativeRule
|
||||
flags ExportFlag
|
||||
proc.BaseFile
|
||||
}
|
||||
|
||||
func (f *File) ErrCount() int { return 2 }
|
||||
func (f *File) Fulfill(ctx context.Context, dispatchErr func(error)) error {
|
||||
e := newExporter(f.rules, f.flags)
|
||||
if err := e.prepare(); err != nil {
|
||||
return err
|
||||
}
|
||||
f.Set(e.r)
|
||||
go func() {
|
||||
select {
|
||||
case err := <-e.exportErr:
|
||||
dispatchErr(nil)
|
||||
dispatchErr(err)
|
||||
case <-ctx.Done():
|
||||
dispatchErr(e.closeWrite())
|
||||
dispatchErr(<-e.exportErr)
|
||||
}
|
||||
}()
|
||||
return nil
|
||||
}
|
||||
60
container/seccomp/seccomp.go
Normal file
60
container/seccomp/seccomp.go
Normal file
@@ -0,0 +1,60 @@
|
||||
// Package seccomp provides high level wrappers around libseccomp.
|
||||
package seccomp
|
||||
|
||||
import (
|
||||
"os"
|
||||
"runtime"
|
||||
"sync"
|
||||
)
|
||||
|
||||
type exporter struct {
|
||||
rules []NativeRule
|
||||
flags ExportFlag
|
||||
r, w *os.File
|
||||
|
||||
prepareOnce sync.Once
|
||||
prepareErr error
|
||||
closeOnce sync.Once
|
||||
closeErr error
|
||||
exportErr <-chan error
|
||||
}
|
||||
|
||||
func (e *exporter) prepare() error {
|
||||
e.prepareOnce.Do(func() {
|
||||
if r, w, err := os.Pipe(); err != nil {
|
||||
e.prepareErr = err
|
||||
return
|
||||
} else {
|
||||
e.r, e.w = r, w
|
||||
}
|
||||
|
||||
ec := make(chan error, 1)
|
||||
go func(fd uintptr) {
|
||||
ec <- Export(int(fd), e.rules, e.flags)
|
||||
close(ec)
|
||||
_ = e.closeWrite()
|
||||
runtime.KeepAlive(e.w)
|
||||
}(e.w.Fd())
|
||||
e.exportErr = ec
|
||||
runtime.SetFinalizer(e, (*exporter).closeWrite)
|
||||
})
|
||||
return e.prepareErr
|
||||
}
|
||||
|
||||
func (e *exporter) closeWrite() error {
|
||||
e.closeOnce.Do(func() {
|
||||
if e.w == nil {
|
||||
panic("closeWrite called on invalid exporter")
|
||||
}
|
||||
e.closeErr = e.w.Close()
|
||||
|
||||
// no need for a finalizer anymore
|
||||
runtime.SetFinalizer(e, nil)
|
||||
})
|
||||
|
||||
return e.closeErr
|
||||
}
|
||||
|
||||
func newExporter(rules []NativeRule, flags ExportFlag) *exporter {
|
||||
return &exporter{rules: rules, flags: flags}
|
||||
}
|
||||
65
container/seccomp/seccomp_test.go
Normal file
65
container/seccomp/seccomp_test.go
Normal file
@@ -0,0 +1,65 @@
|
||||
package seccomp_test
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"runtime"
|
||||
"syscall"
|
||||
"testing"
|
||||
|
||||
"git.gensokyo.uk/security/hakurei/container/seccomp"
|
||||
)
|
||||
|
||||
func TestLibraryError(t *testing.T) {
|
||||
testCases := []struct {
|
||||
name string
|
||||
sample *seccomp.LibraryError
|
||||
want string
|
||||
wantIs bool
|
||||
compare error
|
||||
}{
|
||||
{
|
||||
"full",
|
||||
&seccomp.LibraryError{Prefix: "seccomp_export_bpf failed", Seccomp: syscall.ECANCELED, Errno: syscall.EBADF},
|
||||
"seccomp_export_bpf failed: operation canceled (bad file descriptor)",
|
||||
true,
|
||||
&seccomp.LibraryError{Prefix: "seccomp_export_bpf failed", Seccomp: syscall.ECANCELED, Errno: syscall.EBADF},
|
||||
},
|
||||
{
|
||||
"errno only",
|
||||
&seccomp.LibraryError{Prefix: "seccomp_init failed", Errno: syscall.ENOMEM},
|
||||
"seccomp_init failed: cannot allocate memory",
|
||||
false,
|
||||
nil,
|
||||
},
|
||||
{
|
||||
"seccomp only",
|
||||
&seccomp.LibraryError{Prefix: "internal libseccomp failure", Seccomp: syscall.EFAULT},
|
||||
"internal libseccomp failure: bad address",
|
||||
true,
|
||||
syscall.EFAULT,
|
||||
},
|
||||
}
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
if errors.Is(tc.sample, tc.compare) != tc.wantIs {
|
||||
t.Errorf("errors.Is(%#v, %#v) did not return %v",
|
||||
tc.sample, tc.compare, tc.wantIs)
|
||||
}
|
||||
|
||||
if got := tc.sample.Error(); got != tc.want {
|
||||
t.Errorf("Error: %q, want %q",
|
||||
got, tc.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
t.Run("invalid", func(t *testing.T) {
|
||||
wantPanic := "invalid libseccomp error"
|
||||
defer func() {
|
||||
if r := recover(); r != wantPanic {
|
||||
t.Errorf("panic: %q, want %q", r, wantPanic)
|
||||
}
|
||||
}()
|
||||
runtime.KeepAlive(new(seccomp.LibraryError).Error())
|
||||
})
|
||||
}
|
||||
28
container/seccomp/syscall.go
Normal file
28
container/seccomp/syscall.go
Normal file
@@ -0,0 +1,28 @@
|
||||
package seccomp
|
||||
|
||||
import "iter"
|
||||
|
||||
// Syscalls returns an iterator over all wired syscalls.
|
||||
func Syscalls() iter.Seq2[string, int] {
|
||||
return func(yield func(string, int) bool) {
|
||||
for name, num := range syscallNum {
|
||||
if !yield(name, num) {
|
||||
return
|
||||
}
|
||||
}
|
||||
for name, num := range syscallNumExtra {
|
||||
if !yield(name, num) {
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// SyscallResolveName resolves a syscall number from its string representation.
|
||||
func SyscallResolveName(name string) (num int, ok bool) {
|
||||
if num, ok = syscallNum[name]; ok {
|
||||
return
|
||||
}
|
||||
num, ok = syscallNumExtra[name]
|
||||
return
|
||||
}
|
||||
54
container/seccomp/syscall_extra_linux_amd64.go
Normal file
54
container/seccomp/syscall_extra_linux_amd64.go
Normal file
@@ -0,0 +1,54 @@
|
||||
package seccomp
|
||||
|
||||
/*
|
||||
#cgo linux pkg-config: --static libseccomp
|
||||
|
||||
#include <seccomp.h>
|
||||
#include <sys/personality.h>
|
||||
*/
|
||||
import "C"
|
||||
|
||||
const (
|
||||
PER_LINUX = C.PER_LINUX
|
||||
PER_LINUX32 = C.PER_LINUX32
|
||||
)
|
||||
|
||||
var syscallNumExtra = map[string]int{
|
||||
"umount": SYS_UMOUNT,
|
||||
"subpage_prot": SYS_SUBPAGE_PROT,
|
||||
"switch_endian": SYS_SWITCH_ENDIAN,
|
||||
"vm86": SYS_VM86,
|
||||
"vm86old": SYS_VM86OLD,
|
||||
"clock_adjtime64": SYS_CLOCK_ADJTIME64,
|
||||
"clock_settime64": SYS_CLOCK_SETTIME64,
|
||||
"chown32": SYS_CHOWN32,
|
||||
"fchown32": SYS_FCHOWN32,
|
||||
"lchown32": SYS_LCHOWN32,
|
||||
"setgid32": SYS_SETGID32,
|
||||
"setgroups32": SYS_SETGROUPS32,
|
||||
"setregid32": SYS_SETREGID32,
|
||||
"setresgid32": SYS_SETRESGID32,
|
||||
"setresuid32": SYS_SETRESUID32,
|
||||
"setreuid32": SYS_SETREUID32,
|
||||
"setuid32": SYS_SETUID32,
|
||||
}
|
||||
|
||||
const (
|
||||
SYS_UMOUNT = C.__SNR_umount
|
||||
SYS_SUBPAGE_PROT = C.__SNR_subpage_prot
|
||||
SYS_SWITCH_ENDIAN = C.__SNR_switch_endian
|
||||
SYS_VM86 = C.__SNR_vm86
|
||||
SYS_VM86OLD = C.__SNR_vm86old
|
||||
SYS_CLOCK_ADJTIME64 = C.__SNR_clock_adjtime64
|
||||
SYS_CLOCK_SETTIME64 = C.__SNR_clock_settime64
|
||||
SYS_CHOWN32 = C.__SNR_chown32
|
||||
SYS_FCHOWN32 = C.__SNR_fchown32
|
||||
SYS_LCHOWN32 = C.__SNR_lchown32
|
||||
SYS_SETGID32 = C.__SNR_setgid32
|
||||
SYS_SETGROUPS32 = C.__SNR_setgroups32
|
||||
SYS_SETREGID32 = C.__SNR_setregid32
|
||||
SYS_SETRESGID32 = C.__SNR_setresgid32
|
||||
SYS_SETRESUID32 = C.__SNR_setresuid32
|
||||
SYS_SETREUID32 = C.__SNR_setreuid32
|
||||
SYS_SETUID32 = C.__SNR_setuid32
|
||||
)
|
||||
459
container/seccomp/syscall_linux_amd64.go
Normal file
459
container/seccomp/syscall_linux_amd64.go
Normal file
@@ -0,0 +1,459 @@
|
||||
// mksysnum_linux.pl /usr/include/asm/unistd_64.h
|
||||
// Code generated by the command above; DO NOT EDIT.
|
||||
|
||||
package seccomp
|
||||
|
||||
import . "syscall"
|
||||
|
||||
var syscallNum = map[string]int{
|
||||
"read": SYS_READ,
|
||||
"write": SYS_WRITE,
|
||||
"open": SYS_OPEN,
|
||||
"close": SYS_CLOSE,
|
||||
"stat": SYS_STAT,
|
||||
"fstat": SYS_FSTAT,
|
||||
"lstat": SYS_LSTAT,
|
||||
"poll": SYS_POLL,
|
||||
"lseek": SYS_LSEEK,
|
||||
"mmap": SYS_MMAP,
|
||||
"mprotect": SYS_MPROTECT,
|
||||
"munmap": SYS_MUNMAP,
|
||||
"brk": SYS_BRK,
|
||||
"rt_sigaction": SYS_RT_SIGACTION,
|
||||
"rt_sigprocmask": SYS_RT_SIGPROCMASK,
|
||||
"rt_sigreturn": SYS_RT_SIGRETURN,
|
||||
"ioctl": SYS_IOCTL,
|
||||
"pread64": SYS_PREAD64,
|
||||
"pwrite64": SYS_PWRITE64,
|
||||
"readv": SYS_READV,
|
||||
"writev": SYS_WRITEV,
|
||||
"access": SYS_ACCESS,
|
||||
"pipe": SYS_PIPE,
|
||||
"select": SYS_SELECT,
|
||||
"sched_yield": SYS_SCHED_YIELD,
|
||||
"mremap": SYS_MREMAP,
|
||||
"msync": SYS_MSYNC,
|
||||
"mincore": SYS_MINCORE,
|
||||
"madvise": SYS_MADVISE,
|
||||
"shmget": SYS_SHMGET,
|
||||
"shmat": SYS_SHMAT,
|
||||
"shmctl": SYS_SHMCTL,
|
||||
"dup": SYS_DUP,
|
||||
"dup2": SYS_DUP2,
|
||||
"pause": SYS_PAUSE,
|
||||
"nanosleep": SYS_NANOSLEEP,
|
||||
"getitimer": SYS_GETITIMER,
|
||||
"alarm": SYS_ALARM,
|
||||
"setitimer": SYS_SETITIMER,
|
||||
"getpid": SYS_GETPID,
|
||||
"sendfile": SYS_SENDFILE,
|
||||
"socket": SYS_SOCKET,
|
||||
"connect": SYS_CONNECT,
|
||||
"accept": SYS_ACCEPT,
|
||||
"sendto": SYS_SENDTO,
|
||||
"recvfrom": SYS_RECVFROM,
|
||||
"sendmsg": SYS_SENDMSG,
|
||||
"recvmsg": SYS_RECVMSG,
|
||||
"shutdown": SYS_SHUTDOWN,
|
||||
"bind": SYS_BIND,
|
||||
"listen": SYS_LISTEN,
|
||||
"getsockname": SYS_GETSOCKNAME,
|
||||
"getpeername": SYS_GETPEERNAME,
|
||||
"socketpair": SYS_SOCKETPAIR,
|
||||
"setsockopt": SYS_SETSOCKOPT,
|
||||
"getsockopt": SYS_GETSOCKOPT,
|
||||
"clone": SYS_CLONE,
|
||||
"fork": SYS_FORK,
|
||||
"vfork": SYS_VFORK,
|
||||
"execve": SYS_EXECVE,
|
||||
"exit": SYS_EXIT,
|
||||
"wait4": SYS_WAIT4,
|
||||
"kill": SYS_KILL,
|
||||
"uname": SYS_UNAME,
|
||||
"semget": SYS_SEMGET,
|
||||
"semop": SYS_SEMOP,
|
||||
"semctl": SYS_SEMCTL,
|
||||
"shmdt": SYS_SHMDT,
|
||||
"msgget": SYS_MSGGET,
|
||||
"msgsnd": SYS_MSGSND,
|
||||
"msgrcv": SYS_MSGRCV,
|
||||
"msgctl": SYS_MSGCTL,
|
||||
"fcntl": SYS_FCNTL,
|
||||
"flock": SYS_FLOCK,
|
||||
"fsync": SYS_FSYNC,
|
||||
"fdatasync": SYS_FDATASYNC,
|
||||
"truncate": SYS_TRUNCATE,
|
||||
"ftruncate": SYS_FTRUNCATE,
|
||||
"getdents": SYS_GETDENTS,
|
||||
"getcwd": SYS_GETCWD,
|
||||
"chdir": SYS_CHDIR,
|
||||
"fchdir": SYS_FCHDIR,
|
||||
"rename": SYS_RENAME,
|
||||
"mkdir": SYS_MKDIR,
|
||||
"rmdir": SYS_RMDIR,
|
||||
"creat": SYS_CREAT,
|
||||
"link": SYS_LINK,
|
||||
"unlink": SYS_UNLINK,
|
||||
"symlink": SYS_SYMLINK,
|
||||
"readlink": SYS_READLINK,
|
||||
"chmod": SYS_CHMOD,
|
||||
"fchmod": SYS_FCHMOD,
|
||||
"chown": SYS_CHOWN,
|
||||
"fchown": SYS_FCHOWN,
|
||||
"lchown": SYS_LCHOWN,
|
||||
"umask": SYS_UMASK,
|
||||
"gettimeofday": SYS_GETTIMEOFDAY,
|
||||
"getrlimit": SYS_GETRLIMIT,
|
||||
"getrusage": SYS_GETRUSAGE,
|
||||
"sysinfo": SYS_SYSINFO,
|
||||
"times": SYS_TIMES,
|
||||
"ptrace": SYS_PTRACE,
|
||||
"getuid": SYS_GETUID,
|
||||
"syslog": SYS_SYSLOG,
|
||||
"getgid": SYS_GETGID,
|
||||
"setuid": SYS_SETUID,
|
||||
"setgid": SYS_SETGID,
|
||||
"geteuid": SYS_GETEUID,
|
||||
"getegid": SYS_GETEGID,
|
||||
"setpgid": SYS_SETPGID,
|
||||
"getppid": SYS_GETPPID,
|
||||
"getpgrp": SYS_GETPGRP,
|
||||
"setsid": SYS_SETSID,
|
||||
"setreuid": SYS_SETREUID,
|
||||
"setregid": SYS_SETREGID,
|
||||
"getgroups": SYS_GETGROUPS,
|
||||
"setgroups": SYS_SETGROUPS,
|
||||
"setresuid": SYS_SETRESUID,
|
||||
"getresuid": SYS_GETRESUID,
|
||||
"setresgid": SYS_SETRESGID,
|
||||
"getresgid": SYS_GETRESGID,
|
||||
"getpgid": SYS_GETPGID,
|
||||
"setfsuid": SYS_SETFSUID,
|
||||
"setfsgid": SYS_SETFSGID,
|
||||
"getsid": SYS_GETSID,
|
||||
"capget": SYS_CAPGET,
|
||||
"capset": SYS_CAPSET,
|
||||
"rt_sigpending": SYS_RT_SIGPENDING,
|
||||
"rt_sigtimedwait": SYS_RT_SIGTIMEDWAIT,
|
||||
"rt_sigqueueinfo": SYS_RT_SIGQUEUEINFO,
|
||||
"rt_sigsuspend": SYS_RT_SIGSUSPEND,
|
||||
"sigaltstack": SYS_SIGALTSTACK,
|
||||
"utime": SYS_UTIME,
|
||||
"mknod": SYS_MKNOD,
|
||||
"uselib": SYS_USELIB,
|
||||
"personality": SYS_PERSONALITY,
|
||||
"ustat": SYS_USTAT,
|
||||
"statfs": SYS_STATFS,
|
||||
"fstatfs": SYS_FSTATFS,
|
||||
"sysfs": SYS_SYSFS,
|
||||
"getpriority": SYS_GETPRIORITY,
|
||||
"setpriority": SYS_SETPRIORITY,
|
||||
"sched_setparam": SYS_SCHED_SETPARAM,
|
||||
"sched_getparam": SYS_SCHED_GETPARAM,
|
||||
"sched_setscheduler": SYS_SCHED_SETSCHEDULER,
|
||||
"sched_getscheduler": SYS_SCHED_GETSCHEDULER,
|
||||
"sched_get_priority_max": SYS_SCHED_GET_PRIORITY_MAX,
|
||||
"sched_get_priority_min": SYS_SCHED_GET_PRIORITY_MIN,
|
||||
"sched_rr_get_interval": SYS_SCHED_RR_GET_INTERVAL,
|
||||
"mlock": SYS_MLOCK,
|
||||
"munlock": SYS_MUNLOCK,
|
||||
"mlockall": SYS_MLOCKALL,
|
||||
"munlockall": SYS_MUNLOCKALL,
|
||||
"vhangup": SYS_VHANGUP,
|
||||
"modify_ldt": SYS_MODIFY_LDT,
|
||||
"pivot_root": SYS_PIVOT_ROOT,
|
||||
"_sysctl": SYS__SYSCTL,
|
||||
"prctl": SYS_PRCTL,
|
||||
"arch_prctl": SYS_ARCH_PRCTL,
|
||||
"adjtimex": SYS_ADJTIMEX,
|
||||
"setrlimit": SYS_SETRLIMIT,
|
||||
"chroot": SYS_CHROOT,
|
||||
"sync": SYS_SYNC,
|
||||
"acct": SYS_ACCT,
|
||||
"settimeofday": SYS_SETTIMEOFDAY,
|
||||
"mount": SYS_MOUNT,
|
||||
"umount2": SYS_UMOUNT2,
|
||||
"swapon": SYS_SWAPON,
|
||||
"swapoff": SYS_SWAPOFF,
|
||||
"reboot": SYS_REBOOT,
|
||||
"sethostname": SYS_SETHOSTNAME,
|
||||
"setdomainname": SYS_SETDOMAINNAME,
|
||||
"iopl": SYS_IOPL,
|
||||
"ioperm": SYS_IOPERM,
|
||||
"create_module": SYS_CREATE_MODULE,
|
||||
"init_module": SYS_INIT_MODULE,
|
||||
"delete_module": SYS_DELETE_MODULE,
|
||||
"get_kernel_syms": SYS_GET_KERNEL_SYMS,
|
||||
"query_module": SYS_QUERY_MODULE,
|
||||
"quotactl": SYS_QUOTACTL,
|
||||
"nfsservctl": SYS_NFSSERVCTL,
|
||||
"getpmsg": SYS_GETPMSG,
|
||||
"putpmsg": SYS_PUTPMSG,
|
||||
"afs_syscall": SYS_AFS_SYSCALL,
|
||||
"tuxcall": SYS_TUXCALL,
|
||||
"security": SYS_SECURITY,
|
||||
"gettid": SYS_GETTID,
|
||||
"readahead": SYS_READAHEAD,
|
||||
"setxattr": SYS_SETXATTR,
|
||||
"lsetxattr": SYS_LSETXATTR,
|
||||
"fsetxattr": SYS_FSETXATTR,
|
||||
"getxattr": SYS_GETXATTR,
|
||||
"lgetxattr": SYS_LGETXATTR,
|
||||
"fgetxattr": SYS_FGETXATTR,
|
||||
"listxattr": SYS_LISTXATTR,
|
||||
"llistxattr": SYS_LLISTXATTR,
|
||||
"flistxattr": SYS_FLISTXATTR,
|
||||
"removexattr": SYS_REMOVEXATTR,
|
||||
"lremovexattr": SYS_LREMOVEXATTR,
|
||||
"fremovexattr": SYS_FREMOVEXATTR,
|
||||
"tkill": SYS_TKILL,
|
||||
"time": SYS_TIME,
|
||||
"futex": SYS_FUTEX,
|
||||
"sched_setaffinity": SYS_SCHED_SETAFFINITY,
|
||||
"sched_getaffinity": SYS_SCHED_GETAFFINITY,
|
||||
"set_thread_area": SYS_SET_THREAD_AREA,
|
||||
"io_setup": SYS_IO_SETUP,
|
||||
"io_destroy": SYS_IO_DESTROY,
|
||||
"io_getevents": SYS_IO_GETEVENTS,
|
||||
"io_submit": SYS_IO_SUBMIT,
|
||||
"io_cancel": SYS_IO_CANCEL,
|
||||
"get_thread_area": SYS_GET_THREAD_AREA,
|
||||
"lookup_dcookie": SYS_LOOKUP_DCOOKIE,
|
||||
"epoll_create": SYS_EPOLL_CREATE,
|
||||
"epoll_ctl_old": SYS_EPOLL_CTL_OLD,
|
||||
"epoll_wait_old": SYS_EPOLL_WAIT_OLD,
|
||||
"remap_file_pages": SYS_REMAP_FILE_PAGES,
|
||||
"getdents64": SYS_GETDENTS64,
|
||||
"set_tid_address": SYS_SET_TID_ADDRESS,
|
||||
"restart_syscall": SYS_RESTART_SYSCALL,
|
||||
"semtimedop": SYS_SEMTIMEDOP,
|
||||
"fadvise64": SYS_FADVISE64,
|
||||
"timer_create": SYS_TIMER_CREATE,
|
||||
"timer_settime": SYS_TIMER_SETTIME,
|
||||
"timer_gettime": SYS_TIMER_GETTIME,
|
||||
"timer_getoverrun": SYS_TIMER_GETOVERRUN,
|
||||
"timer_delete": SYS_TIMER_DELETE,
|
||||
"clock_settime": SYS_CLOCK_SETTIME,
|
||||
"clock_gettime": SYS_CLOCK_GETTIME,
|
||||
"clock_getres": SYS_CLOCK_GETRES,
|
||||
"clock_nanosleep": SYS_CLOCK_NANOSLEEP,
|
||||
"exit_group": SYS_EXIT_GROUP,
|
||||
"epoll_wait": SYS_EPOLL_WAIT,
|
||||
"epoll_ctl": SYS_EPOLL_CTL,
|
||||
"tgkill": SYS_TGKILL,
|
||||
"utimes": SYS_UTIMES,
|
||||
"vserver": SYS_VSERVER,
|
||||
"mbind": SYS_MBIND,
|
||||
"set_mempolicy": SYS_SET_MEMPOLICY,
|
||||
"get_mempolicy": SYS_GET_MEMPOLICY,
|
||||
"mq_open": SYS_MQ_OPEN,
|
||||
"mq_unlink": SYS_MQ_UNLINK,
|
||||
"mq_timedsend": SYS_MQ_TIMEDSEND,
|
||||
"mq_timedreceive": SYS_MQ_TIMEDRECEIVE,
|
||||
"mq_notify": SYS_MQ_NOTIFY,
|
||||
"mq_getsetattr": SYS_MQ_GETSETATTR,
|
||||
"kexec_load": SYS_KEXEC_LOAD,
|
||||
"waitid": SYS_WAITID,
|
||||
"add_key": SYS_ADD_KEY,
|
||||
"request_key": SYS_REQUEST_KEY,
|
||||
"keyctl": SYS_KEYCTL,
|
||||
"ioprio_set": SYS_IOPRIO_SET,
|
||||
"ioprio_get": SYS_IOPRIO_GET,
|
||||
"inotify_init": SYS_INOTIFY_INIT,
|
||||
"inotify_add_watch": SYS_INOTIFY_ADD_WATCH,
|
||||
"inotify_rm_watch": SYS_INOTIFY_RM_WATCH,
|
||||
"migrate_pages": SYS_MIGRATE_PAGES,
|
||||
"openat": SYS_OPENAT,
|
||||
"mkdirat": SYS_MKDIRAT,
|
||||
"mknodat": SYS_MKNODAT,
|
||||
"fchownat": SYS_FCHOWNAT,
|
||||
"futimesat": SYS_FUTIMESAT,
|
||||
"newfstatat": SYS_NEWFSTATAT,
|
||||
"unlinkat": SYS_UNLINKAT,
|
||||
"renameat": SYS_RENAMEAT,
|
||||
"linkat": SYS_LINKAT,
|
||||
"symlinkat": SYS_SYMLINKAT,
|
||||
"readlinkat": SYS_READLINKAT,
|
||||
"fchmodat": SYS_FCHMODAT,
|
||||
"faccessat": SYS_FACCESSAT,
|
||||
"pselect6": SYS_PSELECT6,
|
||||
"ppoll": SYS_PPOLL,
|
||||
"unshare": SYS_UNSHARE,
|
||||
"set_robust_list": SYS_SET_ROBUST_LIST,
|
||||
"get_robust_list": SYS_GET_ROBUST_LIST,
|
||||
"splice": SYS_SPLICE,
|
||||
"tee": SYS_TEE,
|
||||
"sync_file_range": SYS_SYNC_FILE_RANGE,
|
||||
"vmsplice": SYS_VMSPLICE,
|
||||
"move_pages": SYS_MOVE_PAGES,
|
||||
"utimensat": SYS_UTIMENSAT,
|
||||
"epoll_pwait": SYS_EPOLL_PWAIT,
|
||||
"signalfd": SYS_SIGNALFD,
|
||||
"timerfd_create": SYS_TIMERFD_CREATE,
|
||||
"eventfd": SYS_EVENTFD,
|
||||
"fallocate": SYS_FALLOCATE,
|
||||
"timerfd_settime": SYS_TIMERFD_SETTIME,
|
||||
"timerfd_gettime": SYS_TIMERFD_GETTIME,
|
||||
"accept4": SYS_ACCEPT4,
|
||||
"signalfd4": SYS_SIGNALFD4,
|
||||
"eventfd2": SYS_EVENTFD2,
|
||||
"epoll_create1": SYS_EPOLL_CREATE1,
|
||||
"dup3": SYS_DUP3,
|
||||
"pipe2": SYS_PIPE2,
|
||||
"inotify_init1": SYS_INOTIFY_INIT1,
|
||||
"preadv": SYS_PREADV,
|
||||
"pwritev": SYS_PWRITEV,
|
||||
"rt_tgsigqueueinfo": SYS_RT_TGSIGQUEUEINFO,
|
||||
"perf_event_open": SYS_PERF_EVENT_OPEN,
|
||||
"recvmmsg": SYS_RECVMMSG,
|
||||
"fanotify_init": SYS_FANOTIFY_INIT,
|
||||
"fanotify_mark": SYS_FANOTIFY_MARK,
|
||||
"prlimit64": SYS_PRLIMIT64,
|
||||
"name_to_handle_at": SYS_NAME_TO_HANDLE_AT,
|
||||
"open_by_handle_at": SYS_OPEN_BY_HANDLE_AT,
|
||||
"clock_adjtime": SYS_CLOCK_ADJTIME,
|
||||
"syncfs": SYS_SYNCFS,
|
||||
"sendmmsg": SYS_SENDMMSG,
|
||||
"setns": SYS_SETNS,
|
||||
"getcpu": SYS_GETCPU,
|
||||
"process_vm_readv": SYS_PROCESS_VM_READV,
|
||||
"process_vm_writev": SYS_PROCESS_VM_WRITEV,
|
||||
"kcmp": SYS_KCMP,
|
||||
"finit_module": SYS_FINIT_MODULE,
|
||||
"sched_setattr": SYS_SCHED_SETATTR,
|
||||
"sched_getattr": SYS_SCHED_GETATTR,
|
||||
"renameat2": SYS_RENAMEAT2,
|
||||
"seccomp": SYS_SECCOMP,
|
||||
"getrandom": SYS_GETRANDOM,
|
||||
"memfd_create": SYS_MEMFD_CREATE,
|
||||
"kexec_file_load": SYS_KEXEC_FILE_LOAD,
|
||||
"bpf": SYS_BPF,
|
||||
"execveat": SYS_EXECVEAT,
|
||||
"userfaultfd": SYS_USERFAULTFD,
|
||||
"membarrier": SYS_MEMBARRIER,
|
||||
"mlock2": SYS_MLOCK2,
|
||||
"copy_file_range": SYS_COPY_FILE_RANGE,
|
||||
"preadv2": SYS_PREADV2,
|
||||
"pwritev2": SYS_PWRITEV2,
|
||||
"pkey_mprotect": SYS_PKEY_MPROTECT,
|
||||
"pkey_alloc": SYS_PKEY_ALLOC,
|
||||
"pkey_free": SYS_PKEY_FREE,
|
||||
"statx": SYS_STATX,
|
||||
"io_pgetevents": SYS_IO_PGETEVENTS,
|
||||
"rseq": SYS_RSEQ,
|
||||
"uretprobe": SYS_URETPROBE,
|
||||
"pidfd_send_signal": SYS_PIDFD_SEND_SIGNAL,
|
||||
"io_uring_setup": SYS_IO_URING_SETUP,
|
||||
"io_uring_enter": SYS_IO_URING_ENTER,
|
||||
"io_uring_register": SYS_IO_URING_REGISTER,
|
||||
"open_tree": SYS_OPEN_TREE,
|
||||
"move_mount": SYS_MOVE_MOUNT,
|
||||
"fsopen": SYS_FSOPEN,
|
||||
"fsconfig": SYS_FSCONFIG,
|
||||
"fsmount": SYS_FSMOUNT,
|
||||
"fspick": SYS_FSPICK,
|
||||
"pidfd_open": SYS_PIDFD_OPEN,
|
||||
"clone3": SYS_CLONE3,
|
||||
"close_range": SYS_CLOSE_RANGE,
|
||||
"openat2": SYS_OPENAT2,
|
||||
"pidfd_getfd": SYS_PIDFD_GETFD,
|
||||
"faccessat2": SYS_FACCESSAT2,
|
||||
"process_madvise": SYS_PROCESS_MADVISE,
|
||||
"epoll_pwait2": SYS_EPOLL_PWAIT2,
|
||||
"mount_setattr": SYS_MOUNT_SETATTR,
|
||||
"quotactl_fd": SYS_QUOTACTL_FD,
|
||||
"landlock_create_ruleset": SYS_LANDLOCK_CREATE_RULESET,
|
||||
"landlock_add_rule": SYS_LANDLOCK_ADD_RULE,
|
||||
"landlock_restrict_self": SYS_LANDLOCK_RESTRICT_SELF,
|
||||
"memfd_secret": SYS_MEMFD_SECRET,
|
||||
"process_mrelease": SYS_PROCESS_MRELEASE,
|
||||
"futex_waitv": SYS_FUTEX_WAITV,
|
||||
"set_mempolicy_home_node": SYS_SET_MEMPOLICY_HOME_NODE,
|
||||
"cachestat": SYS_CACHESTAT,
|
||||
"fchmodat2": SYS_FCHMODAT2,
|
||||
"map_shadow_stack": SYS_MAP_SHADOW_STACK,
|
||||
"futex_wake": SYS_FUTEX_WAKE,
|
||||
"futex_wait": SYS_FUTEX_WAIT,
|
||||
"futex_requeue": SYS_FUTEX_REQUEUE,
|
||||
"statmount": SYS_STATMOUNT,
|
||||
"listmount": SYS_LISTMOUNT,
|
||||
"lsm_get_self_attr": SYS_LSM_GET_SELF_ATTR,
|
||||
"lsm_set_self_attr": SYS_LSM_SET_SELF_ATTR,
|
||||
"lsm_list_modules": SYS_LSM_LIST_MODULES,
|
||||
"mseal": SYS_MSEAL,
|
||||
}
|
||||
|
||||
const (
|
||||
SYS_NAME_TO_HANDLE_AT = 303
|
||||
SYS_OPEN_BY_HANDLE_AT = 304
|
||||
SYS_CLOCK_ADJTIME = 305
|
||||
SYS_SYNCFS = 306
|
||||
SYS_SENDMMSG = 307
|
||||
SYS_SETNS = 308
|
||||
SYS_GETCPU = 309
|
||||
SYS_PROCESS_VM_READV = 310
|
||||
SYS_PROCESS_VM_WRITEV = 311
|
||||
SYS_KCMP = 312
|
||||
SYS_FINIT_MODULE = 313
|
||||
SYS_SCHED_SETATTR = 314
|
||||
SYS_SCHED_GETATTR = 315
|
||||
SYS_RENAMEAT2 = 316
|
||||
SYS_SECCOMP = 317
|
||||
SYS_GETRANDOM = 318
|
||||
SYS_MEMFD_CREATE = 319
|
||||
SYS_KEXEC_FILE_LOAD = 320
|
||||
SYS_BPF = 321
|
||||
SYS_EXECVEAT = 322
|
||||
SYS_USERFAULTFD = 323
|
||||
SYS_MEMBARRIER = 324
|
||||
SYS_MLOCK2 = 325
|
||||
SYS_COPY_FILE_RANGE = 326
|
||||
SYS_PREADV2 = 327
|
||||
SYS_PWRITEV2 = 328
|
||||
SYS_PKEY_MPROTECT = 329
|
||||
SYS_PKEY_ALLOC = 330
|
||||
SYS_PKEY_FREE = 331
|
||||
SYS_STATX = 332
|
||||
SYS_IO_PGETEVENTS = 333
|
||||
SYS_RSEQ = 334
|
||||
SYS_URETPROBE = 335
|
||||
SYS_PIDFD_SEND_SIGNAL = 424
|
||||
SYS_IO_URING_SETUP = 425
|
||||
SYS_IO_URING_ENTER = 426
|
||||
SYS_IO_URING_REGISTER = 427
|
||||
SYS_OPEN_TREE = 428
|
||||
SYS_MOVE_MOUNT = 429
|
||||
SYS_FSOPEN = 430
|
||||
SYS_FSCONFIG = 431
|
||||
SYS_FSMOUNT = 432
|
||||
SYS_FSPICK = 433
|
||||
SYS_PIDFD_OPEN = 434
|
||||
SYS_CLONE3 = 435
|
||||
SYS_CLOSE_RANGE = 436
|
||||
SYS_OPENAT2 = 437
|
||||
SYS_PIDFD_GETFD = 438
|
||||
SYS_FACCESSAT2 = 439
|
||||
SYS_PROCESS_MADVISE = 440
|
||||
SYS_EPOLL_PWAIT2 = 441
|
||||
SYS_MOUNT_SETATTR = 442
|
||||
SYS_QUOTACTL_FD = 443
|
||||
SYS_LANDLOCK_CREATE_RULESET = 444
|
||||
SYS_LANDLOCK_ADD_RULE = 445
|
||||
SYS_LANDLOCK_RESTRICT_SELF = 446
|
||||
SYS_MEMFD_SECRET = 447
|
||||
SYS_PROCESS_MRELEASE = 448
|
||||
SYS_FUTEX_WAITV = 449
|
||||
SYS_SET_MEMPOLICY_HOME_NODE = 450
|
||||
SYS_CACHESTAT = 451
|
||||
SYS_FCHMODAT2 = 452
|
||||
SYS_MAP_SHADOW_STACK = 453
|
||||
SYS_FUTEX_WAKE = 454
|
||||
SYS_FUTEX_WAIT = 455
|
||||
SYS_FUTEX_REQUEUE = 456
|
||||
SYS_STATMOUNT = 457
|
||||
SYS_LISTMOUNT = 458
|
||||
SYS_LSM_GET_SELF_ATTR = 459
|
||||
SYS_LSM_SET_SELF_ATTR = 460
|
||||
SYS_LSM_LIST_MODULES = 461
|
||||
SYS_MSEAL = 462
|
||||
)
|
||||
20
container/seccomp/syscall_test.go
Normal file
20
container/seccomp/syscall_test.go
Normal file
@@ -0,0 +1,20 @@
|
||||
package seccomp
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestSyscallResolveName(t *testing.T) {
|
||||
for name, want := range Syscalls() {
|
||||
t.Run(name, func(t *testing.T) {
|
||||
if got := syscallResolveName(name); got != want {
|
||||
t.Errorf("syscallResolveName(%q) = %d, want %d",
|
||||
name, got, want)
|
||||
}
|
||||
if got, ok := SyscallResolveName(name); !ok || got != want {
|
||||
t.Errorf("SyscallResolveName(%q) = %d, want %d",
|
||||
name, got, want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
81
container/syscall.go
Normal file
81
container/syscall.go
Normal file
@@ -0,0 +1,81 @@
|
||||
package container
|
||||
|
||||
import (
|
||||
"syscall"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
const (
|
||||
O_PATH = 0x200000
|
||||
|
||||
PR_SET_NO_NEW_PRIVS = 0x26
|
||||
|
||||
CAP_SYS_ADMIN = 0x15
|
||||
CAP_SETPCAP = 0x8
|
||||
)
|
||||
|
||||
const (
|
||||
SUID_DUMP_DISABLE = iota
|
||||
SUID_DUMP_USER
|
||||
)
|
||||
|
||||
func SetDumpable(dumpable uintptr) error {
|
||||
// linux/sched/coredump.h
|
||||
if _, _, errno := syscall.Syscall(syscall.SYS_PRCTL, syscall.PR_SET_DUMPABLE, dumpable, 0); errno != 0 {
|
||||
return errno
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
const (
|
||||
_LINUX_CAPABILITY_VERSION_3 = 0x20080522
|
||||
|
||||
PR_CAP_AMBIENT = 0x2f
|
||||
PR_CAP_AMBIENT_RAISE = 0x2
|
||||
PR_CAP_AMBIENT_CLEAR_ALL = 0x4
|
||||
)
|
||||
|
||||
type (
|
||||
capHeader struct {
|
||||
version uint32
|
||||
pid int32
|
||||
}
|
||||
|
||||
capData struct {
|
||||
effective uint32
|
||||
permitted uint32
|
||||
inheritable uint32
|
||||
}
|
||||
)
|
||||
|
||||
// See CAP_TO_INDEX in linux/capability.h:
|
||||
func capToIndex(cap uintptr) uintptr { return cap >> 5 }
|
||||
|
||||
// See CAP_TO_MASK in linux/capability.h:
|
||||
func capToMask(cap uintptr) uint32 { return 1 << uint(cap&31) }
|
||||
|
||||
func capset(hdrp *capHeader, datap *[2]capData) error {
|
||||
if _, _, errno := syscall.Syscall(syscall.SYS_CAPSET,
|
||||
uintptr(unsafe.Pointer(hdrp)),
|
||||
uintptr(unsafe.Pointer(&datap[0])), 0); errno != 0 {
|
||||
return errno
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// IgnoringEINTR makes a function call and repeats it if it returns an
|
||||
// EINTR error. This appears to be required even though we install all
|
||||
// signal handlers with SA_RESTART: see #22838, #38033, #38836, #40846.
|
||||
// Also #20400 and #36644 are issues in which a signal handler is
|
||||
// installed without setting SA_RESTART. None of these are the common case,
|
||||
// but there are enough of them that it seems that we can't avoid
|
||||
// an EINTR loop.
|
||||
func IgnoringEINTR(fn func() error) error {
|
||||
for {
|
||||
err := fn()
|
||||
if err != syscall.EINTR {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
47
container/sysctl.go
Normal file
47
container/sysctl.go
Normal file
@@ -0,0 +1,47 @@
|
||||
package container
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"log"
|
||||
"os"
|
||||
"strconv"
|
||||
"sync"
|
||||
)
|
||||
|
||||
var (
|
||||
kernelOverflowuid int
|
||||
kernelOverflowgid int
|
||||
kernelCapLastCap int
|
||||
|
||||
sysctlOnce sync.Once
|
||||
)
|
||||
|
||||
const (
|
||||
kernelOverflowuidPath = "/proc/sys/kernel/overflowuid"
|
||||
kernelOverflowgidPath = "/proc/sys/kernel/overflowgid"
|
||||
kernelCapLastCapPath = "/proc/sys/kernel/cap_last_cap"
|
||||
)
|
||||
|
||||
func mustReadSysctl() {
|
||||
if v, err := os.ReadFile(kernelOverflowuidPath); err != nil {
|
||||
log.Fatalf("cannot read %q: %v", kernelOverflowuidPath, err)
|
||||
} else if kernelOverflowuid, err = strconv.Atoi(string(bytes.TrimSpace(v))); err != nil {
|
||||
log.Fatalf("cannot interpret %q: %v", kernelOverflowuidPath, err)
|
||||
}
|
||||
|
||||
if v, err := os.ReadFile(kernelOverflowgidPath); err != nil {
|
||||
log.Fatalf("cannot read %q: %v", kernelOverflowgidPath, err)
|
||||
} else if kernelOverflowgid, err = strconv.Atoi(string(bytes.TrimSpace(v))); err != nil {
|
||||
log.Fatalf("cannot interpret %q: %v", kernelOverflowgidPath, err)
|
||||
}
|
||||
|
||||
if v, err := os.ReadFile(kernelCapLastCapPath); err != nil {
|
||||
log.Fatalf("cannot read %q: %v", kernelCapLastCapPath, err)
|
||||
} else if kernelCapLastCap, err = strconv.Atoi(string(bytes.TrimSpace(v))); err != nil {
|
||||
log.Fatalf("cannot interpret %q: %v", kernelCapLastCapPath, err)
|
||||
}
|
||||
}
|
||||
|
||||
func OverflowUid() int { sysctlOnce.Do(mustReadSysctl); return kernelOverflowuid }
|
||||
func OverflowGid() int { sysctlOnce.Do(mustReadSysctl); return kernelOverflowgid }
|
||||
func LastCap() uintptr { sysctlOnce.Do(mustReadSysctl); return uintptr(kernelCapLastCap) }
|
||||
30
container/vfs/mangle.go
Normal file
30
container/vfs/mangle.go
Normal file
@@ -0,0 +1,30 @@
|
||||
package vfs
|
||||
|
||||
import "strings"
|
||||
|
||||
func Unmangle(s string) string {
|
||||
if !strings.ContainsRune(s, '\\') {
|
||||
return s
|
||||
}
|
||||
|
||||
v := make([]byte, len(s))
|
||||
var (
|
||||
j int
|
||||
c byte
|
||||
)
|
||||
for i := 0; i < len(s); i++ {
|
||||
c = s[i]
|
||||
if c == '\\' && len(s) > i+3 &&
|
||||
(s[i+1] == '0' || s[i+1] == '1') &&
|
||||
(s[i+2] >= '0' && s[i+2] <= '7') &&
|
||||
(s[i+3] >= '0' && s[i+3] <= '7') {
|
||||
c = ((s[i+1] - '0') << 6) |
|
||||
((s[i+2] - '0') << 3) |
|
||||
(s[i+3] - '0')
|
||||
i += 3
|
||||
}
|
||||
v[j] = c
|
||||
j++
|
||||
}
|
||||
return string(v[:j])
|
||||
}
|
||||
27
container/vfs/mangle_test.go
Normal file
27
container/vfs/mangle_test.go
Normal file
@@ -0,0 +1,27 @@
|
||||
package vfs_test
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"git.gensokyo.uk/security/hakurei/container/vfs"
|
||||
)
|
||||
|
||||
func TestUnmangle(t *testing.T) {
|
||||
testCases := []struct {
|
||||
want string
|
||||
sample string
|
||||
}{
|
||||
{`\, `, `\134\054\040`},
|
||||
{`(10) source -- maybe empty string`, `(10)\040source\040--\040maybe empty string`},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.want, func(t *testing.T) {
|
||||
got := vfs.Unmangle(tc.sample)
|
||||
if got != tc.want {
|
||||
t.Errorf("Unmangle: %q, want %q",
|
||||
got, tc.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
260
container/vfs/mountinfo.go
Normal file
260
container/vfs/mountinfo.go
Normal file
@@ -0,0 +1,260 @@
|
||||
// Package vfs provides bindings and iterators over proc_pid_mountinfo(5).
|
||||
package vfs
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"iter"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
)
|
||||
|
||||
const (
|
||||
MS_NOSYMFOLLOW = 0x100
|
||||
)
|
||||
|
||||
var (
|
||||
ErrMountInfoFields = errors.New("unexpected field count")
|
||||
ErrMountInfoEmpty = errors.New("unexpected empty field")
|
||||
ErrMountInfoDevno = errors.New("bad maj:min field")
|
||||
ErrMountInfoSep = errors.New("bad optional fields separator")
|
||||
)
|
||||
|
||||
type (
|
||||
// A MountInfoDecoder reads and decodes proc_pid_mountinfo(5) entries from an input stream.
|
||||
MountInfoDecoder struct {
|
||||
s *bufio.Scanner
|
||||
m *MountInfo
|
||||
|
||||
current *MountInfo
|
||||
parseErr error
|
||||
complete bool
|
||||
}
|
||||
|
||||
// MountInfo represents the contents of a proc_pid_mountinfo(5) document.
|
||||
MountInfo struct {
|
||||
Next *MountInfo
|
||||
MountInfoEntry
|
||||
}
|
||||
|
||||
// MountInfoEntry represents a proc_pid_mountinfo(5) entry.
|
||||
MountInfoEntry struct {
|
||||
// mount ID: a unique ID for the mount (may be reused after umount(2)).
|
||||
ID int `json:"id"`
|
||||
// parent ID: the ID of the parent mount (or of self for the root of this mount namespace's mount tree).
|
||||
Parent int `json:"parent"`
|
||||
// major:minor: the value of st_dev for files on this filesystem (see stat(2)).
|
||||
Devno DevT `json:"devno"`
|
||||
// root: the pathname of the directory in the filesystem which forms the root of this mount.
|
||||
Root string `json:"root"`
|
||||
// mount point: the pathname of the mount point relative to the process's root directory.
|
||||
Target string `json:"target"`
|
||||
// mount options: per-mount options (see mount(2)).
|
||||
VfsOptstr string `json:"vfs_optstr"`
|
||||
// optional fields: zero or more fields of the form "tag[:value]"; see below.
|
||||
// separator: the end of the optional fields is marked by a single hyphen.
|
||||
OptFields []string `json:"opt_fields"`
|
||||
// filesystem type: the filesystem type in the form "type[.subtype]".
|
||||
FsType string `json:"fstype"`
|
||||
// mount source: filesystem-specific information or "none".
|
||||
Source string `json:"source"`
|
||||
// super options: per-superblock options (see mount(2)).
|
||||
FsOptstr string `json:"fs_optstr"`
|
||||
}
|
||||
|
||||
DevT [2]int
|
||||
)
|
||||
|
||||
// Flags interprets VfsOptstr and returns the resulting flags and unmatched options.
|
||||
func (e *MountInfoEntry) Flags() (flags uintptr, unmatched []string) {
|
||||
for _, s := range strings.Split(e.VfsOptstr, ",") {
|
||||
switch s {
|
||||
case "rw":
|
||||
case "ro":
|
||||
flags |= syscall.MS_RDONLY
|
||||
case "nosuid":
|
||||
flags |= syscall.MS_NOSUID
|
||||
case "nodev":
|
||||
flags |= syscall.MS_NODEV
|
||||
case "noexec":
|
||||
flags |= syscall.MS_NOEXEC
|
||||
case "nosymfollow":
|
||||
flags |= MS_NOSYMFOLLOW
|
||||
case "noatime":
|
||||
flags |= syscall.MS_NOATIME
|
||||
case "nodiratime":
|
||||
flags |= syscall.MS_NODIRATIME
|
||||
case "relatime":
|
||||
flags |= syscall.MS_RELATIME
|
||||
default:
|
||||
unmatched = append(unmatched, s)
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// NewMountInfoDecoder returns a new decoder that reads from r.
|
||||
//
|
||||
// The decoder introduces its own buffering and may read data from r beyond the mountinfo entries requested.
|
||||
func NewMountInfoDecoder(r io.Reader) *MountInfoDecoder {
|
||||
return &MountInfoDecoder{s: bufio.NewScanner(r)}
|
||||
}
|
||||
|
||||
func (d *MountInfoDecoder) Decode(v **MountInfo) (err error) {
|
||||
for d.scan() {
|
||||
}
|
||||
err = d.Err()
|
||||
if err == nil {
|
||||
*v = d.m
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Entries returns an iterator over mountinfo entries.
|
||||
func (d *MountInfoDecoder) Entries() iter.Seq[*MountInfoEntry] {
|
||||
return func(yield func(*MountInfoEntry) bool) {
|
||||
for cur := d.m; cur != nil; cur = cur.Next {
|
||||
if !yield(&cur.MountInfoEntry) {
|
||||
return
|
||||
}
|
||||
}
|
||||
for d.scan() {
|
||||
if !yield(&d.current.MountInfoEntry) {
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (d *MountInfoDecoder) Err() error {
|
||||
if err := d.s.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
return d.parseErr
|
||||
}
|
||||
|
||||
func (d *MountInfoDecoder) scan() bool {
|
||||
if d.complete {
|
||||
return false
|
||||
}
|
||||
if !d.s.Scan() {
|
||||
d.complete = true
|
||||
return false
|
||||
}
|
||||
|
||||
m := new(MountInfo)
|
||||
if err := parseMountInfoLine(d.s.Text(), &m.MountInfoEntry); err != nil {
|
||||
d.parseErr = err
|
||||
d.complete = true
|
||||
return false
|
||||
}
|
||||
|
||||
if d.current == nil {
|
||||
d.m = m
|
||||
d.current = d.m
|
||||
} else {
|
||||
d.current.Next = m
|
||||
d.current = d.current.Next
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func parseMountInfoLine(s string, ent *MountInfoEntry) error {
|
||||
// prevent proceeding with misaligned fields due to optional fields
|
||||
f := strings.Split(s, " ")
|
||||
if len(f) < 10 {
|
||||
return ErrMountInfoFields
|
||||
}
|
||||
|
||||
// 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
|
||||
// (1)(2)(3) (4) (5) (6) (7) (8) (9) (10) (11)
|
||||
|
||||
// (1) id
|
||||
if id, err := strconv.Atoi(f[0]); err != nil { // 0
|
||||
return err
|
||||
} else {
|
||||
ent.ID = id
|
||||
}
|
||||
|
||||
// (2) parent
|
||||
if parent, err := strconv.Atoi(f[1]); err != nil { // 1
|
||||
return err
|
||||
} else {
|
||||
ent.Parent = parent
|
||||
}
|
||||
|
||||
// (3) maj:min
|
||||
if n, err := fmt.Sscanf(f[2], "%d:%d", &ent.Devno[0], &ent.Devno[1]); err != nil {
|
||||
return err
|
||||
} else if n != 2 {
|
||||
// unreachable
|
||||
return ErrMountInfoDevno
|
||||
}
|
||||
|
||||
// (4) mountroot
|
||||
ent.Root = Unmangle(f[3])
|
||||
if ent.Root == "" {
|
||||
return ErrMountInfoEmpty
|
||||
}
|
||||
|
||||
// (5) target
|
||||
ent.Target = Unmangle(f[4])
|
||||
if ent.Target == "" {
|
||||
return ErrMountInfoEmpty
|
||||
}
|
||||
|
||||
// (6) vfs options (fs-independent)
|
||||
ent.VfsOptstr = Unmangle(f[5])
|
||||
if ent.VfsOptstr == "" {
|
||||
return ErrMountInfoEmpty
|
||||
}
|
||||
|
||||
// (7) optional fields, terminated by " - "
|
||||
i := len(f) - 4
|
||||
ent.OptFields = f[6:i]
|
||||
|
||||
// (8) optional fields end marker
|
||||
if f[i] != "-" {
|
||||
return ErrMountInfoSep
|
||||
}
|
||||
i++
|
||||
|
||||
// (9) FS type
|
||||
ent.FsType = Unmangle(f[i])
|
||||
if ent.FsType == "" {
|
||||
return ErrMountInfoEmpty
|
||||
}
|
||||
i++
|
||||
|
||||
// (10) source -- maybe empty string
|
||||
ent.Source = Unmangle(f[i])
|
||||
i++
|
||||
|
||||
// (11) fs options (fs specific)
|
||||
ent.FsOptstr = Unmangle(f[i])
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (e *MountInfoEntry) EqualWithIgnore(want *MountInfoEntry, ignore string) bool {
|
||||
return (e.ID == want.ID || want.ID == -1) &&
|
||||
(e.Parent == want.Parent || want.Parent == -1) &&
|
||||
(e.Devno == want.Devno || (want.Devno[0] == -1 && want.Devno[1] == -1)) &&
|
||||
(e.Root == want.Root || want.Root == ignore) &&
|
||||
(e.Target == want.Target || want.Target == ignore) &&
|
||||
(e.VfsOptstr == want.VfsOptstr || want.VfsOptstr == ignore) &&
|
||||
(slices.Equal(e.OptFields, want.OptFields) || (len(want.OptFields) == 1 && want.OptFields[0] == ignore)) &&
|
||||
(e.FsType == want.FsType || want.FsType == ignore) &&
|
||||
(e.Source == want.Source || want.Source == ignore) &&
|
||||
(e.FsOptstr == want.FsOptstr || want.FsOptstr == ignore)
|
||||
}
|
||||
|
||||
func (e *MountInfoEntry) String() string {
|
||||
return fmt.Sprintf("%d %d %d:%d %s %s %s %s %s %s %s",
|
||||
e.ID, e.Parent, e.Devno[0], e.Devno[1], e.Root, e.Target, e.VfsOptstr,
|
||||
strings.Join(append(e.OptFields, "-"), " "), e.FsType, e.Source, e.FsOptstr)
|
||||
}
|
||||
404
container/vfs/mountinfo_test.go
Normal file
404
container/vfs/mountinfo_test.go
Normal file
@@ -0,0 +1,404 @@
|
||||
package vfs_test
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"iter"
|
||||
"path"
|
||||
"reflect"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
"testing"
|
||||
|
||||
"git.gensokyo.uk/security/hakurei/container/vfs"
|
||||
)
|
||||
|
||||
func TestMountInfo(t *testing.T) {
|
||||
testCases := []mountInfoTest{
|
||||
{"count", sampleMountinfoBase + `
|
||||
21 20 0:53/ /mnt/test rw,relatime - tmpfs rw
|
||||
21 16 0:17 / /sys/fs/cgroup rw,nosuid,nodev,noexec,relatime - tmpfs tmpfs rw,mode=755`,
|
||||
vfs.ErrMountInfoFields, "", nil, nil, nil},
|
||||
|
||||
{"sep", sampleMountinfoBase + `
|
||||
21 20 0:53 / /mnt/test rw,relatime shared:212 _ tmpfs rw
|
||||
21 16 0:17 / /sys/fs/cgroup rw,nosuid,nodev,noexec,relatime - tmpfs tmpfs rw,mode=755`,
|
||||
vfs.ErrMountInfoSep, "", nil, nil, nil},
|
||||
|
||||
{"id", sampleMountinfoBase + `
|
||||
id 20 0:53 / /mnt/test rw,relatime shared:212 - tmpfs rw
|
||||
21 16 0:17 / /sys/fs/cgroup rw,nosuid,nodev,noexec,relatime - tmpfs tmpfs rw,mode=755`,
|
||||
strconv.ErrSyntax, "", nil, nil, nil},
|
||||
|
||||
{"parent", sampleMountinfoBase + `
|
||||
21 parent 0:53 / /mnt/test rw,relatime shared:212 - tmpfs rw
|
||||
21 16 0:17 / /sys/fs/cgroup rw,nosuid,nodev,noexec,relatime - tmpfs tmpfs rw,mode=755`,
|
||||
strconv.ErrSyntax, "", nil, nil, nil},
|
||||
|
||||
{"devno", sampleMountinfoBase + `
|
||||
21 20 053 / /mnt/test rw,relatime shared:212 - tmpfs rw
|
||||
21 16 0:17 / /sys/fs/cgroup rw,nosuid,nodev,noexec,relatime - tmpfs tmpfs rw,mode=755`,
|
||||
nil, "unexpected EOF", nil, nil, nil},
|
||||
|
||||
{"maj", sampleMountinfoBase + `
|
||||
21 20 maj:53 / /mnt/test rw,relatime shared:212 - tmpfs rw
|
||||
21 16 0:17 / /sys/fs/cgroup rw,nosuid,nodev,noexec,relatime - tmpfs tmpfs rw,mode=755`,
|
||||
nil, "expected integer", nil, nil, nil},
|
||||
|
||||
{"min", sampleMountinfoBase + `
|
||||
21 20 0:min / /mnt/test rw,relatime shared:212 - tmpfs rw
|
||||
21 16 0:17 / /sys/fs/cgroup rw,nosuid,nodev,noexec,relatime - tmpfs tmpfs rw,mode=755`,
|
||||
nil, "expected integer", nil, nil, nil},
|
||||
|
||||
{"mountroot", sampleMountinfoBase + `
|
||||
21 20 0:53 /mnt/test rw,relatime - tmpfs rw
|
||||
21 16 0:17 / /sys/fs/cgroup rw,nosuid,nodev,noexec,relatime - tmpfs tmpfs rw,mode=755`,
|
||||
vfs.ErrMountInfoEmpty, "", nil, nil, nil},
|
||||
|
||||
{"target", sampleMountinfoBase + `
|
||||
21 20 0:53 / rw,relatime - tmpfs rw
|
||||
21 16 0:17 / /sys/fs/cgroup rw,nosuid,nodev,noexec,relatime - tmpfs tmpfs rw,mode=755`,
|
||||
vfs.ErrMountInfoEmpty, "", nil, nil, nil},
|
||||
|
||||
{"vfs options", sampleMountinfoBase + `
|
||||
21 20 0:53 / /mnt/test - tmpfs rw
|
||||
21 16 0:17 / /sys/fs/cgroup rw,nosuid,nodev,noexec,relatime - tmpfs tmpfs rw,mode=755`,
|
||||
vfs.ErrMountInfoEmpty, "", nil, nil, nil},
|
||||
|
||||
{"FS type", sampleMountinfoBase + `
|
||||
21 20 0:53 / /mnt/test rw,relatime - rw
|
||||
21 16 0:17 / /sys/fs/cgroup rw,nosuid,nodev,noexec,relatime - tmpfs tmpfs rw,mode=755`,
|
||||
vfs.ErrMountInfoEmpty, "", nil, nil, nil},
|
||||
|
||||
{"base", sampleMountinfoBase, nil, "", []*wantMountInfo{
|
||||
m(15, 20, 0, 3, "/", "/proc", "rw,relatime", o(), "proc", "/proc", "rw", syscall.MS_RELATIME, nil),
|
||||
m(16, 20, 0, 15, "/", "/sys", "rw,relatime", o(), "sysfs", "/sys", "rw", syscall.MS_RELATIME, nil),
|
||||
m(17, 20, 0, 5, "/", "/dev", "rw,relatime", o(), "devtmpfs", "udev", "rw,size=1983516k,nr_inodes=495879,mode=755", syscall.MS_RELATIME, nil),
|
||||
m(18, 17, 0, 10, "/", "/dev/pts", "rw,relatime", o(), "devpts", "devpts", "rw,gid=5,mode=620,ptmxmode=000", syscall.MS_RELATIME, nil),
|
||||
m(19, 17, 0, 16, "/", "/dev/shm", "rw,relatime", o(), "tmpfs", "tmpfs", "rw", syscall.MS_RELATIME, nil),
|
||||
m(20, 1, 8, 4, "/", "/", "ro,noatime,nodiratime,meow", o(), "ext3", "/dev/sda4", "rw,errors=continue,user_xattr,acl,barrier=0,data=ordered", syscall.MS_RDONLY|syscall.MS_NOATIME|syscall.MS_NODIRATIME, []string{"meow"}),
|
||||
},
|
||||
mn(20, 1, 8, 4, "/", "/", "ro,noatime,nodiratime,meow", o(), "ext3", "/dev/sda4", "rw,errors=continue,user_xattr,acl,barrier=0,data=ordered", false,
|
||||
mn(15, 20, 0, 3, "/", "/proc", "rw,relatime", o(), "proc", "/proc", "rw", false, nil,
|
||||
mn(16, 20, 0, 15, "/", "/sys", "rw,relatime", o(), "sysfs", "/sys", "rw", false, nil,
|
||||
mn(17, 20, 0, 5, "/", "/dev", "rw,relatime", o(), "devtmpfs", "udev", "rw,size=1983516k,nr_inodes=495879,mode=755", false,
|
||||
mn(18, 17, 0, 10, "/", "/dev/pts", "rw,relatime", o(), "devpts", "devpts", "rw,gid=5,mode=620,ptmxmode=000", false, nil,
|
||||
mn(19, 17, 0, 16, "/", "/dev/shm", "rw,relatime", o(), "tmpfs", "tmpfs", "rw", false, nil, nil)),
|
||||
nil))), nil), func(n *vfs.MountInfoNode) []*vfs.MountInfoNode {
|
||||
return []*vfs.MountInfoNode{
|
||||
n,
|
||||
n.FirstChild,
|
||||
n.FirstChild.NextSibling,
|
||||
n.FirstChild.NextSibling.NextSibling,
|
||||
n.FirstChild.NextSibling.NextSibling.FirstChild,
|
||||
n.FirstChild.NextSibling.NextSibling.FirstChild.NextSibling,
|
||||
}
|
||||
}},
|
||||
|
||||
{"sample", sampleMountinfo, nil, "", []*wantMountInfo{
|
||||
m(15, 20, 0, 3, "/", "/proc", "rw,relatime", o(), "proc", "/proc", "rw", syscall.MS_RELATIME, nil),
|
||||
m(16, 20, 0, 15, "/", "/sys", "rw,relatime", o(), "sysfs", "/sys", "rw", syscall.MS_RELATIME, nil),
|
||||
m(17, 20, 0, 5, "/", "/dev", "rw,relatime", o(), "devtmpfs", "udev", "rw,size=1983516k,nr_inodes=495879,mode=755", syscall.MS_RELATIME, nil),
|
||||
m(18, 17, 0, 10, "/", "/dev/pts", "rw,relatime", o(), "devpts", "devpts", "rw,gid=5,mode=620,ptmxmode=000", syscall.MS_RELATIME, nil),
|
||||
m(19, 17, 0, 16, "/", "/dev/shm", "rw,relatime", o(), "tmpfs", "tmpfs", "rw", syscall.MS_RELATIME, nil),
|
||||
m(20, 1, 8, 4, "/", "/", "rw,noatime", o(), "ext3", "/dev/sda4", "rw,errors=continue,user_xattr,acl,barrier=0,data=ordered", syscall.MS_NOATIME, nil),
|
||||
m(21, 16, 0, 17, "/", "/sys/fs/cgroup", "rw,nosuid,nodev,noexec,relatime", o(), "tmpfs", "tmpfs", "rw,mode=755", syscall.MS_NOSUID|syscall.MS_NODEV|syscall.MS_NOEXEC|syscall.MS_RELATIME, nil),
|
||||
m(22, 21, 0, 18, "/", "/sys/fs/cgroup/systemd", "rw,nosuid,nodev,noexec,relatime", o(), "cgroup", "cgroup", "rw,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd", syscall.MS_NOSUID|syscall.MS_NODEV|syscall.MS_NOEXEC|syscall.MS_RELATIME, nil),
|
||||
m(23, 21, 0, 19, "/", "/sys/fs/cgroup/cpuset", "rw,nosuid,nodev,noexec,relatime", o(), "cgroup", "cgroup", "rw,cpuset", syscall.MS_NOSUID|syscall.MS_NODEV|syscall.MS_NOEXEC|syscall.MS_RELATIME, nil),
|
||||
m(24, 21, 0, 20, "/", "/sys/fs/cgroup/ns", "rw,nosuid,nodev,noexec,relatime", o(), "cgroup", "cgroup", "rw,ns", syscall.MS_NOSUID|syscall.MS_NODEV|syscall.MS_NOEXEC|syscall.MS_RELATIME, nil),
|
||||
m(25, 21, 0, 21, "/", "/sys/fs/cgroup/cpu", "rw,nosuid,nodev,noexec,relatime", o(), "cgroup", "cgroup", "rw,cpu", syscall.MS_NOSUID|syscall.MS_NODEV|syscall.MS_NOEXEC|syscall.MS_RELATIME, nil),
|
||||
m(26, 21, 0, 22, "/", "/sys/fs/cgroup/cpuacct", "rw,nosuid,nodev,noexec,relatime", o(), "cgroup", "cgroup", "rw,cpuacct", syscall.MS_NOSUID|syscall.MS_NODEV|syscall.MS_NOEXEC|syscall.MS_RELATIME, nil),
|
||||
m(27, 21, 0, 23, "/", "/sys/fs/cgroup/memory", "rw,nosuid,nodev,noexec,relatime", o(), "cgroup", "cgroup", "rw,memory", syscall.MS_NOSUID|syscall.MS_NODEV|syscall.MS_NOEXEC|syscall.MS_RELATIME, nil),
|
||||
m(28, 21, 0, 24, "/", "/sys/fs/cgroup/devices", "rw,nosuid,nodev,noexec,relatime", o(), "cgroup", "cgroup", "rw,devices", syscall.MS_NOSUID|syscall.MS_NODEV|syscall.MS_NOEXEC|syscall.MS_RELATIME, nil),
|
||||
m(29, 21, 0, 25, "/", "/sys/fs/cgroup/freezer", "rw,nosuid,nodev,noexec,relatime", o(), "cgroup", "cgroup", "rw,freezer", syscall.MS_NOSUID|syscall.MS_NODEV|syscall.MS_NOEXEC|syscall.MS_RELATIME, nil),
|
||||
m(30, 21, 0, 26, "/", "/sys/fs/cgroup/net_cls", "rw,nosuid,nodev,noexec,relatime", o(), "cgroup", "cgroup", "rw,net_cls", syscall.MS_NOSUID|syscall.MS_NODEV|syscall.MS_NOEXEC|syscall.MS_RELATIME, nil),
|
||||
m(31, 21, 0, 27, "/", "/sys/fs/cgroup/blkio", "rw,nosuid,nodev,noexec,relatime", o(), "cgroup", "cgroup", "rw,blkio", syscall.MS_NOSUID|syscall.MS_NODEV|syscall.MS_NOEXEC|syscall.MS_RELATIME, nil),
|
||||
m(32, 16, 0, 28, "/", "/sys/kernel/security", "rw,relatime", o(), "autofs", "systemd-1", "rw,fd=22,pgrp=1,timeout=300,minproto=5,maxproto=5,direct", syscall.MS_RELATIME, nil),
|
||||
m(33, 17, 0, 29, "/", "/dev/hugepages", "rw,relatime", o(), "autofs", "systemd-1", "rw,fd=23,pgrp=1,timeout=300,minproto=5,maxproto=5,direct", syscall.MS_RELATIME, nil),
|
||||
m(34, 16, 0, 30, "/", "/sys/kernel/debug", "rw,relatime", o(), "autofs", "systemd-1", "rw,fd=24,pgrp=1,timeout=300,minproto=5,maxproto=5,direct", syscall.MS_RELATIME, nil),
|
||||
m(35, 15, 0, 31, "/", "/proc/sys/fs/binfmt_misc", "rw,relatime", o(), "autofs", "systemd-1", "rw,fd=25,pgrp=1,timeout=300,minproto=5,maxproto=5,direct", syscall.MS_RELATIME, nil),
|
||||
m(36, 17, 0, 32, "/", "/dev/mqueue", "rw,relatime", o(), "autofs", "systemd-1", "rw,fd=26,pgrp=1,timeout=300,minproto=5,maxproto=5,direct", syscall.MS_RELATIME, nil),
|
||||
m(37, 15, 0, 14, "/", "/proc/bus/usb", "rw,relatime", o(), "usbfs", "/proc/bus/usb", "rw", syscall.MS_RELATIME, nil),
|
||||
m(38, 33, 0, 33, "/", "/dev/hugepages", "rw,relatime", o(), "hugetlbfs", "hugetlbfs", "rw", syscall.MS_RELATIME, nil),
|
||||
m(39, 36, 0, 12, "/", "/dev/mqueue", "rw,relatime", o(), "mqueue", "mqueue", "rw", syscall.MS_RELATIME, nil),
|
||||
m(40, 20, 8, 6, "/", "/boot", "rw,noatime", o(), "ext3", "/dev/sda6", "rw,errors=continue,barrier=0,data=ordered", syscall.MS_NOATIME, nil),
|
||||
m(41, 20, 253, 0, "/", "/home/kzak", "rw,noatime", o(), "ext4", "/dev/mapper/kzak-home", "rw,barrier=1,data=ordered", syscall.MS_NOATIME, nil),
|
||||
m(42, 35, 0, 34, "/", "/proc/sys/fs/binfmt_misc", "rw,relatime", o(), "binfmt_misc", "none", "rw", syscall.MS_RELATIME, nil),
|
||||
m(43, 16, 0, 35, "/", "/sys/fs/fuse/connections", "rw,relatime", o(), "fusectl", "fusectl", "rw", syscall.MS_RELATIME, nil),
|
||||
m(44, 41, 0, 36, "/", "/home/kzak/.gvfs", "rw,nosuid,nodev,relatime", o(), "fuse.gvfs-fuse-daemon", "gvfs-fuse-daemon", "rw,user_id=500,group_id=500", syscall.MS_NOSUID|syscall.MS_NODEV|syscall.MS_RELATIME, nil),
|
||||
m(45, 20, 0, 37, "/", "/var/lib/nfs/rpc_pipefs", "rw,relatime", o(), "rpc_pipefs", "sunrpc", "rw", syscall.MS_RELATIME, nil),
|
||||
m(47, 20, 0, 38, "/", "/mnt/sounds", "rw,relatime", o(), "cifs", "//foo.home/bar/", "rw,unc=\\\\foo.home\\bar,username=kzak,domain=SRGROUP,uid=0,noforceuid,gid=0,noforcegid,addr=192.168.111.1,posixpaths,serverino,acl,rsize=16384,wsize=57344", syscall.MS_RELATIME, nil),
|
||||
m(49, 20, 0, 56, "/", "/mnt/test/foobar", "rw,relatime,nosymfollow", o("shared:323"), "tmpfs", "tmpfs", "rw", syscall.MS_RELATIME|vfs.MS_NOSYMFOLLOW, nil),
|
||||
}, nil, nil},
|
||||
|
||||
{"sample nosrc", sampleMountinfoNoSrc, nil, "", []*wantMountInfo{
|
||||
m(15, 20, 0, 3, "/", "/proc", "rw,relatime", o(), "proc", "/proc", "rw", syscall.MS_RELATIME, nil),
|
||||
m(16, 20, 0, 15, "/", "/sys", "rw,relatime", o(), "sysfs", "/sys", "rw", syscall.MS_RELATIME, nil),
|
||||
m(17, 20, 0, 5, "/", "/dev", "rw,relatime", o(), "devtmpfs", "udev", "rw,size=1983516k,nr_inodes=495879,mode=755", syscall.MS_RELATIME, nil),
|
||||
m(18, 17, 0, 10, "/", "/dev/pts", "rw,relatime", o(), "devpts", "devpts", "rw,gid=5,mode=620,ptmxmode=000", syscall.MS_RELATIME, nil),
|
||||
m(19, 17, 0, 16, "/", "/dev/shm", "rw,relatime", o(), "tmpfs", "tmpfs", "rw", syscall.MS_RELATIME, nil),
|
||||
m(20, 1, 8, 4, "/", "/", "rw,noatime", o(), "ext3", "/dev/sda4", "rw,errors=continue,user_xattr,acl,barrier=0,data=ordered", syscall.MS_NOATIME, nil),
|
||||
m(21, 20, 0, 53, "/", "/mnt/test", "rw,relatime", o("shared:212"), "tmpfs", "", "rw", syscall.MS_RELATIME, nil),
|
||||
}, nil, nil},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
t.Run("decode", func(t *testing.T) {
|
||||
var got *vfs.MountInfo
|
||||
d := vfs.NewMountInfoDecoder(strings.NewReader(tc.sample))
|
||||
err := d.Decode(&got)
|
||||
tc.check(t, d, "Decode",
|
||||
func(yield func(*vfs.MountInfoEntry) bool) {
|
||||
for cur := got; cur != nil; cur = cur.Next {
|
||||
if !yield(&cur.MountInfoEntry) {
|
||||
return
|
||||
}
|
||||
}
|
||||
}, func() error { return err })
|
||||
t.Run("reuse", func(t *testing.T) {
|
||||
tc.check(t, d, "Entries",
|
||||
d.Entries(), d.Err)
|
||||
})
|
||||
})
|
||||
|
||||
t.Run("iter", func(t *testing.T) {
|
||||
d := vfs.NewMountInfoDecoder(strings.NewReader(tc.sample))
|
||||
tc.check(t, d, "Entries",
|
||||
d.Entries(), d.Err)
|
||||
|
||||
t.Run("reuse", func(t *testing.T) {
|
||||
tc.check(t, d, "Entries",
|
||||
d.Entries(), d.Err)
|
||||
})
|
||||
})
|
||||
|
||||
t.Run("yield", func(t *testing.T) {
|
||||
d := vfs.NewMountInfoDecoder(strings.NewReader(tc.sample))
|
||||
v := false
|
||||
d.Entries()(func(entry *vfs.MountInfoEntry) bool { v = !v; return v })
|
||||
d.Entries()(func(entry *vfs.MountInfoEntry) bool { return false })
|
||||
|
||||
tc.check(t, d, "Entries",
|
||||
d.Entries(), d.Err)
|
||||
|
||||
t.Run("reuse", func(t *testing.T) {
|
||||
tc.check(t, d, "Entries",
|
||||
d.Entries(), d.Err)
|
||||
})
|
||||
})
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
type mountInfoTest struct {
|
||||
name string
|
||||
sample string
|
||||
wantErr error
|
||||
wantError string
|
||||
want []*wantMountInfo
|
||||
|
||||
wantNode *vfs.MountInfoNode
|
||||
wantCollectF func(n *vfs.MountInfoNode) []*vfs.MountInfoNode
|
||||
}
|
||||
|
||||
func (tc *mountInfoTest) check(t *testing.T, d *vfs.MountInfoDecoder, funcName string,
|
||||
got iter.Seq[*vfs.MountInfoEntry], gotErr func() error) {
|
||||
i := 0
|
||||
for cur := range got {
|
||||
if i == len(tc.want) {
|
||||
if funcName != "Decode" && (tc.wantErr != nil || tc.wantError != "") {
|
||||
continue
|
||||
}
|
||||
|
||||
t.Errorf("%s: got more than %d entries", funcName, len(tc.want))
|
||||
break
|
||||
}
|
||||
|
||||
if !reflect.DeepEqual(cur, &tc.want[i].MountInfoEntry) {
|
||||
t.Errorf("%s: entry %d\ngot: %#v\nwant: %#v",
|
||||
funcName, i, cur, tc.want[i])
|
||||
}
|
||||
|
||||
flags, unmatched := cur.Flags()
|
||||
if flags != tc.want[i].flags {
|
||||
t.Errorf("Flags(%q): %#x, want %#x",
|
||||
cur.VfsOptstr, flags, tc.want[i].flags)
|
||||
}
|
||||
if !slices.Equal(unmatched, tc.want[i].unmatched) {
|
||||
t.Errorf("Flags(%q): unmatched = %#q, want %#q",
|
||||
cur.VfsOptstr, unmatched, tc.want[i].unmatched)
|
||||
}
|
||||
|
||||
i++
|
||||
}
|
||||
|
||||
if i != len(tc.want) {
|
||||
t.Errorf("%s: got %d entries, want %d", funcName, i, len(tc.want))
|
||||
}
|
||||
|
||||
if tc.wantErr == nil && tc.wantError == "" && tc.wantCollectF != nil {
|
||||
t.Run("unfold", func(t *testing.T) {
|
||||
n, err := d.Unfold("/")
|
||||
if err != nil {
|
||||
t.Errorf("Unfold: error = %v", err)
|
||||
} else {
|
||||
t.Run("stop", func(t *testing.T) {
|
||||
v := false
|
||||
n.Collective()(func(node *vfs.MountInfoNode) bool { v = !v; return v })
|
||||
})
|
||||
|
||||
if !reflect.DeepEqual(n, tc.wantNode) {
|
||||
t.Errorf("Unfold: %s, want %s",
|
||||
mustMarshal(n), mustMarshal(tc.wantNode))
|
||||
}
|
||||
|
||||
t.Run("collective", func(t *testing.T) {
|
||||
wantCollect := tc.wantCollectF(n)
|
||||
if gotCollect := slices.Collect(n.Collective()); !reflect.DeepEqual(gotCollect, wantCollect) {
|
||||
t.Errorf("Collective: \ngot %#v\nwant %#v",
|
||||
gotCollect, wantCollect)
|
||||
}
|
||||
})
|
||||
}
|
||||
})
|
||||
} else if tc.wantNode != nil || tc.wantCollectF != nil {
|
||||
panic("invalid test case")
|
||||
} else if _, err := d.Unfold("/"); !errors.Is(err, tc.wantErr) {
|
||||
if tc.wantError == "" {
|
||||
t.Errorf("Unfold: error = %v, wantErr %v",
|
||||
err, tc.wantErr)
|
||||
} else if err != nil && err.Error() != tc.wantError {
|
||||
t.Errorf("Unfold: error = %q, wantError %q",
|
||||
err, tc.wantError)
|
||||
}
|
||||
}
|
||||
|
||||
if err := gotErr(); !errors.Is(err, tc.wantErr) {
|
||||
if tc.wantError == "" {
|
||||
t.Errorf("%s: error = %v, wantErr %v",
|
||||
funcName, err, tc.wantErr)
|
||||
} else if err != nil && err.Error() != tc.wantError {
|
||||
t.Errorf("%s: error = %q, wantError %q",
|
||||
funcName, err, tc.wantError)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func mustMarshal(v any) string {
|
||||
p, err := json.Marshal(v)
|
||||
if err != nil {
|
||||
panic(err.Error())
|
||||
}
|
||||
return string(p)
|
||||
}
|
||||
|
||||
type wantMountInfo struct {
|
||||
vfs.MountInfoEntry
|
||||
flags uintptr
|
||||
unmatched []string
|
||||
}
|
||||
|
||||
func m(
|
||||
id, parent, maj, min int, root, target, vfsOptstr string, optFields []string, fsType, source, fsOptstr string,
|
||||
flags uintptr, unmatched []string,
|
||||
) *wantMountInfo {
|
||||
return &wantMountInfo{
|
||||
vfs.MountInfoEntry{
|
||||
ID: id,
|
||||
Parent: parent,
|
||||
Devno: vfs.DevT{maj, min},
|
||||
Root: root,
|
||||
Target: target,
|
||||
VfsOptstr: vfsOptstr,
|
||||
OptFields: optFields,
|
||||
FsType: fsType,
|
||||
Source: source,
|
||||
FsOptstr: fsOptstr,
|
||||
}, flags, unmatched,
|
||||
}
|
||||
}
|
||||
|
||||
func mn(
|
||||
id, parent, maj, min int, root, target, vfsOptstr string, optFields []string, fsType, source, fsOptstr string,
|
||||
covered bool, firstChild, nextSibling *vfs.MountInfoNode,
|
||||
) *vfs.MountInfoNode {
|
||||
return &vfs.MountInfoNode{
|
||||
MountInfoEntry: &vfs.MountInfoEntry{
|
||||
ID: id,
|
||||
Parent: parent,
|
||||
Devno: vfs.DevT{maj, min},
|
||||
Root: root,
|
||||
Target: target,
|
||||
VfsOptstr: vfsOptstr,
|
||||
OptFields: optFields,
|
||||
FsType: fsType,
|
||||
Source: source,
|
||||
FsOptstr: fsOptstr,
|
||||
},
|
||||
FirstChild: firstChild,
|
||||
NextSibling: nextSibling,
|
||||
Clean: path.Clean(target),
|
||||
Covered: covered,
|
||||
}
|
||||
}
|
||||
|
||||
func o(field ...string) []string {
|
||||
if field == nil {
|
||||
return []string{}
|
||||
}
|
||||
return field
|
||||
}
|
||||
|
||||
const (
|
||||
sampleMountinfoBase = `15 20 0:3 / /proc rw,relatime - proc /proc rw
|
||||
16 20 0:15 / /sys rw,relatime - sysfs /sys rw
|
||||
17 20 0:5 / /dev rw,relatime - devtmpfs udev rw,size=1983516k,nr_inodes=495879,mode=755
|
||||
18 17 0:10 / /dev/pts rw,relatime - devpts devpts rw,gid=5,mode=620,ptmxmode=000
|
||||
19 17 0:16 / /dev/shm rw,relatime - tmpfs tmpfs rw
|
||||
20 1 8:4 / / ro,noatime,nodiratime,meow - ext3 /dev/sda4 rw,errors=continue,user_xattr,acl,barrier=0,data=ordered`
|
||||
|
||||
sampleMountinfo = `15 20 0:3 / /proc rw,relatime - proc /proc rw
|
||||
16 20 0:15 / /sys rw,relatime - sysfs /sys rw
|
||||
17 20 0:5 / /dev rw,relatime - devtmpfs udev rw,size=1983516k,nr_inodes=495879,mode=755
|
||||
18 17 0:10 / /dev/pts rw,relatime - devpts devpts rw,gid=5,mode=620,ptmxmode=000
|
||||
19 17 0:16 / /dev/shm rw,relatime - tmpfs tmpfs rw
|
||||
20 1 8:4 / / rw,noatime - ext3 /dev/sda4 rw,errors=continue,user_xattr,acl,barrier=0,data=ordered
|
||||
21 16 0:17 / /sys/fs/cgroup rw,nosuid,nodev,noexec,relatime - tmpfs tmpfs rw,mode=755
|
||||
22 21 0:18 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd
|
||||
23 21 0:19 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,cpuset
|
||||
24 21 0:20 / /sys/fs/cgroup/ns rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,ns
|
||||
25 21 0:21 / /sys/fs/cgroup/cpu rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,cpu
|
||||
26 21 0:22 / /sys/fs/cgroup/cpuacct rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,cpuacct
|
||||
27 21 0:23 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,memory
|
||||
28 21 0:24 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,devices
|
||||
29 21 0:25 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,freezer
|
||||
30 21 0:26 / /sys/fs/cgroup/net_cls rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,net_cls
|
||||
31 21 0:27 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,blkio
|
||||
32 16 0:28 / /sys/kernel/security rw,relatime - autofs systemd-1 rw,fd=22,pgrp=1,timeout=300,minproto=5,maxproto=5,direct
|
||||
33 17 0:29 / /dev/hugepages rw,relatime - autofs systemd-1 rw,fd=23,pgrp=1,timeout=300,minproto=5,maxproto=5,direct
|
||||
34 16 0:30 / /sys/kernel/debug rw,relatime - autofs systemd-1 rw,fd=24,pgrp=1,timeout=300,minproto=5,maxproto=5,direct
|
||||
35 15 0:31 / /proc/sys/fs/binfmt_misc rw,relatime - autofs systemd-1 rw,fd=25,pgrp=1,timeout=300,minproto=5,maxproto=5,direct
|
||||
36 17 0:32 / /dev/mqueue rw,relatime - autofs systemd-1 rw,fd=26,pgrp=1,timeout=300,minproto=5,maxproto=5,direct
|
||||
37 15 0:14 / /proc/bus/usb rw,relatime - usbfs /proc/bus/usb rw
|
||||
38 33 0:33 / /dev/hugepages rw,relatime - hugetlbfs hugetlbfs rw
|
||||
39 36 0:12 / /dev/mqueue rw,relatime - mqueue mqueue rw
|
||||
40 20 8:6 / /boot rw,noatime - ext3 /dev/sda6 rw,errors=continue,barrier=0,data=ordered
|
||||
41 20 253:0 / /home/kzak rw,noatime - ext4 /dev/mapper/kzak-home rw,barrier=1,data=ordered
|
||||
42 35 0:34 / /proc/sys/fs/binfmt_misc rw,relatime - binfmt_misc none rw
|
||||
43 16 0:35 / /sys/fs/fuse/connections rw,relatime - fusectl fusectl rw
|
||||
44 41 0:36 / /home/kzak/.gvfs rw,nosuid,nodev,relatime - fuse.gvfs-fuse-daemon gvfs-fuse-daemon rw,user_id=500,group_id=500
|
||||
45 20 0:37 / /var/lib/nfs/rpc_pipefs rw,relatime - rpc_pipefs sunrpc rw
|
||||
47 20 0:38 / /mnt/sounds rw,relatime - cifs //foo.home/bar/ rw,unc=\\foo.home\bar,username=kzak,domain=SRGROUP,uid=0,noforceuid,gid=0,noforcegid,addr=192.168.111.1,posixpaths,serverino,acl,rsize=16384,wsize=57344
|
||||
49 20 0:56 / /mnt/test/foobar rw,relatime,nosymfollow shared:323 - tmpfs tmpfs rw`
|
||||
|
||||
sampleMountinfoNoSrc = `15 20 0:3 / /proc rw,relatime - proc /proc rw
|
||||
16 20 0:15 / /sys rw,relatime - sysfs /sys rw
|
||||
17 20 0:5 / /dev rw,relatime - devtmpfs udev rw,size=1983516k,nr_inodes=495879,mode=755
|
||||
18 17 0:10 / /dev/pts rw,relatime - devpts devpts rw,gid=5,mode=620,ptmxmode=000
|
||||
19 17 0:16 / /dev/shm rw,relatime - tmpfs tmpfs rw
|
||||
20 1 8:4 / / rw,noatime - ext3 /dev/sda4 rw,errors=continue,user_xattr,acl,barrier=0,data=ordered
|
||||
21 20 0:53 / /mnt/test rw,relatime shared:212 - tmpfs rw`
|
||||
)
|
||||
107
container/vfs/unfold.go
Normal file
107
container/vfs/unfold.go
Normal file
@@ -0,0 +1,107 @@
|
||||
package vfs
|
||||
|
||||
import (
|
||||
"iter"
|
||||
"path"
|
||||
"strings"
|
||||
"syscall"
|
||||
)
|
||||
|
||||
// MountInfoNode positions a [MountInfoEntry] in its mount hierarchy.
|
||||
type MountInfoNode struct {
|
||||
*MountInfoEntry
|
||||
FirstChild *MountInfoNode `json:"first_child"`
|
||||
NextSibling *MountInfoNode `json:"next_sibling"`
|
||||
|
||||
Clean string `json:"clean"`
|
||||
Covered bool `json:"covered"`
|
||||
}
|
||||
|
||||
// Collective returns an iterator over visible mountinfo nodes.
|
||||
func (n *MountInfoNode) Collective() iter.Seq[*MountInfoNode] {
|
||||
return func(yield func(*MountInfoNode) bool) { n.visit(yield) }
|
||||
}
|
||||
|
||||
func (n *MountInfoNode) visit(yield func(*MountInfoNode) bool) bool {
|
||||
if !n.Covered && !yield(n) {
|
||||
return false
|
||||
}
|
||||
for cur := n.FirstChild; cur != nil; cur = cur.NextSibling {
|
||||
if !cur.visit(yield) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// Unfold unfolds the mount hierarchy and resolves covered paths.
|
||||
func (d *MountInfoDecoder) Unfold(target string) (*MountInfoNode, error) {
|
||||
targetClean := path.Clean(target)
|
||||
|
||||
var mountinfoSize int
|
||||
for range d.Entries() {
|
||||
mountinfoSize++
|
||||
}
|
||||
if err := d.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
mountinfo := make([]*MountInfoNode, mountinfoSize)
|
||||
// mount ID to index lookup
|
||||
idIndex := make(map[int]int, mountinfoSize)
|
||||
// final entry to match target
|
||||
targetIndex := -1
|
||||
{
|
||||
i := 0
|
||||
for ent := range d.Entries() {
|
||||
mountinfo[i] = &MountInfoNode{Clean: path.Clean(ent.Target), MountInfoEntry: ent}
|
||||
idIndex[ent.ID] = i
|
||||
if mountinfo[i].Clean == targetClean {
|
||||
targetIndex = i
|
||||
}
|
||||
|
||||
i++
|
||||
}
|
||||
}
|
||||
|
||||
if targetIndex == -1 {
|
||||
return nil, syscall.ESTALE
|
||||
}
|
||||
|
||||
for _, cur := range mountinfo {
|
||||
var parent *MountInfoNode
|
||||
if p, ok := idIndex[cur.Parent]; !ok {
|
||||
continue
|
||||
} else {
|
||||
parent = mountinfo[p]
|
||||
}
|
||||
|
||||
if !strings.HasPrefix(cur.Clean, targetClean) {
|
||||
continue
|
||||
}
|
||||
if parent.Clean == cur.Clean {
|
||||
parent.Covered = true
|
||||
}
|
||||
|
||||
covered := false
|
||||
nsp := &parent.FirstChild
|
||||
for s := parent.FirstChild; s != nil; s = s.NextSibling {
|
||||
if strings.HasPrefix(cur.Clean, s.Clean) {
|
||||
covered = true
|
||||
break
|
||||
}
|
||||
|
||||
if strings.HasPrefix(s.Clean, cur.Clean) {
|
||||
*nsp = s.NextSibling
|
||||
} else {
|
||||
nsp = &s.NextSibling
|
||||
}
|
||||
}
|
||||
if covered {
|
||||
continue
|
||||
}
|
||||
*nsp = cur
|
||||
}
|
||||
|
||||
return mountinfo[targetIndex], nil
|
||||
}
|
||||
93
container/vfs/unfold_test.go
Normal file
93
container/vfs/unfold_test.go
Normal file
@@ -0,0 +1,93 @@
|
||||
package vfs_test
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"reflect"
|
||||
"slices"
|
||||
"strings"
|
||||
"syscall"
|
||||
"testing"
|
||||
|
||||
"git.gensokyo.uk/security/hakurei/container/vfs"
|
||||
)
|
||||
|
||||
func TestUnfold(t *testing.T) {
|
||||
testCases := []struct {
|
||||
name string
|
||||
sample string
|
||||
target string
|
||||
wantErr error
|
||||
|
||||
want *vfs.MountInfoNode
|
||||
wantCollectF func(n *vfs.MountInfoNode) []*vfs.MountInfoNode
|
||||
wantCollectN []string
|
||||
}{
|
||||
{
|
||||
"no match",
|
||||
sampleMountinfoBase,
|
||||
"/mnt",
|
||||
syscall.ESTALE, nil, nil, nil,
|
||||
},
|
||||
{
|
||||
"cover",
|
||||
`33 1 0:33 / / rw,relatime shared:1 - tmpfs impure rw,size=16777216k,mode=755
|
||||
37 33 0:32 / /proc rw,nosuid,nodev,noexec,relatime shared:41 - proc proc rw
|
||||
551 33 0:121 / /mnt rw,relatime shared:666 - tmpfs tmpfs rw
|
||||
595 551 0:123 / /mnt rw,relatime shared:990 - tmpfs tmpfs rw
|
||||
611 595 0:142 / /mnt/etc rw,relatime shared:1112 - tmpfs tmpfs rw
|
||||
625 644 0:142 /passwd /mnt/etc/passwd rw,relatime shared:1112 - tmpfs tmpfs rw
|
||||
641 625 0:33 /etc/passwd /mnt/etc/passwd rw,relatime shared:1 - tmpfs impure rw,size=16777216k,mode=755
|
||||
644 611 0:33 /etc/passwd /mnt/etc/passwd rw,relatime shared:1 - tmpfs impure rw,size=16777216k,mode=755
|
||||
`, "/mnt", nil,
|
||||
mn(595, 551, 0, 123, "/", "/mnt", "rw,relatime", o("shared:990"), "tmpfs", "tmpfs", "rw", false,
|
||||
mn(611, 595, 0, 142, "/", "/mnt/etc", "rw,relatime", o("shared:1112"), "tmpfs", "tmpfs", "rw", false,
|
||||
mn(644, 611, 0, 33, "/etc/passwd", "/mnt/etc/passwd", "rw,relatime", o("shared:1"), "tmpfs", "impure", "rw,size=16777216k,mode=755", true,
|
||||
mn(625, 644, 0, 142, "/passwd", "/mnt/etc/passwd", "rw,relatime", o("shared:1112"), "tmpfs", "tmpfs", "rw", true,
|
||||
mn(641, 625, 0, 33, "/etc/passwd", "/mnt/etc/passwd", "rw,relatime", o("shared:1"), "tmpfs", "impure", "rw,size=16777216k,mode=755", false,
|
||||
nil, nil), nil), nil), nil), nil), func(n *vfs.MountInfoNode) []*vfs.MountInfoNode {
|
||||
return []*vfs.MountInfoNode{n, n.FirstChild, n.FirstChild.FirstChild.FirstChild.FirstChild}
|
||||
}, []string{"/mnt", "/mnt/etc", "/mnt/etc/passwd"},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
d := vfs.NewMountInfoDecoder(strings.NewReader(tc.sample))
|
||||
got, err := d.Unfold(tc.target)
|
||||
|
||||
if !errors.Is(err, tc.wantErr) {
|
||||
t.Errorf("Unfold: error = %v, wantErr %v",
|
||||
err, tc.wantErr)
|
||||
}
|
||||
|
||||
if !reflect.DeepEqual(got, tc.want) {
|
||||
t.Errorf("Unfold:\ngot %s\nwant %s",
|
||||
mustMarshal(got), mustMarshal(tc.want))
|
||||
}
|
||||
|
||||
if err == nil && tc.wantCollectF != nil {
|
||||
t.Run("collective", func(t *testing.T) {
|
||||
wantCollect := tc.wantCollectF(got)
|
||||
gotCollect := slices.Collect(got.Collective())
|
||||
if !reflect.DeepEqual(gotCollect, wantCollect) {
|
||||
t.Errorf("Collective: \ngot %#v\nwant %#v",
|
||||
gotCollect, wantCollect)
|
||||
}
|
||||
t.Run("target", func(t *testing.T) {
|
||||
gotCollectN := slices.Collect[string](func(yield func(v string) bool) {
|
||||
for _, cur := range gotCollect {
|
||||
if !yield(cur.Clean) {
|
||||
return
|
||||
}
|
||||
}
|
||||
})
|
||||
if !reflect.DeepEqual(gotCollectN, tc.wantCollectN) {
|
||||
t.Errorf("Collective: got %q, want %q",
|
||||
gotCollectN, tc.wantCollectN)
|
||||
}
|
||||
})
|
||||
})
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user