7 Commits

Author SHA1 Message Date
8a3c3d145a internal/pkg: correctly generate cure expects
All checks were successful
Test / Create distribution (push) Successful in 1m4s
Test / Sandbox (push) Successful in 2m48s
Test / Hakurei (push) Successful in 3m47s
Test / ShareFS (push) Successful in 3m53s
Test / Sandbox (race detector) (push) Successful in 5m19s
Test / Hakurei (race detector) (push) Successful in 6m25s
Test / Flake checks (push) Successful in 1m22s
This needs to dereference the identifier symlink.

Signed-off-by: Ophestra <cat@gensokyo.uk>
2026-05-07 15:57:45 +09:00
575ef307ad container: binfmt registration
All checks were successful
Test / Create distribution (push) Successful in 1m4s
Test / Sandbox (push) Successful in 2m43s
Test / Hakurei (push) Successful in 3m49s
Test / ShareFS (push) Successful in 3m55s
Test / Sandbox (race detector) (push) Successful in 5m18s
Test / Hakurei (race detector) (push) Successful in 6m25s
Test / Flake checks (push) Successful in 1m22s
This arranges for binfmt entries to be registered for the container.

Signed-off-by: Ophestra <cat@gensokyo.uk>
2026-05-07 15:55:19 +09:00
d4144fcf7f container: optionally map uid/gid 0 as init
All checks were successful
Test / Create distribution (push) Successful in 1m3s
Test / Sandbox (push) Successful in 2m48s
Test / Hakurei (push) Successful in 3m45s
Test / ShareFS (push) Successful in 3m55s
Test / Sandbox (race detector) (push) Successful in 5m15s
Test / Hakurei (race detector) (push) Successful in 6m31s
Test / Flake checks (push) Successful in 1m21s
Unfortunately required to work around flawed APIs like binfmt_misc.

Signed-off-by: Ophestra <cat@gensokyo.uk>
2026-05-07 15:15:47 +09:00
bad66facbc container: improve capability handling
All checks were successful
Test / Create distribution (push) Successful in 1m4s
Test / Sandbox (push) Successful in 2m44s
Test / Hakurei (push) Successful in 4m5s
Test / ShareFS (push) Successful in 4m25s
Test / Sandbox (race detector) (push) Successful in 5m55s
Test / Hakurei (race detector) (push) Successful in 7m54s
Test / Flake checks (push) Successful in 1m35s
This cleans up preserving caps for expansion and correctly sets privileged caps.

Signed-off-by: Ophestra <cat@gensokyo.uk>
2026-05-07 14:27:28 +09:00
4aba014eac container: abandon response on termination
All checks were successful
Test / Create distribution (push) Successful in 1m3s
Test / Sandbox (push) Successful in 2m51s
Test / Hakurei (push) Successful in 3m49s
Test / ShareFS (push) Successful in 3m56s
Test / Sandbox (race detector) (push) Successful in 5m18s
Test / Hakurei (race detector) (push) Successful in 6m22s
Test / Flake checks (push) Successful in 1m23s
This prevents blocking on early failure.

Signed-off-by: Ophestra <cat@gensokyo.uk>
2026-05-07 00:58:02 +09:00
779ba994ce container: check capability in test helper
All checks were successful
Test / Create distribution (push) Successful in 1m4s
Test / Sandbox (push) Successful in 2m48s
Test / Hakurei (push) Successful in 3m51s
Test / ShareFS (push) Successful in 3m53s
Test / Sandbox (race detector) (push) Successful in 5m18s
Test / Hakurei (race detector) (push) Successful in 6m25s
Test / Flake checks (push) Successful in 1m22s
This makes corresponding nixos tests redundant.

Signed-off-by: Ophestra <cat@gensokyo.uk>
2026-05-06 21:05:54 +09:00
917be2de93 internal/pkg/exec: close early failure before wait
All checks were successful
Test / Create distribution (push) Successful in 1m4s
Test / Sandbox (push) Successful in 2m46s
Test / Hakurei (push) Successful in 3m44s
Test / ShareFS (push) Successful in 3m53s
Test / Sandbox (race detector) (push) Successful in 5m15s
Test / Hakurei (race detector) (push) Successful in 6m21s
Test / Flake checks (push) Successful in 1m22s
This avoids a deadlock on an early container failure.

Signed-off-by: Ophestra <cat@gensokyo.uk>
2026-05-06 18:38:16 +09:00
11 changed files with 445 additions and 107 deletions

46
container/binfmt.go Normal file
View File

@@ -0,0 +1,46 @@
package container
import (
"strings"
"unsafe"
"hakurei.app/check"
)
// escapeBinfmt escapes magic/mask sequences in a [BinfmtEntry].
func escapeBinfmt(buf *strings.Builder, s string) string {
const lowerhex = "0123456789abcdef"
buf.Reset()
for _, c := range unsafe.Slice(unsafe.StringData(s), len(s)) {
switch c {
case 0, '\\', ':':
buf.WriteString(`\x`)
buf.WriteByte(lowerhex[c>>4])
buf.WriteByte(lowerhex[c&0xf])
default:
buf.WriteByte(c)
}
}
return buf.String()
}
// BinfmtEntry is an entry to be registered by the init process.
type BinfmtEntry struct {
// The offset of the magic/mask in the file, counted in bytes.
Offset byte
// The byte sequence binfmt_misc is matching for.
Magic string
// An (optional, defaults to all 0xff) mask.
Mask string
// The program that should be invoked with the binary as first argument.
Interpreter *check.Absolute
}
// Valid returns whether e can be registered into the kernel.
func (e *BinfmtEntry) Valid() bool {
return e != nil &&
int(e.Offset)+max(len(e.Magic), len(e.Mask)) < 128 &&
e.Interpreter != nil && len(e.Interpreter.String()) < 128
}

62
container/binfmt_test.go Normal file
View File

@@ -0,0 +1,62 @@
package container
import (
"strings"
"testing"
"hakurei.app/fhs"
)
func TestEscapeBinfmt(t *testing.T) {
t.Parallel()
testCases := []struct {
name string
magic string
want string
}{
{"packed DOS applications", "\x0eDEX", "\x0eDEX"},
{"riscv64 magic",
"\x7fELF\x02\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\xf3\x00",
"\x7fELF\x02\x01\x01\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\x02\\x00\xf3\\x00"},
{"riscv64 mask",
"\xff\xff\xff\xff\xff\xff\xff\x00\xff\xff\xff\xff\xff\xff\xff\xff\xfe\xff\xff\xff",
"\xff\xff\xff\xff\xff\xff\xff\\x00\xff\xff\xff\xff\xff\xff\xff\xff\xfe\xff\xff\xff"},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
got := escapeBinfmt(new(strings.Builder), tc.magic)
if got != tc.want {
t.Errorf("escapeBinfmt: %q, want %q", got, tc.want)
}
})
}
}
func TestBinfmtEntry(t *testing.T) {
t.Parallel()
testCases := []struct {
name string
e BinfmtEntry
valid bool
}{
{"zero", BinfmtEntry{}, false},
{"large offset", BinfmtEntry{Offset: 128}, false},
{"long magic", BinfmtEntry{Magic: strings.Repeat("\x00", 128)}, false},
{"long mask", BinfmtEntry{Mask: strings.Repeat("\x00", 128)}, false},
{"valid", BinfmtEntry{Interpreter: fhs.AbsRoot}, true},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
if tc.e.Valid() != tc.valid {
t.Errorf("Valid: %v", !tc.valid)
}
})
}
}

View File

@@ -18,6 +18,7 @@ const (
CAP_SETPCAP = 0x8 CAP_SETPCAP = 0x8
CAP_NET_ADMIN = 0xc CAP_NET_ADMIN = 0xc
CAP_DAC_OVERRIDE = 0x1 CAP_DAC_OVERRIDE = 0x1
CAP_SETFCAP = 0x1f
) )
type ( type (

View File

@@ -91,12 +91,20 @@ type (
// Time to wait for processes lingering after the initial process terminates. // Time to wait for processes lingering after the initial process terminates.
AdoptWaitDelay time.Duration AdoptWaitDelay time.Duration
// Map uid/gid 0 in the init process. Requires [FstypeProc] attached to
// [fhs.Proc] in the container filesystem.
InitAsRoot bool
// Mapped Uid in user namespace. // Mapped Uid in user namespace.
Uid int Uid int
// Mapped Gid in user namespace. // Mapped Gid in user namespace.
Gid int Gid int
// Hostname value in UTS namespace. // Hostname value in UTS namespace.
Hostname string Hostname string
// Register binfmt_misc entries.
Binfmt []BinfmtEntry
// Alternative pathname to attach binfmt_misc filesystem. The zero value
// requires [FstypeProc] to be made available at [fhs.Proc].
BinfmtPath *check.Absolute
// Sequential container setup ops. // Sequential container setup ops.
*Ops *Ops
@@ -216,6 +224,9 @@ func (p *Container) Start() error {
if p.cmd.Process != nil { if p.cmd.Process != nil {
return errors.New("container: already started") return errors.New("container: already started")
} }
if !p.InitAsRoot && len(p.Binfmt) > 0 {
return errors.New("container: init as root required, but not enabled")
}
if err := ensureCloseOnExec(); err != nil { if err := ensureCloseOnExec(); err != nil {
return err return err
@@ -286,6 +297,18 @@ func (p *Container) Start() error {
if !p.HostNet { if !p.HostNet {
p.cmd.SysProcAttr.Cloneflags |= CLONE_NEWNET p.cmd.SysProcAttr.Cloneflags |= CLONE_NEWNET
} }
if p.InitAsRoot {
p.cmd.SysProcAttr.AmbientCaps = append(p.cmd.SysProcAttr.AmbientCaps,
// mappings during init as root
CAP_SETFCAP,
)
if !p.SeccompDisable &&
len(p.SeccompRules) == 0 &&
p.SeccompPresets&std.PresetDenyNS != 0 {
return errors.New("container: as root requires late namespace creation")
}
}
// place setup pipe before user supplied extra files, this is later restored by init // place setup pipe before user supplied extra files, this is later restored by init
if r, w, err := os.Pipe(); err != nil { if r, w, err := os.Pipe(); err != nil {

View File

@@ -16,7 +16,8 @@ import (
"strings" "strings"
"syscall" "syscall"
"testing" "testing"
_ "unsafe" // for go:linkname "time"
"unsafe"
"hakurei.app/check" "hakurei.app/check"
"hakurei.app/command" "hakurei.app/command"
@@ -408,39 +409,11 @@ var containerTestCases = []struct {
func TestContainer(t *testing.T) { func TestContainer(t *testing.T) {
t.Parallel() t.Parallel()
t.Run("cancel", testContainerCancel(nil, func(t *testing.T, c *container.Container) { var suffix string
wantErr := context.Canceled runTests:
wantExitCode := 0
if err := c.Wait(); !reflect.DeepEqual(err, wantErr) {
if m, ok := container.InternalMessageFromError(err); ok {
t.Error(m)
}
t.Errorf("Wait: error = %#v, want %#v", err, wantErr)
}
if ps := c.ProcessState(); ps == nil {
t.Errorf("ProcessState unexpectedly returned nil")
} else if code := ps.ExitCode(); code != wantExitCode {
t.Errorf("ExitCode: %d, want %d", code, wantExitCode)
}
}))
t.Run("forward", testContainerCancel(func(c *container.Container) {
c.ForwardCancel = true
}, func(t *testing.T, c *container.Container) {
var exitError *exec.ExitError
if err := c.Wait(); !errors.As(err, &exitError) {
if m, ok := container.InternalMessageFromError(err); ok {
t.Error(m)
}
t.Errorf("Wait: error = %v", err)
}
if code := exitError.ExitCode(); code != blockExitCodeInterrupt {
t.Errorf("ExitCode: %d, want %d", code, blockExitCodeInterrupt)
}
}))
for i, tc := range containerTestCases { for i, tc := range containerTestCases {
t.Run(tc.name, func(t *testing.T) { _suffix := suffix
t.Run(tc.name+_suffix, func(t *testing.T) {
t.Parallel() t.Parallel()
wantOps, wantOpsCtx := tc.ops(t) wantOps, wantOpsCtx := tc.ops(t)
@@ -464,6 +437,8 @@ func TestContainer(t *testing.T) {
c.SeccompDisable = !tc.filter c.SeccompDisable = !tc.filter
c.RetainSession = tc.session c.RetainSession = tc.session
c.HostNet = tc.net c.HostNet = tc.net
c.InitAsRoot = _suffix != ""
c.Env = append(c.Env, "HAKUREI_TEST_SUFFIX="+_suffix)
if info.CanDegrade { if info.CanDegrade {
if _, err := landlock.GetABI(); err != nil { if _, err := landlock.GetABI(); err != nil {
if !errors.Is(err, syscall.ENOSYS) { if !errors.Is(err, syscall.ENOSYS) {
@@ -473,6 +448,9 @@ func TestContainer(t *testing.T) {
t.Log("Landlock LSM is unavailable, enabling HostAbstract") t.Log("Landlock LSM is unavailable, enabling HostAbstract")
} }
} }
if c.InitAsRoot {
c.SeccompPresets &= ^std.PresetDenyNS
}
c. c.
Readonly(check.MustAbs(pathReadonly), 0755). Readonly(check.MustAbs(pathReadonly), 0755).
@@ -541,6 +519,11 @@ func TestContainer(t *testing.T) {
} }
}) })
} }
if suffix == "" {
suffix = " as root"
goto runTests
}
} }
func ent(root, target, vfsOptstr, fsType, source, fsOptstr string) *vfs.MountInfoEntry { func ent(root, target, vfsOptstr, fsType, source, fsOptstr string) *vfs.MountInfoEntry {
@@ -563,11 +546,10 @@ func hostnameFromTestCase(name string) string {
} }
func testContainerCancel( func testContainerCancel(
t *testing.T,
containerExtra func(c *container.Container), containerExtra func(c *container.Container),
waitCheck func(t *testing.T, c *container.Container), waitCheck func(ps *os.ProcessState, waitErr error),
) func(t *testing.T) { ) {
return func(t *testing.T) {
t.Parallel()
ctx, cancel := context.WithCancel(t.Context()) ctx, cancel := context.WithCancel(t.Context())
c := helperNewContainer(ctx, "block") c := helperNewContainer(ctx, "block")
@@ -577,25 +559,36 @@ func testContainerCancel(
} }
ready := make(chan struct{}) ready := make(chan struct{})
if r, w, err := os.Pipe(); err != nil { var waitErr error
r, w, err := os.Pipe()
if err != nil {
t.Fatalf("cannot pipe: %v", err) t.Fatalf("cannot pipe: %v", err)
} else { }
c.ExtraFiles = append(c.ExtraFiles, w) c.ExtraFiles = append(c.ExtraFiles, w)
go func() { go func() {
defer close(ready) defer close(ready)
if _, err = r.Read(make([]byte, 1)); err != nil { if _, _err := r.Read(make([]byte, 1)); _err != nil {
panic(err.Error()) panic(_err)
} }
}() }()
}
if err := c.Start(); err != nil { if err = c.Start(); err != nil {
if m, ok := container.InternalMessageFromError(err); ok { if m, ok := container.InternalMessageFromError(err); ok {
t.Fatal(m) t.Fatal(m)
} else { } else {
t.Fatalf("cannot start container: %v", err) t.Fatalf("cannot start container: %v", err)
} }
} else if err = c.Serve(); err != nil { }
done := make(chan struct{})
go func() {
defer close(done)
waitErr = c.Wait()
_ = r.SetReadDeadline(time.Now())
}()
if err = c.Serve(); err != nil {
if m, ok := container.InternalMessageFromError(err); ok { if m, ok := container.InternalMessageFromError(err); ok {
t.Error(m) t.Error(m)
} else { } else {
@@ -604,8 +597,67 @@ func testContainerCancel(
} }
<-ready <-ready
cancel() cancel()
waitCheck(t, c) <-done
waitCheck(c.ProcessState(), waitErr)
} }
func TestForward(t *testing.T) {
t.Parallel()
f := func(ps *os.ProcessState, waitErr error) {
var exitError *exec.ExitError
if !errors.As(waitErr, &exitError) {
if m, ok := container.InternalMessageFromError(waitErr); ok {
t.Error(m)
}
t.Errorf("Wait: error = %v", waitErr)
}
if code := exitError.ExitCode(); code != blockExitCodeInterrupt {
t.Errorf("ExitCode: %d, want %d", code, blockExitCodeInterrupt)
}
}
t.Run("direct", func(t *testing.T) {
t.Parallel()
testContainerCancel(t, func(c *container.Container) {
c.ForwardCancel = true
}, f)
})
t.Run("as root", func(t *testing.T) {
testContainerCancel(t, func(c *container.Container) {
c.ForwardCancel = true
c.InitAsRoot = true
c.Proc(fhs.AbsProc)
}, f)
})
}
func TestCancel(t *testing.T) {
t.Parallel()
f := func(ps *os.ProcessState, waitErr error) {
wantErr := context.Canceled
if !reflect.DeepEqual(waitErr, wantErr) {
if m, ok := container.InternalMessageFromError(waitErr); ok {
t.Error(m)
}
t.Errorf("Wait: error = %#v, want %#v", waitErr, wantErr)
}
if ps == nil {
t.Errorf("ProcessState unexpectedly returned nil")
} else if code := ps.ExitCode(); code != 0 {
t.Errorf("ExitCode: %d, want %d", code, 0)
}
}
t.Run("direct", func(t *testing.T) {
t.Parallel()
testContainerCancel(t, nil, f)
})
t.Run("as root", func(t *testing.T) {
testContainerCancel(t, func(c *container.Container) {
c.InitAsRoot = true
c.Proc(fhs.AbsProc)
}, f)
})
} }
func TestContainerString(t *testing.T) { func TestContainerString(t *testing.T) {
@@ -641,6 +693,8 @@ func init() {
}) })
c.Command("container", command.UsageInternal, func(args []string) error { c.Command("container", command.UsageInternal, func(args []string) error {
asRoot := os.Getenv("HAKUREI_TEST_SUFFIX") == " as root"
if len(args) != 1 { if len(args) != 1 {
return syscall.EINVAL return syscall.EINVAL
} }
@@ -658,6 +712,66 @@ func init() {
return fmt.Errorf("gid: %d, want %d", gid, tc.gid) return fmt.Errorf("gid: %d, want %d", gid, tc.gid)
} }
// no attack surface increase during as root due to no_new_privs
var wantBounding uintptr = 1
asRootNot := " not"
if !asRoot {
wantBounding = 0
asRootNot = ""
}
const (
PR_CAP_AMBIENT = 0x2f
PR_CAP_AMBIENT_IS_SET = 0x1
)
for i := range container.LastCap(nil) + 1 {
r, _, errno := syscall.Syscall(
syscall.SYS_PRCTL,
PR_CAP_AMBIENT,
PR_CAP_AMBIENT_IS_SET,
i,
)
if errno != 0 {
return os.NewSyscallError("prctl", errno)
}
if r != 0 {
return fmt.Errorf("capability %d in ambient set", i)
}
r, _, errno = syscall.Syscall(
syscall.SYS_PRCTL,
syscall.PR_CAPBSET_READ,
i,
0,
)
if errno != 0 {
return os.NewSyscallError("prctl", errno)
}
if r != wantBounding {
return fmt.Errorf("capability %d%s in bounding set", i, asRootNot)
}
}
const _LINUX_CAPABILITY_VERSION_3 = 0x20080522
var capData struct {
effective uint32
permitted uint32
inheritable uint32
}
if _, _, errno := syscall.Syscall(syscall.SYS_CAPGET, uintptr(unsafe.Pointer(&struct {
version uint32
pid int32
}{_LINUX_CAPABILITY_VERSION_3, 0})), uintptr(unsafe.Pointer(&capData)), 0); errno != 0 {
return os.NewSyscallError("capget", errno)
}
if max(capData.effective, capData.permitted, capData.inheritable) != 0 {
return fmt.Errorf(
"effective = %d, permitted = %d, inheritable = %d",
capData.effective, capData.permitted, capData.inheritable,
)
}
wantHost := hostnameFromTestCase(tc.name) wantHost := hostnameFromTestCase(tc.name)
if host, err := os.Hostname(); err != nil { if host, err := os.Hostname(); err != nil {
return fmt.Errorf("cannot get hostname: %v", err) return fmt.Errorf("cannot get hostname: %v", err)
@@ -775,7 +889,7 @@ func TestMain(m *testing.M) {
} }
c.MustParse(os.Args[1:], func(err error) { c.MustParse(os.Args[1:], func(err error) {
if err != nil { if err != nil {
log.Fatal(err.Error()) log.Fatal(err)
} }
}) })
return return

View File

@@ -11,11 +11,13 @@ import (
"path/filepath" "path/filepath"
"slices" "slices"
"strconv" "strconv"
"strings"
"sync" "sync"
"sync/atomic" "sync/atomic"
. "syscall" . "syscall"
"time" "time"
"hakurei.app/check"
"hakurei.app/container/seccomp" "hakurei.app/container/seccomp"
"hakurei.app/ext" "hakurei.app/ext"
"hakurei.app/fhs" "hakurei.app/fhs"
@@ -182,23 +184,33 @@ func initEntrypoint(k syscallDispatcher, msg message.Msg) {
cancel() cancel()
} }
uid, gid := param.Uid, param.Gid
if param.InitAsRoot {
uid, gid = 0, 0
}
// write uid/gid map here so parent does not need to set dumpable // write uid/gid map here so parent does not need to set dumpable
if err := k.setDumpable(ext.SUID_DUMP_USER); err != nil { if err := k.setDumpable(ext.SUID_DUMP_USER); err != nil {
k.fatalf(msg, "cannot set SUID_DUMP_USER: %v", err) k.fatalf(msg, "cannot set SUID_DUMP_USER: %v", err)
} }
if err := k.writeFile(fhs.Proc+"self/uid_map", if err := k.writeFile(
append([]byte{}, strconv.Itoa(param.Uid)+" "+strconv.Itoa(param.HostUid)+" 1\n"...), fhs.Proc+"self/uid_map",
0); err != nil { []byte(strconv.Itoa(uid)+" "+strconv.Itoa(param.HostUid)+" 1\n"),
0,
); err != nil {
k.fatalf(msg, "%v", err) k.fatalf(msg, "%v", err)
} }
if err := k.writeFile(fhs.Proc+"self/setgroups", if err := k.writeFile(
fhs.Proc+"self/setgroups",
[]byte("deny\n"), []byte("deny\n"),
0); err != nil && !os.IsNotExist(err) { 0,
); err != nil && !os.IsNotExist(err) {
k.fatalf(msg, "%v", err) k.fatalf(msg, "%v", err)
} }
if err := k.writeFile(fhs.Proc+"self/gid_map", if err := k.writeFile(fhs.Proc+"self/gid_map",
append([]byte{}, strconv.Itoa(param.Gid)+" "+strconv.Itoa(param.HostGid)+" 1\n"...), []byte(strconv.Itoa(gid)+" "+strconv.Itoa(param.HostGid)+" 1\n"),
0); err != nil { 0,
); err != nil {
k.fatalf(msg, "%v", err) k.fatalf(msg, "%v", err)
} }
if err := k.setDumpable(ext.SUID_DUMP_DISABLE); err != nil { if err := k.setDumpable(ext.SUID_DUMP_DISABLE); err != nil {
@@ -230,6 +242,16 @@ func initEntrypoint(k syscallDispatcher, msg message.Msg) {
k.fatalf(msg, "cannot enter intermediate host path: %v", err) k.fatalf(msg, "cannot enter intermediate host path: %v", err)
} }
if len(param.Binfmt) > 0 {
for i, e := range param.Binfmt {
if pathname, err := k.evalSymlinks(e.Interpreter.String()); err != nil {
k.fatal(msg, err)
} else if param.Binfmt[i].Interpreter, err = check.NewAbs(pathname); err != nil {
k.fatal(msg, err)
}
}
}
/* early is called right before pivot_root into intermediate root; /* early is called right before pivot_root into intermediate root;
this step is mostly for gathering information that would otherwise be this step is mostly for gathering information that would otherwise be
difficult to obtain via library functions after pivot_root, and difficult to obtain via library functions after pivot_root, and
@@ -285,6 +307,48 @@ func initEntrypoint(k syscallDispatcher, msg message.Msg) {
} }
} }
if len(param.Binfmt) > 0 {
const interpreter = "/interpreter"
if param.BinfmtPath == nil {
param.BinfmtPath = fhs.AbsProcSys.Append("fs/binfmt_misc")
}
binfmt := sysrootPath + param.BinfmtPath.String()
if err := k.mkdirAll(binfmt, 0); err != nil {
k.fatal(msg, err)
}
if err := k.mount(
SourceBinfmtMisc,
binfmt,
FstypeBinfmtMisc,
MS_NOSUID|MS_NOEXEC|MS_NODEV,
zeroString,
); err != nil {
k.fatal(msg, err)
}
var buf strings.Builder
buf.Grow(1920)
register := binfmt + "/register"
for i, e := range param.Binfmt {
if err := k.symlink(hostPath+e.Interpreter.String(), interpreter); err != nil {
k.fatal(msg, err)
} else if err = k.writeFile(register, []byte(":"+
strconv.Itoa(i)+":"+
"M:"+
strconv.Itoa(int(e.Offset))+":"+
escapeBinfmt(&buf, e.Magic)+":"+
escapeBinfmt(&buf, e.Mask)+":"+
interpreter+":"+
"F"), 0); err != nil {
k.fatal(msg, err)
} else if err = k.remove(interpreter); err != nil {
k.fatal(msg, err)
}
}
}
// setup requiring host root complete at this point // setup requiring host root complete at this point
if err := k.mount(hostDir, hostDir, zeroString, MS_SILENT|MS_REC|MS_PRIVATE, zeroString); err != nil { if err := k.mount(hostDir, hostDir, zeroString, MS_SILENT|MS_REC|MS_PRIVATE, zeroString); err != nil {
k.fatalf(msg, "cannot make host root rprivate: %v", optionalErrorUnwrap(err)) k.fatalf(msg, "cannot make host root rprivate: %v", optionalErrorUnwrap(err))
@@ -323,11 +387,19 @@ func initEntrypoint(k syscallDispatcher, msg message.Msg) {
} }
} }
var keepCaps []uintptr
if param.Privileged {
keepCaps = append(keepCaps, CAP_SYS_ADMIN, CAP_SETPCAP)
}
if param.InitAsRoot {
keepCaps = append(keepCaps, CAP_SETFCAP)
}
if err := k.capAmbientClearAll(); err != nil { if err := k.capAmbientClearAll(); err != nil {
k.fatalf(msg, "cannot clear the ambient capability set: %v", err) k.fatalf(msg, "cannot clear the ambient capability set: %v", err)
} }
for i := uintptr(0); i <= lastcap; i++ { for i := range lastcap + 1 {
if param.Privileged && i == CAP_SYS_ADMIN { if slices.Contains(keepCaps, i) {
continue continue
} }
if err := k.capBoundingSetDrop(i); err != nil { if err := k.capBoundingSetDrop(i); err != nil {
@@ -336,20 +408,23 @@ func initEntrypoint(k syscallDispatcher, msg message.Msg) {
} }
var keep [2]uint32 var keep [2]uint32
if param.Privileged { for _, c := range keepCaps {
keep[capToIndex(CAP_SYS_ADMIN)] |= capToMask(CAP_SYS_ADMIN) keep[capToIndex(c)] |= capToMask(c)
}
if err := k.capAmbientRaise(CAP_SYS_ADMIN); err != nil {
k.fatalf(msg, "cannot raise CAP_SYS_ADMIN: %v", err)
}
}
if err := k.capset( if err := k.capset(
&capHeader{_LINUX_CAPABILITY_VERSION_3, 0}, &capHeader{_LINUX_CAPABILITY_VERSION_3, 0},
&[2]capData{{0, keep[0], keep[0]}, {0, keep[1], keep[1]}}, &[2]capData{{keep[0], keep[0], keep[0]}, {keep[1], keep[1], keep[1]}},
); err != nil { ); err != nil {
k.fatalf(msg, "cannot capset: %v", err) k.fatalf(msg, "cannot capset: %v", err)
} }
for _, c := range keepCaps {
if err := k.capAmbientRaise(c); err != nil {
k.fatalf(msg, "cannot raise %#x: %v", c, err)
}
}
if !param.SeccompDisable { if !param.SeccompDisable {
rules := param.SeccompRules rules := param.SeccompRules
if len(rules) == 0 { // non-empty rules slice always overrides presets if len(rules) == 0 { // non-empty rules slice always overrides presets
@@ -474,6 +549,14 @@ func initEntrypoint(k syscallDispatcher, msg message.Msg) {
cmd.ExtraFiles = extraFiles cmd.ExtraFiles = extraFiles
cmd.Dir = param.Dir.String() cmd.Dir = param.Dir.String()
if param.InitAsRoot {
cmd.SysProcAttr = &SysProcAttr{
Cloneflags: CLONE_NEWUSER,
UidMappings: []SysProcIDMap{{ContainerID: param.Uid, HostID: 0, Size: 1}},
GidMappings: []SysProcIDMap{{ContainerID: param.Gid, HostID: 0, Size: 1}},
}
}
msg.Verbosef("starting initial process %s", param.Path) msg.Verbosef("starting initial process %s", param.Path)
if err := k.start(cmd); err != nil { if err := k.start(cmd); err != nil {
k.fatalf(msg, "%v", err) k.fatalf(msg, "%v", err)

View File

@@ -1624,7 +1624,6 @@ func TestInitEntrypoint(t *testing.T) {
call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x5)}, nil, nil), call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x5)}, nil, nil),
call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x6)}, nil, nil), call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x6)}, nil, nil),
call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x7)}, nil, nil), call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x7)}, nil, nil),
call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x8)}, nil, nil),
call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x9)}, nil, nil), call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x9)}, nil, nil),
call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0xa)}, nil, nil), call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0xa)}, nil, nil),
call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0xb)}, nil, nil), call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0xb)}, nil, nil),
@@ -1656,8 +1655,9 @@ func TestInitEntrypoint(t *testing.T) {
call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x26)}, nil, nil), call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x26)}, nil, nil),
call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x27)}, nil, nil), call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x27)}, nil, nil),
call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x28)}, nil, nil), call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x28)}, nil, nil),
call("capset", stub.ExpectArgs{&capHeader{_LINUX_CAPABILITY_VERSION_3, 0}, &[2]capData{{0x200100, 0x200100, 0x200100}, {0, 0, 0}}}, nil, nil),
call("capAmbientRaise", stub.ExpectArgs{uintptr(0x15)}, nil, stub.UniqueError(19)), call("capAmbientRaise", stub.ExpectArgs{uintptr(0x15)}, nil, stub.UniqueError(19)),
call("fatalf", stub.ExpectArgs{"cannot raise CAP_SYS_ADMIN: %v", []any{stub.UniqueError(19)}}, nil, nil), call("fatalf", stub.ExpectArgs{"cannot raise %#x: %v", []any{uintptr(0x15), stub.UniqueError(19)}}, nil, nil),
}, },
}, nil}, }, nil},
@@ -1731,7 +1731,6 @@ func TestInitEntrypoint(t *testing.T) {
call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x5)}, nil, nil), call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x5)}, nil, nil),
call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x6)}, nil, nil), call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x6)}, nil, nil),
call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x7)}, nil, nil), call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x7)}, nil, nil),
call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x8)}, nil, nil),
call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x9)}, nil, nil), call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x9)}, nil, nil),
call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0xa)}, nil, nil), call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0xa)}, nil, nil),
call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0xb)}, nil, nil), call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0xb)}, nil, nil),
@@ -1763,8 +1762,7 @@ func TestInitEntrypoint(t *testing.T) {
call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x26)}, nil, nil), call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x26)}, nil, nil),
call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x27)}, nil, nil), call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x27)}, nil, nil),
call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x28)}, nil, nil), call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x28)}, nil, nil),
call("capAmbientRaise", stub.ExpectArgs{uintptr(0x15)}, nil, nil), call("capset", stub.ExpectArgs{&capHeader{_LINUX_CAPABILITY_VERSION_3, 0}, &[2]capData{{0x200100, 0x200100, 0x200100}, {0, 0, 0}}}, nil, stub.UniqueError(17)),
call("capset", stub.ExpectArgs{&capHeader{_LINUX_CAPABILITY_VERSION_3, 0}, &[2]capData{{0, 0x200000, 0x200000}, {0, 0, 0}}}, nil, stub.UniqueError(17)),
call("fatalf", stub.ExpectArgs{"cannot capset: %v", []any{stub.UniqueError(17)}}, nil, nil), call("fatalf", stub.ExpectArgs{"cannot capset: %v", []any{stub.UniqueError(17)}}, nil, nil),
}, },
}, nil}, }, nil},
@@ -1839,7 +1837,6 @@ func TestInitEntrypoint(t *testing.T) {
call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x5)}, nil, nil), call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x5)}, nil, nil),
call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x6)}, nil, nil), call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x6)}, nil, nil),
call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x7)}, nil, nil), call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x7)}, nil, nil),
call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x8)}, nil, nil),
call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x9)}, nil, nil), call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x9)}, nil, nil),
call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0xa)}, nil, nil), call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0xa)}, nil, nil),
call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0xb)}, nil, nil), call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0xb)}, nil, nil),
@@ -1871,8 +1868,9 @@ func TestInitEntrypoint(t *testing.T) {
call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x26)}, nil, nil), call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x26)}, nil, nil),
call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x27)}, nil, nil), call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x27)}, nil, nil),
call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x28)}, nil, nil), call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x28)}, nil, nil),
call("capset", stub.ExpectArgs{&capHeader{_LINUX_CAPABILITY_VERSION_3, 0}, &[2]capData{{0x200100, 0x200100, 0x200100}, {0, 0, 0}}}, nil, nil),
call("capAmbientRaise", stub.ExpectArgs{uintptr(0x15)}, nil, nil), call("capAmbientRaise", stub.ExpectArgs{uintptr(0x15)}, nil, nil),
call("capset", stub.ExpectArgs{&capHeader{_LINUX_CAPABILITY_VERSION_3, 0}, &[2]capData{{0, 0x200000, 0x200000}, {0, 0, 0}}}, nil, nil), call("capAmbientRaise", stub.ExpectArgs{uintptr(0x8)}, nil, nil),
call("verbosef", stub.ExpectArgs{"resolving presets %#x", []any{std.FilterPreset(0xf)}}, nil, nil), call("verbosef", stub.ExpectArgs{"resolving presets %#x", []any{std.FilterPreset(0xf)}}, nil, nil),
call("seccompLoad", stub.ExpectArgs{seccomp.Preset(0xf, 0), seccomp.ExportFlag(0)}, nil, stub.UniqueError(15)), call("seccompLoad", stub.ExpectArgs{seccomp.Preset(0xf, 0), seccomp.ExportFlag(0)}, nil, stub.UniqueError(15)),
call("fatalf", stub.ExpectArgs{"cannot load syscall filter: %v", []any{stub.UniqueError(15)}}, nil, nil), call("fatalf", stub.ExpectArgs{"cannot load syscall filter: %v", []any{stub.UniqueError(15)}}, nil, nil),
@@ -2699,7 +2697,6 @@ func TestInitEntrypoint(t *testing.T) {
call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x5)}, nil, nil), call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x5)}, nil, nil),
call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x6)}, nil, nil), call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x6)}, nil, nil),
call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x7)}, nil, nil), call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x7)}, nil, nil),
call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x8)}, nil, nil),
call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x9)}, nil, nil), call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x9)}, nil, nil),
call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0xa)}, nil, nil), call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0xa)}, nil, nil),
call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0xb)}, nil, nil), call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0xb)}, nil, nil),
@@ -2731,8 +2728,9 @@ func TestInitEntrypoint(t *testing.T) {
call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x26)}, nil, nil), call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x26)}, nil, nil),
call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x27)}, nil, nil), call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x27)}, nil, nil),
call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x28)}, nil, nil), call("capBoundingSetDrop", stub.ExpectArgs{uintptr(0x28)}, nil, nil),
call("capset", stub.ExpectArgs{&capHeader{_LINUX_CAPABILITY_VERSION_3, 0}, &[2]capData{{0x200100, 0x200100, 0x200100}, {0, 0, 0}}}, nil, nil),
call("capAmbientRaise", stub.ExpectArgs{uintptr(0x15)}, nil, nil), call("capAmbientRaise", stub.ExpectArgs{uintptr(0x15)}, nil, nil),
call("capset", stub.ExpectArgs{&capHeader{_LINUX_CAPABILITY_VERSION_3, 0}, &[2]capData{{0, 0x200000, 0x200000}, {0, 0, 0}}}, nil, nil), call("capAmbientRaise", stub.ExpectArgs{uintptr(0x8)}, nil, nil),
call("verbosef", stub.ExpectArgs{"resolving presets %#x", []any{std.FilterPreset(0xf)}}, nil, nil), call("verbosef", stub.ExpectArgs{"resolving presets %#x", []any{std.FilterPreset(0xf)}}, nil, nil),
call("seccompLoad", stub.ExpectArgs{seccomp.Preset(0xf, 0), seccomp.ExportFlag(0)}, nil, nil), call("seccompLoad", stub.ExpectArgs{seccomp.Preset(0xf, 0), seccomp.ExportFlag(0)}, nil, nil),
call("verbosef", stub.ExpectArgs{"%d filter rules loaded", []any{73}}, nil, nil), call("verbosef", stub.ExpectArgs{"%d filter rules loaded", []any{73}}, nil, nil),

View File

@@ -40,6 +40,9 @@ const (
// SourceMqueue is used when mounting mqueue. // SourceMqueue is used when mounting mqueue.
// Note that any source value is allowed when fstype is [FstypeMqueue]. // Note that any source value is allowed when fstype is [FstypeMqueue].
SourceMqueue = "mqueue" SourceMqueue = "mqueue"
// SourceBinfmtMisc is used when mounting binfmt_misc.
// Note that any source value is allowed when fstype is [SourceBinfmtMisc].
SourceBinfmtMisc = "binfmt_misc"
// SourceOverlay is used when mounting overlay. // SourceOverlay is used when mounting overlay.
// Note that any source value is allowed when fstype is [FstypeOverlay]. // Note that any source value is allowed when fstype is [FstypeOverlay].
SourceOverlay = "overlay" SourceOverlay = "overlay"
@@ -70,6 +73,9 @@ const (
// FstypeMqueue represents the mqueue pseudo-filesystem. // FstypeMqueue represents the mqueue pseudo-filesystem.
// This filesystem type is usually mounted on /dev/mqueue. // This filesystem type is usually mounted on /dev/mqueue.
FstypeMqueue = "mqueue" FstypeMqueue = "mqueue"
// FstypeBinfmtMisc represents the binfmt_misc pseudo-filesystem.
// This filesystem type is usually mounted on /proc/sys/fs/binfmt_misc.
FstypeBinfmtMisc = "binfmt_misc"
// FstypeOverlay represents the overlay pseudo-filesystem. // FstypeOverlay represents the overlay pseudo-filesystem.
// This filesystem type can be mounted anywhere in the container filesystem. // This filesystem type can be mounted anywhere in the container filesystem.
FstypeOverlay = "overlay" FstypeOverlay = "overlay"

View File

@@ -42,6 +42,8 @@ var (
AbsDevShm = unsafeAbs(DevShm) AbsDevShm = unsafeAbs(DevShm)
// AbsProc is [Proc] as [check.Absolute]. // AbsProc is [Proc] as [check.Absolute].
AbsProc = unsafeAbs(Proc) AbsProc = unsafeAbs(Proc)
// AbsProcSys is [ProcSys] as [check.Absolute].
AbsProcSys = unsafeAbs(ProcSys)
// AbsProcSelfExe is [ProcSelfExe] as [check.Absolute]. // AbsProcSelfExe is [ProcSelfExe] as [check.Absolute].
AbsProcSelfExe = unsafeAbs(ProcSelfExe) AbsProcSelfExe = unsafeAbs(ProcSelfExe)
// AbsSys is [Sys] as [check.Absolute]. // AbsSys is [Sys] as [check.Absolute].

View File

@@ -631,12 +631,6 @@ func (a *execArtifact) cure(f *FContext, hostNet bool) (err error) {
_ = stdout.Close() _ = stdout.Close()
return return
} }
defer func() {
if err != nil && !errors.As(err, new(*exec.ExitError)) {
_ = stdout.Close()
_ = stderr.Close()
}
}()
brStdout, brStderr := f.cache.getReader(stdout), f.cache.getReader(stderr) brStdout, brStderr := f.cache.getReader(stdout), f.cache.getReader(stderr)
stdoutDone, stderrDone := make(chan struct{}), make(chan struct{}) stdoutDone, stderrDone := make(chan struct{}), make(chan struct{})
@@ -651,6 +645,11 @@ func (a *execArtifact) cure(f *FContext, hostNet bool) (err error) {
io.TeeReader(brStderr, status), io.TeeReader(brStderr, status),
) )
defer func() { defer func() {
if err != nil && !errors.As(err, new(*exec.ExitError)) {
_ = stdout.Close()
_ = stderr.Close()
}
<-stdoutDone <-stdoutDone
<-stderrDone <-stderrDone
f.cache.putReader(brStdout) f.cache.putReader(brStdout)

View File

@@ -423,7 +423,7 @@ func checkWithCache(t *testing.T, testCases []cacheTestCase) {
msg := message.New(log.New(os.Stderr, "cache: ", 0)) msg := message.New(log.New(os.Stderr, "cache: ", 0))
msg.SwapVerbose(testing.Verbose()) msg.SwapVerbose(testing.Verbose())
flags := tc.flags flags := tc.flags | pkg.CSuppressInit
if info.CanDegrade { if info.CanDegrade {
if _, err := landlock.GetABI(); err != nil { if _, err := landlock.GetABI(); err != nil {
@@ -544,7 +544,11 @@ func cureMany(t *testing.T, c *pkg.Cache, steps []cureStep) {
t.Fatalf("Cure: pathname = %q, want %q", pathname, step.pathname) t.Fatalf("Cure: pathname = %q, want %q", pathname, step.pathname)
} else if step.output == nil || checksum != makeChecksumH(step.output.hash()) { } else if step.output == nil || checksum != makeChecksumH(step.output.hash()) {
if pathname != nil { if pathname != nil {
t.Fatal(expectsFrom(pathname.String())) if name, _err := filepath.EvalSymlinks(pathname.String()); _err != nil {
t.Fatal(_err)
} else {
t.Fatal(expectsFrom(name))
}
} else if checksum != (unique.Handle[pkg.Checksum]{}) { } else if checksum != (unique.Handle[pkg.Checksum]{}) {
t.Fatalf("Cure: unexpected checksum %s", pkg.Encode(checksum.Value())) t.Fatalf("Cure: unexpected checksum %s", pkg.Encode(checksum.Value()))
} }