container: optionally map uid/gid 0 as init
All checks were successful
Test / Create distribution (push) Successful in 1m3s
Test / Sandbox (push) Successful in 2m48s
Test / Hakurei (push) Successful in 3m45s
Test / ShareFS (push) Successful in 3m55s
Test / Sandbox (race detector) (push) Successful in 5m15s
Test / Hakurei (race detector) (push) Successful in 6m31s
Test / Flake checks (push) Successful in 1m21s

Unfortunately required to work around flawed APIs like binfmt_misc.

Signed-off-by: Ophestra <cat@gensokyo.uk>
This commit is contained in:
2026-05-07 15:15:28 +09:00
parent bad66facbc
commit d4144fcf7f
4 changed files with 91 additions and 11 deletions

View File

@@ -18,6 +18,7 @@ const (
CAP_SETPCAP = 0x8
CAP_NET_ADMIN = 0xc
CAP_DAC_OVERRIDE = 0x1
CAP_SETFCAP = 0x1f
)
type (

View File

@@ -91,6 +91,9 @@ type (
// Time to wait for processes lingering after the initial process terminates.
AdoptWaitDelay time.Duration
// Map uid/gid 0 in the init process. Requires [FstypeProc] attached to
// [fhs.Proc] in the container filesystem.
InitAsRoot bool
// Mapped Uid in user namespace.
Uid int
// Mapped Gid in user namespace.
@@ -286,6 +289,18 @@ func (p *Container) Start() error {
if !p.HostNet {
p.cmd.SysProcAttr.Cloneflags |= CLONE_NEWNET
}
if p.InitAsRoot {
p.cmd.SysProcAttr.AmbientCaps = append(p.cmd.SysProcAttr.AmbientCaps,
// mappings during init as root
CAP_SETFCAP,
)
if !p.SeccompDisable &&
len(p.SeccompRules) == 0 &&
p.SeccompPresets&std.PresetDenyNS != 0 {
return errors.New("container: as root requires late namespace creation")
}
}
// place setup pipe before user supplied extra files, this is later restored by init
if r, w, err := os.Pipe(); err != nil {

View File

@@ -409,8 +409,11 @@ var containerTestCases = []struct {
func TestContainer(t *testing.T) {
t.Parallel()
var suffix string
runTests:
for i, tc := range containerTestCases {
t.Run(tc.name, func(t *testing.T) {
_suffix := suffix
t.Run(tc.name+_suffix, func(t *testing.T) {
t.Parallel()
wantOps, wantOpsCtx := tc.ops(t)
@@ -434,6 +437,8 @@ func TestContainer(t *testing.T) {
c.SeccompDisable = !tc.filter
c.RetainSession = tc.session
c.HostNet = tc.net
c.InitAsRoot = _suffix != ""
c.Env = append(c.Env, "HAKUREI_TEST_SUFFIX="+_suffix)
if info.CanDegrade {
if _, err := landlock.GetABI(); err != nil {
if !errors.Is(err, syscall.ENOSYS) {
@@ -443,6 +448,9 @@ func TestContainer(t *testing.T) {
t.Log("Landlock LSM is unavailable, enabling HostAbstract")
}
}
if c.InitAsRoot {
c.SeccompPresets &= ^std.PresetDenyNS
}
c.
Readonly(check.MustAbs(pathReadonly), 0755).
@@ -511,6 +519,11 @@ func TestContainer(t *testing.T) {
}
})
}
if suffix == "" {
suffix = " as root"
goto runTests
}
}
func ent(root, target, vfsOptstr, fsType, source, fsOptstr string) *vfs.MountInfoEntry {
@@ -589,9 +602,9 @@ func testContainerCancel(
}
func TestForward(t *testing.T) {
testContainerCancel(t, func(c *container.Container) {
c.ForwardCancel = true
}, func(ps *os.ProcessState, waitErr error) {
t.Parallel()
f := func(ps *os.ProcessState, waitErr error) {
var exitError *exec.ExitError
if !errors.As(waitErr, &exitError) {
if m, ok := container.InternalMessageFromError(waitErr); ok {
@@ -602,11 +615,26 @@ func TestForward(t *testing.T) {
if code := exitError.ExitCode(); code != blockExitCodeInterrupt {
t.Errorf("ExitCode: %d, want %d", code, blockExitCodeInterrupt)
}
}
t.Run("direct", func(t *testing.T) {
t.Parallel()
testContainerCancel(t, func(c *container.Container) {
c.ForwardCancel = true
}, f)
})
t.Run("as root", func(t *testing.T) {
testContainerCancel(t, func(c *container.Container) {
c.ForwardCancel = true
c.InitAsRoot = true
c.Proc(fhs.AbsProc)
}, f)
})
}
func TestCancel(t *testing.T) {
testContainerCancel(t, nil, func(ps *os.ProcessState, waitErr error) {
t.Parallel()
f := func(ps *os.ProcessState, waitErr error) {
wantErr := context.Canceled
if !reflect.DeepEqual(waitErr, wantErr) {
if m, ok := container.InternalMessageFromError(waitErr); ok {
@@ -619,6 +647,16 @@ func TestCancel(t *testing.T) {
} else if code := ps.ExitCode(); code != 0 {
t.Errorf("ExitCode: %d, want %d", code, 0)
}
}
t.Run("direct", func(t *testing.T) {
t.Parallel()
testContainerCancel(t, nil, f)
})
t.Run("as root", func(t *testing.T) {
testContainerCancel(t, func(c *container.Container) {
c.InitAsRoot = true
c.Proc(fhs.AbsProc)
}, f)
})
}
@@ -655,6 +693,8 @@ func init() {
})
c.Command("container", command.UsageInternal, func(args []string) error {
asRoot := os.Getenv("HAKUREI_TEST_SUFFIX") == " as root"
if len(args) != 1 {
return syscall.EINVAL
}
@@ -672,11 +712,19 @@ func init() {
return fmt.Errorf("gid: %d, want %d", gid, tc.gid)
}
// no attack surface increase during as root due to no_new_privs
var wantBounding uintptr = 1
asRootNot := " not"
if !asRoot {
wantBounding = 0
asRootNot = ""
}
const (
PR_CAP_AMBIENT = 0x2f
PR_CAP_AMBIENT_IS_SET = 0x1
)
for i := range container.LastCap(nil) {
for i := range container.LastCap(nil) + 1 {
r, _, errno := syscall.Syscall(
syscall.SYS_PRCTL,
PR_CAP_AMBIENT,
@@ -687,7 +735,7 @@ func init() {
return os.NewSyscallError("prctl", errno)
}
if r != 0 {
return fmt.Errorf("capability %d is set", i)
return fmt.Errorf("capability %d in ambient set", i)
}
r, _, errno = syscall.Syscall(
@@ -699,8 +747,8 @@ func init() {
if errno != 0 {
return os.NewSyscallError("prctl", errno)
}
if r != 0 {
return fmt.Errorf("capability %d in set", i)
if r != wantBounding {
return fmt.Errorf("capability %d%s in bounding set", i, asRootNot)
}
}

View File

@@ -182,13 +182,18 @@ func initEntrypoint(k syscallDispatcher, msg message.Msg) {
cancel()
}
uid, gid := param.Uid, param.Gid
if param.InitAsRoot {
uid, gid = 0, 0
}
// write uid/gid map here so parent does not need to set dumpable
if err := k.setDumpable(ext.SUID_DUMP_USER); err != nil {
k.fatalf(msg, "cannot set SUID_DUMP_USER: %v", err)
}
if err := k.writeFile(
fhs.Proc+"self/uid_map",
[]byte(strconv.Itoa(param.Uid)+" "+strconv.Itoa(param.HostUid)+" 1\n"),
[]byte(strconv.Itoa(uid)+" "+strconv.Itoa(param.HostUid)+" 1\n"),
0,
); err != nil {
k.fatalf(msg, "%v", err)
@@ -201,7 +206,7 @@ func initEntrypoint(k syscallDispatcher, msg message.Msg) {
k.fatalf(msg, "%v", err)
}
if err := k.writeFile(fhs.Proc+"self/gid_map",
[]byte(strconv.Itoa(param.Gid)+" "+strconv.Itoa(param.HostGid)+" 1\n"),
[]byte(strconv.Itoa(gid)+" "+strconv.Itoa(param.HostGid)+" 1\n"),
0,
); err != nil {
k.fatalf(msg, "%v", err)
@@ -332,6 +337,9 @@ func initEntrypoint(k syscallDispatcher, msg message.Msg) {
if param.Privileged {
keepCaps = append(keepCaps, CAP_SYS_ADMIN, CAP_SETPCAP)
}
if param.InitAsRoot {
keepCaps = append(keepCaps, CAP_SETFCAP)
}
if err := k.capAmbientClearAll(); err != nil {
k.fatalf(msg, "cannot clear the ambient capability set: %v", err)
@@ -487,6 +495,14 @@ func initEntrypoint(k syscallDispatcher, msg message.Msg) {
cmd.ExtraFiles = extraFiles
cmd.Dir = param.Dir.String()
if param.InitAsRoot {
cmd.SysProcAttr = &SysProcAttr{
Cloneflags: CLONE_NEWUSER,
UidMappings: []SysProcIDMap{{ContainerID: param.Uid, HostID: 0, Size: 1}},
GidMappings: []SysProcIDMap{{ContainerID: param.Gid, HostID: 0, Size: 1}},
}
}
msg.Verbosef("starting initial process %s", param.Path)
if err := k.start(cmd); err != nil {
k.fatalf(msg, "%v", err)