From d4144fcf7f9217d2a2dcec21202421d5fa9d4928 Mon Sep 17 00:00:00 2001 From: Ophestra Date: Thu, 7 May 2026 15:15:28 +0900 Subject: [PATCH] container: optionally map uid/gid 0 as init Unfortunately required to work around flawed APIs like binfmt_misc. Signed-off-by: Ophestra --- container/capability.go | 1 + container/container.go | 15 +++++++++ container/container_test.go | 66 ++++++++++++++++++++++++++++++++----- container/init.go | 20 +++++++++-- 4 files changed, 91 insertions(+), 11 deletions(-) diff --git a/container/capability.go b/container/capability.go index 1d98ec4b..ac9c8641 100644 --- a/container/capability.go +++ b/container/capability.go @@ -18,6 +18,7 @@ const ( CAP_SETPCAP = 0x8 CAP_NET_ADMIN = 0xc CAP_DAC_OVERRIDE = 0x1 + CAP_SETFCAP = 0x1f ) type ( diff --git a/container/container.go b/container/container.go index 4208a449..85e6596f 100644 --- a/container/container.go +++ b/container/container.go @@ -91,6 +91,9 @@ type ( // Time to wait for processes lingering after the initial process terminates. AdoptWaitDelay time.Duration + // Map uid/gid 0 in the init process. Requires [FstypeProc] attached to + // [fhs.Proc] in the container filesystem. + InitAsRoot bool // Mapped Uid in user namespace. Uid int // Mapped Gid in user namespace. @@ -286,6 +289,18 @@ func (p *Container) Start() error { if !p.HostNet { p.cmd.SysProcAttr.Cloneflags |= CLONE_NEWNET } + if p.InitAsRoot { + p.cmd.SysProcAttr.AmbientCaps = append(p.cmd.SysProcAttr.AmbientCaps, + // mappings during init as root + CAP_SETFCAP, + ) + + if !p.SeccompDisable && + len(p.SeccompRules) == 0 && + p.SeccompPresets&std.PresetDenyNS != 0 { + return errors.New("container: as root requires late namespace creation") + } + } // place setup pipe before user supplied extra files, this is later restored by init if r, w, err := os.Pipe(); err != nil { diff --git a/container/container_test.go b/container/container_test.go index 48b6e8a4..787720a8 100644 --- a/container/container_test.go +++ b/container/container_test.go @@ -409,8 +409,11 @@ var containerTestCases = []struct { func TestContainer(t *testing.T) { t.Parallel() + var suffix string +runTests: for i, tc := range containerTestCases { - t.Run(tc.name, func(t *testing.T) { + _suffix := suffix + t.Run(tc.name+_suffix, func(t *testing.T) { t.Parallel() wantOps, wantOpsCtx := tc.ops(t) @@ -434,6 +437,8 @@ func TestContainer(t *testing.T) { c.SeccompDisable = !tc.filter c.RetainSession = tc.session c.HostNet = tc.net + c.InitAsRoot = _suffix != "" + c.Env = append(c.Env, "HAKUREI_TEST_SUFFIX="+_suffix) if info.CanDegrade { if _, err := landlock.GetABI(); err != nil { if !errors.Is(err, syscall.ENOSYS) { @@ -443,6 +448,9 @@ func TestContainer(t *testing.T) { t.Log("Landlock LSM is unavailable, enabling HostAbstract") } } + if c.InitAsRoot { + c.SeccompPresets &= ^std.PresetDenyNS + } c. Readonly(check.MustAbs(pathReadonly), 0755). @@ -511,6 +519,11 @@ func TestContainer(t *testing.T) { } }) } + + if suffix == "" { + suffix = " as root" + goto runTests + } } func ent(root, target, vfsOptstr, fsType, source, fsOptstr string) *vfs.MountInfoEntry { @@ -589,9 +602,9 @@ func testContainerCancel( } func TestForward(t *testing.T) { - testContainerCancel(t, func(c *container.Container) { - c.ForwardCancel = true - }, func(ps *os.ProcessState, waitErr error) { + t.Parallel() + + f := func(ps *os.ProcessState, waitErr error) { var exitError *exec.ExitError if !errors.As(waitErr, &exitError) { if m, ok := container.InternalMessageFromError(waitErr); ok { @@ -602,11 +615,26 @@ func TestForward(t *testing.T) { if code := exitError.ExitCode(); code != blockExitCodeInterrupt { t.Errorf("ExitCode: %d, want %d", code, blockExitCodeInterrupt) } + } + t.Run("direct", func(t *testing.T) { + t.Parallel() + testContainerCancel(t, func(c *container.Container) { + c.ForwardCancel = true + }, f) + }) + t.Run("as root", func(t *testing.T) { + testContainerCancel(t, func(c *container.Container) { + c.ForwardCancel = true + c.InitAsRoot = true + c.Proc(fhs.AbsProc) + }, f) }) } func TestCancel(t *testing.T) { - testContainerCancel(t, nil, func(ps *os.ProcessState, waitErr error) { + t.Parallel() + + f := func(ps *os.ProcessState, waitErr error) { wantErr := context.Canceled if !reflect.DeepEqual(waitErr, wantErr) { if m, ok := container.InternalMessageFromError(waitErr); ok { @@ -619,6 +647,16 @@ func TestCancel(t *testing.T) { } else if code := ps.ExitCode(); code != 0 { t.Errorf("ExitCode: %d, want %d", code, 0) } + } + t.Run("direct", func(t *testing.T) { + t.Parallel() + testContainerCancel(t, nil, f) + }) + t.Run("as root", func(t *testing.T) { + testContainerCancel(t, func(c *container.Container) { + c.InitAsRoot = true + c.Proc(fhs.AbsProc) + }, f) }) } @@ -655,6 +693,8 @@ func init() { }) c.Command("container", command.UsageInternal, func(args []string) error { + asRoot := os.Getenv("HAKUREI_TEST_SUFFIX") == " as root" + if len(args) != 1 { return syscall.EINVAL } @@ -672,11 +712,19 @@ func init() { return fmt.Errorf("gid: %d, want %d", gid, tc.gid) } + // no attack surface increase during as root due to no_new_privs + var wantBounding uintptr = 1 + asRootNot := " not" + if !asRoot { + wantBounding = 0 + asRootNot = "" + } + const ( PR_CAP_AMBIENT = 0x2f PR_CAP_AMBIENT_IS_SET = 0x1 ) - for i := range container.LastCap(nil) { + for i := range container.LastCap(nil) + 1 { r, _, errno := syscall.Syscall( syscall.SYS_PRCTL, PR_CAP_AMBIENT, @@ -687,7 +735,7 @@ func init() { return os.NewSyscallError("prctl", errno) } if r != 0 { - return fmt.Errorf("capability %d is set", i) + return fmt.Errorf("capability %d in ambient set", i) } r, _, errno = syscall.Syscall( @@ -699,8 +747,8 @@ func init() { if errno != 0 { return os.NewSyscallError("prctl", errno) } - if r != 0 { - return fmt.Errorf("capability %d in set", i) + if r != wantBounding { + return fmt.Errorf("capability %d%s in bounding set", i, asRootNot) } } diff --git a/container/init.go b/container/init.go index 78c95439..3b850073 100644 --- a/container/init.go +++ b/container/init.go @@ -182,13 +182,18 @@ func initEntrypoint(k syscallDispatcher, msg message.Msg) { cancel() } + uid, gid := param.Uid, param.Gid + if param.InitAsRoot { + uid, gid = 0, 0 + } + // write uid/gid map here so parent does not need to set dumpable if err := k.setDumpable(ext.SUID_DUMP_USER); err != nil { k.fatalf(msg, "cannot set SUID_DUMP_USER: %v", err) } if err := k.writeFile( fhs.Proc+"self/uid_map", - []byte(strconv.Itoa(param.Uid)+" "+strconv.Itoa(param.HostUid)+" 1\n"), + []byte(strconv.Itoa(uid)+" "+strconv.Itoa(param.HostUid)+" 1\n"), 0, ); err != nil { k.fatalf(msg, "%v", err) @@ -201,7 +206,7 @@ func initEntrypoint(k syscallDispatcher, msg message.Msg) { k.fatalf(msg, "%v", err) } if err := k.writeFile(fhs.Proc+"self/gid_map", - []byte(strconv.Itoa(param.Gid)+" "+strconv.Itoa(param.HostGid)+" 1\n"), + []byte(strconv.Itoa(gid)+" "+strconv.Itoa(param.HostGid)+" 1\n"), 0, ); err != nil { k.fatalf(msg, "%v", err) @@ -332,6 +337,9 @@ func initEntrypoint(k syscallDispatcher, msg message.Msg) { if param.Privileged { keepCaps = append(keepCaps, CAP_SYS_ADMIN, CAP_SETPCAP) } + if param.InitAsRoot { + keepCaps = append(keepCaps, CAP_SETFCAP) + } if err := k.capAmbientClearAll(); err != nil { k.fatalf(msg, "cannot clear the ambient capability set: %v", err) @@ -487,6 +495,14 @@ func initEntrypoint(k syscallDispatcher, msg message.Msg) { cmd.ExtraFiles = extraFiles cmd.Dir = param.Dir.String() + if param.InitAsRoot { + cmd.SysProcAttr = &SysProcAttr{ + Cloneflags: CLONE_NEWUSER, + UidMappings: []SysProcIDMap{{ContainerID: param.Uid, HostID: 0, Size: 1}}, + GidMappings: []SysProcIDMap{{ContainerID: param.Gid, HostID: 0, Size: 1}}, + } + } + msg.Verbosef("starting initial process %s", param.Path) if err := k.start(cmd); err != nil { k.fatalf(msg, "%v", err)