container: optionally map uid/gid 0 as init
All checks were successful
Test / Create distribution (push) Successful in 1m3s
Test / Sandbox (push) Successful in 2m48s
Test / Hakurei (push) Successful in 3m45s
Test / ShareFS (push) Successful in 3m55s
Test / Sandbox (race detector) (push) Successful in 5m15s
Test / Hakurei (race detector) (push) Successful in 6m31s
Test / Flake checks (push) Successful in 1m21s
All checks were successful
Test / Create distribution (push) Successful in 1m3s
Test / Sandbox (push) Successful in 2m48s
Test / Hakurei (push) Successful in 3m45s
Test / ShareFS (push) Successful in 3m55s
Test / Sandbox (race detector) (push) Successful in 5m15s
Test / Hakurei (race detector) (push) Successful in 6m31s
Test / Flake checks (push) Successful in 1m21s
Unfortunately required to work around flawed APIs like binfmt_misc. Signed-off-by: Ophestra <cat@gensokyo.uk>
This commit is contained in:
@@ -18,6 +18,7 @@ const (
|
||||
CAP_SETPCAP = 0x8
|
||||
CAP_NET_ADMIN = 0xc
|
||||
CAP_DAC_OVERRIDE = 0x1
|
||||
CAP_SETFCAP = 0x1f
|
||||
)
|
||||
|
||||
type (
|
||||
|
||||
@@ -91,6 +91,9 @@ type (
|
||||
// Time to wait for processes lingering after the initial process terminates.
|
||||
AdoptWaitDelay time.Duration
|
||||
|
||||
// Map uid/gid 0 in the init process. Requires [FstypeProc] attached to
|
||||
// [fhs.Proc] in the container filesystem.
|
||||
InitAsRoot bool
|
||||
// Mapped Uid in user namespace.
|
||||
Uid int
|
||||
// Mapped Gid in user namespace.
|
||||
@@ -286,6 +289,18 @@ func (p *Container) Start() error {
|
||||
if !p.HostNet {
|
||||
p.cmd.SysProcAttr.Cloneflags |= CLONE_NEWNET
|
||||
}
|
||||
if p.InitAsRoot {
|
||||
p.cmd.SysProcAttr.AmbientCaps = append(p.cmd.SysProcAttr.AmbientCaps,
|
||||
// mappings during init as root
|
||||
CAP_SETFCAP,
|
||||
)
|
||||
|
||||
if !p.SeccompDisable &&
|
||||
len(p.SeccompRules) == 0 &&
|
||||
p.SeccompPresets&std.PresetDenyNS != 0 {
|
||||
return errors.New("container: as root requires late namespace creation")
|
||||
}
|
||||
}
|
||||
|
||||
// place setup pipe before user supplied extra files, this is later restored by init
|
||||
if r, w, err := os.Pipe(); err != nil {
|
||||
|
||||
@@ -409,8 +409,11 @@ var containerTestCases = []struct {
|
||||
func TestContainer(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
var suffix string
|
||||
runTests:
|
||||
for i, tc := range containerTestCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
_suffix := suffix
|
||||
t.Run(tc.name+_suffix, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
wantOps, wantOpsCtx := tc.ops(t)
|
||||
@@ -434,6 +437,8 @@ func TestContainer(t *testing.T) {
|
||||
c.SeccompDisable = !tc.filter
|
||||
c.RetainSession = tc.session
|
||||
c.HostNet = tc.net
|
||||
c.InitAsRoot = _suffix != ""
|
||||
c.Env = append(c.Env, "HAKUREI_TEST_SUFFIX="+_suffix)
|
||||
if info.CanDegrade {
|
||||
if _, err := landlock.GetABI(); err != nil {
|
||||
if !errors.Is(err, syscall.ENOSYS) {
|
||||
@@ -443,6 +448,9 @@ func TestContainer(t *testing.T) {
|
||||
t.Log("Landlock LSM is unavailable, enabling HostAbstract")
|
||||
}
|
||||
}
|
||||
if c.InitAsRoot {
|
||||
c.SeccompPresets &= ^std.PresetDenyNS
|
||||
}
|
||||
|
||||
c.
|
||||
Readonly(check.MustAbs(pathReadonly), 0755).
|
||||
@@ -511,6 +519,11 @@ func TestContainer(t *testing.T) {
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
if suffix == "" {
|
||||
suffix = " as root"
|
||||
goto runTests
|
||||
}
|
||||
}
|
||||
|
||||
func ent(root, target, vfsOptstr, fsType, source, fsOptstr string) *vfs.MountInfoEntry {
|
||||
@@ -589,9 +602,9 @@ func testContainerCancel(
|
||||
}
|
||||
|
||||
func TestForward(t *testing.T) {
|
||||
testContainerCancel(t, func(c *container.Container) {
|
||||
c.ForwardCancel = true
|
||||
}, func(ps *os.ProcessState, waitErr error) {
|
||||
t.Parallel()
|
||||
|
||||
f := func(ps *os.ProcessState, waitErr error) {
|
||||
var exitError *exec.ExitError
|
||||
if !errors.As(waitErr, &exitError) {
|
||||
if m, ok := container.InternalMessageFromError(waitErr); ok {
|
||||
@@ -602,11 +615,26 @@ func TestForward(t *testing.T) {
|
||||
if code := exitError.ExitCode(); code != blockExitCodeInterrupt {
|
||||
t.Errorf("ExitCode: %d, want %d", code, blockExitCodeInterrupt)
|
||||
}
|
||||
}
|
||||
t.Run("direct", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
testContainerCancel(t, func(c *container.Container) {
|
||||
c.ForwardCancel = true
|
||||
}, f)
|
||||
})
|
||||
t.Run("as root", func(t *testing.T) {
|
||||
testContainerCancel(t, func(c *container.Container) {
|
||||
c.ForwardCancel = true
|
||||
c.InitAsRoot = true
|
||||
c.Proc(fhs.AbsProc)
|
||||
}, f)
|
||||
})
|
||||
}
|
||||
|
||||
func TestCancel(t *testing.T) {
|
||||
testContainerCancel(t, nil, func(ps *os.ProcessState, waitErr error) {
|
||||
t.Parallel()
|
||||
|
||||
f := func(ps *os.ProcessState, waitErr error) {
|
||||
wantErr := context.Canceled
|
||||
if !reflect.DeepEqual(waitErr, wantErr) {
|
||||
if m, ok := container.InternalMessageFromError(waitErr); ok {
|
||||
@@ -619,6 +647,16 @@ func TestCancel(t *testing.T) {
|
||||
} else if code := ps.ExitCode(); code != 0 {
|
||||
t.Errorf("ExitCode: %d, want %d", code, 0)
|
||||
}
|
||||
}
|
||||
t.Run("direct", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
testContainerCancel(t, nil, f)
|
||||
})
|
||||
t.Run("as root", func(t *testing.T) {
|
||||
testContainerCancel(t, func(c *container.Container) {
|
||||
c.InitAsRoot = true
|
||||
c.Proc(fhs.AbsProc)
|
||||
}, f)
|
||||
})
|
||||
}
|
||||
|
||||
@@ -655,6 +693,8 @@ func init() {
|
||||
})
|
||||
|
||||
c.Command("container", command.UsageInternal, func(args []string) error {
|
||||
asRoot := os.Getenv("HAKUREI_TEST_SUFFIX") == " as root"
|
||||
|
||||
if len(args) != 1 {
|
||||
return syscall.EINVAL
|
||||
}
|
||||
@@ -672,11 +712,19 @@ func init() {
|
||||
return fmt.Errorf("gid: %d, want %d", gid, tc.gid)
|
||||
}
|
||||
|
||||
// no attack surface increase during as root due to no_new_privs
|
||||
var wantBounding uintptr = 1
|
||||
asRootNot := " not"
|
||||
if !asRoot {
|
||||
wantBounding = 0
|
||||
asRootNot = ""
|
||||
}
|
||||
|
||||
const (
|
||||
PR_CAP_AMBIENT = 0x2f
|
||||
PR_CAP_AMBIENT_IS_SET = 0x1
|
||||
)
|
||||
for i := range container.LastCap(nil) {
|
||||
for i := range container.LastCap(nil) + 1 {
|
||||
r, _, errno := syscall.Syscall(
|
||||
syscall.SYS_PRCTL,
|
||||
PR_CAP_AMBIENT,
|
||||
@@ -687,7 +735,7 @@ func init() {
|
||||
return os.NewSyscallError("prctl", errno)
|
||||
}
|
||||
if r != 0 {
|
||||
return fmt.Errorf("capability %d is set", i)
|
||||
return fmt.Errorf("capability %d in ambient set", i)
|
||||
}
|
||||
|
||||
r, _, errno = syscall.Syscall(
|
||||
@@ -699,8 +747,8 @@ func init() {
|
||||
if errno != 0 {
|
||||
return os.NewSyscallError("prctl", errno)
|
||||
}
|
||||
if r != 0 {
|
||||
return fmt.Errorf("capability %d in set", i)
|
||||
if r != wantBounding {
|
||||
return fmt.Errorf("capability %d%s in bounding set", i, asRootNot)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -182,13 +182,18 @@ func initEntrypoint(k syscallDispatcher, msg message.Msg) {
|
||||
cancel()
|
||||
}
|
||||
|
||||
uid, gid := param.Uid, param.Gid
|
||||
if param.InitAsRoot {
|
||||
uid, gid = 0, 0
|
||||
}
|
||||
|
||||
// write uid/gid map here so parent does not need to set dumpable
|
||||
if err := k.setDumpable(ext.SUID_DUMP_USER); err != nil {
|
||||
k.fatalf(msg, "cannot set SUID_DUMP_USER: %v", err)
|
||||
}
|
||||
if err := k.writeFile(
|
||||
fhs.Proc+"self/uid_map",
|
||||
[]byte(strconv.Itoa(param.Uid)+" "+strconv.Itoa(param.HostUid)+" 1\n"),
|
||||
[]byte(strconv.Itoa(uid)+" "+strconv.Itoa(param.HostUid)+" 1\n"),
|
||||
0,
|
||||
); err != nil {
|
||||
k.fatalf(msg, "%v", err)
|
||||
@@ -201,7 +206,7 @@ func initEntrypoint(k syscallDispatcher, msg message.Msg) {
|
||||
k.fatalf(msg, "%v", err)
|
||||
}
|
||||
if err := k.writeFile(fhs.Proc+"self/gid_map",
|
||||
[]byte(strconv.Itoa(param.Gid)+" "+strconv.Itoa(param.HostGid)+" 1\n"),
|
||||
[]byte(strconv.Itoa(gid)+" "+strconv.Itoa(param.HostGid)+" 1\n"),
|
||||
0,
|
||||
); err != nil {
|
||||
k.fatalf(msg, "%v", err)
|
||||
@@ -332,6 +337,9 @@ func initEntrypoint(k syscallDispatcher, msg message.Msg) {
|
||||
if param.Privileged {
|
||||
keepCaps = append(keepCaps, CAP_SYS_ADMIN, CAP_SETPCAP)
|
||||
}
|
||||
if param.InitAsRoot {
|
||||
keepCaps = append(keepCaps, CAP_SETFCAP)
|
||||
}
|
||||
|
||||
if err := k.capAmbientClearAll(); err != nil {
|
||||
k.fatalf(msg, "cannot clear the ambient capability set: %v", err)
|
||||
@@ -487,6 +495,14 @@ func initEntrypoint(k syscallDispatcher, msg message.Msg) {
|
||||
cmd.ExtraFiles = extraFiles
|
||||
cmd.Dir = param.Dir.String()
|
||||
|
||||
if param.InitAsRoot {
|
||||
cmd.SysProcAttr = &SysProcAttr{
|
||||
Cloneflags: CLONE_NEWUSER,
|
||||
UidMappings: []SysProcIDMap{{ContainerID: param.Uid, HostID: 0, Size: 1}},
|
||||
GidMappings: []SysProcIDMap{{ContainerID: param.Gid, HostID: 0, Size: 1}},
|
||||
}
|
||||
}
|
||||
|
||||
msg.Verbosef("starting initial process %s", param.Path)
|
||||
if err := k.start(cmd); err != nil {
|
||||
k.fatalf(msg, "%v", err)
|
||||
|
||||
Reference in New Issue
Block a user