diff --git a/sandbox/container.go b/sandbox/container.go index f2595c3..58c9749 100644 --- a/sandbox/container.go +++ b/sandbox/container.go @@ -99,6 +99,8 @@ type ( // Permission bits of newly created parent directories. // The zero value is interpreted as 0755. ParentPerm os.FileMode + // Retain CAP_SYS_ADMIN. + Privileged bool Flags HardeningFlags } diff --git a/sandbox/init.go b/sandbox/init.go index 0d14e5f..a03868b 100644 --- a/sandbox/init.go +++ b/sandbox/init.go @@ -223,17 +223,30 @@ func Init(prepare func(prefix string), setVerbose func(verbose bool)) { if _, _, errno := syscall.Syscall(PR_SET_NO_NEW_PRIVS, 1, 0, 0); errno != 0 { log.Fatalf("prctl(PR_SET_NO_NEW_PRIVS): %v", errno) } + if _, _, errno := syscall.Syscall(syscall.SYS_PRCTL, PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0); errno != 0 { log.Fatalf("cannot clear the ambient capability set: %v", errno) } for i := uintptr(0); i <= LastCap(); i++ { + if params.Privileged && i == CAP_SYS_ADMIN { + continue + } if _, _, errno := syscall.Syscall(syscall.SYS_PRCTL, syscall.PR_CAPBSET_DROP, i, 0); errno != 0 { log.Fatalf("cannot drop capability from bonding set: %v", errno) } } + + var keep [2]uint32 + if params.Privileged { + keep[capToIndex(CAP_SYS_ADMIN)] |= capToMask(CAP_SYS_ADMIN) + + if _, _, errno := syscall.Syscall(syscall.SYS_PRCTL, PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_SYS_ADMIN); errno != 0 { + log.Fatalf("cannot raise CAP_SYS_ADMIN: %v", errno) + } + } if err := capset( &capHeader{_LINUX_CAPABILITY_VERSION_3, 0}, - &[2]capData{{0, 0, 0}, {0, 0, 0}}, + &[2]capData{{0, keep[0], keep[0]}, {0, keep[1], keep[1]}}, ); err != nil { log.Fatalf("cannot capset: %v", err) } diff --git a/sandbox/syscall.go b/sandbox/syscall.go index 7ddb513..9e46baf 100644 --- a/sandbox/syscall.go +++ b/sandbox/syscall.go @@ -11,6 +11,7 @@ const ( PR_SET_NO_NEW_PRIVS = 0x26 CAP_SYS_ADMIN = 0x15 + CAP_SETPCAP = 0x8 ) const ( @@ -30,10 +31,9 @@ func SetDumpable(dumpable uintptr) error { const ( _LINUX_CAPABILITY_VERSION_3 = 0x20080522 - PR_CAP_AMBIENT = 47 - PR_CAP_AMBIENT_CLEAR_ALL = 4 - - CAP_SETPCAP = 8 + PR_CAP_AMBIENT = 0x2f + PR_CAP_AMBIENT_RAISE = 0x2 + PR_CAP_AMBIENT_CLEAR_ALL = 0x4 ) type ( @@ -49,6 +49,12 @@ type ( } ) +// See CAP_TO_INDEX in linux/capability.h: +func capToIndex(cap uintptr) uintptr { return cap >> 5 } + +// See CAP_TO_MASK in linux/capability.h: +func capToMask(cap uintptr) uint32 { return 1 << uint(cap&31) } + func capset(hdrp *capHeader, datap *[2]capData) error { if _, _, errno := syscall.Syscall(syscall.SYS_CAPSET, uintptr(unsafe.Pointer(hdrp)),