diff --git a/container/binfmt.go b/container/binfmt.go new file mode 100644 index 00000000..837e35b4 --- /dev/null +++ b/container/binfmt.go @@ -0,0 +1,46 @@ +package container + +import ( + "strings" + "unsafe" + + "hakurei.app/check" +) + +// escapeBinfmt escapes magic/mask sequences in a [BinfmtEntry]. +func escapeBinfmt(buf *strings.Builder, s string) string { + const lowerhex = "0123456789abcdef" + + buf.Reset() + for _, c := range unsafe.Slice(unsafe.StringData(s), len(s)) { + switch c { + case 0, '\\', ':': + buf.WriteString(`\x`) + buf.WriteByte(lowerhex[c>>4]) + buf.WriteByte(lowerhex[c&0xf]) + + default: + buf.WriteByte(c) + } + } + return buf.String() +} + +// BinfmtEntry is an entry to be registered by the init process. +type BinfmtEntry struct { + // The offset of the magic/mask in the file, counted in bytes. + Offset byte + // The byte sequence binfmt_misc is matching for. + Magic string + // An (optional, defaults to all 0xff) mask. + Mask string + // The program that should be invoked with the binary as first argument. + Interpreter *check.Absolute +} + +// Valid returns whether e can be registered into the kernel. +func (e *BinfmtEntry) Valid() bool { + return e != nil && + int(e.Offset)+max(len(e.Magic), len(e.Mask)) < 128 && + e.Interpreter != nil && len(e.Interpreter.String()) < 128 +} diff --git a/container/binfmt_test.go b/container/binfmt_test.go new file mode 100644 index 00000000..7345b094 --- /dev/null +++ b/container/binfmt_test.go @@ -0,0 +1,62 @@ +package container + +import ( + "strings" + "testing" + + "hakurei.app/fhs" +) + +func TestEscapeBinfmt(t *testing.T) { + t.Parallel() + + testCases := []struct { + name string + magic string + want string + }{ + {"packed DOS applications", "\x0eDEX", "\x0eDEX"}, + + {"riscv64 magic", + "\x7fELF\x02\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\xf3\x00", + "\x7fELF\x02\x01\x01\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\x02\\x00\xf3\\x00"}, + {"riscv64 mask", + "\xff\xff\xff\xff\xff\xff\xff\x00\xff\xff\xff\xff\xff\xff\xff\xff\xfe\xff\xff\xff", + "\xff\xff\xff\xff\xff\xff\xff\\x00\xff\xff\xff\xff\xff\xff\xff\xff\xfe\xff\xff\xff"}, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + got := escapeBinfmt(new(strings.Builder), tc.magic) + if got != tc.want { + t.Errorf("escapeBinfmt: %q, want %q", got, tc.want) + } + }) + } +} + +func TestBinfmtEntry(t *testing.T) { + t.Parallel() + + testCases := []struct { + name string + e BinfmtEntry + valid bool + }{ + {"zero", BinfmtEntry{}, false}, + {"large offset", BinfmtEntry{Offset: 128}, false}, + {"long magic", BinfmtEntry{Magic: strings.Repeat("\x00", 128)}, false}, + {"long mask", BinfmtEntry{Mask: strings.Repeat("\x00", 128)}, false}, + {"valid", BinfmtEntry{Interpreter: fhs.AbsRoot}, true}, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + if tc.e.Valid() != tc.valid { + t.Errorf("Valid: %v", !tc.valid) + } + }) + } +} diff --git a/container/container.go b/container/container.go index 85e6596f..8af77315 100644 --- a/container/container.go +++ b/container/container.go @@ -100,6 +100,11 @@ type ( Gid int // Hostname value in UTS namespace. Hostname string + // Register binfmt_misc entries. + Binfmt []BinfmtEntry + // Alternative pathname to attach binfmt_misc filesystem. The zero value + // requires [FstypeProc] to be made available at [fhs.Proc]. + BinfmtPath *check.Absolute // Sequential container setup ops. *Ops @@ -219,6 +224,9 @@ func (p *Container) Start() error { if p.cmd.Process != nil { return errors.New("container: already started") } + if !p.InitAsRoot && len(p.Binfmt) > 0 { + return errors.New("container: init as root required, but not enabled") + } if err := ensureCloseOnExec(); err != nil { return err diff --git a/container/init.go b/container/init.go index 3b850073..41de58cd 100644 --- a/container/init.go +++ b/container/init.go @@ -11,11 +11,13 @@ import ( "path/filepath" "slices" "strconv" + "strings" "sync" "sync/atomic" . "syscall" "time" + "hakurei.app/check" "hakurei.app/container/seccomp" "hakurei.app/ext" "hakurei.app/fhs" @@ -240,6 +242,16 @@ func initEntrypoint(k syscallDispatcher, msg message.Msg) { k.fatalf(msg, "cannot enter intermediate host path: %v", err) } + if len(param.Binfmt) > 0 { + for i, e := range param.Binfmt { + if pathname, err := k.evalSymlinks(e.Interpreter.String()); err != nil { + k.fatal(msg, err) + } else if param.Binfmt[i].Interpreter, err = check.NewAbs(pathname); err != nil { + k.fatal(msg, err) + } + } + } + /* early is called right before pivot_root into intermediate root; this step is mostly for gathering information that would otherwise be difficult to obtain via library functions after pivot_root, and @@ -295,6 +307,48 @@ func initEntrypoint(k syscallDispatcher, msg message.Msg) { } } + if len(param.Binfmt) > 0 { + const interpreter = "/interpreter" + + if param.BinfmtPath == nil { + param.BinfmtPath = fhs.AbsProcSys.Append("fs/binfmt_misc") + } + binfmt := sysrootPath + param.BinfmtPath.String() + if err := k.mkdirAll(binfmt, 0); err != nil { + k.fatal(msg, err) + } + if err := k.mount( + SourceBinfmtMisc, + binfmt, + FstypeBinfmtMisc, + MS_NOSUID|MS_NOEXEC|MS_NODEV, + zeroString, + ); err != nil { + k.fatal(msg, err) + } + + var buf strings.Builder + buf.Grow(1920) + + register := binfmt + "/register" + for i, e := range param.Binfmt { + if err := k.symlink(hostPath+e.Interpreter.String(), interpreter); err != nil { + k.fatal(msg, err) + } else if err = k.writeFile(register, []byte(":"+ + strconv.Itoa(i)+":"+ + "M:"+ + strconv.Itoa(int(e.Offset))+":"+ + escapeBinfmt(&buf, e.Magic)+":"+ + escapeBinfmt(&buf, e.Mask)+":"+ + interpreter+":"+ + "F"), 0); err != nil { + k.fatal(msg, err) + } else if err = k.remove(interpreter); err != nil { + k.fatal(msg, err) + } + } + } + // setup requiring host root complete at this point if err := k.mount(hostDir, hostDir, zeroString, MS_SILENT|MS_REC|MS_PRIVATE, zeroString); err != nil { k.fatalf(msg, "cannot make host root rprivate: %v", optionalErrorUnwrap(err)) diff --git a/container/mount.go b/container/mount.go index f6a8844f..37a9445c 100644 --- a/container/mount.go +++ b/container/mount.go @@ -40,6 +40,9 @@ const ( // SourceMqueue is used when mounting mqueue. // Note that any source value is allowed when fstype is [FstypeMqueue]. SourceMqueue = "mqueue" + // SourceBinfmtMisc is used when mounting binfmt_misc. + // Note that any source value is allowed when fstype is [SourceBinfmtMisc]. + SourceBinfmtMisc = "binfmt_misc" // SourceOverlay is used when mounting overlay. // Note that any source value is allowed when fstype is [FstypeOverlay]. SourceOverlay = "overlay" @@ -70,6 +73,9 @@ const ( // FstypeMqueue represents the mqueue pseudo-filesystem. // This filesystem type is usually mounted on /dev/mqueue. FstypeMqueue = "mqueue" + // FstypeBinfmtMisc represents the binfmt_misc pseudo-filesystem. + // This filesystem type is usually mounted on /proc/sys/fs/binfmt_misc. + FstypeBinfmtMisc = "binfmt_misc" // FstypeOverlay represents the overlay pseudo-filesystem. // This filesystem type can be mounted anywhere in the container filesystem. FstypeOverlay = "overlay" diff --git a/fhs/abs.go b/fhs/abs.go index ce2ae34d..7adde1db 100644 --- a/fhs/abs.go +++ b/fhs/abs.go @@ -42,6 +42,8 @@ var ( AbsDevShm = unsafeAbs(DevShm) // AbsProc is [Proc] as [check.Absolute]. AbsProc = unsafeAbs(Proc) + // AbsProcSys is [ProcSys] as [check.Absolute]. + AbsProcSys = unsafeAbs(ProcSys) // AbsProcSelfExe is [ProcSelfExe] as [check.Absolute]. AbsProcSelfExe = unsafeAbs(ProcSelfExe) // AbsSys is [Sys] as [check.Absolute].