container: binfmt registration
All checks were successful
Test / Create distribution (push) Successful in 1m4s
Test / Sandbox (push) Successful in 2m43s
Test / Hakurei (push) Successful in 3m49s
Test / ShareFS (push) Successful in 3m55s
Test / Sandbox (race detector) (push) Successful in 5m18s
Test / Hakurei (race detector) (push) Successful in 6m25s
Test / Flake checks (push) Successful in 1m22s

This arranges for binfmt entries to be registered for the container.

Signed-off-by: Ophestra <cat@gensokyo.uk>
This commit is contained in:
2026-05-07 15:40:47 +09:00
parent d4144fcf7f
commit 575ef307ad
6 changed files with 178 additions and 0 deletions

46
container/binfmt.go Normal file
View File

@@ -0,0 +1,46 @@
package container
import (
"strings"
"unsafe"
"hakurei.app/check"
)
// escapeBinfmt escapes magic/mask sequences in a [BinfmtEntry].
func escapeBinfmt(buf *strings.Builder, s string) string {
const lowerhex = "0123456789abcdef"
buf.Reset()
for _, c := range unsafe.Slice(unsafe.StringData(s), len(s)) {
switch c {
case 0, '\\', ':':
buf.WriteString(`\x`)
buf.WriteByte(lowerhex[c>>4])
buf.WriteByte(lowerhex[c&0xf])
default:
buf.WriteByte(c)
}
}
return buf.String()
}
// BinfmtEntry is an entry to be registered by the init process.
type BinfmtEntry struct {
// The offset of the magic/mask in the file, counted in bytes.
Offset byte
// The byte sequence binfmt_misc is matching for.
Magic string
// An (optional, defaults to all 0xff) mask.
Mask string
// The program that should be invoked with the binary as first argument.
Interpreter *check.Absolute
}
// Valid returns whether e can be registered into the kernel.
func (e *BinfmtEntry) Valid() bool {
return e != nil &&
int(e.Offset)+max(len(e.Magic), len(e.Mask)) < 128 &&
e.Interpreter != nil && len(e.Interpreter.String()) < 128
}

62
container/binfmt_test.go Normal file
View File

@@ -0,0 +1,62 @@
package container
import (
"strings"
"testing"
"hakurei.app/fhs"
)
func TestEscapeBinfmt(t *testing.T) {
t.Parallel()
testCases := []struct {
name string
magic string
want string
}{
{"packed DOS applications", "\x0eDEX", "\x0eDEX"},
{"riscv64 magic",
"\x7fELF\x02\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\xf3\x00",
"\x7fELF\x02\x01\x01\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\x02\\x00\xf3\\x00"},
{"riscv64 mask",
"\xff\xff\xff\xff\xff\xff\xff\x00\xff\xff\xff\xff\xff\xff\xff\xff\xfe\xff\xff\xff",
"\xff\xff\xff\xff\xff\xff\xff\\x00\xff\xff\xff\xff\xff\xff\xff\xff\xfe\xff\xff\xff"},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
got := escapeBinfmt(new(strings.Builder), tc.magic)
if got != tc.want {
t.Errorf("escapeBinfmt: %q, want %q", got, tc.want)
}
})
}
}
func TestBinfmtEntry(t *testing.T) {
t.Parallel()
testCases := []struct {
name string
e BinfmtEntry
valid bool
}{
{"zero", BinfmtEntry{}, false},
{"large offset", BinfmtEntry{Offset: 128}, false},
{"long magic", BinfmtEntry{Magic: strings.Repeat("\x00", 128)}, false},
{"long mask", BinfmtEntry{Mask: strings.Repeat("\x00", 128)}, false},
{"valid", BinfmtEntry{Interpreter: fhs.AbsRoot}, true},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
if tc.e.Valid() != tc.valid {
t.Errorf("Valid: %v", !tc.valid)
}
})
}
}

View File

@@ -100,6 +100,11 @@ type (
Gid int Gid int
// Hostname value in UTS namespace. // Hostname value in UTS namespace.
Hostname string Hostname string
// Register binfmt_misc entries.
Binfmt []BinfmtEntry
// Alternative pathname to attach binfmt_misc filesystem. The zero value
// requires [FstypeProc] to be made available at [fhs.Proc].
BinfmtPath *check.Absolute
// Sequential container setup ops. // Sequential container setup ops.
*Ops *Ops
@@ -219,6 +224,9 @@ func (p *Container) Start() error {
if p.cmd.Process != nil { if p.cmd.Process != nil {
return errors.New("container: already started") return errors.New("container: already started")
} }
if !p.InitAsRoot && len(p.Binfmt) > 0 {
return errors.New("container: init as root required, but not enabled")
}
if err := ensureCloseOnExec(); err != nil { if err := ensureCloseOnExec(); err != nil {
return err return err

View File

@@ -11,11 +11,13 @@ import (
"path/filepath" "path/filepath"
"slices" "slices"
"strconv" "strconv"
"strings"
"sync" "sync"
"sync/atomic" "sync/atomic"
. "syscall" . "syscall"
"time" "time"
"hakurei.app/check"
"hakurei.app/container/seccomp" "hakurei.app/container/seccomp"
"hakurei.app/ext" "hakurei.app/ext"
"hakurei.app/fhs" "hakurei.app/fhs"
@@ -240,6 +242,16 @@ func initEntrypoint(k syscallDispatcher, msg message.Msg) {
k.fatalf(msg, "cannot enter intermediate host path: %v", err) k.fatalf(msg, "cannot enter intermediate host path: %v", err)
} }
if len(param.Binfmt) > 0 {
for i, e := range param.Binfmt {
if pathname, err := k.evalSymlinks(e.Interpreter.String()); err != nil {
k.fatal(msg, err)
} else if param.Binfmt[i].Interpreter, err = check.NewAbs(pathname); err != nil {
k.fatal(msg, err)
}
}
}
/* early is called right before pivot_root into intermediate root; /* early is called right before pivot_root into intermediate root;
this step is mostly for gathering information that would otherwise be this step is mostly for gathering information that would otherwise be
difficult to obtain via library functions after pivot_root, and difficult to obtain via library functions after pivot_root, and
@@ -295,6 +307,48 @@ func initEntrypoint(k syscallDispatcher, msg message.Msg) {
} }
} }
if len(param.Binfmt) > 0 {
const interpreter = "/interpreter"
if param.BinfmtPath == nil {
param.BinfmtPath = fhs.AbsProcSys.Append("fs/binfmt_misc")
}
binfmt := sysrootPath + param.BinfmtPath.String()
if err := k.mkdirAll(binfmt, 0); err != nil {
k.fatal(msg, err)
}
if err := k.mount(
SourceBinfmtMisc,
binfmt,
FstypeBinfmtMisc,
MS_NOSUID|MS_NOEXEC|MS_NODEV,
zeroString,
); err != nil {
k.fatal(msg, err)
}
var buf strings.Builder
buf.Grow(1920)
register := binfmt + "/register"
for i, e := range param.Binfmt {
if err := k.symlink(hostPath+e.Interpreter.String(), interpreter); err != nil {
k.fatal(msg, err)
} else if err = k.writeFile(register, []byte(":"+
strconv.Itoa(i)+":"+
"M:"+
strconv.Itoa(int(e.Offset))+":"+
escapeBinfmt(&buf, e.Magic)+":"+
escapeBinfmt(&buf, e.Mask)+":"+
interpreter+":"+
"F"), 0); err != nil {
k.fatal(msg, err)
} else if err = k.remove(interpreter); err != nil {
k.fatal(msg, err)
}
}
}
// setup requiring host root complete at this point // setup requiring host root complete at this point
if err := k.mount(hostDir, hostDir, zeroString, MS_SILENT|MS_REC|MS_PRIVATE, zeroString); err != nil { if err := k.mount(hostDir, hostDir, zeroString, MS_SILENT|MS_REC|MS_PRIVATE, zeroString); err != nil {
k.fatalf(msg, "cannot make host root rprivate: %v", optionalErrorUnwrap(err)) k.fatalf(msg, "cannot make host root rprivate: %v", optionalErrorUnwrap(err))

View File

@@ -40,6 +40,9 @@ const (
// SourceMqueue is used when mounting mqueue. // SourceMqueue is used when mounting mqueue.
// Note that any source value is allowed when fstype is [FstypeMqueue]. // Note that any source value is allowed when fstype is [FstypeMqueue].
SourceMqueue = "mqueue" SourceMqueue = "mqueue"
// SourceBinfmtMisc is used when mounting binfmt_misc.
// Note that any source value is allowed when fstype is [SourceBinfmtMisc].
SourceBinfmtMisc = "binfmt_misc"
// SourceOverlay is used when mounting overlay. // SourceOverlay is used when mounting overlay.
// Note that any source value is allowed when fstype is [FstypeOverlay]. // Note that any source value is allowed when fstype is [FstypeOverlay].
SourceOverlay = "overlay" SourceOverlay = "overlay"
@@ -70,6 +73,9 @@ const (
// FstypeMqueue represents the mqueue pseudo-filesystem. // FstypeMqueue represents the mqueue pseudo-filesystem.
// This filesystem type is usually mounted on /dev/mqueue. // This filesystem type is usually mounted on /dev/mqueue.
FstypeMqueue = "mqueue" FstypeMqueue = "mqueue"
// FstypeBinfmtMisc represents the binfmt_misc pseudo-filesystem.
// This filesystem type is usually mounted on /proc/sys/fs/binfmt_misc.
FstypeBinfmtMisc = "binfmt_misc"
// FstypeOverlay represents the overlay pseudo-filesystem. // FstypeOverlay represents the overlay pseudo-filesystem.
// This filesystem type can be mounted anywhere in the container filesystem. // This filesystem type can be mounted anywhere in the container filesystem.
FstypeOverlay = "overlay" FstypeOverlay = "overlay"

View File

@@ -42,6 +42,8 @@ var (
AbsDevShm = unsafeAbs(DevShm) AbsDevShm = unsafeAbs(DevShm)
// AbsProc is [Proc] as [check.Absolute]. // AbsProc is [Proc] as [check.Absolute].
AbsProc = unsafeAbs(Proc) AbsProc = unsafeAbs(Proc)
// AbsProcSys is [ProcSys] as [check.Absolute].
AbsProcSys = unsafeAbs(ProcSys)
// AbsProcSelfExe is [ProcSelfExe] as [check.Absolute]. // AbsProcSelfExe is [ProcSelfExe] as [check.Absolute].
AbsProcSelfExe = unsafeAbs(ProcSelfExe) AbsProcSelfExe = unsafeAbs(ProcSelfExe)
// AbsSys is [Sys] as [check.Absolute]. // AbsSys is [Sys] as [check.Absolute].