container/ops: implement overlay op
All checks were successful
Test / Create distribution (push) Successful in 32s
Test / Sandbox (push) Successful in 2m2s
Test / Hakurei (push) Successful in 2m57s
Test / Hpkg (push) Successful in 3m54s
Test / Sandbox (race detector) (push) Successful in 4m6s
Test / Hakurei (race detector) (push) Successful in 4m51s
Test / Flake checks (push) Successful in 1m22s

There are significant limitations to using the overlay mount, and the implementation in the kernel is quite quirky. For now the Op is quite robust, however a higher level interface for it has not been decided yet.

Signed-off-by: Ophestra <cat@gensokyo.uk>
This commit is contained in:
Ophestra 2025-08-08 01:50:38 +09:00
parent 8da76483e6
commit acffa76812
Signed by: cat
SSH Key Fingerprint: SHA256:gQ67O0enBZ7UdZypgtspB2FDM1g3GVw8nX0XSdcFw8Q
4 changed files with 327 additions and 4 deletions

View File

@ -10,6 +10,7 @@ import (
"os" "os"
"os/exec" "os/exec"
"os/signal" "os/signal"
"path"
"strconv" "strconv"
"strings" "strings"
"syscall" "syscall"
@ -115,6 +116,95 @@ var containerTestCases = []struct {
ent("/", "/dev/pts", "rw,nosuid,noexec,relatime", "devpts", "devpts", "rw,mode=620,ptmxmode=666"), ent("/", "/dev/pts", "rw,nosuid,noexec,relatime", "devpts", "devpts", "rw,mode=620,ptmxmode=666"),
), ),
1971, 100, nil, 0, seccomp.PresetStrict}, 1971, 100, nil, 0, seccomp.PresetStrict},
{"overlay", true, false, false, true,
func(t *testing.T) (*container.Ops, context.Context) {
tempDir := t.TempDir()
lower0, lower1, upper, work :=
path.Join(tempDir, "lower0"),
path.Join(tempDir, "lower1"),
path.Join(tempDir, "upper"),
path.Join(tempDir, "work")
for _, name := range []string{lower0, lower1, upper, work} {
if err := os.Mkdir(name, 0755); err != nil {
t.Fatalf("Mkdir: error = %v", err)
}
}
return new(container.Ops).
Overlay(hst.Tmp, upper, work, lower0, lower1),
context.WithValue(context.WithValue(context.WithValue(context.WithValue(t.Context(),
testVal("lower1"), lower1),
testVal("lower0"), lower0),
testVal("work"), work),
testVal("upper"), upper)
},
func(t *testing.T, ctx context.Context) []*vfs.MountInfoEntry {
return []*vfs.MountInfoEntry{
ent("/", hst.Tmp, "rw", "overlay", "overlay",
"rw,lowerdir="+
container.InternalToHostOvlEscape(ctx.Value(testVal("lower0")).(string))+":"+
container.InternalToHostOvlEscape(ctx.Value(testVal("lower1")).(string))+
",upperdir="+
container.InternalToHostOvlEscape(ctx.Value(testVal("upper")).(string))+
",workdir="+
container.InternalToHostOvlEscape(ctx.Value(testVal("work")).(string))+
",redirect_dir=nofollow,uuid=on,userxattr"),
}
},
1 << 3, 1 << 14, nil, 0, seccomp.PresetStrict},
{"overlay ephemeral", true, false, false, true,
func(t *testing.T) (*container.Ops, context.Context) {
tempDir := t.TempDir()
lower0, lower1 :=
path.Join(tempDir, "lower0"),
path.Join(tempDir, "lower1")
for _, name := range []string{lower0, lower1} {
if err := os.Mkdir(name, 0755); err != nil {
t.Fatalf("Mkdir: error = %v", err)
}
}
return new(container.Ops).
OverlayEphemeral(hst.Tmp, lower0, lower1),
t.Context()
},
func(t *testing.T, ctx context.Context) []*vfs.MountInfoEntry {
return []*vfs.MountInfoEntry{
// contains random suffix
ent("/", hst.Tmp, "rw", "overlay", "overlay", ignore),
}
},
1 << 3, 1 << 14, nil, 0, seccomp.PresetStrict},
{"overlay readonly", true, false, false, true,
func(t *testing.T) (*container.Ops, context.Context) {
tempDir := t.TempDir()
lower0, lower1 :=
path.Join(tempDir, "lower0"),
path.Join(tempDir, "lower1")
for _, name := range []string{lower0, lower1} {
if err := os.Mkdir(name, 0755); err != nil {
t.Fatalf("Mkdir: error = %v", err)
}
}
return new(container.Ops).
OverlayReadonly(hst.Tmp, lower0, lower1),
context.WithValue(context.WithValue(t.Context(),
testVal("lower1"), lower1),
testVal("lower0"), lower0)
},
func(t *testing.T, ctx context.Context) []*vfs.MountInfoEntry {
return []*vfs.MountInfoEntry{
ent("/", hst.Tmp, "rw", "overlay", "overlay",
"ro,lowerdir="+
container.InternalToHostOvlEscape(ctx.Value(testVal("lower0")).(string))+":"+
container.InternalToHostOvlEscape(ctx.Value(testVal("lower1")).(string))+
",redirect_dir=nofollow,userxattr"),
}
},
1 << 3, 1 << 14, nil, 0, seccomp.PresetStrict},
} }
func TestContainer(t *testing.T) { func TestContainer(t *testing.T) {

View File

@ -40,6 +40,9 @@ const (
// SourceMqueue is used when mounting mqueue. // SourceMqueue is used when mounting mqueue.
// Note that any source value is allowed when fstype is [FstypeMqueue]. // Note that any source value is allowed when fstype is [FstypeMqueue].
SourceMqueue = "mqueue" SourceMqueue = "mqueue"
// SourceOverlay is used when mounting overlay.
// Note that any source value is allowed when fstype is [FstypeOverlay].
SourceOverlay = "overlay"
// SourceTmpfsRootfs is used when mounting the tmpfs instance backing the intermediate root. // SourceTmpfsRootfs is used when mounting the tmpfs instance backing the intermediate root.
SourceTmpfsRootfs = "rootfs" SourceTmpfsRootfs = "rootfs"
@ -66,6 +69,29 @@ const (
// FstypeMqueue represents the mqueue pseudo-filesystem. // FstypeMqueue represents the mqueue pseudo-filesystem.
// This filesystem type is usually mounted on /dev/mqueue. // This filesystem type is usually mounted on /dev/mqueue.
FstypeMqueue = "mqueue" FstypeMqueue = "mqueue"
// FstypeOverlay represents the overlay pseudo-filesystem.
// This filesystem type can be mounted anywhere in the container filesystem.
FstypeOverlay = "overlay"
// OptionOverlayLowerdir represents the lowerdir option of the overlay pseudo-filesystem.
// Any filesystem, does not need to be on a writable filesystem.
OptionOverlayLowerdir = "lowerdir"
// OptionOverlayUpperdir represents the upperdir option of the overlay pseudo-filesystem.
// The upperdir is normally on a writable filesystem.
OptionOverlayUpperdir = "upperdir"
// OptionOverlayWorkdir represents the workdir option of the overlay pseudo-filesystem.
// The workdir needs to be an empty directory on the same filesystem as upperdir.
OptionOverlayWorkdir = "workdir"
// OptionOverlayUserxattr represents the userxattr option of the overlay pseudo-filesystem.
// Use the "user.overlay." xattr namespace instead of "trusted.overlay.".
OptionOverlayUserxattr = "userxattr"
// SpecialOverlayEscape is the escape string for overlay mount options.
SpecialOverlayEscape = `\`
// SpecialOverlayOption is the separator string between overlay mount options.
SpecialOverlayOption = ","
// SpecialOverlayPath is the separator string between overlay paths.
SpecialOverlayPath = ":"
) )
// bindMount mounts source on target and recursively applies flags if MS_REC is set. // bindMount mounts source on target and recursively applies flags if MS_REC is set.
@ -199,8 +225,8 @@ func escapeOverlayDataSegment(s string) string {
} }
return strings.NewReplacer( return strings.NewReplacer(
`\`, `\\`, SpecialOverlayEscape, SpecialOverlayEscape+SpecialOverlayEscape,
`,`, `\,`, SpecialOverlayOption, SpecialOverlayEscape+SpecialOverlayOption,
`:`, `\:`, SpecialOverlayPath, SpecialOverlayEscape+SpecialOverlayPath,
).Replace(s) ).Replace(s)
} }

View File

@ -13,6 +13,17 @@ import (
"unsafe" "unsafe"
) )
const (
// intermediate root file name pattern for [MountOverlayOp.Upper];
// remains after apply returns
intermediatePatternOverlayUpper = "overlay.upper.*"
// intermediate root file name pattern for [MountOverlayOp.Work];
// remains after apply returns
intermediatePatternOverlayWork = "overlay.work.*"
// intermediate root file name pattern for [TmpfileOp]
intermediatePatternTmpfile = "tmp.*"
)
type ( type (
Ops []Op Ops []Op
@ -337,6 +348,160 @@ func (t *MountTmpfsOp) Is(op Op) bool { vt, ok := op.(*MountTmpfsOp); return ok
func (*MountTmpfsOp) prefix() string { return "mounting" } func (*MountTmpfsOp) prefix() string { return "mounting" }
func (t *MountTmpfsOp) String() string { return fmt.Sprintf("tmpfs on %q size %d", t.Path, t.Size) } func (t *MountTmpfsOp) String() string { return fmt.Sprintf("tmpfs on %q size %d", t.Path, t.Size) }
func init() { gob.Register(new(MountOverlayOp)) }
// Overlay appends an [Op] that mounts the overlay pseudo filesystem on [MountOverlayOp.Target].
func (f *Ops) Overlay(target, state, work string, layers ...string) *Ops {
*f = append(*f, &MountOverlayOp{
Target: target,
Lower: layers,
Upper: state,
Work: work,
})
return f
}
// OverlayEphemeral appends an [Op] that mounts the overlay pseudo filesystem on [MountOverlayOp.Target]
// with an ephemeral upperdir and workdir.
func (f *Ops) OverlayEphemeral(target string, layers ...string) *Ops {
return f.Overlay(target, SourceTmpfsEphemeral, zeroString, layers...)
}
// OverlayReadonly appends an [Op] that mounts the overlay pseudo filesystem readonly on [MountOverlayOp.Target]
func (f *Ops) OverlayReadonly(target string, layers ...string) *Ops {
return f.Overlay(target, zeroString, zeroString, layers...)
}
type MountOverlayOp struct {
Target string
// formatted for [OptionOverlayLowerdir], resolved, prefixed and escaped during early;
Lower []string
// formatted for [OptionOverlayUpperdir], resolved, prefixed and escaped during early;
//
// If Work is an empty string and Upper holds the special value [SourceTmpfsEphemeral],
// an ephemeral upperdir and workdir will be set up.
//
// If both Work and Upper are empty strings, upperdir and workdir is omitted and the overlay is mounted readonly.
Upper string
// formatted for [OptionOverlayWorkdir], resolved, prefixed and escaped during early;
Work string
ephemeral bool
}
func (o *MountOverlayOp) early(*Params) error {
if o.Work == zeroString {
switch o.Upper {
case SourceTmpfsEphemeral: // ephemeral
o.ephemeral = true // intermediate root not yet available
case zeroString: // readonly
default:
return msg.WrapErr(EINVAL, fmt.Sprintf("upperdir has unexpected value %q", o.Upper))
}
}
if !o.ephemeral {
if o.Upper != o.Work && (o.Upper == zeroString || o.Work == zeroString) {
// unreachable
return msg.WrapErr(ENOTRECOVERABLE, "impossible overlay state reached")
}
if o.Upper != zeroString {
if !path.IsAbs(o.Upper) {
return msg.WrapErr(EBADE, fmt.Sprintf("upperdir %q is not absolute", o.Upper))
}
if v, err := filepath.EvalSymlinks(o.Upper); err != nil {
return wrapErrSelf(err)
} else {
o.Upper = escapeOverlayDataSegment(toHost(v))
}
}
if o.Work != zeroString {
if !path.IsAbs(o.Work) {
return msg.WrapErr(EBADE, fmt.Sprintf("workdir %q is not absolute", o.Work))
}
if v, err := filepath.EvalSymlinks(o.Work); err != nil {
return wrapErrSelf(err)
} else {
o.Work = escapeOverlayDataSegment(toHost(v))
}
}
}
for i := range o.Lower {
if !path.IsAbs(o.Lower[i]) {
return msg.WrapErr(EBADE, fmt.Sprintf("lowerdir %q is not absolute", o.Lower[i]))
}
if v, err := filepath.EvalSymlinks(o.Lower[i]); err != nil {
return wrapErrSelf(err)
} else {
o.Lower[i] = escapeOverlayDataSegment(toHost(v))
}
}
return nil
}
func (o *MountOverlayOp) apply(params *Params) error {
if !path.IsAbs(o.Target) {
return msg.WrapErr(EBADE, fmt.Sprintf("path %q is not absolute", o.Target))
}
target := toSysroot(o.Target)
if err := os.MkdirAll(target, params.ParentPerm); err != nil {
return wrapErrSelf(err)
}
if o.ephemeral {
var err error
// these directories are created internally, therefore early (absolute, symlink, prefix, escape) is bypassed
if o.Upper, err = os.MkdirTemp(FHSRoot, intermediatePatternOverlayUpper); err != nil {
return wrapErrSelf(err)
}
if o.Work, err = os.MkdirTemp(FHSRoot, intermediatePatternOverlayWork); err != nil {
return wrapErrSelf(err)
}
}
options := make([]string, 0, 4)
if o.Upper == zeroString && o.Work == zeroString { // readonly
if len(o.Lower) < 2 {
return msg.WrapErr(EINVAL, "readonly overlay requires at least two lowerdir")
}
// "upperdir=" and "workdir=" may be omitted. In that case the overlay will be read-only
} else {
if len(o.Lower) == 0 {
return msg.WrapErr(EINVAL, "overlay requires at least one lowerdir")
}
options = append(options,
OptionOverlayUpperdir+"="+o.Upper,
OptionOverlayWorkdir+"="+o.Work)
}
options = append(options,
OptionOverlayLowerdir+"="+strings.Join(o.Lower, SpecialOverlayPath),
OptionOverlayUserxattr)
return wrapErrSuffix(Mount(SourceOverlay, target, FstypeOverlay, 0, strings.Join(options, SpecialOverlayOption)),
fmt.Sprintf("cannot mount overlay on %q:", o.Target))
}
func (o *MountOverlayOp) Is(op Op) bool {
vo, ok := op.(*MountOverlayOp)
return ok &&
o.Target == vo.Target &&
slices.Equal(o.Lower, vo.Lower) &&
o.Upper == vo.Upper &&
o.Work == vo.Work
}
func (*MountOverlayOp) prefix() string { return "mounting" }
func (o *MountOverlayOp) String() string {
return fmt.Sprintf("overlay on %q with %d layers", o.Target, len(o.Lower))
}
func init() { gob.Register(new(SymlinkOp)) } func init() { gob.Register(new(SymlinkOp)) }
// Link appends an [Op] that creates a symlink in the container filesystem. // Link appends an [Op] that creates a symlink in the container filesystem.
@ -436,7 +601,7 @@ func (t *TmpfileOp) apply(params *Params) error {
} }
var tmpPath string var tmpPath string
if f, err := os.CreateTemp(FHSRoot, "tmp.*"); err != nil { if f, err := os.CreateTemp(FHSRoot, intermediatePatternTmpfile); err != nil {
return wrapErrSelf(err) return wrapErrSelf(err)
} else if _, err = f.Write(t.Data); err != nil { } else if _, err = f.Write(t.Data); err != nil {
return wrapErrSuffix(err, return wrapErrSuffix(err,

42
container/path_test.go Normal file
View File

@ -0,0 +1,42 @@
package container
import "testing"
func TestToSysroot(t *testing.T) {
testCases := []struct {
name string
want string
}{
{"", "/sysroot"},
{"/", "/sysroot"},
{"//etc///", "/sysroot/etc"},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
if got := toSysroot(tc.name); got != tc.want {
t.Errorf("toSysroot: %q, want %q", got, tc.want)
}
})
}
}
func TestToHost(t *testing.T) {
testCases := []struct {
name string
want string
}{
{"", "/host"},
{"/", "/host"},
{"//etc///", "/host/etc"},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
if got := toHost(tc.name); got != tc.want {
t.Errorf("toHost: %q, want %q", got, tc.want)
}
})
}
}
// InternalToHostOvlEscape exports toHost passed to escapeOverlayDataSegment.
func InternalToHostOvlEscape(s string) string { return escapeOverlayDataSegment(toHost(s)) }