container/ops: implement overlay op

There are significant limitations to using the overlay mount, and the implementation in the kernel is quite quirky. For now the Op is quite robust, however a higher level interface for it has not been decided yet.

Signed-off-by: Ophestra <cat@gensokyo.uk>
This commit is contained in:
2025-08-08 01:50:38 +09:00
parent 8da76483e6
commit acffa76812
4 changed files with 327 additions and 4 deletions

View File

@@ -10,6 +10,7 @@ import (
"os"
"os/exec"
"os/signal"
"path"
"strconv"
"strings"
"syscall"
@@ -115,6 +116,95 @@ var containerTestCases = []struct {
ent("/", "/dev/pts", "rw,nosuid,noexec,relatime", "devpts", "devpts", "rw,mode=620,ptmxmode=666"),
),
1971, 100, nil, 0, seccomp.PresetStrict},
{"overlay", true, false, false, true,
func(t *testing.T) (*container.Ops, context.Context) {
tempDir := t.TempDir()
lower0, lower1, upper, work :=
path.Join(tempDir, "lower0"),
path.Join(tempDir, "lower1"),
path.Join(tempDir, "upper"),
path.Join(tempDir, "work")
for _, name := range []string{lower0, lower1, upper, work} {
if err := os.Mkdir(name, 0755); err != nil {
t.Fatalf("Mkdir: error = %v", err)
}
}
return new(container.Ops).
Overlay(hst.Tmp, upper, work, lower0, lower1),
context.WithValue(context.WithValue(context.WithValue(context.WithValue(t.Context(),
testVal("lower1"), lower1),
testVal("lower0"), lower0),
testVal("work"), work),
testVal("upper"), upper)
},
func(t *testing.T, ctx context.Context) []*vfs.MountInfoEntry {
return []*vfs.MountInfoEntry{
ent("/", hst.Tmp, "rw", "overlay", "overlay",
"rw,lowerdir="+
container.InternalToHostOvlEscape(ctx.Value(testVal("lower0")).(string))+":"+
container.InternalToHostOvlEscape(ctx.Value(testVal("lower1")).(string))+
",upperdir="+
container.InternalToHostOvlEscape(ctx.Value(testVal("upper")).(string))+
",workdir="+
container.InternalToHostOvlEscape(ctx.Value(testVal("work")).(string))+
",redirect_dir=nofollow,uuid=on,userxattr"),
}
},
1 << 3, 1 << 14, nil, 0, seccomp.PresetStrict},
{"overlay ephemeral", true, false, false, true,
func(t *testing.T) (*container.Ops, context.Context) {
tempDir := t.TempDir()
lower0, lower1 :=
path.Join(tempDir, "lower0"),
path.Join(tempDir, "lower1")
for _, name := range []string{lower0, lower1} {
if err := os.Mkdir(name, 0755); err != nil {
t.Fatalf("Mkdir: error = %v", err)
}
}
return new(container.Ops).
OverlayEphemeral(hst.Tmp, lower0, lower1),
t.Context()
},
func(t *testing.T, ctx context.Context) []*vfs.MountInfoEntry {
return []*vfs.MountInfoEntry{
// contains random suffix
ent("/", hst.Tmp, "rw", "overlay", "overlay", ignore),
}
},
1 << 3, 1 << 14, nil, 0, seccomp.PresetStrict},
{"overlay readonly", true, false, false, true,
func(t *testing.T) (*container.Ops, context.Context) {
tempDir := t.TempDir()
lower0, lower1 :=
path.Join(tempDir, "lower0"),
path.Join(tempDir, "lower1")
for _, name := range []string{lower0, lower1} {
if err := os.Mkdir(name, 0755); err != nil {
t.Fatalf("Mkdir: error = %v", err)
}
}
return new(container.Ops).
OverlayReadonly(hst.Tmp, lower0, lower1),
context.WithValue(context.WithValue(t.Context(),
testVal("lower1"), lower1),
testVal("lower0"), lower0)
},
func(t *testing.T, ctx context.Context) []*vfs.MountInfoEntry {
return []*vfs.MountInfoEntry{
ent("/", hst.Tmp, "rw", "overlay", "overlay",
"ro,lowerdir="+
container.InternalToHostOvlEscape(ctx.Value(testVal("lower0")).(string))+":"+
container.InternalToHostOvlEscape(ctx.Value(testVal("lower1")).(string))+
",redirect_dir=nofollow,userxattr"),
}
},
1 << 3, 1 << 14, nil, 0, seccomp.PresetStrict},
}
func TestContainer(t *testing.T) {

View File

@@ -40,6 +40,9 @@ const (
// SourceMqueue is used when mounting mqueue.
// Note that any source value is allowed when fstype is [FstypeMqueue].
SourceMqueue = "mqueue"
// SourceOverlay is used when mounting overlay.
// Note that any source value is allowed when fstype is [FstypeOverlay].
SourceOverlay = "overlay"
// SourceTmpfsRootfs is used when mounting the tmpfs instance backing the intermediate root.
SourceTmpfsRootfs = "rootfs"
@@ -66,6 +69,29 @@ const (
// FstypeMqueue represents the mqueue pseudo-filesystem.
// This filesystem type is usually mounted on /dev/mqueue.
FstypeMqueue = "mqueue"
// FstypeOverlay represents the overlay pseudo-filesystem.
// This filesystem type can be mounted anywhere in the container filesystem.
FstypeOverlay = "overlay"
// OptionOverlayLowerdir represents the lowerdir option of the overlay pseudo-filesystem.
// Any filesystem, does not need to be on a writable filesystem.
OptionOverlayLowerdir = "lowerdir"
// OptionOverlayUpperdir represents the upperdir option of the overlay pseudo-filesystem.
// The upperdir is normally on a writable filesystem.
OptionOverlayUpperdir = "upperdir"
// OptionOverlayWorkdir represents the workdir option of the overlay pseudo-filesystem.
// The workdir needs to be an empty directory on the same filesystem as upperdir.
OptionOverlayWorkdir = "workdir"
// OptionOverlayUserxattr represents the userxattr option of the overlay pseudo-filesystem.
// Use the "user.overlay." xattr namespace instead of "trusted.overlay.".
OptionOverlayUserxattr = "userxattr"
// SpecialOverlayEscape is the escape string for overlay mount options.
SpecialOverlayEscape = `\`
// SpecialOverlayOption is the separator string between overlay mount options.
SpecialOverlayOption = ","
// SpecialOverlayPath is the separator string between overlay paths.
SpecialOverlayPath = ":"
)
// bindMount mounts source on target and recursively applies flags if MS_REC is set.
@@ -199,8 +225,8 @@ func escapeOverlayDataSegment(s string) string {
}
return strings.NewReplacer(
`\`, `\\`,
`,`, `\,`,
`:`, `\:`,
SpecialOverlayEscape, SpecialOverlayEscape+SpecialOverlayEscape,
SpecialOverlayOption, SpecialOverlayEscape+SpecialOverlayOption,
SpecialOverlayPath, SpecialOverlayEscape+SpecialOverlayPath,
).Replace(s)
}

View File

@@ -13,6 +13,17 @@ import (
"unsafe"
)
const (
// intermediate root file name pattern for [MountOverlayOp.Upper];
// remains after apply returns
intermediatePatternOverlayUpper = "overlay.upper.*"
// intermediate root file name pattern for [MountOverlayOp.Work];
// remains after apply returns
intermediatePatternOverlayWork = "overlay.work.*"
// intermediate root file name pattern for [TmpfileOp]
intermediatePatternTmpfile = "tmp.*"
)
type (
Ops []Op
@@ -337,6 +348,160 @@ func (t *MountTmpfsOp) Is(op Op) bool { vt, ok := op.(*MountTmpfsOp); return ok
func (*MountTmpfsOp) prefix() string { return "mounting" }
func (t *MountTmpfsOp) String() string { return fmt.Sprintf("tmpfs on %q size %d", t.Path, t.Size) }
func init() { gob.Register(new(MountOverlayOp)) }
// Overlay appends an [Op] that mounts the overlay pseudo filesystem on [MountOverlayOp.Target].
func (f *Ops) Overlay(target, state, work string, layers ...string) *Ops {
*f = append(*f, &MountOverlayOp{
Target: target,
Lower: layers,
Upper: state,
Work: work,
})
return f
}
// OverlayEphemeral appends an [Op] that mounts the overlay pseudo filesystem on [MountOverlayOp.Target]
// with an ephemeral upperdir and workdir.
func (f *Ops) OverlayEphemeral(target string, layers ...string) *Ops {
return f.Overlay(target, SourceTmpfsEphemeral, zeroString, layers...)
}
// OverlayReadonly appends an [Op] that mounts the overlay pseudo filesystem readonly on [MountOverlayOp.Target]
func (f *Ops) OverlayReadonly(target string, layers ...string) *Ops {
return f.Overlay(target, zeroString, zeroString, layers...)
}
type MountOverlayOp struct {
Target string
// formatted for [OptionOverlayLowerdir], resolved, prefixed and escaped during early;
Lower []string
// formatted for [OptionOverlayUpperdir], resolved, prefixed and escaped during early;
//
// If Work is an empty string and Upper holds the special value [SourceTmpfsEphemeral],
// an ephemeral upperdir and workdir will be set up.
//
// If both Work and Upper are empty strings, upperdir and workdir is omitted and the overlay is mounted readonly.
Upper string
// formatted for [OptionOverlayWorkdir], resolved, prefixed and escaped during early;
Work string
ephemeral bool
}
func (o *MountOverlayOp) early(*Params) error {
if o.Work == zeroString {
switch o.Upper {
case SourceTmpfsEphemeral: // ephemeral
o.ephemeral = true // intermediate root not yet available
case zeroString: // readonly
default:
return msg.WrapErr(EINVAL, fmt.Sprintf("upperdir has unexpected value %q", o.Upper))
}
}
if !o.ephemeral {
if o.Upper != o.Work && (o.Upper == zeroString || o.Work == zeroString) {
// unreachable
return msg.WrapErr(ENOTRECOVERABLE, "impossible overlay state reached")
}
if o.Upper != zeroString {
if !path.IsAbs(o.Upper) {
return msg.WrapErr(EBADE, fmt.Sprintf("upperdir %q is not absolute", o.Upper))
}
if v, err := filepath.EvalSymlinks(o.Upper); err != nil {
return wrapErrSelf(err)
} else {
o.Upper = escapeOverlayDataSegment(toHost(v))
}
}
if o.Work != zeroString {
if !path.IsAbs(o.Work) {
return msg.WrapErr(EBADE, fmt.Sprintf("workdir %q is not absolute", o.Work))
}
if v, err := filepath.EvalSymlinks(o.Work); err != nil {
return wrapErrSelf(err)
} else {
o.Work = escapeOverlayDataSegment(toHost(v))
}
}
}
for i := range o.Lower {
if !path.IsAbs(o.Lower[i]) {
return msg.WrapErr(EBADE, fmt.Sprintf("lowerdir %q is not absolute", o.Lower[i]))
}
if v, err := filepath.EvalSymlinks(o.Lower[i]); err != nil {
return wrapErrSelf(err)
} else {
o.Lower[i] = escapeOverlayDataSegment(toHost(v))
}
}
return nil
}
func (o *MountOverlayOp) apply(params *Params) error {
if !path.IsAbs(o.Target) {
return msg.WrapErr(EBADE, fmt.Sprintf("path %q is not absolute", o.Target))
}
target := toSysroot(o.Target)
if err := os.MkdirAll(target, params.ParentPerm); err != nil {
return wrapErrSelf(err)
}
if o.ephemeral {
var err error
// these directories are created internally, therefore early (absolute, symlink, prefix, escape) is bypassed
if o.Upper, err = os.MkdirTemp(FHSRoot, intermediatePatternOverlayUpper); err != nil {
return wrapErrSelf(err)
}
if o.Work, err = os.MkdirTemp(FHSRoot, intermediatePatternOverlayWork); err != nil {
return wrapErrSelf(err)
}
}
options := make([]string, 0, 4)
if o.Upper == zeroString && o.Work == zeroString { // readonly
if len(o.Lower) < 2 {
return msg.WrapErr(EINVAL, "readonly overlay requires at least two lowerdir")
}
// "upperdir=" and "workdir=" may be omitted. In that case the overlay will be read-only
} else {
if len(o.Lower) == 0 {
return msg.WrapErr(EINVAL, "overlay requires at least one lowerdir")
}
options = append(options,
OptionOverlayUpperdir+"="+o.Upper,
OptionOverlayWorkdir+"="+o.Work)
}
options = append(options,
OptionOverlayLowerdir+"="+strings.Join(o.Lower, SpecialOverlayPath),
OptionOverlayUserxattr)
return wrapErrSuffix(Mount(SourceOverlay, target, FstypeOverlay, 0, strings.Join(options, SpecialOverlayOption)),
fmt.Sprintf("cannot mount overlay on %q:", o.Target))
}
func (o *MountOverlayOp) Is(op Op) bool {
vo, ok := op.(*MountOverlayOp)
return ok &&
o.Target == vo.Target &&
slices.Equal(o.Lower, vo.Lower) &&
o.Upper == vo.Upper &&
o.Work == vo.Work
}
func (*MountOverlayOp) prefix() string { return "mounting" }
func (o *MountOverlayOp) String() string {
return fmt.Sprintf("overlay on %q with %d layers", o.Target, len(o.Lower))
}
func init() { gob.Register(new(SymlinkOp)) }
// Link appends an [Op] that creates a symlink in the container filesystem.
@@ -436,7 +601,7 @@ func (t *TmpfileOp) apply(params *Params) error {
}
var tmpPath string
if f, err := os.CreateTemp(FHSRoot, "tmp.*"); err != nil {
if f, err := os.CreateTemp(FHSRoot, intermediatePatternTmpfile); err != nil {
return wrapErrSelf(err)
} else if _, err = f.Write(t.Data); err != nil {
return wrapErrSuffix(err,

42
container/path_test.go Normal file
View File

@@ -0,0 +1,42 @@
package container
import "testing"
func TestToSysroot(t *testing.T) {
testCases := []struct {
name string
want string
}{
{"", "/sysroot"},
{"/", "/sysroot"},
{"//etc///", "/sysroot/etc"},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
if got := toSysroot(tc.name); got != tc.want {
t.Errorf("toSysroot: %q, want %q", got, tc.want)
}
})
}
}
func TestToHost(t *testing.T) {
testCases := []struct {
name string
want string
}{
{"", "/host"},
{"/", "/host"},
{"//etc///", "/host/etc"},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
if got := toHost(tc.name); got != tc.want {
t.Errorf("toHost: %q, want %q", got, tc.want)
}
})
}
}
// InternalToHostOvlEscape exports toHost passed to escapeOverlayDataSegment.
func InternalToHostOvlEscape(s string) string { return escapeOverlayDataSegment(toHost(s)) }