From acffa76812eb2972f2478b55fe2ba1e0ae8fa41d Mon Sep 17 00:00:00 2001 From: Ophestra Date: Fri, 8 Aug 2025 01:50:38 +0900 Subject: [PATCH] container/ops: implement overlay op There are significant limitations to using the overlay mount, and the implementation in the kernel is quite quirky. For now the Op is quite robust, however a higher level interface for it has not been decided yet. Signed-off-by: Ophestra --- container/container_test.go | 90 +++++++++++++++++++ container/mount.go | 32 ++++++- container/ops.go | 167 +++++++++++++++++++++++++++++++++++- container/path_test.go | 42 +++++++++ 4 files changed, 327 insertions(+), 4 deletions(-) create mode 100644 container/path_test.go diff --git a/container/container_test.go b/container/container_test.go index e03ef43..62134f7 100644 --- a/container/container_test.go +++ b/container/container_test.go @@ -10,6 +10,7 @@ import ( "os" "os/exec" "os/signal" + "path" "strconv" "strings" "syscall" @@ -115,6 +116,95 @@ var containerTestCases = []struct { ent("/", "/dev/pts", "rw,nosuid,noexec,relatime", "devpts", "devpts", "rw,mode=620,ptmxmode=666"), ), 1971, 100, nil, 0, seccomp.PresetStrict}, + + {"overlay", true, false, false, true, + func(t *testing.T) (*container.Ops, context.Context) { + tempDir := t.TempDir() + lower0, lower1, upper, work := + path.Join(tempDir, "lower0"), + path.Join(tempDir, "lower1"), + path.Join(tempDir, "upper"), + path.Join(tempDir, "work") + for _, name := range []string{lower0, lower1, upper, work} { + if err := os.Mkdir(name, 0755); err != nil { + t.Fatalf("Mkdir: error = %v", err) + } + } + + return new(container.Ops). + Overlay(hst.Tmp, upper, work, lower0, lower1), + context.WithValue(context.WithValue(context.WithValue(context.WithValue(t.Context(), + testVal("lower1"), lower1), + testVal("lower0"), lower0), + testVal("work"), work), + testVal("upper"), upper) + }, + func(t *testing.T, ctx context.Context) []*vfs.MountInfoEntry { + return []*vfs.MountInfoEntry{ + ent("/", hst.Tmp, "rw", "overlay", "overlay", + "rw,lowerdir="+ + container.InternalToHostOvlEscape(ctx.Value(testVal("lower0")).(string))+":"+ + container.InternalToHostOvlEscape(ctx.Value(testVal("lower1")).(string))+ + ",upperdir="+ + container.InternalToHostOvlEscape(ctx.Value(testVal("upper")).(string))+ + ",workdir="+ + container.InternalToHostOvlEscape(ctx.Value(testVal("work")).(string))+ + ",redirect_dir=nofollow,uuid=on,userxattr"), + } + }, + 1 << 3, 1 << 14, nil, 0, seccomp.PresetStrict}, + + {"overlay ephemeral", true, false, false, true, + func(t *testing.T) (*container.Ops, context.Context) { + tempDir := t.TempDir() + lower0, lower1 := + path.Join(tempDir, "lower0"), + path.Join(tempDir, "lower1") + for _, name := range []string{lower0, lower1} { + if err := os.Mkdir(name, 0755); err != nil { + t.Fatalf("Mkdir: error = %v", err) + } + } + + return new(container.Ops). + OverlayEphemeral(hst.Tmp, lower0, lower1), + t.Context() + }, + func(t *testing.T, ctx context.Context) []*vfs.MountInfoEntry { + return []*vfs.MountInfoEntry{ + // contains random suffix + ent("/", hst.Tmp, "rw", "overlay", "overlay", ignore), + } + }, + 1 << 3, 1 << 14, nil, 0, seccomp.PresetStrict}, + + {"overlay readonly", true, false, false, true, + func(t *testing.T) (*container.Ops, context.Context) { + tempDir := t.TempDir() + lower0, lower1 := + path.Join(tempDir, "lower0"), + path.Join(tempDir, "lower1") + for _, name := range []string{lower0, lower1} { + if err := os.Mkdir(name, 0755); err != nil { + t.Fatalf("Mkdir: error = %v", err) + } + } + return new(container.Ops). + OverlayReadonly(hst.Tmp, lower0, lower1), + context.WithValue(context.WithValue(t.Context(), + testVal("lower1"), lower1), + testVal("lower0"), lower0) + }, + func(t *testing.T, ctx context.Context) []*vfs.MountInfoEntry { + return []*vfs.MountInfoEntry{ + ent("/", hst.Tmp, "rw", "overlay", "overlay", + "ro,lowerdir="+ + container.InternalToHostOvlEscape(ctx.Value(testVal("lower0")).(string))+":"+ + container.InternalToHostOvlEscape(ctx.Value(testVal("lower1")).(string))+ + ",redirect_dir=nofollow,userxattr"), + } + }, + 1 << 3, 1 << 14, nil, 0, seccomp.PresetStrict}, } func TestContainer(t *testing.T) { diff --git a/container/mount.go b/container/mount.go index 821cdc0..873f69c 100644 --- a/container/mount.go +++ b/container/mount.go @@ -40,6 +40,9 @@ const ( // SourceMqueue is used when mounting mqueue. // Note that any source value is allowed when fstype is [FstypeMqueue]. SourceMqueue = "mqueue" + // SourceOverlay is used when mounting overlay. + // Note that any source value is allowed when fstype is [FstypeOverlay]. + SourceOverlay = "overlay" // SourceTmpfsRootfs is used when mounting the tmpfs instance backing the intermediate root. SourceTmpfsRootfs = "rootfs" @@ -66,6 +69,29 @@ const ( // FstypeMqueue represents the mqueue pseudo-filesystem. // This filesystem type is usually mounted on /dev/mqueue. FstypeMqueue = "mqueue" + // FstypeOverlay represents the overlay pseudo-filesystem. + // This filesystem type can be mounted anywhere in the container filesystem. + FstypeOverlay = "overlay" + + // OptionOverlayLowerdir represents the lowerdir option of the overlay pseudo-filesystem. + // Any filesystem, does not need to be on a writable filesystem. + OptionOverlayLowerdir = "lowerdir" + // OptionOverlayUpperdir represents the upperdir option of the overlay pseudo-filesystem. + // The upperdir is normally on a writable filesystem. + OptionOverlayUpperdir = "upperdir" + // OptionOverlayWorkdir represents the workdir option of the overlay pseudo-filesystem. + // The workdir needs to be an empty directory on the same filesystem as upperdir. + OptionOverlayWorkdir = "workdir" + // OptionOverlayUserxattr represents the userxattr option of the overlay pseudo-filesystem. + // Use the "user.overlay." xattr namespace instead of "trusted.overlay.". + OptionOverlayUserxattr = "userxattr" + + // SpecialOverlayEscape is the escape string for overlay mount options. + SpecialOverlayEscape = `\` + // SpecialOverlayOption is the separator string between overlay mount options. + SpecialOverlayOption = "," + // SpecialOverlayPath is the separator string between overlay paths. + SpecialOverlayPath = ":" ) // bindMount mounts source on target and recursively applies flags if MS_REC is set. @@ -199,8 +225,8 @@ func escapeOverlayDataSegment(s string) string { } return strings.NewReplacer( - `\`, `\\`, - `,`, `\,`, - `:`, `\:`, + SpecialOverlayEscape, SpecialOverlayEscape+SpecialOverlayEscape, + SpecialOverlayOption, SpecialOverlayEscape+SpecialOverlayOption, + SpecialOverlayPath, SpecialOverlayEscape+SpecialOverlayPath, ).Replace(s) } diff --git a/container/ops.go b/container/ops.go index 8d9d169..927c60a 100644 --- a/container/ops.go +++ b/container/ops.go @@ -13,6 +13,17 @@ import ( "unsafe" ) +const ( + // intermediate root file name pattern for [MountOverlayOp.Upper]; + // remains after apply returns + intermediatePatternOverlayUpper = "overlay.upper.*" + // intermediate root file name pattern for [MountOverlayOp.Work]; + // remains after apply returns + intermediatePatternOverlayWork = "overlay.work.*" + // intermediate root file name pattern for [TmpfileOp] + intermediatePatternTmpfile = "tmp.*" +) + type ( Ops []Op @@ -337,6 +348,160 @@ func (t *MountTmpfsOp) Is(op Op) bool { vt, ok := op.(*MountTmpfsOp); return ok func (*MountTmpfsOp) prefix() string { return "mounting" } func (t *MountTmpfsOp) String() string { return fmt.Sprintf("tmpfs on %q size %d", t.Path, t.Size) } +func init() { gob.Register(new(MountOverlayOp)) } + +// Overlay appends an [Op] that mounts the overlay pseudo filesystem on [MountOverlayOp.Target]. +func (f *Ops) Overlay(target, state, work string, layers ...string) *Ops { + *f = append(*f, &MountOverlayOp{ + Target: target, + Lower: layers, + Upper: state, + Work: work, + }) + return f +} + +// OverlayEphemeral appends an [Op] that mounts the overlay pseudo filesystem on [MountOverlayOp.Target] +// with an ephemeral upperdir and workdir. +func (f *Ops) OverlayEphemeral(target string, layers ...string) *Ops { + return f.Overlay(target, SourceTmpfsEphemeral, zeroString, layers...) +} + +// OverlayReadonly appends an [Op] that mounts the overlay pseudo filesystem readonly on [MountOverlayOp.Target] +func (f *Ops) OverlayReadonly(target string, layers ...string) *Ops { + return f.Overlay(target, zeroString, zeroString, layers...) +} + +type MountOverlayOp struct { + Target string + + // formatted for [OptionOverlayLowerdir], resolved, prefixed and escaped during early; + Lower []string + // formatted for [OptionOverlayUpperdir], resolved, prefixed and escaped during early; + // + // If Work is an empty string and Upper holds the special value [SourceTmpfsEphemeral], + // an ephemeral upperdir and workdir will be set up. + // + // If both Work and Upper are empty strings, upperdir and workdir is omitted and the overlay is mounted readonly. + Upper string + // formatted for [OptionOverlayWorkdir], resolved, prefixed and escaped during early; + Work string + + ephemeral bool +} + +func (o *MountOverlayOp) early(*Params) error { + if o.Work == zeroString { + switch o.Upper { + case SourceTmpfsEphemeral: // ephemeral + o.ephemeral = true // intermediate root not yet available + + case zeroString: // readonly + + default: + return msg.WrapErr(EINVAL, fmt.Sprintf("upperdir has unexpected value %q", o.Upper)) + } + } + + if !o.ephemeral { + if o.Upper != o.Work && (o.Upper == zeroString || o.Work == zeroString) { + // unreachable + return msg.WrapErr(ENOTRECOVERABLE, "impossible overlay state reached") + } + + if o.Upper != zeroString { + if !path.IsAbs(o.Upper) { + return msg.WrapErr(EBADE, fmt.Sprintf("upperdir %q is not absolute", o.Upper)) + } + if v, err := filepath.EvalSymlinks(o.Upper); err != nil { + return wrapErrSelf(err) + } else { + o.Upper = escapeOverlayDataSegment(toHost(v)) + } + } + + if o.Work != zeroString { + if !path.IsAbs(o.Work) { + return msg.WrapErr(EBADE, fmt.Sprintf("workdir %q is not absolute", o.Work)) + } + if v, err := filepath.EvalSymlinks(o.Work); err != nil { + return wrapErrSelf(err) + } else { + o.Work = escapeOverlayDataSegment(toHost(v)) + } + } + } + + for i := range o.Lower { + if !path.IsAbs(o.Lower[i]) { + return msg.WrapErr(EBADE, fmt.Sprintf("lowerdir %q is not absolute", o.Lower[i])) + } + + if v, err := filepath.EvalSymlinks(o.Lower[i]); err != nil { + return wrapErrSelf(err) + } else { + o.Lower[i] = escapeOverlayDataSegment(toHost(v)) + } + } + return nil +} + +func (o *MountOverlayOp) apply(params *Params) error { + if !path.IsAbs(o.Target) { + return msg.WrapErr(EBADE, fmt.Sprintf("path %q is not absolute", o.Target)) + } + target := toSysroot(o.Target) + if err := os.MkdirAll(target, params.ParentPerm); err != nil { + return wrapErrSelf(err) + } + + if o.ephemeral { + var err error + // these directories are created internally, therefore early (absolute, symlink, prefix, escape) is bypassed + if o.Upper, err = os.MkdirTemp(FHSRoot, intermediatePatternOverlayUpper); err != nil { + return wrapErrSelf(err) + } + if o.Work, err = os.MkdirTemp(FHSRoot, intermediatePatternOverlayWork); err != nil { + return wrapErrSelf(err) + } + } + + options := make([]string, 0, 4) + + if o.Upper == zeroString && o.Work == zeroString { // readonly + if len(o.Lower) < 2 { + return msg.WrapErr(EINVAL, "readonly overlay requires at least two lowerdir") + } + // "upperdir=" and "workdir=" may be omitted. In that case the overlay will be read-only + } else { + if len(o.Lower) == 0 { + return msg.WrapErr(EINVAL, "overlay requires at least one lowerdir") + } + options = append(options, + OptionOverlayUpperdir+"="+o.Upper, + OptionOverlayWorkdir+"="+o.Work) + } + options = append(options, + OptionOverlayLowerdir+"="+strings.Join(o.Lower, SpecialOverlayPath), + OptionOverlayUserxattr) + + return wrapErrSuffix(Mount(SourceOverlay, target, FstypeOverlay, 0, strings.Join(options, SpecialOverlayOption)), + fmt.Sprintf("cannot mount overlay on %q:", o.Target)) +} + +func (o *MountOverlayOp) Is(op Op) bool { + vo, ok := op.(*MountOverlayOp) + return ok && + o.Target == vo.Target && + slices.Equal(o.Lower, vo.Lower) && + o.Upper == vo.Upper && + o.Work == vo.Work +} +func (*MountOverlayOp) prefix() string { return "mounting" } +func (o *MountOverlayOp) String() string { + return fmt.Sprintf("overlay on %q with %d layers", o.Target, len(o.Lower)) +} + func init() { gob.Register(new(SymlinkOp)) } // Link appends an [Op] that creates a symlink in the container filesystem. @@ -436,7 +601,7 @@ func (t *TmpfileOp) apply(params *Params) error { } var tmpPath string - if f, err := os.CreateTemp(FHSRoot, "tmp.*"); err != nil { + if f, err := os.CreateTemp(FHSRoot, intermediatePatternTmpfile); err != nil { return wrapErrSelf(err) } else if _, err = f.Write(t.Data); err != nil { return wrapErrSuffix(err, diff --git a/container/path_test.go b/container/path_test.go new file mode 100644 index 0000000..9af2d2f --- /dev/null +++ b/container/path_test.go @@ -0,0 +1,42 @@ +package container + +import "testing" + +func TestToSysroot(t *testing.T) { + testCases := []struct { + name string + want string + }{ + {"", "/sysroot"}, + {"/", "/sysroot"}, + {"//etc///", "/sysroot/etc"}, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + if got := toSysroot(tc.name); got != tc.want { + t.Errorf("toSysroot: %q, want %q", got, tc.want) + } + }) + } +} + +func TestToHost(t *testing.T) { + testCases := []struct { + name string + want string + }{ + {"", "/host"}, + {"/", "/host"}, + {"//etc///", "/host/etc"}, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + if got := toHost(tc.name); got != tc.want { + t.Errorf("toHost: %q, want %q", got, tc.want) + } + }) + } +} + +// InternalToHostOvlEscape exports toHost passed to escapeOverlayDataSegment. +func InternalToHostOvlEscape(s string) string { return escapeOverlayDataSegment(toHost(s)) }