From 1a8840bebc673672235b6e10b1b9386f24751757 Mon Sep 17 00:00:00 2001 From: Ophestra Date: Tue, 1 Jul 2025 20:23:33 +0900 Subject: [PATCH] sandbox/seccomp: resolve rules natively This enables loading syscall filter policies from external cross-platform config files. This also removes a significant amount of C code. Signed-off-by: Ophestra --- cmd/planterette/app.go | 4 +- cmd/planterette/with.go | 16 +- dbus/dbus_test.go | 2 +- dbus/proc.go | 2 +- hst/container.go | 4 +- hst/template.go | 19 +- hst/template_test.go | 3 +- internal/app/instance/common/container.go | 7 +- internal/app/internal/setuid/shim.go | 2 +- internal/output.go | 4 - print_test.go | 9 +- sandbox/container.go | 22 +- sandbox/container_test.go | 4 +- sandbox/init.go | 2 +- sandbox/seccomp/export.go | 58 ---- sandbox/seccomp/libseccomp-helper.c | 130 +++++++ sandbox/seccomp/libseccomp-helper.h | 24 ++ sandbox/seccomp/libseccomp.go | 183 ++++++++++ .../{export_test.go => libseccomp_test.go} | 41 +-- sandbox/seccomp/output.go | 30 -- sandbox/seccomp/presets.go | 230 +++++++++++++ sandbox/seccomp/presets_clone_backwards2.go | 7 + sandbox/seccomp/presets_clone_generic.go | 6 + sandbox/seccomp/{api.go => proc.go} | 20 +- sandbox/seccomp/seccomp-build.c | 321 ------------------ sandbox/seccomp/seccomp-build.h | 23 -- sandbox/seccomp/seccomp.go | 155 +++------ 27 files changed, 709 insertions(+), 619 deletions(-) delete mode 100644 sandbox/seccomp/export.go create mode 100644 sandbox/seccomp/libseccomp-helper.c create mode 100644 sandbox/seccomp/libseccomp-helper.h create mode 100644 sandbox/seccomp/libseccomp.go rename sandbox/seccomp/{export_test.go => libseccomp_test.go} (81%) delete mode 100644 sandbox/seccomp/output.go create mode 100644 sandbox/seccomp/presets.go create mode 100644 sandbox/seccomp/presets_clone_backwards2.go create mode 100644 sandbox/seccomp/presets_clone_generic.go rename sandbox/seccomp/{api.go => proc.go} (72%) delete mode 100644 sandbox/seccomp/seccomp-build.c delete mode 100644 sandbox/seccomp/seccomp-build.h diff --git a/cmd/planterette/app.go b/cmd/planterette/app.go index 257956d..6d0993f 100644 --- a/cmd/planterette/app.go +++ b/cmd/planterette/app.go @@ -115,10 +115,10 @@ func (app *appInfo) toFst(pathSet *appPathSet, argv []string, flagDropShell bool }, } if app.Multiarch { - config.Container.Seccomp |= seccomp.FilterMultiarch + config.Container.SeccompFlags |= seccomp.AllowMultiarch } if app.Bluetooth { - config.Container.Seccomp |= seccomp.FilterBluetooth + config.Container.SeccompFlags |= seccomp.AllowBluetooth } return config } diff --git a/cmd/planterette/with.go b/cmd/planterette/with.go index ffacec7..f2f4541 100644 --- a/cmd/planterette/with.go +++ b/cmd/planterette/with.go @@ -43,11 +43,11 @@ func withNixDaemon( Identity: app.Identity, Container: &hst.ContainerConfig{ - Hostname: formatHostname(app.Name) + "-" + action, - Userns: true, // nix sandbox requires userns - Net: net, - Seccomp: seccomp.FilterMultiarch, - Tty: dropShell, + Hostname: formatHostname(app.Name) + "-" + action, + Userns: true, // nix sandbox requires userns + Net: net, + SeccompFlags: seccomp.AllowMultiarch, + Tty: dropShell, Filesystem: []*hst.FilesystemConfig{ {Src: pathSet.nixPath, Dst: "/nix", Write: true, Must: true}, }, @@ -85,9 +85,9 @@ func withCacheDir( Identity: app.Identity, Container: &hst.ContainerConfig{ - Hostname: formatHostname(app.Name) + "-" + action, - Seccomp: seccomp.FilterMultiarch, - Tty: dropShell, + Hostname: formatHostname(app.Name) + "-" + action, + SeccompFlags: seccomp.AllowMultiarch, + Tty: dropShell, Filesystem: []*hst.FilesystemConfig{ {Src: path.Join(workDir, "nix"), Dst: "/nix", Must: true}, {Src: workDir, Dst: path.Join(hst.Tmp, "bundle"), Must: true}, diff --git a/dbus/dbus_test.go b/dbus/dbus_test.go index 7f57cea..36e9d65 100644 --- a/dbus/dbus_test.go +++ b/dbus/dbus_test.go @@ -178,7 +178,7 @@ func testProxyFinaliseStartWaitCloseString(t *testing.T, useSandbox bool) { t.Run("string", func(t *testing.T) { wantSubstr := fmt.Sprintf("%s -test.run=TestHelperStub -- --args=3 --fd=4", os.Args[0]) if useSandbox { - wantSubstr = fmt.Sprintf(`argv: ["%s" "-test.run=TestHelperStub" "--" "--args=3" "--fd=4"], flags: 0x0, seccomp: 0x3e`, os.Args[0]) + wantSubstr = fmt.Sprintf(`argv: ["%s" "-test.run=TestHelperStub" "--" "--args=3" "--fd=4"], flags: 0x0, seccomp: 0x1, presets: 0xf`, os.Args[0]) } if got := p.String(); !strings.Contains(got, wantSubstr) { t.Errorf("String: %q, want %q", diff --git a/dbus/proc.go b/dbus/proc.go index 1b1a51c..1a51961 100644 --- a/dbus/proc.go +++ b/dbus/proc.go @@ -66,7 +66,7 @@ func (p *Proxy) Start() error { ctx, toolPath, p.final, true, argF, func(container *sandbox.Container) { - container.Seccomp |= seccomp.FilterMultiarch + container.SeccompFlags |= seccomp.AllowMultiarch container.Hostname = "hakurei-dbus" container.CommandContext = p.CommandContext if p.output != nil { diff --git a/hst/container.go b/hst/container.go index 796b82e..32ddef2 100644 --- a/hst/container.go +++ b/hst/container.go @@ -11,7 +11,9 @@ type ( Hostname string `json:"hostname,omitempty"` // extra seccomp flags - Seccomp seccomp.FilterOpts `json:"seccomp"` + SeccompFlags seccomp.PrepareFlag `json:"seccomp_flags"` + // extra seccomp presets + SeccompPresets seccomp.FilterPreset `json:"seccomp_presets"` // allow ptrace and friends Devel bool `json:"devel,omitempty"` // allow userns creation in container diff --git a/hst/template.go b/hst/template.go index 4d054a4..bcf14ac 100644 --- a/hst/template.go +++ b/hst/template.go @@ -57,15 +57,16 @@ func Template() *Config { Groups: []string{"video", "dialout", "plugdev"}, Container: &ContainerConfig{ - Hostname: "localhost", - Devel: true, - Userns: true, - Net: true, - Device: true, - Seccomp: seccomp.FilterMultiarch, - Tty: true, - Multiarch: true, - MapRealUID: true, + Hostname: "localhost", + Devel: true, + Userns: true, + Net: true, + Device: true, + SeccompFlags: seccomp.AllowMultiarch, + SeccompPresets: seccomp.PresetExt, + Tty: true, + Multiarch: true, + MapRealUID: true, // example API credentials pulled from Google Chrome // DO NOT USE THESE IN A REAL BROWSER Env: map[string]string{ diff --git a/hst/template_test.go b/hst/template_test.go index 74d258d..ad56331 100644 --- a/hst/template_test.go +++ b/hst/template_test.go @@ -80,7 +80,8 @@ func TestTemplate(t *testing.T) { ], "container": { "hostname": "localhost", - "seccomp": 32, + "seccomp_flags": 1, + "seccomp_presets": 1, "devel": true, "userns": true, "net": true, diff --git a/internal/app/instance/common/container.go b/internal/app/instance/common/container.go index 567a8aa..6a32e9a 100644 --- a/internal/app/instance/common/container.go +++ b/internal/app/instance/common/container.go @@ -27,8 +27,9 @@ func NewContainer(s *hst.ContainerConfig, os sys.State, uid, gid *int) (*sandbox } container := &sandbox.Params{ - Hostname: s.Hostname, - Seccomp: s.Seccomp, + Hostname: s.Hostname, + SeccompFlags: s.SeccompFlags, + SeccompPresets: s.SeccompPresets, } { @@ -37,7 +38,7 @@ func NewContainer(s *hst.ContainerConfig, os sys.State, uid, gid *int) (*sandbox } if s.Multiarch { - container.Seccomp |= seccomp.FilterMultiarch + container.SeccompFlags |= seccomp.AllowMultiarch } if s.Devel { diff --git a/internal/app/internal/setuid/shim.go b/internal/app/internal/setuid/shim.go index 15587cd..196fcfe 100644 --- a/internal/app/internal/setuid/shim.go +++ b/internal/app/internal/setuid/shim.go @@ -163,7 +163,7 @@ func ShimMain() { hlog.PrintBaseError(err, "cannot configure container:") } - if err := seccomp.Load(seccomp.PresetCommon); err != nil { + if err := seccomp.Load(seccomp.PresetStrict, seccomp.AllowMultiarch); err != nil { log.Fatalf("cannot load syscall filter: %v", err) } diff --git a/internal/output.go b/internal/output.go index 9d6133a..a00cc21 100644 --- a/internal/output.go +++ b/internal/output.go @@ -3,7 +3,6 @@ package internal import ( "git.gensokyo.uk/security/hakurei/internal/hlog" "git.gensokyo.uk/security/hakurei/sandbox" - "git.gensokyo.uk/security/hakurei/sandbox/seccomp" "git.gensokyo.uk/security/hakurei/system" ) @@ -11,7 +10,4 @@ func InstallFmsg(verbose bool) { hlog.Store(verbose) sandbox.SetOutput(hlog.Output{}) system.SetOutput(hlog.Output{}) - if verbose { - seccomp.SetOutput(hlog.Verbose) - } } diff --git a/print_test.go b/print_test.go index 883199d..012a48d 100644 --- a/print_test.go +++ b/print_test.go @@ -257,7 +257,8 @@ App ], "container": { "hostname": "localhost", - "seccomp": 32, + "seccomp_flags": 1, + "seccomp_presets": 1, "devel": true, "userns": true, "net": true, @@ -382,7 +383,8 @@ App ], "container": { "hostname": "localhost", - "seccomp": 32, + "seccomp_flags": 1, + "seccomp_presets": 1, "devel": true, "userns": true, "net": true, @@ -561,7 +563,8 @@ func Test_printPs(t *testing.T) { ], "container": { "hostname": "localhost", - "seccomp": 32, + "seccomp_flags": 1, + "seccomp_presets": 1, "devel": true, "userns": true, "net": true, diff --git a/sandbox/container.go b/sandbox/container.go index 475f614..efb3e48 100644 --- a/sandbox/container.go +++ b/sandbox/container.go @@ -27,20 +27,20 @@ const ( FAllowNet ) -func (flags HardeningFlags) seccomp(opts seccomp.FilterOpts) seccomp.FilterOpts { +func (flags HardeningFlags) seccomp(presets seccomp.FilterPreset) seccomp.FilterPreset { if flags&FSyscallCompat == 0 { - opts |= seccomp.FilterExt + presets |= seccomp.PresetExt } if flags&FAllowDevel == 0 { - opts |= seccomp.FilterDenyDevel + presets |= seccomp.PresetDenyDevel } if flags&FAllowUserns == 0 { - opts |= seccomp.FilterDenyNS + presets |= seccomp.PresetDenyNS } if flags&FAllowTTY == 0 { - opts |= seccomp.FilterDenyTTY + presets |= seccomp.PresetDenyTTY } - return opts + return presets } type ( @@ -94,8 +94,10 @@ type ( Hostname string // Sequential container setup ops. *Ops - // Extra seccomp options. - Seccomp seccomp.FilterOpts + // Extra seccomp flags. + SeccompFlags seccomp.PrepareFlag + // Extra seccomp presets. + SeccompPresets seccomp.FilterPreset // Permission bits of newly created parent directories. // The zero value is interpreted as 0755. ParentPerm os.FileMode @@ -233,8 +235,8 @@ func (p *Container) Serve() error { func (p *Container) Wait() error { defer p.cancel(); return p.cmd.Wait() } func (p *Container) String() string { - return fmt.Sprintf("argv: %q, flags: %#x, seccomp: %#x", - p.Args, p.Flags, int(p.Flags.seccomp(p.Seccomp))) + return fmt.Sprintf("argv: %q, flags: %#x, seccomp: %#x, presets: %#x", + p.Args, p.Flags, int(p.SeccompFlags), int(p.Flags.seccomp(p.SeccompPresets))) } func New(ctx context.Context, name string, args ...string) *Container { diff --git a/sandbox/container_test.go b/sandbox/container_test.go index 7f64e12..85e40bf 100644 --- a/sandbox/container_test.go +++ b/sandbox/container_test.go @@ -164,8 +164,8 @@ func e(root, target, vfsOptstr, fsType, source, fsOptstr string) *vfs.MountInfoE func TestContainerString(t *testing.T) { container := sandbox.New(t.Context(), "ldd", "/usr/bin/env") container.Flags |= sandbox.FAllowDevel - container.Seccomp |= seccomp.FilterMultiarch - want := `argv: ["ldd" "/usr/bin/env"], flags: 0x2, seccomp: 0x2e` + container.SeccompFlags |= seccomp.AllowMultiarch + want := `argv: ["ldd" "/usr/bin/env"], flags: 0x2, seccomp: 0x1, presets: 0x7` if got := container.String(); got != want { t.Errorf("String: %s, want %s", got, want) } diff --git a/sandbox/init.go b/sandbox/init.go index 466d925..9828ada 100644 --- a/sandbox/init.go +++ b/sandbox/init.go @@ -237,7 +237,7 @@ func Init(prepare func(prefix string), setVerbose func(verbose bool)) { log.Fatalf("cannot capset: %v", err) } - if err := seccomp.Load(params.Flags.seccomp(params.Seccomp)); err != nil { + if err := seccomp.Load(params.Flags.seccomp(params.SeccompPresets), params.SeccompFlags); err != nil { log.Fatalf("cannot load syscall filter: %v", err) } diff --git a/sandbox/seccomp/export.go b/sandbox/seccomp/export.go deleted file mode 100644 index c0d2803..0000000 --- a/sandbox/seccomp/export.go +++ /dev/null @@ -1,58 +0,0 @@ -package seccomp - -import ( - "os" - "runtime" - "sync" -) - -type exporter struct { - opts FilterOpts - r, w *os.File - - prepareOnce sync.Once - prepareErr error - closeOnce sync.Once - closeErr error - exportErr <-chan error -} - -func (e *exporter) prepare() error { - e.prepareOnce.Do(func() { - if r, w, err := os.Pipe(); err != nil { - e.prepareErr = err - return - } else { - e.r, e.w = r, w - } - - ec := make(chan error, 1) - go func(fd uintptr) { - ec <- buildFilter(int(fd), e.opts) - close(ec) - _ = e.closeWrite() - runtime.KeepAlive(e.w) - }(e.w.Fd()) - e.exportErr = ec - runtime.SetFinalizer(e, (*exporter).closeWrite) - }) - return e.prepareErr -} - -func (e *exporter) closeWrite() error { - e.closeOnce.Do(func() { - if e.w == nil { - panic("closeWrite called on invalid exporter") - } - e.closeErr = e.w.Close() - - // no need for a finalizer anymore - runtime.SetFinalizer(e, nil) - }) - - return e.closeErr -} - -func newExporter(opts FilterOpts) *exporter { - return &exporter{opts: opts} -} diff --git a/sandbox/seccomp/libseccomp-helper.c b/sandbox/seccomp/libseccomp-helper.c new file mode 100644 index 0000000..aa11b6f --- /dev/null +++ b/sandbox/seccomp/libseccomp-helper.c @@ -0,0 +1,130 @@ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE /* CLONE_NEWUSER */ +#endif + +#include "libseccomp-helper.h" +#include +#include +#include + +#define LEN(arr) (sizeof(arr) / sizeof((arr)[0])) + +int32_t hakurei_prepare_filter(int *ret_p, int fd, uint32_t arch, + uint32_t multiarch, + struct hakurei_syscall_rule *rules, + size_t rules_sz, hakurei_prepare_flag flags) { + int i; + int last_allowed_family; + int disallowed; + struct hakurei_syscall_rule *rule; + + int32_t res = 0; /* refer to resPrefix for message */ + + /* Blocklist all but unix, inet, inet6 and netlink */ + struct { + int family; + hakurei_prepare_flag flags_mask; + } socket_family_allowlist[] = { + /* NOTE: Keep in numerical order */ + {AF_UNSPEC, 0}, + {AF_LOCAL, 0}, + {AF_INET, 0}, + {AF_INET6, 0}, + {AF_NETLINK, 0}, + {AF_CAN, HAKUREI_PREPARE_CAN}, + {AF_BLUETOOTH, HAKUREI_PREPARE_BLUETOOTH}, + }; + + scmp_filter_ctx ctx = seccomp_init(SCMP_ACT_ALLOW); + if (ctx == NULL) { + res = 1; + goto out; + } else + errno = 0; + + /* We only really need to handle arches on multiarch systems. + * If only one arch is supported the default is fine */ + if (arch != 0) { + /* This *adds* the target arch, instead of replacing the + * native one. This is not ideal, because we'd like to only + * allow the target arch, but we can't really disallow the + * native arch at this point, because then bubblewrap + * couldn't continue running. */ + *ret_p = seccomp_arch_add(ctx, arch); + if (*ret_p < 0 && *ret_p != -EEXIST) { + res = 2; + goto out; + } + + if (flags & HAKUREI_PREPARE_MULTIARCH && multiarch != 0) { + *ret_p = seccomp_arch_add(ctx, multiarch); + if (*ret_p < 0 && *ret_p != -EEXIST) { + res = 3; + goto out; + } + } + } + + for (i = 0; i < rules_sz; i++) { + rule = &rules[i]; + assert(rule->m_errno == EPERM || rule->m_errno == ENOSYS); + + if (rule->arg) + *ret_p = seccomp_rule_add(ctx, SCMP_ACT_ERRNO(rule->m_errno), + rule->syscall, 1, *rule->arg); + else + *ret_p = seccomp_rule_add(ctx, SCMP_ACT_ERRNO(rule->m_errno), + rule->syscall, 0); + + if (*ret_p == -EFAULT) { + res = 4; + goto out; + } else if (*ret_p < 0) { + res = 5; + goto out; + } + } + + /* Socket filtering doesn't work on e.g. i386, so ignore failures here + * However, we need to user seccomp_rule_add_exact to avoid libseccomp doing + * something else: https://github.com/seccomp/libseccomp/issues/8 */ + last_allowed_family = -1; + for (i = 0; i < LEN(socket_family_allowlist); i++) { + if (socket_family_allowlist[i].flags_mask != 0 && + (socket_family_allowlist[i].flags_mask & flags) != + socket_family_allowlist[i].flags_mask) + continue; + + for (disallowed = last_allowed_family + 1; + disallowed < socket_family_allowlist[i].family; disallowed++) { + /* Blocklist the in-between valid families */ + seccomp_rule_add_exact(ctx, SCMP_ACT_ERRNO(EAFNOSUPPORT), + SCMP_SYS(socket), 1, + SCMP_A0(SCMP_CMP_EQ, disallowed)); + } + last_allowed_family = socket_family_allowlist[i].family; + } + /* Blocklist the rest */ + seccomp_rule_add_exact(ctx, SCMP_ACT_ERRNO(EAFNOSUPPORT), SCMP_SYS(socket), 1, + SCMP_A0(SCMP_CMP_GE, last_allowed_family + 1)); + + if (fd < 0) { + *ret_p = seccomp_load(ctx); + if (*ret_p != 0) { + res = 7; + goto out; + } + } else { + *ret_p = seccomp_export_bpf(ctx, fd); + if (*ret_p != 0) { + res = 6; + goto out; + } + } + +out: + if (ctx) + seccomp_release(ctx); + + return res; +} diff --git a/sandbox/seccomp/libseccomp-helper.h b/sandbox/seccomp/libseccomp-helper.h new file mode 100644 index 0000000..79a13de --- /dev/null +++ b/sandbox/seccomp/libseccomp-helper.h @@ -0,0 +1,24 @@ +#include +#include + +#if (SCMP_VER_MAJOR < 2) || (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR < 5) || \ + (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR == 5 && SCMP_VER_MICRO < 1) +#error This package requires libseccomp >= v2.5.1 +#endif + +typedef enum { + HAKUREI_PREPARE_MULTIARCH = 1 << 0, + HAKUREI_PREPARE_CAN = 1 << 1, + HAKUREI_PREPARE_BLUETOOTH = 1 << 2, +} hakurei_prepare_flag; + +struct hakurei_syscall_rule { + int syscall; + int m_errno; + struct scmp_arg_cmp *arg; +}; + +int32_t hakurei_prepare_filter(int *ret_p, int fd, uint32_t arch, + uint32_t multiarch, + struct hakurei_syscall_rule *rules, + size_t rules_sz, hakurei_prepare_flag flags); \ No newline at end of file diff --git a/sandbox/seccomp/libseccomp.go b/sandbox/seccomp/libseccomp.go new file mode 100644 index 0000000..58ae859 --- /dev/null +++ b/sandbox/seccomp/libseccomp.go @@ -0,0 +1,183 @@ +package seccomp + +/* +#cgo linux pkg-config: --static libseccomp + +#include +*/ +import "C" +import ( + "errors" + "fmt" + "runtime" + "syscall" + "unsafe" +) + +var ( + ErrInvalidRules = errors.New("invalid native rules slice") +) + +// LibraryError represents a libseccomp error. +type LibraryError struct { + Prefix string + Seccomp syscall.Errno + Errno error +} + +func (e *LibraryError) Error() string { + if e.Seccomp == 0 { + if e.Errno == nil { + panic("invalid libseccomp error") + } + return fmt.Sprintf("%s: %s", e.Prefix, e.Errno) + } + if e.Errno == nil { + return fmt.Sprintf("%s: %s", e.Prefix, e.Seccomp) + } + return fmt.Sprintf("%s: %s (%s)", e.Prefix, e.Seccomp, e.Errno) +} + +func (e *LibraryError) Is(err error) bool { + if e == nil { + return err == nil + } + if ef, ok := err.(*LibraryError); ok { + return *e == *ef + } + return (e.Seccomp != 0 && errors.Is(err, e.Seccomp)) || + (e.Errno != nil && errors.Is(err, e.Errno)) +} + +// A NativeRule specifies an arch-specific action taken by seccomp under certain conditions. +type NativeRule struct { + // Syscall is the arch-dependent syscall number to act against. + Syscall C.int + // Errno is the errno value to return when the condition is satisfied. + Errno C.int + // Arg is the optional struct scmp_arg_cmp passed to libseccomp. + Arg *ScmpArgCmp +} + +type PrepareFlag = C.hakurei_prepare_flag + +const ( + // AllowMultiarch allows multiarch/emulation. + AllowMultiarch PrepareFlag = C.HAKUREI_PREPARE_MULTIARCH + // AllowCAN allows AF_CAN. + AllowCAN PrepareFlag = C.HAKUREI_PREPARE_CAN + // AllowBluetooth allows AF_BLUETOOTH. + AllowBluetooth PrepareFlag = C.HAKUREI_PREPARE_BLUETOOTH +) + +var resPrefix = [...]string{ + 0: "", + 1: "seccomp_init failed", + 2: "seccomp_arch_add failed", + 3: "seccomp_arch_add failed (multiarch)", + 4: "internal libseccomp failure", + 5: "seccomp_rule_add failed", + 6: "seccomp_export_bpf failed", + 7: "seccomp_load failed", +} + +// Prepare streams filter contents to fd, or installs it to the current process if fd < 0. +func Prepare(fd int, rules []NativeRule, flags PrepareFlag) error { + if len(rules) == 0 { + return ErrInvalidRules + } + + var ( + arch C.uint32_t = 0 + multiarch C.uint32_t = 0 + ) + switch runtime.GOARCH { + case "386": + arch = C.SCMP_ARCH_X86 + case "amd64": + arch = C.SCMP_ARCH_X86_64 + multiarch = C.SCMP_ARCH_X86 + case "arm": + arch = C.SCMP_ARCH_ARM + case "arm64": + arch = C.SCMP_ARCH_AARCH64 + multiarch = C.SCMP_ARCH_ARM + } + + var ret C.int + + rulesPinner := new(runtime.Pinner) + for i := range rules { + rule := &rules[i] + rulesPinner.Pin(rule) + if rule.Arg != nil { + rulesPinner.Pin(rule.Arg) + } + } + res, err := C.hakurei_prepare_filter( + &ret, C.int(fd), + arch, multiarch, + (*C.struct_hakurei_syscall_rule)(unsafe.Pointer(&rules[0])), + C.size_t(len(rules)), + flags, + ) + rulesPinner.Unpin() + + if prefix := resPrefix[res]; prefix != "" { + return &LibraryError{ + prefix, + -syscall.Errno(ret), + err, + } + } + return err +} + +// ScmpCompare is the equivalent of scmp_compare; +// Comparison operators +type ScmpCompare = C.enum_scmp_compare + +const ( + _SCMP_CMP_MIN = C._SCMP_CMP_MIN + + // not equal + SCMP_CMP_NE = C.SCMP_CMP_NE + // less than + SCMP_CMP_LT = C.SCMP_CMP_LT + // less than or equal + SCMP_CMP_LE = C.SCMP_CMP_LE + // equal + SCMP_CMP_EQ = C.SCMP_CMP_EQ + // greater than or equal + SCMP_CMP_GE = C.SCMP_CMP_GE + // greater than + SCMP_CMP_GT = C.SCMP_CMP_GT + // masked equality + SCMP_CMP_MASKED_EQ = C.SCMP_CMP_MASKED_EQ + + _SCMP_CMP_MAX = C._SCMP_CMP_MAX +) + +// ScmpDatum is the equivalent of scmp_datum_t; +// Argument datum +type ScmpDatum uint64 + +// ScmpArgCmp is the equivalent of struct scmp_arg_cmp; +// Argument / Value comparison definition +type ScmpArgCmp struct { + // argument number, starting at 0 + arg C.uint + // the comparison op, e.g. SCMP_CMP_* + op ScmpCompare + + datum_a, datum_b ScmpDatum +} + +// only used for testing +func syscallResolveName(s string) (trap int) { + v := C.CString(s) + trap = int(C.seccomp_syscall_resolve_name(v)) + C.free(unsafe.Pointer(v)) + + return +} diff --git a/sandbox/seccomp/export_test.go b/sandbox/seccomp/libseccomp_test.go similarity index 81% rename from sandbox/seccomp/export_test.go rename to sandbox/seccomp/libseccomp_test.go index 482d6e5..236a8d8 100644 --- a/sandbox/seccomp/export_test.go +++ b/sandbox/seccomp/libseccomp_test.go @@ -8,17 +8,18 @@ import ( "syscall" "testing" - "git.gensokyo.uk/security/hakurei/sandbox/seccomp" + . "git.gensokyo.uk/security/hakurei/sandbox/seccomp" ) func TestExport(t *testing.T) { testCases := []struct { name string - opts seccomp.FilterOpts + presets FilterPreset + flags PrepareFlag want []byte wantErr bool }{ - {"compat", 0, []byte{ + {"compat", 0, 0, []byte{ 0x95, 0xec, 0x69, 0xd0, 0x17, 0x73, 0x3e, 0x07, 0x21, 0x60, 0xe0, 0xda, 0x80, 0xfd, 0xeb, 0xec, 0xdf, 0x27, 0xae, 0x81, 0x66, 0xf5, 0xe2, 0xa7, @@ -28,7 +29,7 @@ func TestExport(t *testing.T) { 0xa7, 0x9b, 0x07, 0x0e, 0x04, 0xc0, 0xee, 0x9a, 0xcd, 0xf5, 0x8f, 0x55, 0xcf, 0xa8, 0x15, 0xa5, }, false}, - {"base", seccomp.FilterExt, []byte{ + {"base", PresetExt, 0, []byte{ 0xdc, 0x7f, 0x2e, 0x1c, 0x5e, 0x82, 0x9b, 0x79, 0xeb, 0xb7, 0xef, 0xc7, 0x59, 0x15, 0x0f, 0x54, 0xa8, 0x3a, 0x75, 0xc8, 0xdf, 0x6f, 0xee, 0x4d, @@ -38,10 +39,10 @@ func TestExport(t *testing.T) { 0x1d, 0xb0, 0x5d, 0x90, 0x99, 0x7c, 0x86, 0x59, 0xb9, 0x58, 0x91, 0x20, 0x6a, 0xc9, 0x95, 0x2d, }, false}, - {"everything", seccomp.FilterExt | - seccomp.FilterDenyNS | seccomp.FilterDenyTTY | seccomp.FilterDenyDevel | - seccomp.FilterMultiarch | seccomp.FilterLinux32 | seccomp.FilterCan | - seccomp.FilterBluetooth, []byte{ + {"everything", PresetExt | + PresetDenyNS | PresetDenyTTY | PresetDenyDevel | + PresetLinux32, AllowMultiarch | AllowCAN | + AllowBluetooth, []byte{ 0xe9, 0x9d, 0xd3, 0x45, 0xe1, 0x95, 0x41, 0x34, 0x73, 0xd3, 0xcb, 0xee, 0x07, 0xb4, 0xed, 0x57, 0xb9, 0x08, 0xbf, 0xa8, 0x9e, 0xa2, 0x07, 0x2f, @@ -51,7 +52,7 @@ func TestExport(t *testing.T) { 0x4c, 0x02, 0x4e, 0xd4, 0x88, 0x50, 0xbe, 0x69, 0xb6, 0x8a, 0x9a, 0x4c, 0x5f, 0x53, 0xa9, 0xdb, }, false}, - {"strict", seccomp.PresetStrict, []byte{ + {"strict", PresetStrict, 0, []byte{ 0xe8, 0x80, 0x29, 0x8d, 0xf2, 0xbd, 0x67, 0x51, 0xd0, 0x04, 0x0f, 0xc2, 0x1b, 0xc0, 0xed, 0x4c, 0x00, 0xf9, 0x5d, 0xc0, 0xd7, 0xba, 0x50, 0x6c, @@ -62,7 +63,7 @@ func TestExport(t *testing.T) { 0x14, 0x89, 0x60, 0xfb, 0xd3, 0x5c, 0xd7, 0x35, }, false}, {"strict compat", 0 | - seccomp.FilterDenyNS | seccomp.FilterDenyTTY | seccomp.FilterDenyDevel, []byte{ + PresetDenyNS | PresetDenyTTY | PresetDenyDevel, 0, []byte{ 0x39, 0x87, 0x1b, 0x93, 0xff, 0xaf, 0xc8, 0xb9, 0x79, 0xfc, 0xed, 0xc0, 0xb0, 0xc3, 0x7b, 0x9e, 0x03, 0x92, 0x2f, 0x5b, 0x02, 0x74, 0x8d, 0xc5, @@ -72,7 +73,7 @@ func TestExport(t *testing.T) { 0x80, 0x8b, 0x1a, 0x6f, 0x84, 0xf3, 0x2b, 0xbd, 0xe1, 0xaa, 0x02, 0xae, 0x30, 0xee, 0xdc, 0xfa, }, false}, - {"hakurei default", seccomp.FilterExt | seccomp.FilterDenyDevel, []byte{ + {"hakurei default", PresetExt | PresetDenyDevel, 0, []byte{ 0xc6, 0x98, 0xb0, 0x81, 0xff, 0x95, 0x7a, 0xfe, 0x17, 0xa6, 0xd9, 0x43, 0x74, 0x53, 0x7d, 0x37, 0xf2, 0xa6, 0x3f, 0x6f, 0x9d, 0xd7, 0x5d, 0xa7, @@ -87,11 +88,7 @@ func TestExport(t *testing.T) { buf := make([]byte, 8) for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { - oldF := seccomp.GetOutput() - seccomp.SetOutput(t.Log) - t.Cleanup(func() { seccomp.SetOutput(oldF) }) - - e := seccomp.New(tc.opts) + e := New(tc.presets, tc.flags) digest := sha512.New() if _, err := io.CopyBuffer(digest, e, buf); (err != nil) != tc.wantErr { @@ -100,7 +97,6 @@ func TestExport(t *testing.T) { } if err := e.Close(); err != nil { t.Errorf("Close: error = %v", err) - return } if got := digest.Sum(nil); !slices.Equal(got, tc.want) { t.Fatalf("Export() hash = %x, want %x", @@ -111,7 +107,7 @@ func TestExport(t *testing.T) { } t.Run("close without use", func(t *testing.T) { - e := seccomp.New(0) + e := New(0, 0) if err := e.Close(); !errors.Is(err, syscall.EINVAL) { t.Errorf("Close: error = %v", err) return @@ -119,7 +115,7 @@ func TestExport(t *testing.T) { }) t.Run("close partial read", func(t *testing.T) { - e := seccomp.New(0) + e := New(0, 0) if _, err := e.Read(nil); err != nil { t.Errorf("Read: error = %v", err) return @@ -137,10 +133,9 @@ func TestExport(t *testing.T) { func BenchmarkExport(b *testing.B) { buf := make([]byte, 8) for i := 0; i < b.N; i++ { - e := seccomp.New(seccomp.FilterExt | - seccomp.FilterDenyNS | seccomp.FilterDenyTTY | seccomp.FilterDenyDevel | - seccomp.FilterMultiarch | seccomp.FilterLinux32 | seccomp.FilterCan | - seccomp.FilterBluetooth) + e := New(PresetExt| + PresetDenyNS|PresetDenyTTY|PresetDenyDevel|PresetLinux32, + AllowMultiarch|AllowCAN|AllowBluetooth) if _, err := io.CopyBuffer(io.Discard, e, buf); err != nil { b.Fatalf("cannot export: %v", err) } diff --git a/sandbox/seccomp/output.go b/sandbox/seccomp/output.go deleted file mode 100644 index 27d5098..0000000 --- a/sandbox/seccomp/output.go +++ /dev/null @@ -1,30 +0,0 @@ -package seccomp - -import "C" -import "sync/atomic" - -var printlnP atomic.Pointer[func(v ...any)] - -func SetOutput(f func(v ...any)) { - if f == nil { - // avoid storing nil function - printlnP.Store(nil) - } else { - printlnP.Store(&f) - } -} - -func GetOutput() func(v ...any) { - if fp := printlnP.Load(); fp == nil { - return nil - } else { - return *fp - } -} - -//export hakurei_println -func hakurei_println(v *C.char) { - if fp := printlnP.Load(); fp != nil { - (*fp)(C.GoString(v)) - } -} diff --git a/sandbox/seccomp/presets.go b/sandbox/seccomp/presets.go new file mode 100644 index 0000000..caf08c0 --- /dev/null +++ b/sandbox/seccomp/presets.go @@ -0,0 +1,230 @@ +package seccomp + +/* flatpak commit 4c3bf179e2e4a2a298cd1db1d045adaf3f564532 */ + +import "C" +import ( + . "syscall" +) + +type FilterPreset int + +const ( + // PresetExt are project-specific extensions. + PresetExt FilterPreset = 1 << iota + // PresetDenyNS denies namespace setup syscalls. + PresetDenyNS + // PresetDenyTTY denies faking input. + PresetDenyTTY + // PresetDenyDevel denies development-related syscalls. + PresetDenyDevel + // PresetLinux32 sets PER_LINUX32. + PresetLinux32 +) + +func preparePreset(fd int, presets FilterPreset, flags PrepareFlag) error { + allowedPersonality := PER_LINUX + if presets&PresetLinux32 != 0 { + allowedPersonality = PER_LINUX32 + } + presetDevelFinal := presetDevel(ScmpDatum(allowedPersonality)) + + l := len(presetCommon) + if presets&PresetDenyNS != 0 { + l += len(presetNamespace) + } + if presets&PresetDenyTTY != 0 { + l += len(presetTTY) + } + if presets&PresetDenyDevel != 0 { + l += len(presetDevelFinal) + } + if flags&AllowMultiarch == 0 { + l += len(presetEmu) + } + if presets&PresetExt != 0 { + l += len(presetCommonExt) + if presets&PresetDenyNS != 0 { + l += len(presetNamespaceExt) + } + if flags&AllowMultiarch == 0 { + l += len(presetEmuExt) + } + } + + rules := make([]NativeRule, 0, l) + rules = append(rules, presetCommon...) + if presets&PresetDenyNS != 0 { + rules = append(rules, presetNamespace...) + } + if presets&PresetDenyTTY != 0 { + rules = append(rules, presetTTY...) + } + if presets&PresetDenyDevel != 0 { + rules = append(rules, presetDevelFinal...) + } + if flags&AllowMultiarch == 0 { + rules = append(rules, presetEmu...) + } + if presets&PresetExt != 0 { + rules = append(rules, presetCommonExt...) + if presets&PresetDenyNS != 0 { + rules = append(rules, presetNamespaceExt...) + } + if flags&AllowMultiarch == 0 { + rules = append(rules, presetEmuExt...) + } + } + + return Prepare(fd, rules, flags) +} + +var ( + presetCommon = []NativeRule{ + /* Block dmesg */ + {C.int(SYS_SYSLOG), C.int(EPERM), nil}, + /* Useless old syscall */ + {C.int(SYS_USELIB), C.int(EPERM), nil}, + /* Don't allow disabling accounting */ + {C.int(SYS_ACCT), C.int(EPERM), nil}, + /* Don't allow reading current quota use */ + {C.int(SYS_QUOTACTL), C.int(EPERM), nil}, + + /* Don't allow access to the kernel keyring */ + {C.int(SYS_ADD_KEY), C.int(EPERM), nil}, + {C.int(SYS_KEYCTL), C.int(EPERM), nil}, + {C.int(SYS_REQUEST_KEY), C.int(EPERM), nil}, + + /* Scary VM/NUMA ops */ + {C.int(SYS_MOVE_PAGES), C.int(EPERM), nil}, + {C.int(SYS_MBIND), C.int(EPERM), nil}, + {C.int(SYS_GET_MEMPOLICY), C.int(EPERM), nil}, + {C.int(SYS_SET_MEMPOLICY), C.int(EPERM), nil}, + {C.int(SYS_MIGRATE_PAGES), C.int(EPERM), nil}, + } + + /* hakurei: project-specific extensions */ + presetCommonExt = []NativeRule{ + /* system calls for changing the system clock */ + {C.int(SYS_ADJTIMEX), C.int(EPERM), nil}, + {C.int(SYS_CLOCK_ADJTIME), C.int(EPERM), nil}, + {C.int(SYS_CLOCK_ADJTIME64), C.int(EPERM), nil}, + {C.int(SYS_CLOCK_SETTIME), C.int(EPERM), nil}, + {C.int(SYS_CLOCK_SETTIME64), C.int(EPERM), nil}, + {C.int(SYS_SETTIMEOFDAY), C.int(EPERM), nil}, + + /* loading and unloading of kernel modules */ + {C.int(SYS_DELETE_MODULE), C.int(EPERM), nil}, + {C.int(SYS_FINIT_MODULE), C.int(EPERM), nil}, + {C.int(SYS_INIT_MODULE), C.int(EPERM), nil}, + + /* system calls for rebooting and reboot preparation */ + {C.int(SYS_KEXEC_FILE_LOAD), C.int(EPERM), nil}, + {C.int(SYS_KEXEC_LOAD), C.int(EPERM), nil}, + {C.int(SYS_REBOOT), C.int(EPERM), nil}, + + /* system calls for enabling/disabling swap devices */ + {C.int(SYS_SWAPOFF), C.int(EPERM), nil}, + {C.int(SYS_SWAPON), C.int(EPERM), nil}, + } + + presetNamespace = []NativeRule{ + /* Don't allow subnamespace setups: */ + {C.int(SYS_UNSHARE), C.int(EPERM), nil}, + {C.int(SYS_SETNS), C.int(EPERM), nil}, + {C.int(SYS_MOUNT), C.int(EPERM), nil}, + {C.int(SYS_UMOUNT), C.int(EPERM), nil}, + {C.int(SYS_UMOUNT2), C.int(EPERM), nil}, + {C.int(SYS_PIVOT_ROOT), C.int(EPERM), nil}, + {C.int(SYS_CHROOT), C.int(EPERM), nil}, + {C.int(SYS_CLONE), C.int(EPERM), + &ScmpArgCmp{cloneArg, SCMP_CMP_MASKED_EQ, CLONE_NEWUSER, CLONE_NEWUSER}}, + + /* seccomp can't look into clone3()'s struct clone_args to check whether + * the flags are OK, so we have no choice but to block clone3(). + * Return ENOSYS so user-space will fall back to clone(). + * (CVE-2021-41133; see also https://github.com/moby/moby/commit/9f6b562d) + */ + {C.int(SYS_CLONE3), C.int(ENOSYS), nil}, + + /* New mount manipulation APIs can also change our VFS. There's no + * legitimate reason to do these in the sandbox, so block all of them + * rather than thinking about which ones might be dangerous. + * (CVE-2021-41133) */ + {C.int(SYS_OPEN_TREE), C.int(ENOSYS), nil}, + {C.int(SYS_MOVE_MOUNT), C.int(ENOSYS), nil}, + {C.int(SYS_FSOPEN), C.int(ENOSYS), nil}, + {C.int(SYS_FSCONFIG), C.int(ENOSYS), nil}, + {C.int(SYS_FSMOUNT), C.int(ENOSYS), nil}, + {C.int(SYS_FSPICK), C.int(ENOSYS), nil}, + {C.int(SYS_MOUNT_SETATTR), C.int(ENOSYS), nil}, + } + + /* hakurei: project-specific extensions */ + presetNamespaceExt = []NativeRule{ + /* changing file ownership */ + {C.int(SYS_CHOWN), C.int(EPERM), nil}, + {C.int(SYS_CHOWN32), C.int(EPERM), nil}, + {C.int(SYS_FCHOWN), C.int(EPERM), nil}, + {C.int(SYS_FCHOWN32), C.int(EPERM), nil}, + {C.int(SYS_FCHOWNAT), C.int(EPERM), nil}, + {C.int(SYS_LCHOWN), C.int(EPERM), nil}, + {C.int(SYS_LCHOWN32), C.int(EPERM), nil}, + + /* system calls for changing user ID and group ID credentials */ + {C.int(SYS_SETGID), C.int(EPERM), nil}, + {C.int(SYS_SETGID32), C.int(EPERM), nil}, + {C.int(SYS_SETGROUPS), C.int(EPERM), nil}, + {C.int(SYS_SETGROUPS32), C.int(EPERM), nil}, + {C.int(SYS_SETREGID), C.int(EPERM), nil}, + {C.int(SYS_SETREGID32), C.int(EPERM), nil}, + {C.int(SYS_SETRESGID), C.int(EPERM), nil}, + {C.int(SYS_SETRESGID32), C.int(EPERM), nil}, + {C.int(SYS_SETRESUID), C.int(EPERM), nil}, + {C.int(SYS_SETRESUID32), C.int(EPERM), nil}, + {C.int(SYS_SETREUID), C.int(EPERM), nil}, + {C.int(SYS_SETREUID32), C.int(EPERM), nil}, + {C.int(SYS_SETUID), C.int(EPERM), nil}, + {C.int(SYS_SETUID32), C.int(EPERM), nil}, + } + + presetTTY = []NativeRule{ + /* Don't allow faking input to the controlling tty (CVE-2017-5226) */ + {C.int(SYS_IOCTL), C.int(EPERM), + &ScmpArgCmp{1, SCMP_CMP_MASKED_EQ, 0xFFFFFFFF, TIOCSTI}}, + /* In the unlikely event that the controlling tty is a Linux virtual + * console (/dev/tty2 or similar), copy/paste operations have an effect + * similar to TIOCSTI (CVE-2023-28100) */ + {C.int(SYS_IOCTL), C.int(EPERM), + &ScmpArgCmp{1, SCMP_CMP_MASKED_EQ, 0xFFFFFFFF, TIOCLINUX}}, + } + + presetEmu = []NativeRule{ + /* modify_ldt is a historic source of interesting information leaks, + * so it's disabled as a hardening measure. + * However, it is required to run old 16-bit applications + * as well as some Wine patches, so it's allowed in multiarch. */ + {C.int(SYS_MODIFY_LDT), C.int(EPERM), nil}, + } + + /* hakurei: project-specific extensions */ + presetEmuExt = []NativeRule{ + {C.int(SYS_SUBPAGE_PROT), C.int(ENOSYS), nil}, + {C.int(SYS_SWITCH_ENDIAN), C.int(ENOSYS), nil}, + {C.int(SYS_VM86), C.int(ENOSYS), nil}, + {C.int(SYS_VM86OLD), C.int(ENOSYS), nil}, + } +) + +func presetDevel(allowedPersonality ScmpDatum) []NativeRule { + return []NativeRule{ + /* Profiling operations; we expect these to be done by tools from outside + * the sandbox. In particular perf has been the source of many CVEs. */ + {C.int(SYS_PERF_EVENT_OPEN), C.int(EPERM), nil}, + /* Don't allow you to switch to bsd emulation or whatnot */ + {C.int(SYS_PERSONALITY), C.int(EPERM), + &ScmpArgCmp{0, SCMP_CMP_NE, allowedPersonality, 0}}, + + {C.int(SYS_PTRACE), C.int(EPERM), nil}, + } +} diff --git a/sandbox/seccomp/presets_clone_backwards2.go b/sandbox/seccomp/presets_clone_backwards2.go new file mode 100644 index 0000000..6a7a636 --- /dev/null +++ b/sandbox/seccomp/presets_clone_backwards2.go @@ -0,0 +1,7 @@ +//go:build s390 || s390x + +package seccomp + +/* Architectures with CONFIG_CLONE_BACKWARDS2: the child stack + * and flags arguments are reversed so the flags come second */ +const cloneArg = 1 diff --git a/sandbox/seccomp/presets_clone_generic.go b/sandbox/seccomp/presets_clone_generic.go new file mode 100644 index 0000000..9d20890 --- /dev/null +++ b/sandbox/seccomp/presets_clone_generic.go @@ -0,0 +1,6 @@ +//go:build !s390 && !s390x + +package seccomp + +/* Normally the flags come first */ +const cloneArg = 0 diff --git a/sandbox/seccomp/api.go b/sandbox/seccomp/proc.go similarity index 72% rename from sandbox/seccomp/api.go rename to sandbox/seccomp/proc.go index 81fb68c..2541782 100644 --- a/sandbox/seccomp/api.go +++ b/sandbox/seccomp/proc.go @@ -9,15 +9,18 @@ import ( ) const ( - PresetStrict = FilterExt | FilterDenyNS | FilterDenyTTY | FilterDenyDevel - PresetCommon = PresetStrict | FilterMultiarch + PresetStrict = PresetExt | PresetDenyNS | PresetDenyTTY | PresetDenyDevel ) // New returns an inactive Encoder instance. -func New(opts FilterOpts) *Encoder { return &Encoder{newExporter(opts)} } +func New(presets FilterPreset, flags PrepareFlag) *Encoder { + return &Encoder{newExporter(presets, flags)} +} // Load loads a filter into the kernel. -func Load(opts FilterOpts) error { return buildFilter(-1, opts) } +func Load(presets FilterPreset, flags PrepareFlag) error { + return preparePreset(-1, presets, flags) +} /* An Encoder writes a BPF program to an output stream. @@ -47,17 +50,20 @@ func (e *Encoder) Close() error { } // NewFile returns an instance of exporter implementing [proc.File]. -func NewFile(opts FilterOpts) proc.File { return &File{opts: opts} } +func NewFile(presets FilterPreset, flags PrepareFlag) proc.File { + return &File{presets: presets, flags: flags} +} // File implements [proc.File] and provides access to the read end of exporter pipe. type File struct { - opts FilterOpts + presets FilterPreset + flags PrepareFlag proc.BaseFile } func (f *File) ErrCount() int { return 2 } func (f *File) Fulfill(ctx context.Context, dispatchErr func(error)) error { - e := newExporter(f.opts) + e := newExporter(f.presets, f.flags) if err := e.prepare(); err != nil { return err } diff --git a/sandbox/seccomp/seccomp-build.c b/sandbox/seccomp/seccomp-build.c deleted file mode 100644 index 3fe47df..0000000 --- a/sandbox/seccomp/seccomp-build.c +++ /dev/null @@ -1,321 +0,0 @@ -#ifndef _GNU_SOURCE -#define _GNU_SOURCE /* CLONE_NEWUSER */ -#endif - -#include "seccomp-build.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if (SCMP_VER_MAJOR < 2) || (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR < 5) || \ - (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR == 5 && SCMP_VER_MICRO < 1) -#error This package requires libseccomp >= v2.5.1 -#endif - -struct hakurei_syscall_act { - int syscall; - int m_errno; - struct scmp_arg_cmp *arg; -}; - -#define LEN(arr) (sizeof(arr) / sizeof((arr)[0])) - -#define SECCOMP_RULESET_ADD(ruleset) \ - do { \ - if (opts & HAKUREI_VERBOSE) \ - hakurei_println("adding seccomp ruleset \"" #ruleset "\""); \ - for (int i = 0; i < LEN(ruleset); i++) { \ - assert(ruleset[i].m_errno == EPERM || ruleset[i].m_errno == ENOSYS); \ - \ - if (ruleset[i].arg) \ - *ret_p = seccomp_rule_add(ctx, SCMP_ACT_ERRNO(ruleset[i].m_errno), \ - ruleset[i].syscall, 1, *ruleset[i].arg); \ - else \ - *ret_p = seccomp_rule_add(ctx, SCMP_ACT_ERRNO(ruleset[i].m_errno), \ - ruleset[i].syscall, 0); \ - \ - if (*ret_p == -EFAULT) { \ - res = 4; \ - goto out; \ - } else if (*ret_p < 0) { \ - res = 5; \ - goto out; \ - } \ - } \ - } while (0) - -int32_t hakurei_build_filter(int *ret_p, int fd, uint32_t arch, - uint32_t multiarch, hakurei_filter_opts opts) { - int32_t res = 0; /* refer to resPrefix for message */ - int allow_multiarch = opts & HAKUREI_MULTIARCH; - int allowed_personality = PER_LINUX; - - if (opts & HAKUREI_LINUX32) - allowed_personality = PER_LINUX32; - - /* flatpak commit 4c3bf179e2e4a2a298cd1db1d045adaf3f564532 */ - - struct hakurei_syscall_act deny_common[] = { - /* Block dmesg */ - {SCMP_SYS(syslog), EPERM}, - /* Useless old syscall */ - {SCMP_SYS(uselib), EPERM}, - /* Don't allow disabling accounting */ - {SCMP_SYS(acct), EPERM}, - /* Don't allow reading current quota use */ - {SCMP_SYS(quotactl), EPERM}, - - /* Don't allow access to the kernel keyring */ - {SCMP_SYS(add_key), EPERM}, - {SCMP_SYS(keyctl), EPERM}, - {SCMP_SYS(request_key), EPERM}, - - /* Scary VM/NUMA ops */ - {SCMP_SYS(move_pages), EPERM}, - {SCMP_SYS(mbind), EPERM}, - {SCMP_SYS(get_mempolicy), EPERM}, - {SCMP_SYS(set_mempolicy), EPERM}, - {SCMP_SYS(migrate_pages), EPERM}, - }; - - /* hakurei: project-specific extensions */ - struct hakurei_syscall_act deny_common_ext[] = { - /* system calls for changing the system clock */ - {SCMP_SYS(adjtimex), EPERM}, - {SCMP_SYS(clock_adjtime), EPERM}, - {SCMP_SYS(clock_adjtime64), EPERM}, - {SCMP_SYS(clock_settime), EPERM}, - {SCMP_SYS(clock_settime64), EPERM}, - {SCMP_SYS(settimeofday), EPERM}, - - /* loading and unloading of kernel modules */ - {SCMP_SYS(delete_module), EPERM}, - {SCMP_SYS(finit_module), EPERM}, - {SCMP_SYS(init_module), EPERM}, - - /* system calls for rebooting and reboot preparation */ - {SCMP_SYS(kexec_file_load), EPERM}, - {SCMP_SYS(kexec_load), EPERM}, - {SCMP_SYS(reboot), EPERM}, - - /* system calls for enabling/disabling swap devices */ - {SCMP_SYS(swapoff), EPERM}, - {SCMP_SYS(swapon), EPERM}, - }; - - struct hakurei_syscall_act deny_ns[] = { - /* Don't allow subnamespace setups: */ - {SCMP_SYS(unshare), EPERM}, - {SCMP_SYS(setns), EPERM}, - {SCMP_SYS(mount), EPERM}, - {SCMP_SYS(umount), EPERM}, - {SCMP_SYS(umount2), EPERM}, - {SCMP_SYS(pivot_root), EPERM}, - {SCMP_SYS(chroot), EPERM}, -#if defined(__s390__) || defined(__s390x__) || defined(__CRIS__) - /* Architectures with CONFIG_CLONE_BACKWARDS2: the child stack - * and flags arguments are reversed so the flags come second */ - {SCMP_SYS(clone), EPERM, - &SCMP_A1(SCMP_CMP_MASKED_EQ, CLONE_NEWUSER, CLONE_NEWUSER)}, -#else - /* Normally the flags come first */ - {SCMP_SYS(clone), EPERM, - &SCMP_A0(SCMP_CMP_MASKED_EQ, CLONE_NEWUSER, CLONE_NEWUSER)}, -#endif - - /* seccomp can't look into clone3()'s struct clone_args to check whether - * the flags are OK, so we have no choice but to block clone3(). - * Return ENOSYS so user-space will fall back to clone(). - * (CVE-2021-41133; see also https://github.com/moby/moby/commit/9f6b562d) - */ - {SCMP_SYS(clone3), ENOSYS}, - - /* New mount manipulation APIs can also change our VFS. There's no - * legitimate reason to do these in the sandbox, so block all of them - * rather than thinking about which ones might be dangerous. - * (CVE-2021-41133) */ - {SCMP_SYS(open_tree), ENOSYS}, - {SCMP_SYS(move_mount), ENOSYS}, - {SCMP_SYS(fsopen), ENOSYS}, - {SCMP_SYS(fsconfig), ENOSYS}, - {SCMP_SYS(fsmount), ENOSYS}, - {SCMP_SYS(fspick), ENOSYS}, - {SCMP_SYS(mount_setattr), ENOSYS}, - }; - - /* hakurei: project-specific extensions */ - struct hakurei_syscall_act deny_ns_ext[] = { - /* changing file ownership */ - {SCMP_SYS(chown), EPERM}, - {SCMP_SYS(chown32), EPERM}, - {SCMP_SYS(fchown), EPERM}, - {SCMP_SYS(fchown32), EPERM}, - {SCMP_SYS(fchownat), EPERM}, - {SCMP_SYS(lchown), EPERM}, - {SCMP_SYS(lchown32), EPERM}, - - /* system calls for changing user ID and group ID credentials */ - {SCMP_SYS(setgid), EPERM}, - {SCMP_SYS(setgid32), EPERM}, - {SCMP_SYS(setgroups), EPERM}, - {SCMP_SYS(setgroups32), EPERM}, - {SCMP_SYS(setregid), EPERM}, - {SCMP_SYS(setregid32), EPERM}, - {SCMP_SYS(setresgid), EPERM}, - {SCMP_SYS(setresgid32), EPERM}, - {SCMP_SYS(setresuid), EPERM}, - {SCMP_SYS(setresuid32), EPERM}, - {SCMP_SYS(setreuid), EPERM}, - {SCMP_SYS(setreuid32), EPERM}, - {SCMP_SYS(setuid), EPERM}, - {SCMP_SYS(setuid32), EPERM}, - }; - - struct hakurei_syscall_act deny_tty[] = { - /* Don't allow faking input to the controlling tty (CVE-2017-5226) */ - {SCMP_SYS(ioctl), EPERM, - &SCMP_A1(SCMP_CMP_MASKED_EQ, 0xFFFFFFFFu, (int)TIOCSTI)}, - /* In the unlikely event that the controlling tty is a Linux virtual - * console (/dev/tty2 or similar), copy/paste operations have an effect - * similar to TIOCSTI (CVE-2023-28100) */ - {SCMP_SYS(ioctl), EPERM, - &SCMP_A1(SCMP_CMP_MASKED_EQ, 0xFFFFFFFFu, (int)TIOCLINUX)}, - }; - - struct hakurei_syscall_act deny_devel[] = { - /* Profiling operations; we expect these to be done by tools from outside - * the sandbox. In particular perf has been the source of many CVEs. */ - {SCMP_SYS(perf_event_open), EPERM}, - /* Don't allow you to switch to bsd emulation or whatnot */ - {SCMP_SYS(personality), EPERM, - &SCMP_A0(SCMP_CMP_NE, allowed_personality)}, - - {SCMP_SYS(ptrace), EPERM}}; - - struct hakurei_syscall_act deny_emu[] = { - /* modify_ldt is a historic source of interesting information leaks, - * so it's disabled as a hardening measure. - * However, it is required to run old 16-bit applications - * as well as some Wine patches, so it's allowed in multiarch. */ - {SCMP_SYS(modify_ldt), EPERM}, - }; - - /* hakurei: project-specific extensions */ - struct hakurei_syscall_act deny_emu_ext[] = { - {SCMP_SYS(subpage_prot), ENOSYS}, - {SCMP_SYS(switch_endian), ENOSYS}, - {SCMP_SYS(vm86), ENOSYS}, - {SCMP_SYS(vm86old), ENOSYS}, - }; - - /* Blocklist all but unix, inet, inet6 and netlink */ - struct { - int family; - hakurei_filter_opts flags_mask; - } socket_family_allowlist[] = { - /* NOTE: Keep in numerical order */ - {AF_UNSPEC, 0}, - {AF_LOCAL, 0}, - {AF_INET, 0}, - {AF_INET6, 0}, - {AF_NETLINK, 0}, - {AF_CAN, HAKUREI_CAN}, - {AF_BLUETOOTH, HAKUREI_BLUETOOTH}, - }; - - scmp_filter_ctx ctx = seccomp_init(SCMP_ACT_ALLOW); - if (ctx == NULL) { - res = 1; - goto out; - } else - errno = 0; - - /* We only really need to handle arches on multiarch systems. - * If only one arch is supported the default is fine */ - if (arch != 0) { - /* This *adds* the target arch, instead of replacing the - * native one. This is not ideal, because we'd like to only - * allow the target arch, but we can't really disallow the - * native arch at this point, because then bubblewrap - * couldn't continue running. */ - *ret_p = seccomp_arch_add(ctx, arch); - if (*ret_p < 0 && *ret_p != -EEXIST) { - res = 2; - goto out; - } - - if (allow_multiarch && multiarch != 0) { - *ret_p = seccomp_arch_add(ctx, multiarch); - if (*ret_p < 0 && *ret_p != -EEXIST) { - res = 3; - goto out; - } - } - } - - SECCOMP_RULESET_ADD(deny_common); - if (opts & HAKUREI_DENY_NS) - SECCOMP_RULESET_ADD(deny_ns); - if (opts & HAKUREI_DENY_TTY) - SECCOMP_RULESET_ADD(deny_tty); - if (opts & HAKUREI_DENY_DEVEL) - SECCOMP_RULESET_ADD(deny_devel); - if (!allow_multiarch) - SECCOMP_RULESET_ADD(deny_emu); - if (opts & HAKUREI_EXT) { - SECCOMP_RULESET_ADD(deny_common_ext); - if (opts & HAKUREI_DENY_NS) - SECCOMP_RULESET_ADD(deny_ns_ext); - if (!allow_multiarch) - SECCOMP_RULESET_ADD(deny_emu_ext); - } - - /* Socket filtering doesn't work on e.g. i386, so ignore failures here - * However, we need to user seccomp_rule_add_exact to avoid libseccomp doing - * something else: https://github.com/seccomp/libseccomp/issues/8 */ - int last_allowed_family = -1; - for (int i = 0; i < LEN(socket_family_allowlist); i++) { - if (socket_family_allowlist[i].flags_mask != 0 && - (socket_family_allowlist[i].flags_mask & opts) != - socket_family_allowlist[i].flags_mask) - continue; - - for (int disallowed = last_allowed_family + 1; - disallowed < socket_family_allowlist[i].family; disallowed++) { - /* Blocklist the in-between valid families */ - seccomp_rule_add_exact(ctx, SCMP_ACT_ERRNO(EAFNOSUPPORT), - SCMP_SYS(socket), 1, - SCMP_A0(SCMP_CMP_EQ, disallowed)); - } - last_allowed_family = socket_family_allowlist[i].family; - } - /* Blocklist the rest */ - seccomp_rule_add_exact(ctx, SCMP_ACT_ERRNO(EAFNOSUPPORT), SCMP_SYS(socket), 1, - SCMP_A0(SCMP_CMP_GE, last_allowed_family + 1)); - - if (fd < 0) { - *ret_p = seccomp_load(ctx); - if (*ret_p != 0) { - res = 7; - goto out; - } - } else { - *ret_p = seccomp_export_bpf(ctx, fd); - if (*ret_p != 0) { - res = 6; - goto out; - } - } - -out: - if (ctx) - seccomp_release(ctx); - - return res; -} diff --git a/sandbox/seccomp/seccomp-build.h b/sandbox/seccomp/seccomp-build.h deleted file mode 100644 index 324e6d2..0000000 --- a/sandbox/seccomp/seccomp-build.h +++ /dev/null @@ -1,23 +0,0 @@ -#include -#include - -#if (SCMP_VER_MAJOR < 2) || (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR < 5) || \ - (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR == 5 && SCMP_VER_MICRO < 1) -#error This package requires libseccomp >= v2.5.1 -#endif - -typedef enum { - HAKUREI_VERBOSE = 1 << 0, - HAKUREI_EXT = 1 << 1, - HAKUREI_DENY_NS = 1 << 2, - HAKUREI_DENY_TTY = 1 << 3, - HAKUREI_DENY_DEVEL = 1 << 4, - HAKUREI_MULTIARCH = 1 << 5, - HAKUREI_LINUX32 = 1 << 6, - HAKUREI_CAN = 1 << 7, - HAKUREI_BLUETOOTH = 1 << 8, -} hakurei_filter_opts; - -extern void hakurei_println(char *v); -int32_t hakurei_build_filter(int *ret_p, int fd, uint32_t arch, - uint32_t multiarch, hakurei_filter_opts opts); \ No newline at end of file diff --git a/sandbox/seccomp/seccomp.go b/sandbox/seccomp/seccomp.go index c9a201e..adbe4d0 100644 --- a/sandbox/seccomp/seccomp.go +++ b/sandbox/seccomp/seccomp.go @@ -1,125 +1,60 @@ -// Package seccomp provides filter presets and high level wrappers around libseccomp. +// Package seccomp provides high level wrappers around libseccomp. package seccomp -/* -#cgo linux pkg-config: --static libseccomp - -#include "seccomp-build.h" -*/ -import "C" - import ( - "errors" - "fmt" + "os" "runtime" - "syscall" - "unsafe" + "sync" ) -// LibraryError represents a libseccomp error. -type LibraryError struct { - Prefix string - Seccomp syscall.Errno - Errno error +type exporter struct { + presets FilterPreset + flags PrepareFlag + r, w *os.File + + prepareOnce sync.Once + prepareErr error + closeOnce sync.Once + closeErr error + exportErr <-chan error } -func (e *LibraryError) Error() string { - if e.Seccomp == 0 { - if e.Errno == nil { - panic("invalid libseccomp error") +func (e *exporter) prepare() error { + e.prepareOnce.Do(func() { + if r, w, err := os.Pipe(); err != nil { + e.prepareErr = err + return + } else { + e.r, e.w = r, w } - return fmt.Sprintf("%s: %s", e.Prefix, e.Errno) - } - if e.Errno == nil { - return fmt.Sprintf("%s: %s", e.Prefix, e.Seccomp) - } - return fmt.Sprintf("%s: %s (%s)", e.Prefix, e.Seccomp, e.Errno) + + ec := make(chan error, 1) + go func(fd uintptr) { + ec <- preparePreset(int(fd), e.presets, e.flags) + close(ec) + _ = e.closeWrite() + runtime.KeepAlive(e.w) + }(e.w.Fd()) + e.exportErr = ec + runtime.SetFinalizer(e, (*exporter).closeWrite) + }) + return e.prepareErr } -func (e *LibraryError) Is(err error) bool { - if e == nil { - return err == nil - } - if ef, ok := err.(*LibraryError); ok { - return *e == *ef - } - return (e.Seccomp != 0 && errors.Is(err, e.Seccomp)) || - (e.Errno != nil && errors.Is(err, e.Errno)) -} - -var resPrefix = [...]string{ - 0: "", - 1: "seccomp_init failed", - 2: "seccomp_arch_add failed", - 3: "seccomp_arch_add failed (multiarch)", - 4: "internal libseccomp failure", - 5: "seccomp_rule_add failed", - 6: "seccomp_export_bpf failed", - 7: "seccomp_load failed", -} - -type FilterOpts = C.hakurei_filter_opts - -const ( - filterVerbose FilterOpts = C.HAKUREI_VERBOSE - // FilterExt are project-specific extensions. - FilterExt FilterOpts = C.HAKUREI_EXT - // FilterDenyNS denies namespace setup syscalls. - FilterDenyNS FilterOpts = C.HAKUREI_DENY_NS - // FilterDenyTTY denies faking input. - FilterDenyTTY FilterOpts = C.HAKUREI_DENY_TTY - // FilterDenyDevel denies development-related syscalls. - FilterDenyDevel FilterOpts = C.HAKUREI_DENY_DEVEL - // FilterMultiarch allows multiarch/emulation. - FilterMultiarch FilterOpts = C.HAKUREI_MULTIARCH - // FilterLinux32 sets PER_LINUX32. - FilterLinux32 FilterOpts = C.HAKUREI_LINUX32 - // FilterCan allows AF_CAN. - FilterCan FilterOpts = C.HAKUREI_CAN - // FilterBluetooth allows AF_BLUETOOTH. - FilterBluetooth FilterOpts = C.HAKUREI_BLUETOOTH -) - -func buildFilter(fd int, opts FilterOpts) error { - var ( - arch C.uint32_t = 0 - multiarch C.uint32_t = 0 - ) - switch runtime.GOARCH { - case "386": - arch = C.SCMP_ARCH_X86 - case "amd64": - arch = C.SCMP_ARCH_X86_64 - multiarch = C.SCMP_ARCH_X86 - case "arm": - arch = C.SCMP_ARCH_ARM - case "arm64": - arch = C.SCMP_ARCH_AARCH64 - multiarch = C.SCMP_ARCH_ARM - } - - // this removes repeated transitions between C and Go execution - // when producing log output via hakurei_println and CPrintln is nil - if fp := printlnP.Load(); fp != nil { - opts |= filterVerbose - } - - var ret C.int - res, err := C.hakurei_build_filter(&ret, C.int(fd), arch, multiarch, opts) - if prefix := resPrefix[res]; prefix != "" { - return &LibraryError{ - prefix, - -syscall.Errno(ret), - err, +func (e *exporter) closeWrite() error { + e.closeOnce.Do(func() { + if e.w == nil { + panic("closeWrite called on invalid exporter") } - } - return err + e.closeErr = e.w.Close() + + // no need for a finalizer anymore + runtime.SetFinalizer(e, nil) + }) + + return e.closeErr } -// only used for testing -func syscallResolveName(s string) (trap int) { - v := C.CString(s) - trap = int(C.seccomp_syscall_resolve_name(v)) - C.free(unsafe.Pointer(v)) - return +func newExporter(presets FilterPreset, flags PrepareFlag) *exporter { + return &exporter{presets: presets, flags: flags} }