From 196b200d0f19c41730db439c62db9b39e424f21f Mon Sep 17 00:00:00 2001 From: Ophestra Date: Thu, 12 Mar 2026 01:14:03 +0900 Subject: [PATCH] container: expose priority and SCHED_OTHER policy The more explicit API removes the arbitrary limit preventing use of SCHED_OTHER (referred to as SCHED_NORMAL in the kernel). This change also exposes priority value to set. Signed-off-by: Ophestra --- cmd/mbf/main.go | 2 +- container/container.go | 27 +++++++++++++++++++++------ container/std/syscall.go | 5 ++--- container/syscall.go | 4 ++-- internal/outcome/shim.go | 1 + internal/pkg/exec.go | 7 ++++--- test/test.py | 3 +++ 7 files changed, 34 insertions(+), 15 deletions(-) diff --git a/cmd/mbf/main.go b/cmd/mbf/main.go index 72d9068..1a03757 100644 --- a/cmd/mbf/main.go +++ b/cmd/mbf/main.go @@ -87,7 +87,7 @@ func main() { } if flagIdle { - pkg.SchedPolicy = std.SCHED_IDLE + pkg.SetSchedIdle = true } return diff --git a/container/container.go b/container/container.go index ab597b7..ad86fb0 100644 --- a/container/container.go +++ b/container/container.go @@ -38,9 +38,13 @@ type ( Container struct { // Whether the container init should stay alive after its parent terminates. AllowOrphan bool - // Scheduling policy to set via sched_setscheduler(2). The zero value - // skips this call. Supported policies are [SCHED_BATCH], [SCHED_IDLE]. + // Whether to set SchedPolicy and SchedPriority via sched_setscheduler(2). + SetScheduler bool + // Scheduling policy to set via sched_setscheduler(2). SchedPolicy std.SchedPolicy + // Scheduling priority to set via sched_setscheduler(2). The zero value + // implies the minimum value supported by the current SchedPolicy. + SchedPriority std.Int // Cgroup fd, nil to disable. Cgroup *int // ExtraFiles passed through to initial process in the container, with @@ -373,7 +377,15 @@ func (p *Container) Start() error { // sched_setscheduler: thread-directed but acts on all processes // created from the calling thread - if p.SchedPolicy > 0 && p.SchedPolicy <= std.SCHED_LAST { + if p.SetScheduler { + if p.SchedPolicy < 0 || p.SchedPolicy > std.SCHED_LAST { + return &StartError{ + Fatal: false, + Step: "set scheduling policy", + Err: EINVAL, + } + } + var param schedParam if priority, err := p.SchedPolicy.GetPriorityMin(); err != nil { return &StartError{ @@ -382,10 +394,13 @@ func (p *Container) Start() error { Err: err, } } else { - param.priority = priority + param.priority = max(priority, p.SchedPriority) } - p.msg.Verbosef("setting scheduling policy %s", p.SchedPolicy) + p.msg.Verbosef( + "setting scheduling policy %s priority %d", + p.SchedPolicy, param.priority, + ) if err := schedSetscheduler( 0, // calling thread p.SchedPolicy, @@ -393,7 +408,7 @@ func (p *Container) Start() error { ); err != nil { return &StartError{ Fatal: true, - Step: "enforce landlock ruleset", + Step: "set scheduling policy", Err: err, } } diff --git a/container/std/syscall.go b/container/std/syscall.go index 8ebf33d..61c5487 100644 --- a/container/std/syscall.go +++ b/container/std/syscall.go @@ -134,7 +134,7 @@ func (policy SchedPolicy) GetPriorityMax() (Int, error) { 0, 0, ) schedPriority[policy][0] = Int(priority) - if schedPriority[policy][0] < 0 { + if errno != 0 { schedPriorityErr[policy][0] = errno } }) @@ -151,10 +151,9 @@ func (policy SchedPolicy) GetPriorityMin() (Int, error) { 0, 0, ) schedPriority[policy][1] = Int(priority) - if schedPriority[policy][1] < 0 { + if errno != 0 { schedPriorityErr[policy][1] = errno } }) return schedPriority[policy][1], schedPriorityErr[policy][1] - } diff --git a/container/syscall.go b/container/syscall.go index cba9b7c..35f6d00 100644 --- a/container/syscall.go +++ b/container/syscall.go @@ -63,12 +63,12 @@ type schedParam struct { // // [very subtle to use correctly]: https://www.openwall.com/lists/musl/2016/03/01/4 func schedSetscheduler(tid int, policy std.SchedPolicy, param *schedParam) error { - if r, _, errno := Syscall( + if _, _, errno := Syscall( SYS_SCHED_SETSCHEDULER, uintptr(tid), uintptr(policy), uintptr(unsafe.Pointer(param)), - ); r < 0 { + ); errno != 0 { return errno } return nil diff --git a/internal/outcome/shim.go b/internal/outcome/shim.go index 53b3731..6d3e8d8 100644 --- a/internal/outcome/shim.go +++ b/internal/outcome/shim.go @@ -274,6 +274,7 @@ func shimEntrypoint(k syscallDispatcher) { cancelContainer.Store(&stop) sp := shimPrivate{k: k, id: state.id} z := container.New(ctx, msg) + z.SetScheduler = state.Shim.SchedPolicy > 0 z.SchedPolicy = state.Shim.SchedPolicy z.Params = *stateParams.params z.Stdin, z.Stdout, z.Stderr = os.Stdin, os.Stdout, os.Stderr diff --git a/internal/pkg/exec.go b/internal/pkg/exec.go index 3bcffaf..998dce7 100644 --- a/internal/pkg/exec.go +++ b/internal/pkg/exec.go @@ -39,8 +39,8 @@ type ExecPath struct { W bool } -// SchedPolicy is the [container] scheduling policy. -var SchedPolicy std.SchedPolicy +// SetSchedIdle is whether to set [std.SCHED_IDLE] scheduling priority. +var SetSchedIdle bool // PromoteLayers returns artifacts with identical-by-content layers promoted to // the highest priority instance, as if mounted via [ExecPath]. @@ -413,7 +413,8 @@ func (a *execArtifact) cure(f *FContext, hostNet bool) (err error) { z.ParentPerm = 0700 z.HostNet = hostNet z.Hostname = "cure" - z.SchedPolicy = SchedPolicy + z.SetScheduler = SetSchedIdle + z.SchedPolicy = std.SCHED_IDLE if z.HostNet { z.Hostname = "cure-net" } diff --git a/test/test.py b/test/test.py index 9071f56..57cf208 100644 --- a/test/test.py +++ b/test/test.py @@ -213,6 +213,9 @@ if sched_unset != 0: sched_idle = int(machine.succeed("sudo -u alice -i hakurei -v run --sched=idle cat /proc/self/sched | grep '^policy' | tr -d ' ' | cut -d ':' -f 2")) if sched_idle != 5: raise Exception(f"unexpected idle policy: {sched_idle}") +sched_rr = int(machine.succeed("sudo -u alice -i hakurei -v run --sched=rr cat /proc/self/sched | grep '^policy' | tr -d ' ' | cut -d ':' -f 2")) +if sched_rr != 2: + raise Exception(f"unexpected round-robin policy: {sched_idle}") # Start app (foot) with Wayland enablement: swaymsg("exec ne-foot")