From a6160cd410b1def222018cd88951fcbbf1a65f5c Mon Sep 17 00:00:00 2001 From: Ophestra Date: Thu, 26 Feb 2026 16:29:47 +0900 Subject: [PATCH] container: set scheduling policy This is thread-directed so cannot be done externally. The glibc wrapper exposes this behaviour so most multithreaded programs using this is straight up incorrect. Signed-off-by: Ophestra --- container/container.go | 20 +++++++++++++++++++ container/syscall.go | 45 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+) diff --git a/container/container.go b/container/container.go index d76a090..2a6363a 100644 --- a/container/container.go +++ b/container/container.go @@ -37,6 +37,9 @@ type ( Container struct { // Whether the container init should stay alive after its parent terminates. AllowOrphan bool + // Scheduling policy to set via sched_setscheduler(2). The zero value + // skips this call. Supported policies are [SCHED_BATCH], [SCHED_IDLE]. + SchedPolicy int // Cgroup fd, nil to disable. Cgroup *int // ExtraFiles passed through to initial process in the container, @@ -342,6 +345,23 @@ func (p *Container) Start() error { landlockOut: } + // sched_setscheduler: thread-directed but acts on all processes + // created from that thread + if p.SchedPolicy > 0 { + p.msg.Verbosef("setting scheduling policy %d", p.SchedPolicy) + if err := schedSetscheduler( + 0, // calling thread + p.SchedPolicy, + &schedParam{0}, + ); err != nil { + return &StartError{ + Fatal: true, + Step: "enforce landlock ruleset", + Err: err, + } + } + } + p.msg.Verbose("starting container init") if err := p.cmd.Start(); err != nil { return &StartError{false, "start container init", err, false, true} diff --git a/container/syscall.go b/container/syscall.go index ef9fb15..e5f86cf 100644 --- a/container/syscall.go +++ b/container/syscall.go @@ -3,6 +3,8 @@ package container import ( . "syscall" "unsafe" + + "hakurei.app/container/std" ) // Prctl manipulates various aspects of the behavior of the calling thread or process. @@ -41,6 +43,49 @@ func Isatty(fd int) bool { return r == 0 } +// include/uapi/linux/sched.h +const ( + SCHED_NORMAL = iota + SCHED_FIFO + SCHED_RR + SCHED_BATCH + _ // SCHED_ISO: reserved but not implemented yet + SCHED_IDLE + SCHED_DEADLINE + SCHED_EXT +) + +// schedParam is equivalent to struct sched_param from include/linux/sched.h. +type schedParam struct { + // sched_priority + priority std.ScmpInt +} + +// schedSetscheduler sets both the scheduling policy and parameters for the +// thread whose ID is specified in tid. If tid equals zero, the scheduling +// policy and parameters of the calling thread will be set. +// +// This function is unexported because it is [very subtle to use correctly]. The +// function signature in libc is misleading: pid actually refers to a thread ID. +// The glibc wrapper for this system call ignores this semantic and exposes +// this counterintuitive behaviour. +// +// This function is only called from the container setup thread. Do not reuse +// this if you do not have something similar in place! +// +// [very subtle to use correctly]: https://www.openwall.com/lists/musl/2016/03/01/4 +func schedSetscheduler(tid, policy int, param *schedParam) error { + if r, _, errno := Syscall( + SYS_SCHED_SETSCHEDULER, + uintptr(tid), + uintptr(policy), + uintptr(unsafe.Pointer(param)), + ); r < 0 { + return errno + } + return nil +} + // IgnoringEINTR makes a function call and repeats it if it returns an // EINTR error. This appears to be required even though we install all // signal handlers with SA_RESTART: see #22838, #38033, #38836, #40846.