fix(trickplay): stop scan-time sprite generation from saturating the host
Some checks failed
CI / Test (push) Failing after 6m21s
CI / Build (push) Successful in 1m34s
CI / Build-1 (push) Successful in 2m0s
CI / Build-2 (push) Successful in 1m33s
CI / Build-3 (push) Successful in 1m38s
CI / Build-4 (push) Successful in 1m35s
CI / Build-5 (push) Successful in 1m38s
CI / Lint (push) Failing after 2m34s
CI / Coverage (push) Failing after 2m44s
CI / Vet (push) Successful in 2m3s
Some checks failed
CI / Test (push) Failing after 6m21s
CI / Build (push) Successful in 1m34s
CI / Build-1 (push) Successful in 2m0s
CI / Build-2 (push) Successful in 1m33s
CI / Build-3 (push) Successful in 1m38s
CI / Build-4 (push) Successful in 1m35s
CI / Build-5 (push) Successful in 1m38s
CI / Lint (push) Failing after 2m34s
CI / Coverage (push) Failing after 2m44s
CI / Vet (push) Successful in 2m3s
Trickplay sprite generation (one full-decode ffmpeg pass per file) could pin a machine: multiple agents on the same library decoded the same 4K file at once, no CPU throttling, and crashed/restarted agents orphaned ffmpeg to init (it ran the full 45-min decode to completion). Stacked orphans spiked a box to load ~140. - Single-flight lock: O_CREATE|O_EXCL .lock in the shared sidecar dir so two agents watching the same library never decode the same file twice (stale locks reclaimed after a TTL). Returns ErrTrickplayInProgress → prewarm skips, not fail. - Load gate: defer the heavy decode until 1-min load ≤ max(ratio×NumCPU, 1.5), capped at 15 min so it throttles without ever becoming a permanent off-switch on busy / small hosts. New knob library.prewarm_max_load_ratio (default 0.7). - Concurrency: trickSem caps trickplay to ONE decode at a time per agent. - CPU priority: setLowCPUPriority (nice 19) alongside the existing idle ionice. - No orphans: hardenCmd sets Setpgid + Pdeathsig=SIGKILL, with runtime.LockOSThread around the child so the kernel kills ffmpeg exactly when the agent dies (and not spuriously — golang/go#27505). Tests: single-flight/stale-reclaim, load-gate immediate/cancel, and an e2e Pdeathsig orphan-kill check.
This commit is contained in:
parent
aba20e2078
commit
c82826bf68
10 changed files with 399 additions and 8 deletions
65
internal/library/mediainfo/harden_linux_test.go
Normal file
65
internal/library/mediainfo/harden_linux_test.go
Normal file
|
|
@ -0,0 +1,65 @@
|
|||
//go:build linux
|
||||
|
||||
package mediainfo
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// TestHardenCmd_KillsChildOnParentDeath is the e2e guarantee for the orphan fix:
|
||||
// a child spawned with hardenCmd must be SIGKILL'd by the kernel the instant its
|
||||
// parent process dies (Pdeathsig), so an agent crash/restart can never leave an
|
||||
// ffmpeg running to ppid 1. It re-execs this test binary as a short-lived helper
|
||||
// that starts `sleep`, prints the sleep PID, then exits — and asserts that PID is
|
||||
// gone afterwards.
|
||||
func TestHardenCmd_KillsChildOnParentDeath(t *testing.T) {
|
||||
if os.Getenv("UNARR_PDEATHSIG_CHILD") == "1" {
|
||||
// Helper role: start a hardened long sleep, announce its PID, then exit so
|
||||
// the kernel fires Pdeathsig on it.
|
||||
cmd := exec.Command("sleep", "120")
|
||||
hardenCmd(cmd)
|
||||
if err := cmd.Start(); err != nil {
|
||||
fmt.Println("ERR", err)
|
||||
os.Exit(2)
|
||||
}
|
||||
fmt.Println(cmd.Process.Pid)
|
||||
os.Exit(0)
|
||||
}
|
||||
|
||||
helper := exec.Command(os.Args[0], "-test.run=TestHardenCmd_KillsChildOnParentDeath", "-test.v")
|
||||
helper.Env = append(os.Environ(), "UNARR_PDEATHSIG_CHILD=1")
|
||||
out, err := helper.Output()
|
||||
if err != nil {
|
||||
t.Fatalf("helper run: %v (out=%q)", err, out)
|
||||
}
|
||||
|
||||
var sleepPID int
|
||||
for _, line := range strings.Split(string(out), "\n") {
|
||||
if n, perr := strconv.Atoi(strings.TrimSpace(line)); perr == nil && n > 0 {
|
||||
sleepPID = n
|
||||
break
|
||||
}
|
||||
}
|
||||
if sleepPID == 0 {
|
||||
t.Fatalf("could not parse child PID from helper output: %q", out)
|
||||
}
|
||||
|
||||
// Give the kernel a moment to deliver SIGKILL after the helper exited.
|
||||
deadline := time.Now().Add(3 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
if syscall.Kill(sleepPID, 0) != nil {
|
||||
return // process gone → Pdeathsig worked
|
||||
}
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
}
|
||||
// Cleanup if it somehow survived, then fail.
|
||||
_ = syscall.Kill(sleepPID, syscall.SIGKILL)
|
||||
t.Fatalf("child %d survived parent death — Pdeathsig not applied (orphan leak)", sleepPID)
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue