fix(trickplay): stop scan-time sprite generation from saturating the host
Some checks failed
CI / Test (push) Failing after 6m21s
CI / Build (push) Successful in 1m34s
CI / Build-1 (push) Successful in 2m0s
CI / Build-2 (push) Successful in 1m33s
CI / Build-3 (push) Successful in 1m38s
CI / Build-4 (push) Successful in 1m35s
CI / Build-5 (push) Successful in 1m38s
CI / Lint (push) Failing after 2m34s
CI / Coverage (push) Failing after 2m44s
CI / Vet (push) Successful in 2m3s

Trickplay sprite generation (one full-decode ffmpeg pass per file) could pin a
machine: multiple agents on the same library decoded the same 4K file at once, no
CPU throttling, and crashed/restarted agents orphaned ffmpeg to init (it ran the
full 45-min decode to completion). Stacked orphans spiked a box to load ~140.

- Single-flight lock: O_CREATE|O_EXCL .lock in the shared sidecar dir so two
  agents watching the same library never decode the same file twice (stale locks
  reclaimed after a TTL). Returns ErrTrickplayInProgress → prewarm skips, not fail.
- Load gate: defer the heavy decode until 1-min load ≤ max(ratio×NumCPU, 1.5),
  capped at 15 min so it throttles without ever becoming a permanent off-switch on
  busy / small hosts. New knob library.prewarm_max_load_ratio (default 0.7).
- Concurrency: trickSem caps trickplay to ONE decode at a time per agent.
- CPU priority: setLowCPUPriority (nice 19) alongside the existing idle ionice.
- No orphans: hardenCmd sets Setpgid + Pdeathsig=SIGKILL, with runtime.LockOSThread
  around the child so the kernel kills ffmpeg exactly when the agent dies (and not
  spuriously — golang/go#27505).

Tests: single-flight/stale-reclaim, load-gate immediate/cancel, and an e2e
Pdeathsig orphan-kill check.
This commit is contained in:
Deivid Soto 2026-06-04 08:25:00 +02:00
parent aba20e2078
commit c82826bf68
10 changed files with 399 additions and 8 deletions

View file

@ -2,8 +2,10 @@ package library
import (
"context"
"errors"
"log"
"math"
"runtime"
"sync"
"time"
@ -33,6 +35,12 @@ type PrewarmOptions struct {
Trickplay bool
TrickplayIntervalSec float64
TrickplayWidth int
// MaxLoadRatio gates the heavy trickplay decode on system load: a job only
// starts while the 1-min load average is ≤ MaxLoadRatio×NumCPU, so sprite
// generation never saturates the machine or the NAS. ≤0 → default 0.7. Has no
// effect on platforms without a load reading (proceeds unthrottled).
MaxLoadRatio float64
}
// prewarmJob is one extraction unit: all text subtitles of a file in one ffmpeg
@ -65,6 +73,15 @@ func PrewarmSidecars(ctx context.Context, cache *LibraryCache, opts PrewarmOptio
if workers < 1 {
workers = 2
}
maxLoadRatio := opts.MaxLoadRatio
if maxLoadRatio <= 0 {
maxLoadRatio = 0.7
}
// Trickplay is the heaviest job (full 4K decode). Cap it to ONE concurrent
// decode across this agent's workers — the thumbnail/subtitle jobs (light /
// I/O-bound) keep their `workers` parallelism. Cross-agent dup work is stopped
// by the per-file lock inside GenerateTrickplay.
trickSem := make(chan struct{}, 1)
jobs := make(chan prewarmJob)
var wg sync.WaitGroup
@ -115,19 +132,39 @@ func PrewarmSidecars(ctx context.Context, cache *LibraryCache, opts PrewarmOptio
if _, ok := mediainfo.ReadCachedTrickplay(j.path, j.width); ok {
continue
}
// Serialize the heavy decode (1 at a time) and wait for the box to
// be idle enough before starting — sprite generation must never
// saturate the CPU or the NAS.
select {
case trickSem <- struct{}{}:
case <-ctx.Done():
return
}
waitForLowLoad(ctx, maxLoadRatio)
if ctx.Err() != nil {
<-trickSem
return
}
// Full-decode pass (samples 1 frame per interval over the whole
// file) — generous deadline like subtitles; idempotent + cached.
// INVARIANT: this deadline MUST stay below mediainfo.trickplayLockTTL,
// or another agent could reclaim a still-running job's lock and double
// the decode. If you raise this, raise trickplayLockTTL too.
jctx, cancel := context.WithTimeout(ctx, 45*time.Minute)
_, err := mediainfo.GenerateTrickplay(jctx, opts.FFmpegPath, j.path, opts.TrickplayIntervalSec, j.width, j.duration)
cancel()
<-trickSem
mu.Lock()
if err != nil {
switch {
case err == nil:
trickCached++
case errors.Is(err, mediainfo.ErrTrickplayInProgress):
// another worker/agent owns this file — skip, not a failure.
default:
failed++
if sampleErr == "" {
sampleErr = err.Error()
}
} else {
trickCached++
}
mu.Unlock()
continue
@ -239,6 +276,47 @@ func PrewarmSidecars(ctx context.Context, cache *LibraryCache, opts PrewarmOptio
}
}
// prewarmLoadWaitCap bounds how long the load gate DEFERS a trickplay job. It's a
// throttle, not an off-switch: on a host whose baseline load is permanently above
// the threshold (a shared prod box, or any 12 core machine), an unbounded wait
// would mean sprites NEVER generate. After the cap we proceed anyway — the other
// safeguards (single-flight lock, trickSem=1, nice 19 + idle I/O, Pdeathsig) keep
// one throttled decode from saturating the box.
const prewarmLoadWaitCap = 15 * time.Minute
// waitForLowLoad defers until the 1-minute system load is at or below
// max(maxRatio×NumCPU, 1.5), or ctx is cancelled, or prewarmLoadWaitCap elapses —
// so the heavy trickplay decode prefers an idle machine but never stalls forever.
// The 1.5 floor reserves ~one core so the gate can still open on 12 core hosts
// (without it, threshold 0.71.4 is below almost any active machine's load and the
// feature would be permanently off). No load reading (non-Linux) → returns at once.
func waitForLowLoad(ctx context.Context, maxRatio float64) {
threshold := maxRatio * float64(runtime.NumCPU())
if threshold < 1.5 {
threshold = 1.5
}
deadline := time.After(prewarmLoadWaitCap)
logged := false
for {
load, ok := mediainfo.LoadAverage1()
if !ok || load <= threshold {
return
}
if !logged {
log.Printf("[prewarm] system load %.1f > %.1f — deferring trickplay (≤ %s)", load, threshold, prewarmLoadWaitCap)
logged = true
}
select {
case <-ctx.Done():
return
case <-deadline:
log.Printf("[prewarm] load still high after %s — proceeding with throttled trickplay (nice + idle I/O + single-flight still apply)", prewarmLoadWaitCap)
return
case <-time.After(15 * time.Second):
}
}
}
// thumbPositions returns the sample frame offsets (whole seconds) for an item,
// matching the web panel: fractions of a known runtime, else fixed fallbacks.
func thumbPositions(item LibraryItem) []float64 {