fix(trickplay): stop scan-time sprite generation from saturating the host

Trickplay sprite generation (one full-decode ffmpeg pass per file) could pin a machine: multiple agents on the same library decoded the same 4K file at once, no CPU throttling, and crashed/restarted agents orphaned ffmpeg to init (it ran the full 45-min decode to completion). Stacked orphans spiked a box to load ~140. - Single-flight lock: O_CREATE|O_EXCL .lock in the shared sidecar dir so two agents watching the same library never decode the same file twice (stale locks reclaimed after a TTL). Returns ErrTrickplayInProgress → prewarm skips, not fail. - Load gate: defer the heavy decode until 1-min load ≤ max(ratio×NumCPU, 1.5), capped at 15 min so it throttles without ever becoming a permanent off-switch on busy / small hosts. New knob library.prewarm_max_load_ratio (default 0.7). - Concurrency: trickSem caps trickplay to ONE decode at a time per agent. - CPU priority: setLowCPUPriority (nice 19) alongside the existing idle ionice. - No orphans: hardenCmd sets Setpgid + Pdeathsig=SIGKILL, with runtime.LockOSThread around the child so the kernel kills ffmpeg exactly when the agent dies (and not spuriously — golang/go#27505). Tests: single-flight/stale-reclaim, load-gate immediate/cancel, and an e2e Pdeathsig orphan-kill check.
2026-06-04 08:25:00 +02:00 · 2026-06-04 08:25:00 +02:00 · c82826bf68
commit c82826bf68
parent aba20e2078
10 changed files with 399 additions and 8 deletions
--- a/internal/library/prewarm.go
+++ b/internal/library/prewarm.go
@ -2,8 +2,10 @@ package library

 import (
 	"context"
+	"errors"
 	"log"
 	"math"
+	"runtime"
 	"sync"
 	"time"

@ -33,6 +35,12 @@ type PrewarmOptions struct {
 	Trickplay            bool
 	TrickplayIntervalSec float64
 	TrickplayWidth       int
+
+	// MaxLoadRatio gates the heavy trickplay decode on system load: a job only
+	// starts while the 1-min load average is ≤ MaxLoadRatio×NumCPU, so sprite
+	// generation never saturates the machine or the NAS. ≤0 → default 0.7. Has no
+	// effect on platforms without a load reading (proceeds unthrottled).
+	MaxLoadRatio float64
 }

 // prewarmJob is one extraction unit: all text subtitles of a file in one ffmpeg
@ -65,6 +73,15 @@ func PrewarmSidecars(ctx context.Context, cache *LibraryCache, opts PrewarmOptio
 	if workers < 1 {
 		workers = 2
 	}
+	maxLoadRatio := opts.MaxLoadRatio
+	if maxLoadRatio <= 0 {
+		maxLoadRatio = 0.7
+	}
+	// Trickplay is the heaviest job (full 4K decode). Cap it to ONE concurrent
+	// decode across this agent's workers — the thumbnail/subtitle jobs (light /
+	// I/O-bound) keep their `workers` parallelism. Cross-agent dup work is stopped
+	// by the per-file lock inside GenerateTrickplay.
+	trickSem := make(chan struct{}, 1)

 	jobs := make(chan prewarmJob)
 	var wg sync.WaitGroup
@ -115,19 +132,39 @@ func PrewarmSidecars(ctx context.Context, cache *LibraryCache, opts PrewarmOptio
 					if _, ok := mediainfo.ReadCachedTrickplay(j.path, j.width); ok {
 						continue
 					}
+					// Serialize the heavy decode (1 at a time) and wait for the box to
+					// be idle enough before starting — sprite generation must never
+					// saturate the CPU or the NAS.
+					select {
+					case trickSem <- struct{}{}:
+					case <-ctx.Done():
+						return
+					}
+					waitForLowLoad(ctx, maxLoadRatio)
+					if ctx.Err() != nil {
+						<-trickSem
+						return
+					}
 					// Full-decode pass (samples 1 frame per interval over the whole
 					// file) — generous deadline like subtitles; idempotent + cached.
+					// INVARIANT: this deadline MUST stay below mediainfo.trickplayLockTTL,
+					// or another agent could reclaim a still-running job's lock and double
+					// the decode. If you raise this, raise trickplayLockTTL too.
 					jctx, cancel := context.WithTimeout(ctx, 45*time.Minute)
 					_, err := mediainfo.GenerateTrickplay(jctx, opts.FFmpegPath, j.path, opts.TrickplayIntervalSec, j.width, j.duration)
 					cancel()
+					<-trickSem
 					mu.Lock()
-					if err != nil {
+					switch {
+					case err == nil:
+						trickCached++
+					case errors.Is(err, mediainfo.ErrTrickplayInProgress):
+						// another worker/agent owns this file — skip, not a failure.
+					default:
 						failed++
 						if sampleErr == "" {
 							sampleErr = err.Error()
 						}
-					} else {
-						trickCached++
 					}
 					mu.Unlock()
 					continue
@ -239,6 +276,47 @@ func PrewarmSidecars(ctx context.Context, cache *LibraryCache, opts PrewarmOptio
 	}
 }

+// prewarmLoadWaitCap bounds how long the load gate DEFERS a trickplay job. It's a
+// throttle, not an off-switch: on a host whose baseline load is permanently above
+// the threshold (a shared prod box, or any 1–2 core machine), an unbounded wait
+// would mean sprites NEVER generate. After the cap we proceed anyway — the other
+// safeguards (single-flight lock, trickSem=1, nice 19 + idle I/O, Pdeathsig) keep
+// one throttled decode from saturating the box.
+const prewarmLoadWaitCap = 15 * time.Minute
+
+// waitForLowLoad defers until the 1-minute system load is at or below
+// max(maxRatio×NumCPU, 1.5), or ctx is cancelled, or prewarmLoadWaitCap elapses —
+// so the heavy trickplay decode prefers an idle machine but never stalls forever.
+// The 1.5 floor reserves ~one core so the gate can still open on 1–2 core hosts
+// (without it, threshold 0.7–1.4 is below almost any active machine's load and the
+// feature would be permanently off). No load reading (non-Linux) → returns at once.
+func waitForLowLoad(ctx context.Context, maxRatio float64) {
+	threshold := maxRatio * float64(runtime.NumCPU())
+	if threshold < 1.5 {
+		threshold = 1.5
+	}
+	deadline := time.After(prewarmLoadWaitCap)
+	logged := false
+	for {
+		load, ok := mediainfo.LoadAverage1()
+		if !ok || load <= threshold {
+			return
+		}
+		if !logged {
+			log.Printf("[prewarm] system load %.1f > %.1f — deferring trickplay (≤ %s)", load, threshold, prewarmLoadWaitCap)
+			logged = true
+		}
+		select {
+		case <-ctx.Done():
+			return
+		case <-deadline:
+			log.Printf("[prewarm] load still high after %s — proceeding with throttled trickplay (nice + idle I/O + single-flight still apply)", prewarmLoadWaitCap)
+			return
+		case <-time.After(15 * time.Second):
+		}
+	}
+}
+
 // thumbPositions returns the sample frame offsets (whole seconds) for an item,
 // matching the web panel: fractions of a known runtime, else fixed fallbacks.
 func thumbPositions(item LibraryItem) []float64 {