fix(trickplay): stop scan-time sprite generation from saturating the host
Some checks failed
CI / Test (push) Failing after 6m21s
CI / Build (push) Successful in 1m34s
CI / Build-1 (push) Successful in 2m0s
CI / Build-2 (push) Successful in 1m33s
CI / Build-3 (push) Successful in 1m38s
CI / Build-4 (push) Successful in 1m35s
CI / Build-5 (push) Successful in 1m38s
CI / Lint (push) Failing after 2m34s
CI / Coverage (push) Failing after 2m44s
CI / Vet (push) Successful in 2m3s

Trickplay sprite generation (one full-decode ffmpeg pass per file) could pin a
machine: multiple agents on the same library decoded the same 4K file at once, no
CPU throttling, and crashed/restarted agents orphaned ffmpeg to init (it ran the
full 45-min decode to completion). Stacked orphans spiked a box to load ~140.

- Single-flight lock: O_CREATE|O_EXCL .lock in the shared sidecar dir so two
  agents watching the same library never decode the same file twice (stale locks
  reclaimed after a TTL). Returns ErrTrickplayInProgress → prewarm skips, not fail.
- Load gate: defer the heavy decode until 1-min load ≤ max(ratio×NumCPU, 1.5),
  capped at 15 min so it throttles without ever becoming a permanent off-switch on
  busy / small hosts. New knob library.prewarm_max_load_ratio (default 0.7).
- Concurrency: trickSem caps trickplay to ONE decode at a time per agent.
- CPU priority: setLowCPUPriority (nice 19) alongside the existing idle ionice.
- No orphans: hardenCmd sets Setpgid + Pdeathsig=SIGKILL, with runtime.LockOSThread
  around the child so the kernel kills ffmpeg exactly when the agent dies (and not
  spuriously — golang/go#27505).

Tests: single-flight/stale-reclaim, load-gate immediate/cancel, and an e2e
Pdeathsig orphan-kill check.
This commit is contained in:
Deivid Soto 2026-06-04 08:25:00 +02:00
parent aba20e2078
commit c82826bf68
10 changed files with 399 additions and 8 deletions

View file

@ -3,15 +3,56 @@ package mediainfo
import (
"context"
"encoding/json"
"errors"
"fmt"
"math"
"os"
"os/exec"
"path/filepath"
"runtime"
"strconv"
"strings"
"time"
)
// ErrTrickplayInProgress means another worker — possibly an agent on another host
// sharing the same library (e.g. the dev binary on /mnt/nas and the docker agent
// on /downloads, the SAME files) — already holds this sprite's lock and is
// generating it. The caller must SKIP, not count it as a failure.
var ErrTrickplayInProgress = errors.New("trickplay: generation already in progress")
// trickplayLockTTL bounds a stale lock: longer than the caller's 45-min generation
// deadline so a live job is never stolen, short enough that a crashed/killed
// worker's lock is reclaimed on a later scan.
const trickplayLockTTL = 90 * time.Minute
// acquireTrickplayLock takes an exclusive, cross-process lock for one sprite by
// O_CREATE|O_EXCL on a ".lock" file in the shared sidecar dir, so two agents that
// watch the same library never decode the same 4K file at once (the cause of the
// 5×-per-file ffmpeg pile-up). A lock older than trickplayLockTTL is assumed
// abandoned (owner crashed) and reclaimed. Returns ErrTrickplayInProgress when a
// fresh lock is held by someone else.
func acquireTrickplayLock(lockPath string) (func(), error) {
for attempt := 0; attempt < 2; attempt++ {
f, err := os.OpenFile(lockPath, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0o644)
if err == nil {
host, _ := os.Hostname()
fmt.Fprintf(f, "%s pid=%d t=%d\n", host, os.Getpid(), time.Now().Unix())
_ = f.Close()
return func() { _ = os.Remove(lockPath) }, nil
}
if !os.IsExist(err) {
return nil, fmt.Errorf("trickplay lock: %w", err)
}
if fi, statErr := os.Stat(lockPath); statErr == nil && time.Since(fi.ModTime()) > trickplayLockTTL {
_ = os.Remove(lockPath) // stale → reclaim and retry
continue
}
return nil, ErrTrickplayInProgress
}
return nil, ErrTrickplayInProgress
}
// TrickplayManifest describes the montage sprite layout so a client can map a
// playback time to one tile: tileIndex = floor(timeSec / IntervalSec), then
// col = tileIndex % Cols, row = tileIndex / Cols, and the tile's pixel box is
@ -126,6 +167,15 @@ func GenerateTrickplay(ctx context.Context, ffmpegPath, mediaPath string, interv
if err := os.MkdirAll(filepath.Dir(spritePath), 0o755); err != nil {
return TrickplayManifest{}, err
}
// Single-flight across processes/agents: only one worker decodes this file at
// a time. Returns ErrTrickplayInProgress (skip, not fail) if another holds it.
release, err := acquireTrickplayLock(spritePath + ".lock")
if err != nil {
return TrickplayManifest{}, err
}
defer release()
tmpSprite := spritePath + ".tmp"
// fps filter wants a rational; format 1/effInterval with enough precision.
@ -144,17 +194,31 @@ func GenerateTrickplay(ctx context.Context, ffmpegPath, mediaPath string, interv
"-f", "mjpeg",
tmpSprite,
}
// Pin this goroutine to its OS thread for the whole child lifetime. hardenCmd's
// Pdeathsig is delivered when the THREAD that forked dies, not the process
// (golang/go#27505); without the lock Go could recycle that thread mid-decode
// and the kernel would SIGKILL a perfectly healthy ffmpeg. Locked here (before
// the fork in Start) and released after Wait, the thread lives exactly as long
// as ffmpeg: it dies only when the agent process itself dies → SIGKILL fires
// only then, which is precisely the orphan we want to prevent.
runtime.LockOSThread()
defer runtime.UnlockOSThread()
cmd := exec.CommandContext(ctx, ffmpegPath, args...)
var stderr strings.Builder
cmd.Stderr = &stderr
// Start + idle I/O priority + Wait (matches the subtitle/thumbnail extractors):
// this full-decode pass is the heaviest sidecar job and runs in the background
// alongside live streaming on the same disk/NFS, so it must yield I/O.
// Die-with-parent BEFORE Start so an agent crash can't orphan this decode.
hardenCmd(cmd)
// Start + idle I/O + lowest CPU niceness + Wait (matches the subtitle/thumbnail
// extractors): this full-decode pass is the heaviest sidecar job and runs in the
// background alongside live streaming on the same box/NFS, so it must yield both
// disk AND CPU. The prewarm also gates it on system load before getting here.
if err := cmd.Start(); err != nil {
_ = os.Remove(tmpSprite)
return TrickplayManifest{}, fmt.Errorf("ffmpeg tile start: %w", err)
}
setIdleIOPriority(cmd.Process.Pid)
setLowCPUPriority(cmd.Process.Pid)
if err := cmd.Wait(); err != nil {
_ = os.Remove(tmpSprite)
return TrickplayManifest{}, fmt.Errorf("ffmpeg tile: %w: %s", err, strings.TrimSpace(stderr.String()))