feat(hls): full-GPU scale_cuda for NVENC SDR downscales

Keep an NVENC downscale of an SDR source entirely on the GPU (decode -> scale_cuda -> h264_nvenc) instead of copying every frame to the CPU for `scale=` and back. That GPU->CPU->GPU round-trip is the wall on modest GPUs; even a strong box gains ~37% (scale_cuda 14.9x vs CPU 10.9x on a 4K SDR HEVC -> 1080p encode). Strictly gated so every case that needs CPU frames is unchanged: - HDR (libplacebo Vulkan / zscale CPU tonemap can't consume a CUDA surface), - burn-in (the scale2ref+overlay composite runs on CPU frames), - non-NVENC encoders, and no-op when not actually downscaling. - hwscale.go: FFmpegSupportsScaleCuda — a functional 1-frame probe mirroring the libplacebo probe (presence in -filters lies; needs a real CUDA device). Probes the worst-case real input (10-bit p010 -> 8-bit yuv420p) so a host whose scale_cuda can't do the 10->8-bit conversion fails closed to CPU. - hls.go: useCudaScale gate + `-hwaccel_output_format cuda` + a `scale_cuda=-2:H:format=yuv420p` filter branch. Output is 8-bit (format=yuv420p + `-profile:v main`), browser-safe. - transcode_quality.go / player_session_registry.go / daemon.go: HasScaleCuda flag, populated + warmed at startup like the other ffmpeg capability probes. Fail-closed: probe absent/fails -> keep the CPU scale path, no regression. Verified live (real 4K SDR HEVC Main10 session emitted scale_cuda, 5.54x realtime, nvenc at 100%) + 8 arg-builder unit tests for the gate.
2026-06-10 21:44:58 +02:00 · 2026-06-10 21:44:58 +02:00 · cda2e1322c
commit cda2e1322c
parent 671bee8317
6 changed files with 251 additions and 7 deletions
--- a/internal/engine/hwscale.go
+++ b/internal/engine/hwscale.go
@ -0,0 +1,75 @@
+package engine
+
+import (
+	"context"
+	"log"
+	"os/exec"
+	"strings"
+	"sync"
+	"time"
+)
+
+// Hardware downscale filter probes (F4). Mirror the libplacebo probe in
+// tonemap.go: presence in `ffmpeg -filters` does NOT prove the filter RUNS —
+// scale_cuda needs a working CUDA runtime + device, which the prod debian-slim
+// image may lack even with the filter compiled in. So we run the real filter on
+// one synthetic frame and require a clean exit, cached per binary.
+
+var (
+	scaleCudaCacheMu sync.Mutex
+	scaleCudaCache   = map[string]bool{}
+)
+
+// FFmpegSupportsScaleCuda reports whether this host can ACTUALLY run scale_cuda
+// — a working CUDA device + the filter compiled in. Used to keep an NVENC
+// downscale fully on the GPU (decode → scale_cuda → h264_nvenc) instead of
+// round-tripping each frame to the CPU for `scale=`, which is the wall on modest
+// GPUs. Fails closed: any error → false → the caller keeps the CPU-scale path
+// (no regression, just no speedup). Cached per path EXCEPT a context timeout,
+// which is transient (a busy box) and must not pin the slow path for the run.
+func FFmpegSupportsScaleCuda(ffmpegPath string) bool {
+	if ffmpegPath == "" {
+		return false
+	}
+	scaleCudaCacheMu.Lock()
+	if v, ok := scaleCudaCache[ffmpegPath]; ok {
+		scaleCudaCacheMu.Unlock()
+		return v
+	}
+	scaleCudaCacheMu.Unlock()
+
+	// 10 s: first-run CUDA device creation + filter init can take a beat on a
+	// cold/busy box. Probe the WORST-CASE real input: a 10-bit (p010) surface
+	// scaled down to 8-bit yuv420p. Most 4K SDR HEVC is Main10, so the gated
+	// path routinely hands scale_cuda a 10-bit frame; an 8-bit-only probe would
+	// pass on a host whose scale_cuda can't do the 10→8-bit conversion, and the
+	// real session would then fail with no CPU fallback. testsrc2 is CPU-side,
+	// so format=p010le + hwupload_cuda stands in for a hevc_cuda Main10 decode.
+	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+	defer cancel()
+	out, err := exec.CommandContext(ctx, ffmpegPath,
+		"-hide_banner", "-loglevel", "error", "-nostats",
+		"-init_hw_device", "cuda=cu:0", "-filter_hw_device", "cu",
+		"-f", "lavfi", "-i", "testsrc2=size=256x256:rate=1:duration=1",
+		"-vf", "format=p010le,hwupload_cuda,scale_cuda=64:64:format=yuv420p,hwdownload,format=yuv420p",
+		"-frames:v", "1", "-f", "null", "-",
+	).CombinedOutput()
+	supported := err == nil
+
+	// Cache a stable yes/no, but not a transient deadline (see libplacebo probe).
+	if supported || ctx.Err() != context.DeadlineExceeded {
+		scaleCudaCacheMu.Lock()
+		scaleCudaCache[ffmpegPath] = supported
+		scaleCudaCacheMu.Unlock()
+	}
+	if supported {
+		log.Printf("[hwscale] ffmpeg scale_cuda works — NVENC SDR downscales stay on the GPU (no CPU round-trip)")
+	} else {
+		detail := strings.TrimSpace(lastLine(out))
+		if detail == "" {
+			detail = err.Error()
+		}
+		log.Printf("[hwscale] ffmpeg scale_cuda unavailable — NVENC keeps the CPU scale path: %v", detail)
+	}
+	return supported
+}