feat(hls): full-GPU scale_cuda for NVENC SDR downscales

Keep an NVENC downscale of an SDR source entirely on the GPU (decode -> scale_cuda -> h264_nvenc) instead of copying every frame to the CPU for `scale=` and back. That GPU->CPU->GPU round-trip is the wall on modest GPUs; even a strong box gains ~37% (scale_cuda 14.9x vs CPU 10.9x on a 4K SDR HEVC -> 1080p encode). Strictly gated so every case that needs CPU frames is unchanged: - HDR (libplacebo Vulkan / zscale CPU tonemap can't consume a CUDA surface), - burn-in (the scale2ref+overlay composite runs on CPU frames), - non-NVENC encoders, and no-op when not actually downscaling. - hwscale.go: FFmpegSupportsScaleCuda — a functional 1-frame probe mirroring the libplacebo probe (presence in -filters lies; needs a real CUDA device). Probes the worst-case real input (10-bit p010 -> 8-bit yuv420p) so a host whose scale_cuda can't do the 10->8-bit conversion fails closed to CPU. - hls.go: useCudaScale gate + `-hwaccel_output_format cuda` + a `scale_cuda=-2:H:format=yuv420p` filter branch. Output is 8-bit (format=yuv420p + `-profile:v main`), browser-safe. - transcode_quality.go / player_session_registry.go / daemon.go: HasScaleCuda flag, populated + warmed at startup like the other ffmpeg capability probes. Fail-closed: probe absent/fails -> keep the CPU scale path, no regression. Verified live (real 4K SDR HEVC Main10 session emitted scale_cuda, 5.54x realtime, nvenc at 100%) + 8 arg-builder unit tests for the gate.
2026-06-10 21:44:58 +02:00 · 2026-06-10 21:44:58 +02:00 · cda2e1322c
commit cda2e1322c
parent 671bee8317
6 changed files with 251 additions and 7 deletions
--- a/internal/engine/hls.go
+++ b/internal/engine/hls.go
@ -1359,6 +1359,31 @@ func buildHLSFFmpegArgsAt(cfg HLSSessionConfig, probe *StreamProbe, tmpDir strin
 	// transcode telemetry (F3) without logging it.
 	args := []string{"-y", "-hide_banner", "-loglevel", "warning", "-stats"}

+	// F4 — full-GPU NVENC downscale. When we're downscaling an SDR source with
+	// NVENC on a host whose ffmpeg can run scale_cuda, and NO subtitle is burned
+	// in, keep the decoded frame on the GPU through scale + encode (scale_cuda →
+	// h264_nvenc) instead of copying every frame to the CPU for `scale=`. That
+	// CPU round-trip is the wall on modest GPUs (a strong box still gains ~37%).
+	// Strictly gated — the cases that need CPU frames stay on the CPU path:
+	//   - HDR (the libplacebo Vulkan / zscale CPU tonemap can't consume a CUDA
+	//     surface, and mixing CUDA scale with the Vulkan pass is fragile),
+	//   - burn-in (the scale2ref+overlay composite runs on CPU frames),
+	//   - non-NVENC encoders, and no-op when not actually downscaling.
+	// Output height cap for this session — resolved once here so the F4 gate and
+	// the filter chain below share ONE value (a drift between them would emit
+	// scale_cuda for a height that isn't actually a downscale).
+	qcap := resolveQualityCap(cfg.Quality)
+	maxH := qcap.MaxHeight
+	if maxH == 0 {
+		maxH = cfg.Transcode.MaxHeight
+	}
+	useCudaScale := profile.Codec == "h264_nvenc" &&
+		profile.DecodeHwAccel == "cuda" &&
+		cfg.Transcode.HasScaleCuda &&
+		probe.HDR == "" &&
+		cfg.burnSubtitleIndexOrNone() < 0 &&
+		maxH > 0 && probe.Height > maxH
+
 	// Demuxer-side HW-decode hint. Sourced from the profile so a future
 	// codec/hint mismatch is impossible — the encoder + decode hint are
 	// computed once and stay coherent. Notably we do NOT add
@ -1369,6 +1394,12 @@ func buildHLSFFmpegArgsAt(cfg HLSSessionConfig, probe *StreamProbe, tmpDir strin
 	// decode on the input side.
 	if profile.DecodeHwAccel != "" {
 		args = append(args, "-hwaccel", profile.DecodeHwAccel)
+		// F4: pin decoded frames as CUDA surfaces ONLY on the gated scale_cuda
+		// path, so scale_cuda + h264_nvenc avoid the CPU copy. Off otherwise —
+		// the CPU filter chain can't consume CUDA surfaces.
+		if useCudaScale {
+			args = append(args, "-hwaccel_output_format", "cuda")
+		}
 	}

 	// Seek before -i for fast keyframe-aligned start. The new ffmpeg writes
@ -1527,7 +1558,7 @@ func buildHLSFFmpegArgsAt(cfg HLSSessionConfig, probe *StreamProbe, tmpDir strin
 	// on libx264) and stalls the session. The output height matches qcap.MaxHeight
 	// when the source is downscaled, otherwise probe.Height; the output width is
 	// the source width scaled by the same factor (the filter chain preserves AR).
-	qcap := resolveQualityCap(cfg.Quality)
+	// qcap + maxH were resolved once at the top (shared with the F4 gate).
 	outputHeight := qcap.MaxHeight
 	if outputHeight == 0 {
 		outputHeight = cfg.Transcode.MaxHeight
@ -1595,10 +1626,7 @@ func buildHLSFFmpegArgsAt(cfg HLSSessionConfig, probe *StreamProbe, tmpDir strin
 	// emit the exact computed width — which can be odd (e.g. 853×480) and
 	// libx264 then refuses to open. We chain a second `scale=trunc(iw/2)*2:...`
 	// after the cap to guarantee even dimensions before format/setparams.
-	maxH := qcap.MaxHeight
-	if maxH == 0 {
-		maxH = cfg.Transcode.MaxHeight
-	}
+	// (maxH was resolved once at the top, shared with the F4 cuda-scale gate.)
 	// VAAPI needs frames as nv12 VAAPI surfaces before the encoder. We do
 	// scale + format conversion on CPU then `hwupload` once at the end —
 	// skips the mesa 25 + Raphael iGPU "Cannot allocate memory" log spam
@ -1643,12 +1671,21 @@ func buildHLSFFmpegArgsAt(cfg HLSSessionConfig, probe *StreamProbe, tmpDir strin
 	// hwUploadTail — that has to run last, after any subtitle overlay, so it's
 	// appended separately below.
 	var vchain string
-	if maxH > 0 && probe.Height > maxH {
+	switch {
+	case useCudaScale:
+		// F4: scale on the CUDA surface and hand h264_nvenc a yuv420p CUDA frame
+		// directly — no CPU `format`/`setparams` tail (the frame never leaves the
+		// GPU; nvenc records BT.709 SDR metadata from the source). scale_cuda's
+		// `-2` already yields an even width, so the second even-rounding pass the
+		// CPU path needs is unnecessary. useCudaScale already implies a real
+		// downscale (probe.Height > cudaCap) on an SDR, non-burn-in NVENC source.
+		vchain = fmt.Sprintf("scale_cuda=-2:%d:format=yuv420p", maxH)
+	case maxH > 0 && probe.Height > maxH:
 		vchain = fmt.Sprintf(
 			"scale=-2:%d:force_original_aspect_ratio=decrease,scale=trunc(iw/2)*2:trunc(ih/2)*2,%s",
 			maxH, videoTail,
 		)
-	} else {
+	default:
 		vchain = fmt.Sprintf(
 			"scale=trunc(iw/2)*2:trunc(ih/2)*2,%s",
 			videoTail,
--- a/internal/engine/hls_cudascale_test.go
+++ b/internal/engine/hls_cudascale_test.go
@ -0,0 +1,122 @@
+package engine
+
+import (
+	"strings"
+	"testing"
+)
+
+// F4: buildHLSFFmpegArgsAt must use the full-GPU scale_cuda path ONLY for an
+// SDR NVENC downscale with no burn-in on a host that probed scale_cuda — and
+// keep the CPU `scale=` path for every case that needs CPU frames (HDR tonemap,
+// burn-in, no downscale, non-NVENC, or scale_cuda unavailable).
+
+func nvencCfg(quality string, burn *int) HLSSessionConfig {
+	return HLSSessionConfig{
+		SessionID:         "test-cudascale",
+		SourcePath:        "/tmp/in.mkv",
+		Quality:           quality,
+		AudioIndex:        -1,
+		BurnSubtitleIndex: burn,
+		Transcode: TranscodeRuntime{
+			FFmpegPath:    "/usr/bin/ffmpeg",
+			HWAccel:       HWAccelNVENC,
+			HasScaleCuda:  true,
+			HasLibplacebo: true,
+			TonemapHDR:    true,
+		},
+	}
+}
+
+func argsFor(cfg HLSSessionConfig, probe *StreamProbe) string {
+	return strings.Join(buildHLSFFmpegArgsAt(cfg, probe, "/tmp/tmpdir", 0, 0), " ")
+}
+
+func TestCudaScale_SDRDownscale_UsesGPU(t *testing.T) {
+	probe := &StreamProbe{Width: 3840, Height: 2160, DurationSec: 100} // SDR (HDR == "")
+	got := argsFor(nvencCfg("1080p", nil), probe)
+	if !strings.Contains(got, "scale_cuda=-2:1080") {
+		t.Errorf("expected scale_cuda for SDR NVENC downscale; got:\n%s", got)
+	}
+	if !strings.Contains(got, "-hwaccel_output_format cuda") {
+		t.Errorf("expected -hwaccel_output_format cuda; got:\n%s", got)
+	}
+	if strings.Contains(got, "scale=-2:1080") {
+		t.Errorf("CPU scale must NOT appear on the cuda path; got:\n%s", got)
+	}
+}
+
+func TestCudaScale_HDR_StaysOnCPU(t *testing.T) {
+	probe := &StreamProbe{Width: 3840, Height: 2160, HDR: "HDR10", DurationSec: 100}
+	got := argsFor(nvencCfg("1080p", nil), probe)
+	if strings.Contains(got, "scale_cuda") {
+		t.Errorf("HDR must NOT use scale_cuda (needs the tonemap on CPU frames); got:\n%s", got)
+	}
+	if strings.Contains(got, "-hwaccel_output_format cuda") {
+		t.Errorf("HDR must NOT pin frames to CUDA; got:\n%s", got)
+	}
+	if !strings.Contains(got, "libplacebo") {
+		t.Errorf("HDR should still tonemap via libplacebo; got:\n%s", got)
+	}
+}
+
+func TestCudaScale_BurnIn_StaysOnCPU(t *testing.T) {
+	idx := 0
+	probe := &StreamProbe{Width: 3840, Height: 2160, DurationSec: 100}
+	got := argsFor(nvencCfg("1080p", &idx), probe)
+	if strings.Contains(got, "scale_cuda") {
+		t.Errorf("burn-in requested must NOT use scale_cuda (overlay runs on CPU frames); got:\n%s", got)
+	}
+}
+
+func TestCudaScale_NoDownscale_StaysOnCPU(t *testing.T) {
+	// Source already at/below the cap → no downscale → no point pinning to CUDA.
+	probe := &StreamProbe{Width: 1920, Height: 1080, DurationSec: 100}
+	got := argsFor(nvencCfg("1080p", nil), probe)
+	if strings.Contains(got, "scale_cuda") || strings.Contains(got, "-hwaccel_output_format cuda") {
+		t.Errorf("no downscale must NOT use the cuda scale path; got:\n%s", got)
+	}
+}
+
+func TestCudaScale_ProbeAbsent_StaysOnCPU(t *testing.T) {
+	cfg := nvencCfg("1080p", nil)
+	cfg.Transcode.HasScaleCuda = false // probe said no / non-CUDA host
+	probe := &StreamProbe{Width: 3840, Height: 2160, DurationSec: 100}
+	got := argsFor(cfg, probe)
+	if strings.Contains(got, "scale_cuda") {
+		t.Errorf("scale_cuda unavailable must fall back to CPU scale; got:\n%s", got)
+	}
+	if !strings.Contains(got, "scale=-2:1080") {
+		t.Errorf("expected CPU scale fallback; got:\n%s", got)
+	}
+}
+
+func TestCudaScale_Software_StaysOnCPU(t *testing.T) {
+	cfg := nvencCfg("1080p", nil)
+	cfg.Transcode.HWAccel = HWAccelNone // libx264, no CUDA decode
+	probe := &StreamProbe{Width: 3840, Height: 2160, DurationSec: 100}
+	got := argsFor(cfg, probe)
+	if strings.Contains(got, "scale_cuda") || strings.Contains(got, "-hwaccel_output_format cuda") {
+		t.Errorf("software encoder must NOT use the cuda scale path; got:\n%s", got)
+	}
+}
+
+func TestCudaScale_QSV_StaysOnCPU(t *testing.T) {
+	// A non-NVENC HW encoder (HW decode, but not h264_nvenc/cuda) must keep the
+	// CPU scale — scale_cuda is NVIDIA-only. Distinct from the software case.
+	cfg := nvencCfg("1080p", nil)
+	cfg.Transcode.HWAccel = HWAccelQSV
+	probe := &StreamProbe{Width: 3840, Height: 2160, DurationSec: 100}
+	got := argsFor(cfg, probe)
+	if strings.Contains(got, "scale_cuda") || strings.Contains(got, "-hwaccel_output_format cuda") {
+		t.Errorf("QSV must NOT use the cuda scale path; got:\n%s", got)
+	}
+}
+
+func TestCudaScale_OriginalQuality_StaysOnCPU(t *testing.T) {
+	// "original" → no height cap (maxH == 0) → no downscale → no cuda path.
+	probe := &StreamProbe{Width: 3840, Height: 2160, DurationSec: 100}
+	got := argsFor(nvencCfg("original", nil), probe)
+	if strings.Contains(got, "scale_cuda") || strings.Contains(got, "-hwaccel_output_format cuda") {
+		t.Errorf("original quality (no cap) must NOT use the cuda scale path; got:\n%s", got)
+	}
+}
--- a/internal/engine/hwscale.go
+++ b/internal/engine/hwscale.go
@ -0,0 +1,75 @@
+package engine
+
+import (
+	"context"
+	"log"
+	"os/exec"
+	"strings"
+	"sync"
+	"time"
+)
+
+// Hardware downscale filter probes (F4). Mirror the libplacebo probe in
+// tonemap.go: presence in `ffmpeg -filters` does NOT prove the filter RUNS —
+// scale_cuda needs a working CUDA runtime + device, which the prod debian-slim
+// image may lack even with the filter compiled in. So we run the real filter on
+// one synthetic frame and require a clean exit, cached per binary.
+
+var (
+	scaleCudaCacheMu sync.Mutex
+	scaleCudaCache   = map[string]bool{}
+)
+
+// FFmpegSupportsScaleCuda reports whether this host can ACTUALLY run scale_cuda
+// — a working CUDA device + the filter compiled in. Used to keep an NVENC
+// downscale fully on the GPU (decode → scale_cuda → h264_nvenc) instead of
+// round-tripping each frame to the CPU for `scale=`, which is the wall on modest
+// GPUs. Fails closed: any error → false → the caller keeps the CPU-scale path
+// (no regression, just no speedup). Cached per path EXCEPT a context timeout,
+// which is transient (a busy box) and must not pin the slow path for the run.
+func FFmpegSupportsScaleCuda(ffmpegPath string) bool {
+	if ffmpegPath == "" {
+		return false
+	}
+	scaleCudaCacheMu.Lock()
+	if v, ok := scaleCudaCache[ffmpegPath]; ok {
+		scaleCudaCacheMu.Unlock()
+		return v
+	}
+	scaleCudaCacheMu.Unlock()
+
+	// 10 s: first-run CUDA device creation + filter init can take a beat on a
+	// cold/busy box. Probe the WORST-CASE real input: a 10-bit (p010) surface
+	// scaled down to 8-bit yuv420p. Most 4K SDR HEVC is Main10, so the gated
+	// path routinely hands scale_cuda a 10-bit frame; an 8-bit-only probe would
+	// pass on a host whose scale_cuda can't do the 10→8-bit conversion, and the
+	// real session would then fail with no CPU fallback. testsrc2 is CPU-side,
+	// so format=p010le + hwupload_cuda stands in for a hevc_cuda Main10 decode.
+	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+	defer cancel()
+	out, err := exec.CommandContext(ctx, ffmpegPath,
+		"-hide_banner", "-loglevel", "error", "-nostats",
+		"-init_hw_device", "cuda=cu:0", "-filter_hw_device", "cu",
+		"-f", "lavfi", "-i", "testsrc2=size=256x256:rate=1:duration=1",
+		"-vf", "format=p010le,hwupload_cuda,scale_cuda=64:64:format=yuv420p,hwdownload,format=yuv420p",
+		"-frames:v", "1", "-f", "null", "-",
+	).CombinedOutput()
+	supported := err == nil
+
+	// Cache a stable yes/no, but not a transient deadline (see libplacebo probe).
+	if supported || ctx.Err() != context.DeadlineExceeded {
+		scaleCudaCacheMu.Lock()
+		scaleCudaCache[ffmpegPath] = supported
+		scaleCudaCacheMu.Unlock()
+	}
+	if supported {
+		log.Printf("[hwscale] ffmpeg scale_cuda works — NVENC SDR downscales stay on the GPU (no CPU round-trip)")
+	} else {
+		detail := strings.TrimSpace(lastLine(out))
+		if detail == "" {
+			detail = err.Error()
+		}
+		log.Printf("[hwscale] ffmpeg scale_cuda unavailable — NVENC keeps the CPU scale path: %v", detail)
+	}
+	return supported
+}
--- a/internal/engine/transcode_quality.go
+++ b/internal/engine/transcode_quality.go
@ -27,6 +27,11 @@ type TranscodeRuntime struct {
 	// Preferred over the zscale chain for HDR sources — one GPU pass, higher
 	// quality, and present where zscale is missing.
 	HasLibplacebo bool
+	// HasScaleCuda: this host can run scale_cuda (CUDA device + filter). Lets an
+	// NVENC downscale of an SDR source stay fully on the GPU (decode → scale_cuda
+	// → h264_nvenc) instead of round-tripping each frame to the CPU for `scale=`.
+	// Probed functionally (FFmpegSupportsScaleCuda); false ⇒ keep the CPU scale.
+	HasScaleCuda bool
 }

 // qualityCap maps a session's Quality label to a (MaxHeight, VideoBitrate)