feat(hls): full-GPU scale_cuda for NVENC SDR downscales

Keep an NVENC downscale of an SDR source entirely on the GPU
(decode -> scale_cuda -> h264_nvenc) instead of copying every frame to the
CPU for `scale=` and back. That GPU->CPU->GPU round-trip is the wall on
modest GPUs; even a strong box gains ~37% (scale_cuda 14.9x vs CPU 10.9x
on a 4K SDR HEVC -> 1080p encode).

Strictly gated so every case that needs CPU frames is unchanged:
- HDR (libplacebo Vulkan / zscale CPU tonemap can't consume a CUDA surface),
- burn-in (the scale2ref+overlay composite runs on CPU frames),
- non-NVENC encoders, and no-op when not actually downscaling.

- hwscale.go: FFmpegSupportsScaleCuda — a functional 1-frame probe mirroring
  the libplacebo probe (presence in -filters lies; needs a real CUDA device).
  Probes the worst-case real input (10-bit p010 -> 8-bit yuv420p) so a host
  whose scale_cuda can't do the 10->8-bit conversion fails closed to CPU.
- hls.go: useCudaScale gate + `-hwaccel_output_format cuda` + a
  `scale_cuda=-2:H:format=yuv420p` filter branch. Output is 8-bit
  (format=yuv420p + `-profile:v main`), browser-safe.
- transcode_quality.go / player_session_registry.go / daemon.go: HasScaleCuda
  flag, populated + warmed at startup like the other ffmpeg capability probes.

Fail-closed: probe absent/fails -> keep the CPU scale path, no regression.
Verified live (real 4K SDR HEVC Main10 session emitted scale_cuda, 5.54x
realtime, nvenc at 100%) + 8 arg-builder unit tests for the gate.
This commit is contained in:
Deivid Soto 2026-06-10 21:44:58 +02:00
parent 671bee8317
commit cda2e1322c
6 changed files with 251 additions and 7 deletions

View file

@ -0,0 +1,122 @@
package engine
import (
"strings"
"testing"
)
// F4: buildHLSFFmpegArgsAt must use the full-GPU scale_cuda path ONLY for an
// SDR NVENC downscale with no burn-in on a host that probed scale_cuda — and
// keep the CPU `scale=` path for every case that needs CPU frames (HDR tonemap,
// burn-in, no downscale, non-NVENC, or scale_cuda unavailable).
func nvencCfg(quality string, burn *int) HLSSessionConfig {
return HLSSessionConfig{
SessionID: "test-cudascale",
SourcePath: "/tmp/in.mkv",
Quality: quality,
AudioIndex: -1,
BurnSubtitleIndex: burn,
Transcode: TranscodeRuntime{
FFmpegPath: "/usr/bin/ffmpeg",
HWAccel: HWAccelNVENC,
HasScaleCuda: true,
HasLibplacebo: true,
TonemapHDR: true,
},
}
}
func argsFor(cfg HLSSessionConfig, probe *StreamProbe) string {
return strings.Join(buildHLSFFmpegArgsAt(cfg, probe, "/tmp/tmpdir", 0, 0), " ")
}
func TestCudaScale_SDRDownscale_UsesGPU(t *testing.T) {
probe := &StreamProbe{Width: 3840, Height: 2160, DurationSec: 100} // SDR (HDR == "")
got := argsFor(nvencCfg("1080p", nil), probe)
if !strings.Contains(got, "scale_cuda=-2:1080") {
t.Errorf("expected scale_cuda for SDR NVENC downscale; got:\n%s", got)
}
if !strings.Contains(got, "-hwaccel_output_format cuda") {
t.Errorf("expected -hwaccel_output_format cuda; got:\n%s", got)
}
if strings.Contains(got, "scale=-2:1080") {
t.Errorf("CPU scale must NOT appear on the cuda path; got:\n%s", got)
}
}
func TestCudaScale_HDR_StaysOnCPU(t *testing.T) {
probe := &StreamProbe{Width: 3840, Height: 2160, HDR: "HDR10", DurationSec: 100}
got := argsFor(nvencCfg("1080p", nil), probe)
if strings.Contains(got, "scale_cuda") {
t.Errorf("HDR must NOT use scale_cuda (needs the tonemap on CPU frames); got:\n%s", got)
}
if strings.Contains(got, "-hwaccel_output_format cuda") {
t.Errorf("HDR must NOT pin frames to CUDA; got:\n%s", got)
}
if !strings.Contains(got, "libplacebo") {
t.Errorf("HDR should still tonemap via libplacebo; got:\n%s", got)
}
}
func TestCudaScale_BurnIn_StaysOnCPU(t *testing.T) {
idx := 0
probe := &StreamProbe{Width: 3840, Height: 2160, DurationSec: 100}
got := argsFor(nvencCfg("1080p", &idx), probe)
if strings.Contains(got, "scale_cuda") {
t.Errorf("burn-in requested must NOT use scale_cuda (overlay runs on CPU frames); got:\n%s", got)
}
}
func TestCudaScale_NoDownscale_StaysOnCPU(t *testing.T) {
// Source already at/below the cap → no downscale → no point pinning to CUDA.
probe := &StreamProbe{Width: 1920, Height: 1080, DurationSec: 100}
got := argsFor(nvencCfg("1080p", nil), probe)
if strings.Contains(got, "scale_cuda") || strings.Contains(got, "-hwaccel_output_format cuda") {
t.Errorf("no downscale must NOT use the cuda scale path; got:\n%s", got)
}
}
func TestCudaScale_ProbeAbsent_StaysOnCPU(t *testing.T) {
cfg := nvencCfg("1080p", nil)
cfg.Transcode.HasScaleCuda = false // probe said no / non-CUDA host
probe := &StreamProbe{Width: 3840, Height: 2160, DurationSec: 100}
got := argsFor(cfg, probe)
if strings.Contains(got, "scale_cuda") {
t.Errorf("scale_cuda unavailable must fall back to CPU scale; got:\n%s", got)
}
if !strings.Contains(got, "scale=-2:1080") {
t.Errorf("expected CPU scale fallback; got:\n%s", got)
}
}
func TestCudaScale_Software_StaysOnCPU(t *testing.T) {
cfg := nvencCfg("1080p", nil)
cfg.Transcode.HWAccel = HWAccelNone // libx264, no CUDA decode
probe := &StreamProbe{Width: 3840, Height: 2160, DurationSec: 100}
got := argsFor(cfg, probe)
if strings.Contains(got, "scale_cuda") || strings.Contains(got, "-hwaccel_output_format cuda") {
t.Errorf("software encoder must NOT use the cuda scale path; got:\n%s", got)
}
}
func TestCudaScale_QSV_StaysOnCPU(t *testing.T) {
// A non-NVENC HW encoder (HW decode, but not h264_nvenc/cuda) must keep the
// CPU scale — scale_cuda is NVIDIA-only. Distinct from the software case.
cfg := nvencCfg("1080p", nil)
cfg.Transcode.HWAccel = HWAccelQSV
probe := &StreamProbe{Width: 3840, Height: 2160, DurationSec: 100}
got := argsFor(cfg, probe)
if strings.Contains(got, "scale_cuda") || strings.Contains(got, "-hwaccel_output_format cuda") {
t.Errorf("QSV must NOT use the cuda scale path; got:\n%s", got)
}
}
func TestCudaScale_OriginalQuality_StaysOnCPU(t *testing.T) {
// "original" → no height cap (maxH == 0) → no downscale → no cuda path.
probe := &StreamProbe{Width: 3840, Height: 2160, DurationSec: 100}
got := argsFor(nvencCfg("original", nil), probe)
if strings.Contains(got, "scale_cuda") || strings.Contains(got, "-hwaccel_output_format cuda") {
t.Errorf("original quality (no cap) must NOT use the cuda scale path; got:\n%s", got)
}
}