diff --git a/internal/cmd/daemon.go b/internal/cmd/daemon.go index c887435..9c595ed 100644 --- a/internal/cmd/daemon.go +++ b/internal/cmd/daemon.go @@ -196,6 +196,7 @@ func runDaemonStart() error { go func() { engine.FFmpegSupportsLibplacebo(ffmpegResolved) engine.FFmpegSupportsZscale(ffmpegResolved) + engine.FFmpegSupportsScaleCuda(ffmpegResolved) }() } diff --git a/internal/cmd/player_session_registry.go b/internal/cmd/player_session_registry.go index 90cdc37..e4a134f 100644 --- a/internal/cmd/player_session_registry.go +++ b/internal/cmd/player_session_registry.go @@ -98,5 +98,9 @@ func buildTranscodeRuntime(ctx context.Context, cfg config.Config) engine.Transc // libplacebo (GPU) is preferred over zscale when present — checked here so // the per-session arg builder can pick it for HDR sources. HasLibplacebo: engine.FFmpegSupportsLibplacebo(ffmpegPath), + // scale_cuda lets an NVENC SDR downscale stay fully on the GPU. Probed + // unconditionally (like libplacebo); fails closed to false on non-CUDA + // hosts, where the arg builder keeps the CPU scale path anyway. + HasScaleCuda: engine.FFmpegSupportsScaleCuda(ffmpegPath), } } diff --git a/internal/engine/hls.go b/internal/engine/hls.go index 7062b5d..58deb96 100644 --- a/internal/engine/hls.go +++ b/internal/engine/hls.go @@ -1359,6 +1359,31 @@ func buildHLSFFmpegArgsAt(cfg HLSSessionConfig, probe *StreamProbe, tmpDir strin // transcode telemetry (F3) without logging it. args := []string{"-y", "-hide_banner", "-loglevel", "warning", "-stats"} + // F4 — full-GPU NVENC downscale. When we're downscaling an SDR source with + // NVENC on a host whose ffmpeg can run scale_cuda, and NO subtitle is burned + // in, keep the decoded frame on the GPU through scale + encode (scale_cuda → + // h264_nvenc) instead of copying every frame to the CPU for `scale=`. That + // CPU round-trip is the wall on modest GPUs (a strong box still gains ~37%). + // Strictly gated — the cases that need CPU frames stay on the CPU path: + // - HDR (the libplacebo Vulkan / zscale CPU tonemap can't consume a CUDA + // surface, and mixing CUDA scale with the Vulkan pass is fragile), + // - burn-in (the scale2ref+overlay composite runs on CPU frames), + // - non-NVENC encoders, and no-op when not actually downscaling. + // Output height cap for this session — resolved once here so the F4 gate and + // the filter chain below share ONE value (a drift between them would emit + // scale_cuda for a height that isn't actually a downscale). + qcap := resolveQualityCap(cfg.Quality) + maxH := qcap.MaxHeight + if maxH == 0 { + maxH = cfg.Transcode.MaxHeight + } + useCudaScale := profile.Codec == "h264_nvenc" && + profile.DecodeHwAccel == "cuda" && + cfg.Transcode.HasScaleCuda && + probe.HDR == "" && + cfg.burnSubtitleIndexOrNone() < 0 && + maxH > 0 && probe.Height > maxH + // Demuxer-side HW-decode hint. Sourced from the profile so a future // codec/hint mismatch is impossible — the encoder + decode hint are // computed once and stay coherent. Notably we do NOT add @@ -1369,6 +1394,12 @@ func buildHLSFFmpegArgsAt(cfg HLSSessionConfig, probe *StreamProbe, tmpDir strin // decode on the input side. if profile.DecodeHwAccel != "" { args = append(args, "-hwaccel", profile.DecodeHwAccel) + // F4: pin decoded frames as CUDA surfaces ONLY on the gated scale_cuda + // path, so scale_cuda + h264_nvenc avoid the CPU copy. Off otherwise — + // the CPU filter chain can't consume CUDA surfaces. + if useCudaScale { + args = append(args, "-hwaccel_output_format", "cuda") + } } // Seek before -i for fast keyframe-aligned start. The new ffmpeg writes @@ -1527,7 +1558,7 @@ func buildHLSFFmpegArgsAt(cfg HLSSessionConfig, probe *StreamProbe, tmpDir strin // on libx264) and stalls the session. The output height matches qcap.MaxHeight // when the source is downscaled, otherwise probe.Height; the output width is // the source width scaled by the same factor (the filter chain preserves AR). - qcap := resolveQualityCap(cfg.Quality) + // qcap + maxH were resolved once at the top (shared with the F4 gate). outputHeight := qcap.MaxHeight if outputHeight == 0 { outputHeight = cfg.Transcode.MaxHeight @@ -1595,10 +1626,7 @@ func buildHLSFFmpegArgsAt(cfg HLSSessionConfig, probe *StreamProbe, tmpDir strin // emit the exact computed width — which can be odd (e.g. 853×480) and // libx264 then refuses to open. We chain a second `scale=trunc(iw/2)*2:...` // after the cap to guarantee even dimensions before format/setparams. - maxH := qcap.MaxHeight - if maxH == 0 { - maxH = cfg.Transcode.MaxHeight - } + // (maxH was resolved once at the top, shared with the F4 cuda-scale gate.) // VAAPI needs frames as nv12 VAAPI surfaces before the encoder. We do // scale + format conversion on CPU then `hwupload` once at the end — // skips the mesa 25 + Raphael iGPU "Cannot allocate memory" log spam @@ -1643,12 +1671,21 @@ func buildHLSFFmpegArgsAt(cfg HLSSessionConfig, probe *StreamProbe, tmpDir strin // hwUploadTail — that has to run last, after any subtitle overlay, so it's // appended separately below. var vchain string - if maxH > 0 && probe.Height > maxH { + switch { + case useCudaScale: + // F4: scale on the CUDA surface and hand h264_nvenc a yuv420p CUDA frame + // directly — no CPU `format`/`setparams` tail (the frame never leaves the + // GPU; nvenc records BT.709 SDR metadata from the source). scale_cuda's + // `-2` already yields an even width, so the second even-rounding pass the + // CPU path needs is unnecessary. useCudaScale already implies a real + // downscale (probe.Height > cudaCap) on an SDR, non-burn-in NVENC source. + vchain = fmt.Sprintf("scale_cuda=-2:%d:format=yuv420p", maxH) + case maxH > 0 && probe.Height > maxH: vchain = fmt.Sprintf( "scale=-2:%d:force_original_aspect_ratio=decrease,scale=trunc(iw/2)*2:trunc(ih/2)*2,%s", maxH, videoTail, ) - } else { + default: vchain = fmt.Sprintf( "scale=trunc(iw/2)*2:trunc(ih/2)*2,%s", videoTail, diff --git a/internal/engine/hls_cudascale_test.go b/internal/engine/hls_cudascale_test.go new file mode 100644 index 0000000..83b8105 --- /dev/null +++ b/internal/engine/hls_cudascale_test.go @@ -0,0 +1,122 @@ +package engine + +import ( + "strings" + "testing" +) + +// F4: buildHLSFFmpegArgsAt must use the full-GPU scale_cuda path ONLY for an +// SDR NVENC downscale with no burn-in on a host that probed scale_cuda — and +// keep the CPU `scale=` path for every case that needs CPU frames (HDR tonemap, +// burn-in, no downscale, non-NVENC, or scale_cuda unavailable). + +func nvencCfg(quality string, burn *int) HLSSessionConfig { + return HLSSessionConfig{ + SessionID: "test-cudascale", + SourcePath: "/tmp/in.mkv", + Quality: quality, + AudioIndex: -1, + BurnSubtitleIndex: burn, + Transcode: TranscodeRuntime{ + FFmpegPath: "/usr/bin/ffmpeg", + HWAccel: HWAccelNVENC, + HasScaleCuda: true, + HasLibplacebo: true, + TonemapHDR: true, + }, + } +} + +func argsFor(cfg HLSSessionConfig, probe *StreamProbe) string { + return strings.Join(buildHLSFFmpegArgsAt(cfg, probe, "/tmp/tmpdir", 0, 0), " ") +} + +func TestCudaScale_SDRDownscale_UsesGPU(t *testing.T) { + probe := &StreamProbe{Width: 3840, Height: 2160, DurationSec: 100} // SDR (HDR == "") + got := argsFor(nvencCfg("1080p", nil), probe) + if !strings.Contains(got, "scale_cuda=-2:1080") { + t.Errorf("expected scale_cuda for SDR NVENC downscale; got:\n%s", got) + } + if !strings.Contains(got, "-hwaccel_output_format cuda") { + t.Errorf("expected -hwaccel_output_format cuda; got:\n%s", got) + } + if strings.Contains(got, "scale=-2:1080") { + t.Errorf("CPU scale must NOT appear on the cuda path; got:\n%s", got) + } +} + +func TestCudaScale_HDR_StaysOnCPU(t *testing.T) { + probe := &StreamProbe{Width: 3840, Height: 2160, HDR: "HDR10", DurationSec: 100} + got := argsFor(nvencCfg("1080p", nil), probe) + if strings.Contains(got, "scale_cuda") { + t.Errorf("HDR must NOT use scale_cuda (needs the tonemap on CPU frames); got:\n%s", got) + } + if strings.Contains(got, "-hwaccel_output_format cuda") { + t.Errorf("HDR must NOT pin frames to CUDA; got:\n%s", got) + } + if !strings.Contains(got, "libplacebo") { + t.Errorf("HDR should still tonemap via libplacebo; got:\n%s", got) + } +} + +func TestCudaScale_BurnIn_StaysOnCPU(t *testing.T) { + idx := 0 + probe := &StreamProbe{Width: 3840, Height: 2160, DurationSec: 100} + got := argsFor(nvencCfg("1080p", &idx), probe) + if strings.Contains(got, "scale_cuda") { + t.Errorf("burn-in requested must NOT use scale_cuda (overlay runs on CPU frames); got:\n%s", got) + } +} + +func TestCudaScale_NoDownscale_StaysOnCPU(t *testing.T) { + // Source already at/below the cap → no downscale → no point pinning to CUDA. + probe := &StreamProbe{Width: 1920, Height: 1080, DurationSec: 100} + got := argsFor(nvencCfg("1080p", nil), probe) + if strings.Contains(got, "scale_cuda") || strings.Contains(got, "-hwaccel_output_format cuda") { + t.Errorf("no downscale must NOT use the cuda scale path; got:\n%s", got) + } +} + +func TestCudaScale_ProbeAbsent_StaysOnCPU(t *testing.T) { + cfg := nvencCfg("1080p", nil) + cfg.Transcode.HasScaleCuda = false // probe said no / non-CUDA host + probe := &StreamProbe{Width: 3840, Height: 2160, DurationSec: 100} + got := argsFor(cfg, probe) + if strings.Contains(got, "scale_cuda") { + t.Errorf("scale_cuda unavailable must fall back to CPU scale; got:\n%s", got) + } + if !strings.Contains(got, "scale=-2:1080") { + t.Errorf("expected CPU scale fallback; got:\n%s", got) + } +} + +func TestCudaScale_Software_StaysOnCPU(t *testing.T) { + cfg := nvencCfg("1080p", nil) + cfg.Transcode.HWAccel = HWAccelNone // libx264, no CUDA decode + probe := &StreamProbe{Width: 3840, Height: 2160, DurationSec: 100} + got := argsFor(cfg, probe) + if strings.Contains(got, "scale_cuda") || strings.Contains(got, "-hwaccel_output_format cuda") { + t.Errorf("software encoder must NOT use the cuda scale path; got:\n%s", got) + } +} + +func TestCudaScale_QSV_StaysOnCPU(t *testing.T) { + // A non-NVENC HW encoder (HW decode, but not h264_nvenc/cuda) must keep the + // CPU scale — scale_cuda is NVIDIA-only. Distinct from the software case. + cfg := nvencCfg("1080p", nil) + cfg.Transcode.HWAccel = HWAccelQSV + probe := &StreamProbe{Width: 3840, Height: 2160, DurationSec: 100} + got := argsFor(cfg, probe) + if strings.Contains(got, "scale_cuda") || strings.Contains(got, "-hwaccel_output_format cuda") { + t.Errorf("QSV must NOT use the cuda scale path; got:\n%s", got) + } +} + +func TestCudaScale_OriginalQuality_StaysOnCPU(t *testing.T) { + // "original" → no height cap (maxH == 0) → no downscale → no cuda path. + probe := &StreamProbe{Width: 3840, Height: 2160, DurationSec: 100} + got := argsFor(nvencCfg("original", nil), probe) + if strings.Contains(got, "scale_cuda") || strings.Contains(got, "-hwaccel_output_format cuda") { + t.Errorf("original quality (no cap) must NOT use the cuda scale path; got:\n%s", got) + } +} diff --git a/internal/engine/hwscale.go b/internal/engine/hwscale.go new file mode 100644 index 0000000..75d64ab --- /dev/null +++ b/internal/engine/hwscale.go @@ -0,0 +1,75 @@ +package engine + +import ( + "context" + "log" + "os/exec" + "strings" + "sync" + "time" +) + +// Hardware downscale filter probes (F4). Mirror the libplacebo probe in +// tonemap.go: presence in `ffmpeg -filters` does NOT prove the filter RUNS — +// scale_cuda needs a working CUDA runtime + device, which the prod debian-slim +// image may lack even with the filter compiled in. So we run the real filter on +// one synthetic frame and require a clean exit, cached per binary. + +var ( + scaleCudaCacheMu sync.Mutex + scaleCudaCache = map[string]bool{} +) + +// FFmpegSupportsScaleCuda reports whether this host can ACTUALLY run scale_cuda +// — a working CUDA device + the filter compiled in. Used to keep an NVENC +// downscale fully on the GPU (decode → scale_cuda → h264_nvenc) instead of +// round-tripping each frame to the CPU for `scale=`, which is the wall on modest +// GPUs. Fails closed: any error → false → the caller keeps the CPU-scale path +// (no regression, just no speedup). Cached per path EXCEPT a context timeout, +// which is transient (a busy box) and must not pin the slow path for the run. +func FFmpegSupportsScaleCuda(ffmpegPath string) bool { + if ffmpegPath == "" { + return false + } + scaleCudaCacheMu.Lock() + if v, ok := scaleCudaCache[ffmpegPath]; ok { + scaleCudaCacheMu.Unlock() + return v + } + scaleCudaCacheMu.Unlock() + + // 10 s: first-run CUDA device creation + filter init can take a beat on a + // cold/busy box. Probe the WORST-CASE real input: a 10-bit (p010) surface + // scaled down to 8-bit yuv420p. Most 4K SDR HEVC is Main10, so the gated + // path routinely hands scale_cuda a 10-bit frame; an 8-bit-only probe would + // pass on a host whose scale_cuda can't do the 10→8-bit conversion, and the + // real session would then fail with no CPU fallback. testsrc2 is CPU-side, + // so format=p010le + hwupload_cuda stands in for a hevc_cuda Main10 decode. + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + out, err := exec.CommandContext(ctx, ffmpegPath, + "-hide_banner", "-loglevel", "error", "-nostats", + "-init_hw_device", "cuda=cu:0", "-filter_hw_device", "cu", + "-f", "lavfi", "-i", "testsrc2=size=256x256:rate=1:duration=1", + "-vf", "format=p010le,hwupload_cuda,scale_cuda=64:64:format=yuv420p,hwdownload,format=yuv420p", + "-frames:v", "1", "-f", "null", "-", + ).CombinedOutput() + supported := err == nil + + // Cache a stable yes/no, but not a transient deadline (see libplacebo probe). + if supported || ctx.Err() != context.DeadlineExceeded { + scaleCudaCacheMu.Lock() + scaleCudaCache[ffmpegPath] = supported + scaleCudaCacheMu.Unlock() + } + if supported { + log.Printf("[hwscale] ffmpeg scale_cuda works — NVENC SDR downscales stay on the GPU (no CPU round-trip)") + } else { + detail := strings.TrimSpace(lastLine(out)) + if detail == "" { + detail = err.Error() + } + log.Printf("[hwscale] ffmpeg scale_cuda unavailable — NVENC keeps the CPU scale path: %v", detail) + } + return supported +} diff --git a/internal/engine/transcode_quality.go b/internal/engine/transcode_quality.go index 64bbae4..fe05683 100644 --- a/internal/engine/transcode_quality.go +++ b/internal/engine/transcode_quality.go @@ -27,6 +27,11 @@ type TranscodeRuntime struct { // Preferred over the zscale chain for HDR sources — one GPU pass, higher // quality, and present where zscale is missing. HasLibplacebo bool + // HasScaleCuda: this host can run scale_cuda (CUDA device + filter). Lets an + // NVENC downscale of an SDR source stay fully on the GPU (decode → scale_cuda + // → h264_nvenc) instead of round-tripping each frame to the CPU for `scale=`. + // Probed functionally (FFmpegSupportsScaleCuda); false ⇒ keep the CPU scale. + HasScaleCuda bool } // qualityCap maps a session's Quality label to a (MaxHeight, VideoBitrate)