From e298ff6c05e2f8ee9c34395cca5b30ff69eefc38 Mon Sep 17 00:00:00 2001 From: Deivid Soto Date: Wed, 3 Jun 2026 10:48:30 +0200 Subject: [PATCH] fix(stream): don't cache transient libplacebo probe timeouts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Second critico pass on the functional probe. - The probe does real Vulkan device init, which can transiently fail when the box is busy (notably the startup warm racing the encode benchmark). Caching that timeout as a permanent 'no' would pin HDR to the zscale CPU chain until daemon restart. Now a deadline is NOT cached — only a clean non-zero exit (filter absent / no ICD), which is a stable result. zscale stays cached as before (cheap deterministic grep, can't flake). - Surface the exec error when ffmpeg never produced stderr (timeout / ENOENT): the fallback log now shows err.Error() instead of a blank tail, so 'no Vulkan' is distinguishable from 'ffmpeg never ran'. - Dockerfile comment: clarify the Vulkan ICD (not GLX) is the load-bearing mount that 'graphics' adds; 'compute' alone doesn't mount it. Probe still returns true on a Vulkan host (verified); engine tests green. --- Dockerfile | 7 ++++--- internal/engine/tonemap.go | 33 ++++++++++++++++++++++++--------- 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/Dockerfile b/Dockerfile index f86dbb2..200da0f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -95,9 +95,10 @@ ENV XDG_DATA_HOME=/data # NVIDIA passthrough defaults. `--gpus all` alone only grants the "utility" + # "compute" capabilities; nvenc needs "video", and "graphics" makes the runtime -# mount the NVIDIA Vulkan ICD (nvidia_icd.json + GLX libs) so ffmpeg's libplacebo -# filter (GPU HDR tonemap, paired with libvulkan1 above) can create a Vulkan -# device. Baking these here means a plain `docker run --gpus all` (or the compose +# mount the NVIDIA Vulkan ICD (nvidia_icd.json — the load-bearing piece — plus +# GLX/EGL libs) so ffmpeg's libplacebo filter (GPU HDR tonemap, paired with +# libvulkan1 above) can create a Vulkan device. "compute" alone does NOT mount +# the ICD. Baking these here means a plain `docker run --gpus all` (or the compose # device reservation) lights up HW transcode + GPU tonemap with zero extra flags. # Harmless when no GPU is attached. ENV NVIDIA_VISIBLE_DEVICES=all diff --git a/internal/engine/tonemap.go b/internal/engine/tonemap.go index 192bc9b..5a68f8d 100644 --- a/internal/engine/tonemap.go +++ b/internal/engine/tonemap.go @@ -57,11 +57,12 @@ var ( // tonemapped fine via zscale. // // So we run the real filter on one synthetic frame and require a clean exit: -// that forces Vulkan device creation + filtergraph negotiation (the implicit -// hwupload/hwdownload around the GPU filter). Pass → libplacebo works here; -// fail → fall back to the zscale chain. Cached per path; a probe failure is -// treated as "no". The probe is bounded so a wedged ffmpeg can't stall the -// first session. +// that forces Vulkan device creation + filtergraph negotiation (libplacebo +// auto-inserts the hwupload/hwdownload around itself). Pass → libplacebo works +// here; fail → fall back to the zscale chain. Cached per path — EXCEPT a +// context timeout, which is transient (a busy box during the startup warm) and +// must not pin HDR to zscale for the whole process. The probe is bounded so a +// wedged ffmpeg can't stall the first session. func FFmpegSupportsLibplacebo(ffmpegPath string) bool { if ffmpegPath == "" { return false @@ -87,13 +88,27 @@ func FFmpegSupportsLibplacebo(ffmpegPath string) bool { ).CombinedOutput() supported := err == nil - libplaceboCacheMu.Lock() - libplaceboCache[ffmpegPath] = supported - libplaceboCacheMu.Unlock() + // Cache the result — but NOT a timeout. A clean non-zero exit (filter + // absent, no Vulkan ICD) is a stable "no" worth remembering; a deadline is + // transient (the box was busy, e.g. the startup warm racing the encode + // benchmark) and caching it would force HDR onto the zscale CPU chain until + // restart. Worst case a perpetually-loaded box re-probes per session — rare, + // and it fails closed to zscale each time. + if supported || ctx.Err() != context.DeadlineExceeded { + libplaceboCacheMu.Lock() + libplaceboCache[ffmpegPath] = supported + libplaceboCacheMu.Unlock() + } if supported { log.Printf("[tonemap] ffmpeg libplacebo works (Vulkan OK) — HDR sources tonemapped on the GPU (preferred)") } else { - log.Printf("[tonemap] ffmpeg libplacebo unavailable (no Vulkan runtime or filter absent) — HDR falls back to zscale/none: %v", strings.TrimSpace(lastLine(out))) + // On an exec/timeout failure the stderr tail is empty — surface err + // itself so the log distinguishes "no Vulkan" from "ffmpeg never ran". + detail := strings.TrimSpace(lastLine(out)) + if detail == "" { + detail = err.Error() + } + log.Printf("[tonemap] ffmpeg libplacebo unavailable (no Vulkan runtime or filter absent) — HDR falls back to zscale/none: %v", detail) } return supported }