package engine

import (
	"context"
	"log"
	"os/exec"
	"strings"
	"sync"
	"time"
)

// Hardware downscale filter probes (F4). Mirror the libplacebo probe in
// tonemap.go: presence in `ffmpeg -filters` does NOT prove the filter RUNS —
// scale_cuda needs a working CUDA runtime + device, which the prod debian-slim
// image may lack even with the filter compiled in. So we run the real filter on
// one synthetic frame and require a clean exit, cached per binary.

var (
	scaleCudaCacheMu sync.Mutex
	scaleCudaCache   = map[string]bool{}
)

// FFmpegSupportsScaleCuda reports whether this host can ACTUALLY run scale_cuda
// — a working CUDA device + the filter compiled in. Used to keep an NVENC
// downscale fully on the GPU (decode → scale_cuda → h264_nvenc) instead of
// round-tripping each frame to the CPU for `scale=`, which is the wall on modest
// GPUs. Fails closed: any error → false → the caller keeps the CPU-scale path
// (no regression, just no speedup). Cached per path EXCEPT a context timeout,
// which is transient (a busy box) and must not pin the slow path for the run.
func FFmpegSupportsScaleCuda(ffmpegPath string) bool {
	if ffmpegPath == "" {
		return false
	}
	scaleCudaCacheMu.Lock()
	if v, ok := scaleCudaCache[ffmpegPath]; ok {
		scaleCudaCacheMu.Unlock()
		return v
	}
	scaleCudaCacheMu.Unlock()

	// 10 s: first-run CUDA device creation + filter init can take a beat on a
	// cold/busy box. Probe the WORST-CASE real input: a 10-bit (p010) surface
	// scaled down to 8-bit yuv420p. Most 4K SDR HEVC is Main10, so the gated
	// path routinely hands scale_cuda a 10-bit frame; an 8-bit-only probe would
	// pass on a host whose scale_cuda can't do the 10→8-bit conversion, and the
	// real session would then fail with no CPU fallback. testsrc2 is CPU-side,
	// so format=p010le + hwupload_cuda stands in for a hevc_cuda Main10 decode.
	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
	defer cancel()
	out, err := exec.CommandContext(ctx, ffmpegPath,
		"-hide_banner", "-loglevel", "error", "-nostats",
		"-init_hw_device", "cuda=cu:0", "-filter_hw_device", "cu",
		"-f", "lavfi", "-i", "testsrc2=size=256x256:rate=1:duration=1",
		"-vf", "format=p010le,hwupload_cuda,scale_cuda=64:64:format=yuv420p,hwdownload,format=yuv420p",
		"-frames:v", "1", "-f", "null", "-",
	).CombinedOutput()
	supported := err == nil

	// Cache a stable yes/no, but not a transient deadline (see libplacebo probe).
	if supported || ctx.Err() != context.DeadlineExceeded {
		scaleCudaCacheMu.Lock()
		scaleCudaCache[ffmpegPath] = supported
		scaleCudaCacheMu.Unlock()
	}
	if supported {
		log.Printf("[hwscale] ffmpeg scale_cuda works — NVENC SDR downscales stay on the GPU (no CPU round-trip)")
	} else {
		detail := strings.TrimSpace(lastLine(out))
		if detail == "" {
			detail = err.Error()
		}
		log.Printf("[hwscale] ffmpeg scale_cuda unavailable — NVENC keeps the CPU scale path: %v", detail)
	}
	return supported
}