feat(library): detección de intro/créditos post-scan (skip segments)
Some checks failed
CI / Test (push) Failing after 6m18s
CI / Build (push) Successful in 1m32s
CI / Build-1 (push) Successful in 1m55s
CI / Build-2 (push) Successful in 1m33s
CI / Build-3 (push) Successful in 1m32s
CI / Build-4 (push) Successful in 1m35s
CI / Build-5 (push) Successful in 1m33s
CI / Lint (push) Failing after 2m50s
CI / Coverage (push) Successful in 2m58s
CI / Vet (push) Successful in 2m7s

Tras cada scan, localiza la intro (OP) y los créditos (ED) comparando
fingerprints chromaprint entre episodios de la misma temporada —
reimplementación limpia del enfoque de Intro Skipper: índice invertido
de uint32, alineamiento por shifts, Hamming ≤6/32, región contigua más
larga (15-120s intro / 15-450s créditos). Películas: inicio de créditos
por rachas de blackframe (solo keyframes, -skip_frame nokey) que llegan
al final del fichero.

- fpcalc se auto-descarga de las releases estáticas de acoustid
  (linux/macos/windows, ~2MB) con el mismo patrón que ffmpeg/ffprobe.
- Resultados cacheados como sidecar .skipseg.json (mtime + versión de
  algoritmo); solo los ficheros nuevos trabajan.
- Submit a /api/internal/agent/skip-segments DESPUÉS del library-sync,
  en dos fases (episodios primero, películas después) para que la
  fase rápida no espere a los blackframe lentos sobre NAS.
- Agrupación por (dir + título-pre-SxxEyy + season): los títulos
  parseados arrastran nombre de episodio y tags de release.
- Gotcha cazado en vivo: fpcalc -length sale sin drenar el pipe; hay
  que cerrar nuestro read-end o ffmpeg queda bloqueado para siempre.
- config: library.skip_detect (default true, backfill) y scan_interval
  default 24h → 1h (estilo Plex).
This commit is contained in:
Deivid Soto 2026-06-12 19:46:07 +02:00
parent 59da949a53
commit a710bc1626
11 changed files with 1223 additions and 5 deletions

View file

@ -0,0 +1,279 @@
package mediainfo
import (
"bufio"
"context"
"encoding/json"
"fmt"
"math/bits"
"os"
"os/exec"
"path/filepath"
"regexp"
"strconv"
"strings"
)
// Chromaprint-based shared-audio detection. Episodes of the same season share
// an identical intro (OP) and credits (ED) audio track; fingerprinting a window
// of each episode and finding the longest aligned low-hamming-distance region
// between two episodes localizes those segments. Clean-room implementation of
// the approach popularized by Jellyfin's Intro Skipper plugin.
//
// Fingerprint stream: chromaprint emits one uint32 per ~0.1238s of audio
// (11025 Hz mono, FFT 4096, 2/3 overlap → ~8.08 points/second).
const (
// ChromaprintSampleDur is seconds of audio per fingerprint point.
ChromaprintSampleDur = 0.1238
// maxHammingBits: two points are "similar" when their XOR popcount is below this.
maxHammingBits = 6
// maxTimeSkipSec: gap tolerance when growing a contiguous similar region.
maxTimeSkipSec = 3.5
)
// SkipSegmentRange is one detected skippable range inside a media file.
type SkipSegmentRange struct {
Category string `json:"category"` // "intro" | "credits"
StartSec float64 `json:"startSec"`
EndSec float64 `json:"endSec"`
}
// FingerprintAudioWindow decodes [startSec, startSec+lengthSec] of the first
// audio track with ffmpeg and pipes the WAV into fpcalc -raw, returning the
// chromaprint point stream.
func FingerprintAudioWindow(ctx context.Context, ffmpegPath, fpcalcPath, mediaPath string, startSec, lengthSec float64) ([]uint32, error) {
ff := exec.CommandContext(ctx, ffmpegPath,
"-nostdin", "-loglevel", "error",
"-ss", strconv.FormatFloat(startSec, 'f', 3, 64),
"-i", mediaPath,
"-t", strconv.FormatFloat(lengthSec, 'f', 3, 64),
"-map", "0:a:0",
"-ac", "2",
"-f", "wav", "-",
)
fp := exec.CommandContext(ctx, fpcalcPath,
"-raw", "-length", strconv.Itoa(int(lengthSec)), "-")
pipe, err := ff.StdoutPipe()
if err != nil {
return nil, fmt.Errorf("ffmpeg pipe: %w", err)
}
fp.Stdin = pipe
var ffErr strings.Builder
ff.Stderr = &ffErr
if err := ff.Start(); err != nil {
return nil, fmt.Errorf("ffmpeg start: %w", err)
}
out, err := fp.Output()
// fpcalc stops reading once it has processed -length seconds and may exit
// WITHOUT draining the last buffered bytes. Close our read end so ffmpeg
// gets EPIPE and exits — otherwise it blocks forever on a full pipe whose
// only remaining reader is us (caught live: 5-min ctx kills, per file).
_ = pipe.Close()
// Always reap ffmpeg; early pipe close makes it exit non-zero — fine as
// long as fpcalc produced output.
_ = ff.Wait()
if err != nil {
return nil, fmt.Errorf("fpcalc: %w (ffmpeg: %s)", err, strings.TrimSpace(ffErr.String()))
}
for _, line := range strings.Split(string(out), "\n") {
if rest, ok := strings.CutPrefix(strings.TrimSpace(line), "FINGERPRINT="); ok {
parts := strings.Split(rest, ",")
points := make([]uint32, 0, len(parts))
for _, p := range parts {
// fpcalc may print signed ints; parse wide and truncate.
v, perr := strconv.ParseInt(strings.TrimSpace(p), 10, 64)
if perr != nil {
return nil, fmt.Errorf("fpcalc output parse: %w", perr)
}
points = append(points, uint32(v))
}
if len(points) == 0 {
return nil, fmt.Errorf("fpcalc produced an empty fingerprint")
}
return points, nil
}
}
return nil, fmt.Errorf("no FINGERPRINT line in fpcalc output")
}
// SharedRegion is the longest aligned similar-audio region between two
// fingerprint streams, in seconds relative to each stream's start.
type SharedRegion struct {
AStart, AEnd float64
BStart, BEnd float64
Duration float64
}
// FindSharedRegion locates the longest contiguous region (bounded by
// minDur/maxDur seconds) where streams a and b carry near-identical audio at
// some alignment. Returns nil when no qualifying region exists.
func FindSharedRegion(a, b []uint32, minDur, maxDur float64) *SharedRegion {
if len(a) == 0 || len(b) == 0 {
return nil
}
// Inverted index of b: point value → last index seen.
indexB := make(map[uint32]int, len(b))
for i, v := range b {
indexB[v] = i
}
// Candidate alignments: exact value matches (±2 on the value tolerates
// quantization noise between encodes).
shifts := make(map[int]struct{})
for i, v := range a {
for d := -2; d <= 2; d++ {
if j, ok := indexB[v+uint32(d)]; ok {
shifts[j-i] = struct{}{}
}
}
}
minPoints := int(minDur / ChromaprintSampleDur)
gapSec := float64(maxTimeSkipSec)
gapPoints := int(gapSec / ChromaprintSampleDur)
var best *SharedRegion
for shift := range shifts {
i0 := 0
if shift < 0 {
i0 = -shift
}
i1 := len(a)
if len(b)-shift < i1 {
i1 = len(b) - shift
}
if i1-i0 < minPoints {
continue
}
runStart, prev := -1, -1
flush := func(end int) {
if runStart < 0 {
return
}
dur := float64(end-runStart) * ChromaprintSampleDur
if dur >= minDur && dur <= maxDur && (best == nil || dur > best.Duration) {
best = &SharedRegion{
AStart: float64(runStart) * ChromaprintSampleDur,
AEnd: float64(end) * ChromaprintSampleDur,
BStart: float64(runStart+shift) * ChromaprintSampleDur,
BEnd: float64(end+shift) * ChromaprintSampleDur,
Duration: dur,
}
}
}
for i := i0; i < i1; i++ {
if bits.OnesCount32(a[i]^b[i+shift]) > maxHammingBits {
continue
}
if prev >= 0 && i-prev > gapPoints {
flush(prev)
runStart = i
} else if runStart < 0 {
runStart = i
}
prev = i
}
flush(prev)
}
return best
}
// --- Black-frame credits detection (movies: no sibling episode to compare) ---
var blackframeRe = regexp.MustCompile(`frame:\d+\s+pblack:\d+\s+pts:\d+\s+t:([\d.]+)`)
// DetectBlackFrameRuns scans [startSec, startSec+lengthSec] with ffmpeg's
// blackframe filter and returns the timestamps (absolute seconds) of frames
// that are ≥minBlackPct black. Used to find the start of end credits in movies
// (classic credits roll on black).
func DetectBlackFrameRuns(ctx context.Context, ffmpegPath, mediaPath string, startSec, lengthSec float64, minBlackPct int) ([]float64, error) {
// Keyframe-only decode: credits-on-black lasts minutes, so sampling one
// frame every keyframe interval (~2-10s) finds the run at ~2% of the cost
// of a full decode — the difference between seconds and minutes per 4K film.
cmd := exec.CommandContext(ctx, ffmpegPath,
"-nostdin", "-loglevel", "info",
"-skip_frame", "nokey",
"-ss", strconv.FormatFloat(startSec, 'f', 3, 64),
"-i", mediaPath,
"-t", strconv.FormatFloat(lengthSec, 'f', 3, 64),
"-an", "-sn",
"-vf", fmt.Sprintf("blackframe=amount=%d:threshold=32", minBlackPct),
"-f", "null", "-",
)
stderr, err := cmd.StderrPipe()
if err != nil {
return nil, err
}
if err := cmd.Start(); err != nil {
return nil, fmt.Errorf("ffmpeg blackframe start: %w", err)
}
var times []float64
sc := bufio.NewScanner(stderr)
sc.Buffer(make([]byte, 0, 64*1024), 1024*1024)
for sc.Scan() {
if m := blackframeRe.FindStringSubmatch(sc.Text()); m != nil {
if t, perr := strconv.ParseFloat(m[1], 64); perr == nil {
times = append(times, startSec+t)
}
}
}
if err := cmd.Wait(); err != nil {
return nil, fmt.Errorf("ffmpeg blackframe: %w", err)
}
return times, nil
}
// --- Sidecar cache for detected segments ---
// skipSegmentsSidecarVersion bumps when the detection algorithm changes enough
// that cached results should be recomputed.
const skipSegmentsSidecarVersion = 1
// SkipSegmentsSidecar is the cached detection result for one media file.
type SkipSegmentsSidecar struct {
Version int `json:"version"`
DurationSec float64 `json:"durationSec"`
Segments []SkipSegmentRange `json:"segments"` // empty = analyzed, nothing found
}
func skipSegmentsCachePath(mediaPath string) string {
return filepath.Join(sidecarDir(mediaPath), filepath.Base(mediaPath)+".skipseg.json")
}
// ReadCachedSkipSegments returns the cached detection result for mediaPath if
// fresh (newer than the media file) and of the current algorithm version.
func ReadCachedSkipSegments(mediaPath string) (*SkipSegmentsSidecar, bool) {
p := skipSegmentsCachePath(mediaPath)
if !sidecarFresh(p, mediaPath) {
return nil, false
}
data, err := os.ReadFile(p)
if err != nil {
return nil, false
}
var sc SkipSegmentsSidecar
if err := json.Unmarshal(data, &sc); err != nil || sc.Version != skipSegmentsSidecarVersion {
return nil, false
}
return &sc, true
}
// WriteCachedSkipSegments persists a detection result next to the media file.
func WriteCachedSkipSegments(mediaPath string, durationSec float64, segs []SkipSegmentRange) error {
if segs == nil {
segs = []SkipSegmentRange{}
}
sc := SkipSegmentsSidecar{Version: skipSegmentsSidecarVersion, DurationSec: durationSec, Segments: segs}
data, err := json.Marshal(sc)
if err != nil {
return err
}
dir := sidecarDir(mediaPath)
if err := os.MkdirAll(dir, 0o755); err != nil {
return err
}
return os.WriteFile(skipSegmentsCachePath(mediaPath), data, 0o644)
}

View file

@ -0,0 +1,121 @@
package mediainfo
import (
"math"
"testing"
)
// lcg is a tiny deterministic pseudo-random stream for synthetic fingerprints.
type lcg struct{ state uint64 }
func (l *lcg) next() uint32 {
l.state = l.state*6364136223846793005 + 1442695040888963407
return uint32(l.state >> 32)
}
func TestFindSharedRegion_DetectsAlignedSegment(t *testing.T) {
// Shared segment: 700 points ≈ 86.7s — a typical anime OP.
shared := make([]uint32, 700)
g := &lcg{state: 42}
for i := range shared {
shared[i] = g.next()
}
// a: 80 points of unique noise, then the shared segment, then noise.
ga := &lcg{state: 1001}
a := make([]uint32, 0, 2000)
for i := 0; i < 80; i++ {
a = append(a, ga.next())
}
a = append(a, shared...)
for len(a) < 2000 {
a = append(a, ga.next())
}
// b: 480 points of different noise, then the same shared segment.
gb := &lcg{state: 2002}
b := make([]uint32, 0, 2000)
for i := 0; i < 480; i++ {
b = append(b, gb.next())
}
b = append(b, shared...)
for len(b) < 2000 {
b = append(b, gb.next())
}
r := FindSharedRegion(a, b, 15, 120)
if r == nil {
t.Fatal("expected a shared region, got nil")
}
wantAStart := 80 * ChromaprintSampleDur
wantBStart := 480 * ChromaprintSampleDur
if math.Abs(r.AStart-wantAStart) > 2 {
t.Errorf("AStart = %.1f, want ≈ %.1f", r.AStart, wantAStart)
}
if math.Abs(r.BStart-wantBStart) > 2 {
t.Errorf("BStart = %.1f, want ≈ %.1f", r.BStart, wantBStart)
}
wantDur := 700 * ChromaprintSampleDur
if math.Abs(r.Duration-wantDur) > 4 {
t.Errorf("Duration = %.1f, want ≈ %.1f", r.Duration, wantDur)
}
}
func TestFindSharedRegion_NoMatchOnNoise(t *testing.T) {
ga, gb := &lcg{state: 7}, &lcg{state: 9}
a := make([]uint32, 1500)
b := make([]uint32, 1500)
for i := range a {
a[i] = ga.next()
b[i] = gb.next()
}
if r := FindSharedRegion(a, b, 15, 120); r != nil {
t.Fatalf("expected nil on unrelated noise, got %+v", r)
}
}
func TestFindSharedRegion_FullMatchExceedsMaxDur(t *testing.T) {
// Two identical streams (same episode, two releases): the only region is
// the full window, which must be rejected by maxDur.
g := &lcg{state: 5}
a := make([]uint32, 2000)
for i := range a {
a[i] = g.next()
}
b := make([]uint32, 2000)
copy(b, a)
if r := FindSharedRegion(a, b, 15, 120); r != nil {
t.Fatalf("expected nil for identical streams (region > maxDur), got %+v", r)
}
}
func TestFindSharedRegion_ToleratesBitNoise(t *testing.T) {
// Same shared segment but with ≤2 flipped bits per point (re-encode noise).
shared := make([]uint32, 600)
g := &lcg{state: 77}
for i := range shared {
shared[i] = g.next()
}
noisy := make([]uint32, len(shared))
for i, v := range shared {
noisy[i] = v ^ (1 << uint(i%20)) // flip one bit
}
ga, gb := &lcg{state: 100}, &lcg{state: 200}
a := append(make([]uint32, 0, 1500), shared...)
for len(a) < 1500 {
a = append(a, ga.next())
}
b := append(make([]uint32, 0, 1500), noisy...)
for len(b) < 1500 {
b = append(b, gb.next())
}
r := FindSharedRegion(a, b, 15, 120)
if r == nil {
t.Fatal("expected match despite 1-bit noise, got nil")
}
if r.AStart > 2 {
t.Errorf("AStart = %.1f, want ≈ 0", r.AStart)
}
}

View file

@ -0,0 +1,148 @@
package mediainfo
import (
"archive/tar"
"compress/gzip"
"fmt"
"io"
"net/http"
"os"
"os/exec"
"path/filepath"
"runtime"
"strings"
"time"
)
// fpcalc (chromaprint) powers skip-segment detection: the ffmpeg static builds
// we download from ffbinaries do NOT include the chromaprint muxer, so audio
// fingerprinting pipes decoded WAV from our ffmpeg into a standalone fpcalc
// binary. acoustid publishes small (~2MB) static builds per platform.
const fpcalcVersion = "1.6.0"
var fpcalcDLClient = &http.Client{Timeout: 5 * time.Minute}
const maxFpcalcArchiveSize = 50 * 1024 * 1024 // 50MB
// fpcalcDownloadURL returns the release asset URL for the current platform,
// and whether the asset is a zip (Windows) instead of tar.gz.
func fpcalcDownloadURL() (url string, isZip bool, err error) {
base := fmt.Sprintf("https://github.com/acoustid/chromaprint/releases/download/v%s/chromaprint-fpcalc-%s-", fpcalcVersion, fpcalcVersion)
switch runtime.GOOS {
case "linux":
switch runtime.GOARCH {
case "amd64":
return base + "linux-x86_64.tar.gz", false, nil
case "arm64":
return base + "linux-arm64.tar.gz", false, nil
}
case "darwin":
return base + "macos-universal.tar.gz", false, nil
case "windows":
if runtime.GOARCH == "amd64" {
return base + "windows-x86_64.zip", true, nil
}
}
return "", false, fmt.Errorf("no fpcalc build for platform %s/%s", runtime.GOOS, runtime.GOARCH)
}
// FpcalcCachePath returns the cached fpcalc binary path (same bin dir as the
// downloaded ffmpeg/ffprobe).
func FpcalcCachePath() (string, error) {
dir, err := FFprobeCacheDir()
if err != nil {
return "", err
}
name := "fpcalc"
if runtime.GOOS == "windows" {
name = "fpcalc.exe"
}
return filepath.Join(dir, name), nil
}
// ResolveFpcalc finds a usable fpcalc binary: PATH → cache dir → download.
func ResolveFpcalc() (string, error) {
if p, err := exec.LookPath("fpcalc"); err == nil {
return p, nil
}
dest, err := FpcalcCachePath()
if err != nil {
return "", err
}
if _, err := os.Stat(dest); err == nil {
return dest, nil
}
return downloadFpcalc(dest)
}
func downloadFpcalc(dest string) (string, error) {
url, isZip, err := fpcalcDownloadURL()
if err != nil {
return "", err
}
fmt.Fprintf(os.Stderr, "fpcalc not found — downloading chromaprint %s...\n", fpcalcVersion)
resp, err := fpcalcDLClient.Get(url)
if err != nil {
return "", fmt.Errorf("fpcalc download failed: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("fpcalc download failed: HTTP %d", resp.StatusCode)
}
data, err := io.ReadAll(io.LimitReader(resp.Body, maxFpcalcArchiveSize))
if err != nil {
return "", fmt.Errorf("fpcalc download read failed: %w", err)
}
name := "fpcalc"
if runtime.GOOS == "windows" {
name = "fpcalc.exe"
}
var binary []byte
if isZip {
binary, err = extractFromZip(data, name)
} else {
binary, err = extractFromTarGz(data, name)
}
if err != nil {
return "", err
}
if err := os.MkdirAll(filepath.Dir(dest), 0o755); err != nil {
return "", fmt.Errorf("cannot create cache directory: %w", err)
}
if err := os.WriteFile(dest, binary, 0o755); err != nil {
return "", fmt.Errorf("cannot write fpcalc binary: %w", err)
}
fmt.Fprintf(os.Stderr, "fpcalc installed to %s\n", dest)
return dest, nil
}
func extractFromTarGz(data []byte, target string) ([]byte, error) {
gz, err := gzip.NewReader(strings.NewReader(string(data)))
if err != nil {
return nil, fmt.Errorf("cannot open downloaded archive: %w", err)
}
defer gz.Close()
tr := tar.NewReader(gz)
for {
hdr, err := tr.Next()
if err == io.EOF {
break
}
if err != nil {
return nil, fmt.Errorf("cannot read archive: %w", err)
}
if hdr.Typeflag == tar.TypeReg && filepath.Base(hdr.Name) == target {
return io.ReadAll(io.LimitReader(tr, maxFpcalcArchiveSize))
}
}
return nil, fmt.Errorf("%s not found in downloaded archive", target)
}