unarr/internal/library/mediainfo/gallery_real_test.go
Deivid Soto d708ea2360 feat(subs): resilient subtitle extraction — sidecars, charset, torrent/debrid
Close the recurring "video has subtitles but the web player shows none" gap
with a source-agnostic pipeline:

- Discover EXTERNAL sidecar subs in the scan (Video.es.ass siblings + a Subs/
  bundle), parse lang/forced/SDH from the filename, skip VobSub (.sub+.idx).
  ffprobe-only scanning ignored these (ToonsHub/anime "MSubs" releases).
- Transcode sidecar charset -> UTF-8 before WebVTT (BOM/UTF-16/code-page by
  language). Chinese SCRIPT matters: chs/sc -> GBK, cht/tc/big5 -> Big5
  (decoding one as the other is mojibake).
- /sub now serves a standalone sidecar file (i=-1, p=file, &l=lang hint) and a
  remote debrid URL (ffmpeg reads http, no local stat) — not just embedded
  streams of a local file.
- probe.json emits a tokened vttUrl per TEXT track so torrent/debrid HLS streams
  (never library-scanned) get subtitles too. Embedded index is counted among
  embedded streams only, so -map 0:s:N stays aligned when sidecars are appended.

Tested against a real 347-file gallery: 26/26 sidecars and embedded ass/srt/
mov_text all extract to valid WebVTT; bitmap (pgs/dvd_subtitle) correctly stays
burn-in. Manual harness gated behind GALLERY_DIR.
2026-06-08 13:04:09 +02:00

206 lines
6.2 KiB
Go

package mediainfo
import (
"context"
"os"
"path/filepath"
"sort"
"strings"
"testing"
"time"
)
// TestGalleryReal is a manual end-to-end harness against a REAL media library.
// It is skipped unless GALLERY_DIR is set, so it never runs in CI.
//
// GALLERY_DIR=/mnt/nas/peliculas go test ./internal/library/mediainfo/ \
// -run TestGalleryReal -v -timeout 30m
//
// It surveys every video file (embedded subs via ffprobe + discovered sidecars),
// then actually extracts WebVTT for one representative of each kind and checks the
// output is a valid, non-empty WEBVTT document.
func TestGalleryReal(t *testing.T) {
dir := os.Getenv("GALLERY_DIR")
if dir == "" {
t.Skip("set GALLERY_DIR to run the real-gallery survey")
}
ffprobe := envOr("FFPROBE", "ffprobe")
ffmpeg := envOr("FFMPEG", "ffmpeg")
videoExt := map[string]bool{".mkv": true, ".mp4": true, ".avi": true, ".m4v": true, ".webm": true, ".mov": true, ".ts": true}
var videos []string
_ = filepath.WalkDir(dir, func(p string, d os.DirEntry, err error) error {
if err != nil || d.IsDir() {
return nil
}
if strings.Contains(p, "/.unarr/") || strings.Contains(p, "/.Trash") || strings.Contains(p, "/@eaDir/") {
return nil
}
if videoExt[strings.ToLower(filepath.Ext(p))] {
videos = append(videos, p)
}
return nil
})
sort.Strings(videos)
t.Logf("found %d video files under %s", len(videos), dir)
type cat struct {
embTextCodecs map[string]int // codec → count of files
embBitmap map[string]int
extCodecs map[string]int
filesEmbText []string
filesEmbBitmap []string
filesExt []string
errs int
}
c := cat{embTextCodecs: map[string]int{}, embBitmap: map[string]int{}, extCodecs: map[string]int{}}
for _, v := range videos {
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
mi, err := ExtractMediaInfo(ctx, ffprobe, v)
cancel()
if err != nil {
c.errs++
t.Logf("PROBE ERR %s: %v", filepath.Base(v), err)
continue
}
var sawEmbText, sawEmbBitmap, sawExt bool
for _, s := range mi.Subtitles {
codec := strings.ToLower(s.Codec)
switch {
case s.External:
c.extCodecs[codec]++
sawExt = true
case IsTextSubtitleCodec(codec):
c.embTextCodecs[codec]++
sawEmbText = true
default:
c.embBitmap[codec]++
sawEmbBitmap = true
}
}
if sawEmbText {
c.filesEmbText = append(c.filesEmbText, v)
}
if sawEmbBitmap {
c.filesEmbBitmap = append(c.filesEmbBitmap, v)
}
if sawExt {
c.filesExt = append(c.filesExt, v)
}
}
t.Logf("=== CENSUS ===")
t.Logf("probe errors: %d", c.errs)
t.Logf("embedded TEXT codecs (files w/ track): %v", c.embTextCodecs)
t.Logf("embedded BITMAP codecs (burn-in only): %v", c.embBitmap)
t.Logf("external SIDECAR codecs: %v", c.extCodecs)
t.Logf("files w/ embedded text: %d | w/ embedded bitmap: %d | w/ external sidecar: %d",
len(c.filesEmbText), len(c.filesEmbBitmap), len(c.filesExt))
// --- Real extraction checks ---
validVTT := func(b []byte) bool {
return len(b) > 0 && strings.HasPrefix(strings.TrimSpace(string(b)), "WEBVTT")
}
// Embedded text: extract index 0 of the first such file.
if len(c.filesEmbText) > 0 {
f := c.filesEmbText[0]
ctx, cancel := context.WithTimeout(context.Background(), 90*time.Second)
out, err := ExtractSubtitleVTT(ctx, ffmpeg, f, 0)
cancel()
if err != nil || !validVTT(out) {
t.Errorf("EMBEDDED extract FAILED for %s: err=%v len=%d", filepath.Base(f), err, len(out))
} else {
t.Logf("EMBEDDED extract OK: %s → %d bytes WebVTT", filepath.Base(f), len(out))
}
}
// External sidecar: find one and extract it via the path-addressed function.
if len(c.filesExt) > 0 {
f := c.filesExt[0]
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
mi, _ := ExtractMediaInfo(ctx, ffprobe, f)
cancel()
var subPath, lang string
for _, s := range mi.Subtitles {
if s.External {
subPath, lang = s.Path, s.Lang
break
}
}
ctx2, cancel2 := context.WithTimeout(context.Background(), 60*time.Second)
out, err := ExtractExternalSubtitleVTT(ctx2, ffmpeg, subPath, lang)
cancel2()
if err != nil || !validVTT(out) {
t.Errorf("EXTERNAL extract FAILED for %s: err=%v len=%d", filepath.Base(subPath), err, len(out))
} else {
t.Logf("EXTERNAL extract OK: %s (lang=%s) → %d bytes WebVTT", filepath.Base(subPath), lang, len(out))
}
}
}
func envOr(k, def string) string {
if v := os.Getenv(k); v != "" {
return v
}
return def
}
// TestGalleryExtractAllSidecars extracts EVERY discovered sidecar in the gallery
// and reports any that fail — the real proof the external path is robust across
// formats/charsets. Skipped unless GALLERY_DIR is set.
func TestGalleryExtractAllSidecars(t *testing.T) {
dir := os.Getenv("GALLERY_DIR")
if dir == "" {
t.Skip("set GALLERY_DIR")
}
ffmpeg := envOr("FFMPEG", "ffmpeg")
var subs []SubtitleTrack
_ = filepath.WalkDir(dir, func(p string, d os.DirEntry, err error) error {
if err != nil || d.IsDir() || strings.Contains(p, "/.unarr/") || strings.Contains(p, "/.Trash") || strings.Contains(p, "/@eaDir/") {
return nil
}
ext := strings.ToLower(filepath.Ext(p))
if videoOf(ext) {
subs = append(subs, DiscoverSidecarSubtitles(p)...)
}
return nil
})
// Dedupe by path.
seen := map[string]bool{}
var uniq []SubtitleTrack
for _, s := range subs {
if !seen[s.Path] {
seen[s.Path] = true
uniq = append(uniq, s)
}
}
t.Logf("discovered %d unique sidecar subtitle files", len(uniq))
fails := 0
for _, s := range uniq {
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
out, err := ExtractExternalSubtitleVTT(ctx, ffmpeg, s.Path, s.Lang)
cancel()
ok := len(out) > 0 && strings.HasPrefix(strings.TrimSpace(string(out)), "WEBVTT")
if err != nil || !ok {
fails++
t.Errorf("FAIL %s (lang=%s codec=%s): err=%v len=%d", filepath.Base(s.Path), s.Lang, s.Codec, err, len(out))
} else {
t.Logf("OK %s (lang=%s codec=%s) → %d bytes", filepath.Base(s.Path), s.Lang, s.Codec, len(out))
}
}
if fails > 0 {
t.Errorf("%d/%d sidecar extractions failed", fails, len(uniq))
} else {
t.Logf("all %d sidecar extractions produced valid WebVTT", len(uniq))
}
}
func videoOf(ext string) bool {
switch ext {
case ".mkv", ".mp4", ".avi", ".m4v", ".webm", ".mov", ".ts":
return true
}
return false
}