unarr/internal/library/mediainfo/sidecar_subs.go
Deivid Soto d708ea2360 feat(subs): resilient subtitle extraction — sidecars, charset, torrent/debrid
Close the recurring "video has subtitles but the web player shows none" gap
with a source-agnostic pipeline:

- Discover EXTERNAL sidecar subs in the scan (Video.es.ass siblings + a Subs/
  bundle), parse lang/forced/SDH from the filename, skip VobSub (.sub+.idx).
  ffprobe-only scanning ignored these (ToonsHub/anime "MSubs" releases).
- Transcode sidecar charset -> UTF-8 before WebVTT (BOM/UTF-16/code-page by
  language). Chinese SCRIPT matters: chs/sc -> GBK, cht/tc/big5 -> Big5
  (decoding one as the other is mojibake).
- /sub now serves a standalone sidecar file (i=-1, p=file, &l=lang hint) and a
  remote debrid URL (ffmpeg reads http, no local stat) — not just embedded
  streams of a local file.
- probe.json emits a tokened vttUrl per TEXT track so torrent/debrid HLS streams
  (never library-scanned) get subtitles too. Embedded index is counted among
  embedded streams only, so -map 0:s:N stays aligned when sidecars are appended.

Tested against a real 347-file gallery: 26/26 sidecars and embedded ass/srt/
mov_text all extract to valid WebVTT; bitmap (pgs/dvd_subtitle) correctly stays
burn-in. Manual harness gated behind GALLERY_DIR.
2026-06-08 13:04:09 +02:00

207 lines
8.2 KiB
Go

package mediainfo
import (
"os"
"path/filepath"
"strings"
)
// External (sidecar) subtitle discovery.
//
// A huge share of torrents — anime fansubs especially — ship subtitles as
// SEPARATE files, not embedded streams: a `.srt`/`.ass` named after the video,
// or a bundle inside a `Subs/` (or `Subtitles/`) subfolder. ffprobe on the video
// container never sees these, so the scan recorded zero subtitles for them
// (e.g. ToonsHub "MSubs" releases). This module finds those files so they become
// real, selectable tracks served via the /sub endpoint (path-based, i=-1).
//
// Only TEXT formats are surfaced (srt/ass/ssa/vtt, and a lone .sub). VobSub
// (.idx + .sub) is bitmap — no text form — so it's skipped here; bitmap subs are
// burn-in only and external bitmap burn-in isn't wired.
// subFolderNames are common subfolder names that hold a release's subtitle
// bundle. Matched case-insensitively. Files inside belong to the sibling media.
var subFolderNames = map[string]bool{
"subs": true, "subtitles": true, "sub": true, "subtitle": true,
}
// sidecarSubExts maps a subtitle file extension to its ffmpeg-style codec name.
// The codec drives the web's text-vs-bitmap classification (isTextSubtitleCodec).
var sidecarSubExts = map[string]string{
".srt": "subrip",
".ass": "ass",
".ssa": "ssa",
".vtt": "webvtt",
".sub": "subrip", // MicroDVD/text — UNLESS paired with a .idx (VobSub, handled below)
}
// forcedTokens / sdhTokens are filename markers that refine a sidecar's role.
var forcedTokens = map[string]bool{"forced": true, "forzado": true, "forces": true}
var sdhTokens = map[string]bool{"sdh": true, "cc": true, "hi": false} // "hi" is also Hindi → don't treat as SDH
// sidecarLangAliases maps RELEASE-NAMING subtitle tokens (fansub/scene shorthand
// NOT covered by the ISO 639-1/2 normaliser) to a language hint. Two things make
// this necessary beyond NormalizeLang:
// - Chinese SCRIPT matters for charset: Simplified (chs/sc/gb) is GBK,
// Traditional (cht/tc/big5) is Big5 — decoding one as the other is mojibake.
// We keep the script in the hint ("zh" vs "zh-Hant") so legacyEncodingForLang
// picks the right code page. Anime fansubs routinely ship both.
// - lat/latino/vostfr etc. aren't ISO at all and would fall to "und".
//
// Applied ONLY to sidecar filenames, not ffprobe metadata, so it can't clash with
// the global langNormalize ("lat"→Latin there). Plain ISO codes (eng/spa/…) are
// intentionally left to NormalizeLang.
var sidecarLangAliases = map[string]string{
"chs": "zh", "sc": "zh", "gb": "zh", "gbk": "zh", "hans": "zh", // Simplified → GBK
"cht": "zh-Hant", "tc": "zh-Hant", "big5": "zh-Hant", "hant": "zh-Hant", // Traditional → Big5
"lat": "es", "latino": "es", "esp": "es", "español": "es", "espanol": "es",
"vostfr": "fr", "vff": "fr", "vf": "fr",
"ptbr": "pt", "pt-br": "pt", "bra": "pt",
}
// DiscoverSidecarSubtitles finds external subtitle files for a local media file:
// siblings named after the video, plus everything in a Subs/Subtitles subfolder.
// Returns text tracks only, each with External=true and an absolute Path. Safe on
// any path — returns nil if the directory can't be read (best-effort, like the
// rest of the scan). Never call for a remote URL source (no local directory).
//
// NOTE: discovered sidecars are NOT deduped against embedded streams of the same
// language. That's deliberate — a `Movie.en.srt` next to a video that also has an
// embedded English stream is usually a DIFFERENT track (full vs SDH, retimed, or
// a better translation), so silently dropping either would hide a choice the user
// may want. Both surface as separate, distinctly-labelled entries.
func DiscoverSidecarSubtitles(mediaPath string) []SubtitleTrack {
if mediaPath == "" || strings.Contains(mediaPath, "://") {
return nil
}
dir := filepath.Dir(mediaPath)
videoBase := strings.TrimSuffix(filepath.Base(mediaPath), filepath.Ext(mediaPath))
videoBaseLower := strings.ToLower(videoBase)
var out []SubtitleTrack
seen := make(map[string]bool) // absolute path dedupe
// 1. Siblings in the media's own directory whose name starts with the video
// base name: "Movie.srt", "Movie.en.srt", "Movie.en.forced.ass", …
addFromDir(dir, func(name string) bool {
return strings.HasPrefix(strings.ToLower(name), videoBaseLower)
}, videoBase, &out, seen)
// 2. A Subs/Subtitles subfolder: take EVERY subtitle file (the whole folder
// belongs to this release). Filenames there are usually language-named
// ("2_English.srt", "spa.ass") with no video-base prefix.
if entries, err := os.ReadDir(dir); err == nil {
for _, e := range entries {
if e.IsDir() && subFolderNames[strings.ToLower(e.Name())] {
addFromDir(filepath.Join(dir, e.Name()), func(string) bool { return true }, "", &out, seen)
}
}
}
return out
}
// addFromDir scans one directory, emitting a SubtitleTrack for each text sidecar
// whose name passes `match`. stripPrefix (the video base, may be "") is removed
// before parsing language/role tokens so "Movie.en.forced.srt" parses as "en"+forced.
func addFromDir(dir string, match func(name string) bool, stripPrefix string, out *[]SubtitleTrack, seen map[string]bool) {
entries, err := os.ReadDir(dir)
if err != nil {
return
}
// Pre-index .idx files so a paired .sub is recognised as VobSub (bitmap) and skipped.
idxBases := make(map[string]bool)
for _, e := range entries {
if !e.IsDir() && strings.EqualFold(filepath.Ext(e.Name()), ".idx") {
idxBases[strings.ToLower(strings.TrimSuffix(e.Name(), filepath.Ext(e.Name())))] = true
}
}
for _, e := range entries {
if e.IsDir() {
continue
}
name := e.Name()
ext := strings.ToLower(filepath.Ext(name))
codec, ok := sidecarSubExts[ext]
if !ok || !match(name) {
continue
}
// VobSub: a .sub paired with a same-named .idx is bitmap, not text. Skip.
if ext == ".sub" && idxBases[strings.ToLower(strings.TrimSuffix(name, ext))] {
continue
}
abs := filepath.Join(dir, name)
if seen[abs] {
continue
}
seen[abs] = true
lang, forced, title := parseSidecarName(name, ext, stripPrefix)
*out = append(*out, SubtitleTrack{
Lang: lang,
Codec: codec,
Title: title,
Forced: forced,
External: true,
Path: abs,
})
}
}
// parseSidecarName extracts (lang, forced, title) from a subtitle filename.
// stripPrefix (the video base) is removed first; the remainder is tokenised on
// common separators and scanned for a language code + role markers. Unknown →
// lang "und". The title is a human hint ("Forced", "SDH") or "".
func parseSidecarName(name, ext, stripPrefix string) (lang string, forced bool, title string) {
stem := strings.TrimSuffix(name, filepath.Ext(name))
if stripPrefix != "" && len(stem) >= len(stripPrefix) &&
strings.EqualFold(stem[:len(stripPrefix)], stripPrefix) {
stem = stem[len(stripPrefix):]
}
lang = "und"
var roles []string
for _, tok := range strings.FieldsFunc(stem, func(r rune) bool {
return r == '.' || r == '_' || r == '-' || r == ' ' || r == '[' || r == ']' || r == '(' || r == ')'
}) {
low := strings.ToLower(strings.TrimSpace(tok))
if low == "" {
continue
}
if forcedTokens[low] {
forced = true
roles = append(roles, "Forced")
continue
}
if v, isSDH := sdhTokens[low]; isSDH && v {
roles = append(roles, "SDH")
continue
}
// First token that maps to a real language wins. Try release-naming
// aliases (chs/lat/…) first, then the standard ISO normaliser. NormalizeLang
// echoes unknown input back lowercased, so accept only a mapped result
// (different from the raw token, or already a known 2-letter code).
if lang == "und" {
if alias, ok := sidecarLangAliases[low]; ok {
lang = alias
continue
}
if norm := NormalizeLang(low); norm != "und" && (norm != low || len(low) == 2) && isKnownLang(norm) {
lang = norm
continue
}
}
}
title = strings.Join(roles, " ")
return lang, forced, title
}
// isKnownLang reports whether code is a value present in langNormalize (i.e. a
// real ISO 639-1 we recognise) — guards against treating a random filename token
// ("web", "dl") as a language.
func isKnownLang(code string) bool {
for _, v := range langNormalize {
if v == code {
return true
}
}
return false
}