Close the recurring "video has subtitles but the web player shows none" gap with a source-agnostic pipeline: - Discover EXTERNAL sidecar subs in the scan (Video.es.ass siblings + a Subs/ bundle), parse lang/forced/SDH from the filename, skip VobSub (.sub+.idx). ffprobe-only scanning ignored these (ToonsHub/anime "MSubs" releases). - Transcode sidecar charset -> UTF-8 before WebVTT (BOM/UTF-16/code-page by language). Chinese SCRIPT matters: chs/sc -> GBK, cht/tc/big5 -> Big5 (decoding one as the other is mojibake). - /sub now serves a standalone sidecar file (i=-1, p=file, &l=lang hint) and a remote debrid URL (ffmpeg reads http, no local stat) — not just embedded streams of a local file. - probe.json emits a tokened vttUrl per TEXT track so torrent/debrid HLS streams (never library-scanned) get subtitles too. Embedded index is counted among embedded streams only, so -map 0:s:N stays aligned when sidecars are appended. Tested against a real 347-file gallery: 26/26 sidecars and embedded ass/srt/ mov_text all extract to valid WebVTT; bitmap (pgs/dvd_subtitle) correctly stays burn-in. Manual harness gated behind GALLERY_DIR.
207 lines
8.2 KiB
Go
207 lines
8.2 KiB
Go
package mediainfo
|
|
|
|
import (
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
)
|
|
|
|
// External (sidecar) subtitle discovery.
|
|
//
|
|
// A huge share of torrents — anime fansubs especially — ship subtitles as
|
|
// SEPARATE files, not embedded streams: a `.srt`/`.ass` named after the video,
|
|
// or a bundle inside a `Subs/` (or `Subtitles/`) subfolder. ffprobe on the video
|
|
// container never sees these, so the scan recorded zero subtitles for them
|
|
// (e.g. ToonsHub "MSubs" releases). This module finds those files so they become
|
|
// real, selectable tracks served via the /sub endpoint (path-based, i=-1).
|
|
//
|
|
// Only TEXT formats are surfaced (srt/ass/ssa/vtt, and a lone .sub). VobSub
|
|
// (.idx + .sub) is bitmap — no text form — so it's skipped here; bitmap subs are
|
|
// burn-in only and external bitmap burn-in isn't wired.
|
|
|
|
// subFolderNames are common subfolder names that hold a release's subtitle
|
|
// bundle. Matched case-insensitively. Files inside belong to the sibling media.
|
|
var subFolderNames = map[string]bool{
|
|
"subs": true, "subtitles": true, "sub": true, "subtitle": true,
|
|
}
|
|
|
|
// sidecarSubExts maps a subtitle file extension to its ffmpeg-style codec name.
|
|
// The codec drives the web's text-vs-bitmap classification (isTextSubtitleCodec).
|
|
var sidecarSubExts = map[string]string{
|
|
".srt": "subrip",
|
|
".ass": "ass",
|
|
".ssa": "ssa",
|
|
".vtt": "webvtt",
|
|
".sub": "subrip", // MicroDVD/text — UNLESS paired with a .idx (VobSub, handled below)
|
|
}
|
|
|
|
// forcedTokens / sdhTokens are filename markers that refine a sidecar's role.
|
|
var forcedTokens = map[string]bool{"forced": true, "forzado": true, "forces": true}
|
|
var sdhTokens = map[string]bool{"sdh": true, "cc": true, "hi": false} // "hi" is also Hindi → don't treat as SDH
|
|
|
|
// sidecarLangAliases maps RELEASE-NAMING subtitle tokens (fansub/scene shorthand
|
|
// NOT covered by the ISO 639-1/2 normaliser) to a language hint. Two things make
|
|
// this necessary beyond NormalizeLang:
|
|
// - Chinese SCRIPT matters for charset: Simplified (chs/sc/gb) is GBK,
|
|
// Traditional (cht/tc/big5) is Big5 — decoding one as the other is mojibake.
|
|
// We keep the script in the hint ("zh" vs "zh-Hant") so legacyEncodingForLang
|
|
// picks the right code page. Anime fansubs routinely ship both.
|
|
// - lat/latino/vostfr etc. aren't ISO at all and would fall to "und".
|
|
//
|
|
// Applied ONLY to sidecar filenames, not ffprobe metadata, so it can't clash with
|
|
// the global langNormalize ("lat"→Latin there). Plain ISO codes (eng/spa/…) are
|
|
// intentionally left to NormalizeLang.
|
|
var sidecarLangAliases = map[string]string{
|
|
"chs": "zh", "sc": "zh", "gb": "zh", "gbk": "zh", "hans": "zh", // Simplified → GBK
|
|
"cht": "zh-Hant", "tc": "zh-Hant", "big5": "zh-Hant", "hant": "zh-Hant", // Traditional → Big5
|
|
"lat": "es", "latino": "es", "esp": "es", "español": "es", "espanol": "es",
|
|
"vostfr": "fr", "vff": "fr", "vf": "fr",
|
|
"ptbr": "pt", "pt-br": "pt", "bra": "pt",
|
|
}
|
|
|
|
// DiscoverSidecarSubtitles finds external subtitle files for a local media file:
|
|
// siblings named after the video, plus everything in a Subs/Subtitles subfolder.
|
|
// Returns text tracks only, each with External=true and an absolute Path. Safe on
|
|
// any path — returns nil if the directory can't be read (best-effort, like the
|
|
// rest of the scan). Never call for a remote URL source (no local directory).
|
|
//
|
|
// NOTE: discovered sidecars are NOT deduped against embedded streams of the same
|
|
// language. That's deliberate — a `Movie.en.srt` next to a video that also has an
|
|
// embedded English stream is usually a DIFFERENT track (full vs SDH, retimed, or
|
|
// a better translation), so silently dropping either would hide a choice the user
|
|
// may want. Both surface as separate, distinctly-labelled entries.
|
|
func DiscoverSidecarSubtitles(mediaPath string) []SubtitleTrack {
|
|
if mediaPath == "" || strings.Contains(mediaPath, "://") {
|
|
return nil
|
|
}
|
|
dir := filepath.Dir(mediaPath)
|
|
videoBase := strings.TrimSuffix(filepath.Base(mediaPath), filepath.Ext(mediaPath))
|
|
videoBaseLower := strings.ToLower(videoBase)
|
|
|
|
var out []SubtitleTrack
|
|
seen := make(map[string]bool) // absolute path dedupe
|
|
|
|
// 1. Siblings in the media's own directory whose name starts with the video
|
|
// base name: "Movie.srt", "Movie.en.srt", "Movie.en.forced.ass", …
|
|
addFromDir(dir, func(name string) bool {
|
|
return strings.HasPrefix(strings.ToLower(name), videoBaseLower)
|
|
}, videoBase, &out, seen)
|
|
|
|
// 2. A Subs/Subtitles subfolder: take EVERY subtitle file (the whole folder
|
|
// belongs to this release). Filenames there are usually language-named
|
|
// ("2_English.srt", "spa.ass") with no video-base prefix.
|
|
if entries, err := os.ReadDir(dir); err == nil {
|
|
for _, e := range entries {
|
|
if e.IsDir() && subFolderNames[strings.ToLower(e.Name())] {
|
|
addFromDir(filepath.Join(dir, e.Name()), func(string) bool { return true }, "", &out, seen)
|
|
}
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
// addFromDir scans one directory, emitting a SubtitleTrack for each text sidecar
|
|
// whose name passes `match`. stripPrefix (the video base, may be "") is removed
|
|
// before parsing language/role tokens so "Movie.en.forced.srt" parses as "en"+forced.
|
|
func addFromDir(dir string, match func(name string) bool, stripPrefix string, out *[]SubtitleTrack, seen map[string]bool) {
|
|
entries, err := os.ReadDir(dir)
|
|
if err != nil {
|
|
return
|
|
}
|
|
// Pre-index .idx files so a paired .sub is recognised as VobSub (bitmap) and skipped.
|
|
idxBases := make(map[string]bool)
|
|
for _, e := range entries {
|
|
if !e.IsDir() && strings.EqualFold(filepath.Ext(e.Name()), ".idx") {
|
|
idxBases[strings.ToLower(strings.TrimSuffix(e.Name(), filepath.Ext(e.Name())))] = true
|
|
}
|
|
}
|
|
for _, e := range entries {
|
|
if e.IsDir() {
|
|
continue
|
|
}
|
|
name := e.Name()
|
|
ext := strings.ToLower(filepath.Ext(name))
|
|
codec, ok := sidecarSubExts[ext]
|
|
if !ok || !match(name) {
|
|
continue
|
|
}
|
|
// VobSub: a .sub paired with a same-named .idx is bitmap, not text. Skip.
|
|
if ext == ".sub" && idxBases[strings.ToLower(strings.TrimSuffix(name, ext))] {
|
|
continue
|
|
}
|
|
abs := filepath.Join(dir, name)
|
|
if seen[abs] {
|
|
continue
|
|
}
|
|
seen[abs] = true
|
|
|
|
lang, forced, title := parseSidecarName(name, ext, stripPrefix)
|
|
*out = append(*out, SubtitleTrack{
|
|
Lang: lang,
|
|
Codec: codec,
|
|
Title: title,
|
|
Forced: forced,
|
|
External: true,
|
|
Path: abs,
|
|
})
|
|
}
|
|
}
|
|
|
|
// parseSidecarName extracts (lang, forced, title) from a subtitle filename.
|
|
// stripPrefix (the video base) is removed first; the remainder is tokenised on
|
|
// common separators and scanned for a language code + role markers. Unknown →
|
|
// lang "und". The title is a human hint ("Forced", "SDH") or "".
|
|
func parseSidecarName(name, ext, stripPrefix string) (lang string, forced bool, title string) {
|
|
stem := strings.TrimSuffix(name, filepath.Ext(name))
|
|
if stripPrefix != "" && len(stem) >= len(stripPrefix) &&
|
|
strings.EqualFold(stem[:len(stripPrefix)], stripPrefix) {
|
|
stem = stem[len(stripPrefix):]
|
|
}
|
|
lang = "und"
|
|
var roles []string
|
|
for _, tok := range strings.FieldsFunc(stem, func(r rune) bool {
|
|
return r == '.' || r == '_' || r == '-' || r == ' ' || r == '[' || r == ']' || r == '(' || r == ')'
|
|
}) {
|
|
low := strings.ToLower(strings.TrimSpace(tok))
|
|
if low == "" {
|
|
continue
|
|
}
|
|
if forcedTokens[low] {
|
|
forced = true
|
|
roles = append(roles, "Forced")
|
|
continue
|
|
}
|
|
if v, isSDH := sdhTokens[low]; isSDH && v {
|
|
roles = append(roles, "SDH")
|
|
continue
|
|
}
|
|
// First token that maps to a real language wins. Try release-naming
|
|
// aliases (chs/lat/…) first, then the standard ISO normaliser. NormalizeLang
|
|
// echoes unknown input back lowercased, so accept only a mapped result
|
|
// (different from the raw token, or already a known 2-letter code).
|
|
if lang == "und" {
|
|
if alias, ok := sidecarLangAliases[low]; ok {
|
|
lang = alias
|
|
continue
|
|
}
|
|
if norm := NormalizeLang(low); norm != "und" && (norm != low || len(low) == 2) && isKnownLang(norm) {
|
|
lang = norm
|
|
continue
|
|
}
|
|
}
|
|
}
|
|
title = strings.Join(roles, " ")
|
|
return lang, forced, title
|
|
}
|
|
|
|
// isKnownLang reports whether code is a value present in langNormalize (i.e. a
|
|
// real ISO 639-1 we recognise) — guards against treating a random filename token
|
|
// ("web", "dl") as a language.
|
|
func isKnownLang(code string) bool {
|
|
for _, v := range langNormalize {
|
|
if v == code {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|