feat(library): detect corrupt/incomplete files during scan

ffprobe already runs on every scanned file; now we capture its stderr and
assess integrity from it. assessIntegrity flags a file "damaged" on the
markers that mean the container/bitstream is unusable: invalid_data,
ebml_corrupt, moov_missing, bitstream_corrupt, plus no_duration (a video
stream with non-positive duration = a truncated/incomplete download).

The verdict rides on MediaInfo.Integrity (IntegrityInfo{Damaged,Reason}),
maps onto LibrarySyncItem.{Integrity,IntegrityReason}, and syncs to the web
so a damaged file can be surfaced at rest instead of only blowing up at
playback.

Bumps the scan cache version (1 → 2) so existing entries re-probe once, and
the scanner re-probes any cached entry that has no integrity verdict yet.
This commit is contained in:
Deivid Soto 2026-06-02 19:42:00 +02:00
parent c86e50245e
commit f0ac905fdb
7 changed files with 122 additions and 4 deletions

View file

@ -84,7 +84,55 @@ func ExtractMediaInfo(ctx context.Context, ffprobePath, filePath string) (*Media
return nil, fmt.Errorf("ffprobe JSON parse failed: %w", err)
}
return parseFFprobeOutput(data)
mi, perr := parseFFprobeOutput(data)
if perr != nil {
return nil, perr
}
// A corrupt-but-parseable file (e.g. a half-downloaded MKV) returns valid
// stream JSON and a zero exit, yet ffprobe still logs structural errors to
// stderr (captured above). Flag it so the library can warn instead of
// silently shipping a file that won't play.
if integ := assessIntegrity(stderr.String(), mi); integ != nil {
mi.Integrity = integ
}
return mi, nil
}
// corruptionMarkers are high-confidence ffprobe stderr substrings (lowercased)
// that indicate a structurally damaged / incompletely-downloaded file, paired
// with a STABLE code the web maps to localized copy. Kept conservative so
// healthy files are never flagged — each appears only on real container/
// bitstream damage, not benign warnings (ffprobe runs at -v error).
var corruptionMarkers = []struct{ sub, code string }{
{"invalid data found when processing input", "invalid_data"},
{"as first byte of an ebml number", "ebml_corrupt"}, // truncated/corrupt MKV
{"moov atom not found", "moov_missing"}, // truncated MP4
{"invalid nal unit size", "bitstream_corrupt"},
{"non-existing pps", "bitstream_corrupt"},
// NOTE: deliberately NOT matching "error reading header" (ffprobe emits it
// on transient NFS/network read hiccups — a genuinely unreadable header
// also exits non-zero → ScanError → item skipped) nor "truncating packet"
// (printed for healthy MKV/TS with oversized subtitle/PGS packets). Both
// false-positive on good files; the markers above are structural.
}
// assessIntegrity inspects ffprobe's stderr plus the parsed result and returns
// a damaged verdict on a high-confidence corruption signal, else nil. The
// Reason is a stable code (see corruptionMarkers) the web localizes.
func assessIntegrity(stderr string, mi *MediaInfo) *IntegrityInfo {
low := strings.ToLower(stderr)
for _, m := range corruptionMarkers {
if strings.Contains(low, m.sub) {
return &IntegrityInfo{Damaged: true, Reason: m.code}
}
}
// A file that carries a video stream but no determinable duration is almost
// always truncated (the moov/cues holding duration sit at the end of the
// file). Audio-only items legitimately omit it, so gate on having video.
if mi != nil && mi.Video != nil && mi.Video.Duration <= 0 {
return &IntegrityInfo{Damaged: true, Reason: "no_duration"}
}
return nil
}
// parseFFprobeOutput converts parsed ffprobe JSON into MediaInfo.

View file

@ -428,3 +428,42 @@ func TestParseFFprobeOutput_FrameRateNoSlash(t *testing.T) {
t.Errorf("frameRate = %v, want 0 (no slash)", mi.Video.FrameRate)
}
}
func TestAssessIntegrity(t *testing.T) {
healthy := &MediaInfo{Video: &VideoInfo{Codec: "h264", Width: 1920, Height: 1080, Duration: 5477}}
// Healthy file with no stderr → nil (not damaged).
if got := assessIntegrity("", healthy); got != nil {
t.Errorf("healthy file flagged damaged: %+v", got)
}
// MKV EBML corruption (the real "In the Grey" case): ffprobe exits 0 but
// logs EBML errors → damaged with the ebml_corrupt code.
ebml := "[matroska,webm @ 0x60e7] 0x00 at pos 2144995 invalid as first byte of an EBML number\n"
got := assessIntegrity(ebml, healthy)
if got == nil || !got.Damaged || got.Reason != "ebml_corrupt" {
t.Errorf("EBML corruption not flagged correctly: %+v", got)
}
// Truncated MP4.
if got := assessIntegrity("moov atom not found\n", healthy); got == nil || got.Reason != "moov_missing" {
t.Errorf("moov-missing not flagged: %+v", got)
}
// Invalid data.
if got := assessIntegrity("Invalid data found when processing input\n", healthy); got == nil || got.Reason != "invalid_data" {
t.Errorf("invalid-data not flagged: %+v", got)
}
// No duration on a video stream → truncated.
noDur := &MediaInfo{Video: &VideoInfo{Codec: "h264", Width: 1920, Height: 1080, Duration: 0}}
if got := assessIntegrity("", noDur); got == nil || got.Reason != "no_duration" {
t.Errorf("no-duration not flagged: %+v", got)
}
// Audio-only file with no duration is NOT flagged (legitimately omits it).
audioOnly := &MediaInfo{Audio: []AudioTrack{{Lang: "en", Codec: "aac"}}}
if got := assessIntegrity("", audioOnly); got != nil {
t.Errorf("audio-only file wrongly flagged: %+v", got)
}
}

View file

@ -6,6 +6,19 @@ type MediaInfo struct {
Audio []AudioTrack `json:"audio"`
Subtitles []SubtitleTrack `json:"subtitles"`
Languages []string `json:"languages"` // derived from audio tracks
// Integrity is non-nil only when the scan found signs of corruption / an
// incomplete download. Surfaced in the web library as a "damaged" warning
// so the user re-downloads instead of hitting a file that won't play.
Integrity *IntegrityInfo `json:"integrity,omitempty"`
}
// IntegrityInfo flags a file whose metadata probed OK enough to land in the
// library but that shows structural damage (ffprobe emitted EBML / "invalid
// data" errors, a truncated moov atom, or no usable video/duration) — the
// hallmark of an incomplete or corrupt download.
type IntegrityInfo struct {
Damaged bool `json:"damaged"`
Reason string `json:"reason,omitempty"`
}
// VideoInfo represents the primary video stream metadata.

View file

@ -145,11 +145,16 @@ func scanSingleFile(ctx context.Context, ffprobePath, filePath string, cacheIdx
// Parse season/episode
item.Season, item.Episode = ParseSeasonEpisode(item.FileName)
// Incremental: skip if file hasn't changed
// Incremental: skip if file hasn't changed. EXCEPT a previously-damaged
// file is always re-probed — a re-download to the same path can land with
// an identical size+mtime (some torrent clients preserve the torrent's
// mtime), so trusting the cached "damaged" verdict would pin a now-healthy
// file as broken forever. Re-probing damaged items is cheap (they're few).
if incremental && existing != nil {
if idx, ok := cacheIdx[filePath]; ok {
cached := existing.Items[idx]
if cached.FileSize == item.FileSize && cached.ModTime == item.ModTime && cached.MediaInfo != nil {
if cached.FileSize == item.FileSize && cached.ModTime == item.ModTime &&
cached.MediaInfo != nil && cached.MediaInfo.Integrity == nil {
item.MediaInfo = cached.MediaInfo
return item
}

View file

@ -36,6 +36,10 @@ func BuildSyncItems(cache *LibraryCache) []agent.LibrarySyncItem {
si.AudioTracks = item.MediaInfo.Audio
si.SubtitleTracks = item.MediaInfo.Subtitles
si.VideoInfo = item.MediaInfo.Video
if integ := item.MediaInfo.Integrity; integ != nil && integ.Damaged {
si.Integrity = "damaged"
si.IntegrityReason = integ.Reason
}
}
items = append(items, si)

View file

@ -26,4 +26,8 @@ type LibraryCache struct {
Items []LibraryItem `json:"items"`
}
const cacheVersion = 1
// Bump whenever the scan logic changes in a way that should re-probe an
// existing library on next scan (incremental reuse keys off mtime+size, so a
// pure logic change is invisible without this). v2: file-integrity detection
// (ffprobe corruption / incomplete-download flag).
const cacheVersion = 2