From f0ac905fdba458ef087e24f74264eb3fb3c7654a Mon Sep 17 00:00:00 2001 From: Deivid Soto Date: Tue, 2 Jun 2026 19:42:00 +0200 Subject: [PATCH] feat(library): detect corrupt/incomplete files during scan MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ffprobe already runs on every scanned file; now we capture its stderr and assess integrity from it. assessIntegrity flags a file "damaged" on the markers that mean the container/bitstream is unusable: invalid_data, ebml_corrupt, moov_missing, bitstream_corrupt, plus no_duration (a video stream with non-positive duration = a truncated/incomplete download). The verdict rides on MediaInfo.Integrity (IntegrityInfo{Damaged,Reason}), maps onto LibrarySyncItem.{Integrity,IntegrityReason}, and syncs to the web so a damaged file can be surfaced at rest instead of only blowing up at playback. Bumps the scan cache version (1 → 2) so existing entries re-probe once, and the scanner re-probes any cached entry that has no integrity verdict yet. --- internal/agent/types.go | 5 +++ internal/library/mediainfo/ffprobe.go | 50 +++++++++++++++++++++- internal/library/mediainfo/ffprobe_test.go | 39 +++++++++++++++++ internal/library/mediainfo/types.go | 13 ++++++ internal/library/scanner.go | 9 +++- internal/library/sync.go | 4 ++ internal/library/types.go | 6 ++- 7 files changed, 122 insertions(+), 4 deletions(-) diff --git a/internal/agent/types.go b/internal/agent/types.go index 13fcdee..8a18687 100644 --- a/internal/agent/types.go +++ b/internal/agent/types.go @@ -338,6 +338,11 @@ type LibrarySyncItem struct { AudioTracks any `json:"audioTracks,omitempty"` SubtitleTracks any `json:"subtitleTracks,omitempty"` VideoInfo any `json:"videoInfo,omitempty"` + // Integrity flags a damaged / incompletely-downloaded file ("damaged" or + // empty). IntegrityReason is a stable code (ebml_corrupt, moov_missing, + // no_duration, …) the web maps to a localized "re-download" message. + Integrity string `json:"integrity,omitempty"` + IntegrityReason string `json:"integrityReason,omitempty"` } // LibrarySyncResponse is returned after syncing library items. diff --git a/internal/library/mediainfo/ffprobe.go b/internal/library/mediainfo/ffprobe.go index 0118617..c850029 100644 --- a/internal/library/mediainfo/ffprobe.go +++ b/internal/library/mediainfo/ffprobe.go @@ -84,7 +84,55 @@ func ExtractMediaInfo(ctx context.Context, ffprobePath, filePath string) (*Media return nil, fmt.Errorf("ffprobe JSON parse failed: %w", err) } - return parseFFprobeOutput(data) + mi, perr := parseFFprobeOutput(data) + if perr != nil { + return nil, perr + } + // A corrupt-but-parseable file (e.g. a half-downloaded MKV) returns valid + // stream JSON and a zero exit, yet ffprobe still logs structural errors to + // stderr (captured above). Flag it so the library can warn instead of + // silently shipping a file that won't play. + if integ := assessIntegrity(stderr.String(), mi); integ != nil { + mi.Integrity = integ + } + return mi, nil +} + +// corruptionMarkers are high-confidence ffprobe stderr substrings (lowercased) +// that indicate a structurally damaged / incompletely-downloaded file, paired +// with a STABLE code the web maps to localized copy. Kept conservative so +// healthy files are never flagged — each appears only on real container/ +// bitstream damage, not benign warnings (ffprobe runs at -v error). +var corruptionMarkers = []struct{ sub, code string }{ + {"invalid data found when processing input", "invalid_data"}, + {"as first byte of an ebml number", "ebml_corrupt"}, // truncated/corrupt MKV + {"moov atom not found", "moov_missing"}, // truncated MP4 + {"invalid nal unit size", "bitstream_corrupt"}, + {"non-existing pps", "bitstream_corrupt"}, + // NOTE: deliberately NOT matching "error reading header" (ffprobe emits it + // on transient NFS/network read hiccups — a genuinely unreadable header + // also exits non-zero → ScanError → item skipped) nor "truncating packet" + // (printed for healthy MKV/TS with oversized subtitle/PGS packets). Both + // false-positive on good files; the markers above are structural. +} + +// assessIntegrity inspects ffprobe's stderr plus the parsed result and returns +// a damaged verdict on a high-confidence corruption signal, else nil. The +// Reason is a stable code (see corruptionMarkers) the web localizes. +func assessIntegrity(stderr string, mi *MediaInfo) *IntegrityInfo { + low := strings.ToLower(stderr) + for _, m := range corruptionMarkers { + if strings.Contains(low, m.sub) { + return &IntegrityInfo{Damaged: true, Reason: m.code} + } + } + // A file that carries a video stream but no determinable duration is almost + // always truncated (the moov/cues holding duration sit at the end of the + // file). Audio-only items legitimately omit it, so gate on having video. + if mi != nil && mi.Video != nil && mi.Video.Duration <= 0 { + return &IntegrityInfo{Damaged: true, Reason: "no_duration"} + } + return nil } // parseFFprobeOutput converts parsed ffprobe JSON into MediaInfo. diff --git a/internal/library/mediainfo/ffprobe_test.go b/internal/library/mediainfo/ffprobe_test.go index e29eed1..e7e2a43 100644 --- a/internal/library/mediainfo/ffprobe_test.go +++ b/internal/library/mediainfo/ffprobe_test.go @@ -428,3 +428,42 @@ func TestParseFFprobeOutput_FrameRateNoSlash(t *testing.T) { t.Errorf("frameRate = %v, want 0 (no slash)", mi.Video.FrameRate) } } + +func TestAssessIntegrity(t *testing.T) { + healthy := &MediaInfo{Video: &VideoInfo{Codec: "h264", Width: 1920, Height: 1080, Duration: 5477}} + + // Healthy file with no stderr → nil (not damaged). + if got := assessIntegrity("", healthy); got != nil { + t.Errorf("healthy file flagged damaged: %+v", got) + } + + // MKV EBML corruption (the real "In the Grey" case): ffprobe exits 0 but + // logs EBML errors → damaged with the ebml_corrupt code. + ebml := "[matroska,webm @ 0x60e7] 0x00 at pos 2144995 invalid as first byte of an EBML number\n" + got := assessIntegrity(ebml, healthy) + if got == nil || !got.Damaged || got.Reason != "ebml_corrupt" { + t.Errorf("EBML corruption not flagged correctly: %+v", got) + } + + // Truncated MP4. + if got := assessIntegrity("moov atom not found\n", healthy); got == nil || got.Reason != "moov_missing" { + t.Errorf("moov-missing not flagged: %+v", got) + } + + // Invalid data. + if got := assessIntegrity("Invalid data found when processing input\n", healthy); got == nil || got.Reason != "invalid_data" { + t.Errorf("invalid-data not flagged: %+v", got) + } + + // No duration on a video stream → truncated. + noDur := &MediaInfo{Video: &VideoInfo{Codec: "h264", Width: 1920, Height: 1080, Duration: 0}} + if got := assessIntegrity("", noDur); got == nil || got.Reason != "no_duration" { + t.Errorf("no-duration not flagged: %+v", got) + } + + // Audio-only file with no duration is NOT flagged (legitimately omits it). + audioOnly := &MediaInfo{Audio: []AudioTrack{{Lang: "en", Codec: "aac"}}} + if got := assessIntegrity("", audioOnly); got != nil { + t.Errorf("audio-only file wrongly flagged: %+v", got) + } +} diff --git a/internal/library/mediainfo/types.go b/internal/library/mediainfo/types.go index bf52f80..efa17ca 100644 --- a/internal/library/mediainfo/types.go +++ b/internal/library/mediainfo/types.go @@ -6,6 +6,19 @@ type MediaInfo struct { Audio []AudioTrack `json:"audio"` Subtitles []SubtitleTrack `json:"subtitles"` Languages []string `json:"languages"` // derived from audio tracks + // Integrity is non-nil only when the scan found signs of corruption / an + // incomplete download. Surfaced in the web library as a "damaged" warning + // so the user re-downloads instead of hitting a file that won't play. + Integrity *IntegrityInfo `json:"integrity,omitempty"` +} + +// IntegrityInfo flags a file whose metadata probed OK enough to land in the +// library but that shows structural damage (ffprobe emitted EBML / "invalid +// data" errors, a truncated moov atom, or no usable video/duration) — the +// hallmark of an incomplete or corrupt download. +type IntegrityInfo struct { + Damaged bool `json:"damaged"` + Reason string `json:"reason,omitempty"` } // VideoInfo represents the primary video stream metadata. diff --git a/internal/library/scanner.go b/internal/library/scanner.go index 9b9692e..3d6b5fe 100644 --- a/internal/library/scanner.go +++ b/internal/library/scanner.go @@ -145,11 +145,16 @@ func scanSingleFile(ctx context.Context, ffprobePath, filePath string, cacheIdx // Parse season/episode item.Season, item.Episode = ParseSeasonEpisode(item.FileName) - // Incremental: skip if file hasn't changed + // Incremental: skip if file hasn't changed. EXCEPT a previously-damaged + // file is always re-probed — a re-download to the same path can land with + // an identical size+mtime (some torrent clients preserve the torrent's + // mtime), so trusting the cached "damaged" verdict would pin a now-healthy + // file as broken forever. Re-probing damaged items is cheap (they're few). if incremental && existing != nil { if idx, ok := cacheIdx[filePath]; ok { cached := existing.Items[idx] - if cached.FileSize == item.FileSize && cached.ModTime == item.ModTime && cached.MediaInfo != nil { + if cached.FileSize == item.FileSize && cached.ModTime == item.ModTime && + cached.MediaInfo != nil && cached.MediaInfo.Integrity == nil { item.MediaInfo = cached.MediaInfo return item } diff --git a/internal/library/sync.go b/internal/library/sync.go index f3cd9e6..5d55a57 100644 --- a/internal/library/sync.go +++ b/internal/library/sync.go @@ -36,6 +36,10 @@ func BuildSyncItems(cache *LibraryCache) []agent.LibrarySyncItem { si.AudioTracks = item.MediaInfo.Audio si.SubtitleTracks = item.MediaInfo.Subtitles si.VideoInfo = item.MediaInfo.Video + if integ := item.MediaInfo.Integrity; integ != nil && integ.Damaged { + si.Integrity = "damaged" + si.IntegrityReason = integ.Reason + } } items = append(items, si) diff --git a/internal/library/types.go b/internal/library/types.go index ca89e8c..bd2591b 100644 --- a/internal/library/types.go +++ b/internal/library/types.go @@ -26,4 +26,8 @@ type LibraryCache struct { Items []LibraryItem `json:"items"` } -const cacheVersion = 1 +// Bump whenever the scan logic changes in a way that should re-probe an +// existing library on next scan (incremental reuse keys off mtime+size, so a +// pure logic change is invisible without this). v2: file-integrity detection +// (ffprobe corruption / incomplete-download flag). +const cacheVersion = 2