feat(library): content fingerprint + path-resilient sync + stream self-heal

Stop treating the absolute path as a file's identity so a base-path change
(host binary→docker remap, moved media folder, remount) no longer makes the
server duplicate and orphan library rows.

- fingerprint.go: ComputeFingerprint = sha256(size ‖ first 1MiB ‖ last 1MiB),
  a stable content identity that survives rename/move/base-path change. Cached
  in LibraryItem and reused on incremental scans when size+mtime are unchanged.
- sync: send fingerprint + rel_path (relative to the scan root) + agent_id in
  the library-sync request, so the server can move a row in place and scope
  stale-cleanup per agent.
- daemon: force a FULL re-scan (with a user-facing WARNING) when the scan root
  changed since the last cache, so the server re-maps by fingerprint instead of
  duplicating. basePathChanged compares filepath.Clean'd roots.
- daemon: relocateUnreachable self-heals a stream request whose path is under an
  old root but whose file still exists under a current allowed root, so playback
  works immediately without waiting for the re-scan. Conservative: requires a
  3-segment tail and re-checks containment after resolving symlinks so it can
  neither serve the wrong file nor escape the allowed dirs.

See docs/plans/unarr-path-resilience.md in the web repo.
This commit is contained in:
Deivid Soto 2026-06-03 12:04:04 +02:00
parent e298ff6c05
commit b6ddeea129
9 changed files with 396 additions and 38 deletions

View file

@ -318,6 +318,7 @@ type DebridAccount struct {
type LibrarySyncRequest struct { type LibrarySyncRequest struct {
Items []LibrarySyncItem `json:"items"` Items []LibrarySyncItem `json:"items"`
ScanPath string `json:"scanPath"` ScanPath string `json:"scanPath"`
AgentID string `json:"agentId,omitempty"` // lets the server scope stale-cleanup per agent
IsLastBatch bool `json:"isLastBatch"` IsLastBatch bool `json:"isLastBatch"`
SyncStartedAt string `json:"syncStartedAt,omitempty"` // ISO-8601; same for all batches in a session SyncStartedAt string `json:"syncStartedAt,omitempty"` // ISO-8601; same for all batches in a session
} }
@ -346,8 +347,14 @@ type LibrarySyncItem struct {
// Integrity flags a damaged / incompletely-downloaded file ("damaged" or // Integrity flags a damaged / incompletely-downloaded file ("damaged" or
// empty). IntegrityReason is a stable code (ebml_corrupt, moov_missing, // empty). IntegrityReason is a stable code (ebml_corrupt, moov_missing,
// no_duration, …) the web maps to a localized "re-download" message. // no_duration, …) the web maps to a localized "re-download" message.
Integrity string `json:"integrity,omitempty"` Integrity string `json:"integrity,omitempty"`
IntegrityReason string `json:"integrityReason,omitempty"` IntegrityReason string `json:"integrityReason,omitempty"`
// Path resilience: a stable content identity + the file's location relative
// to its library root, so the server can move a row in place on a rename /
// base-path change instead of duplicating it.
Fingerprint string `json:"fingerprint,omitempty"`
RelPath string `json:"relPath,omitempty"`
LibraryRootKey string `json:"libraryRootKey,omitempty"`
} }
// LibrarySyncResponse is returned after syncing library items. // LibrarySyncResponse is returned after syncing library items.

View file

@ -598,12 +598,25 @@ func runDaemonStart() error {
}() }()
} }
allowedRoots := []string{cfg.Download.Dir, cfg.Library.ScanPath,
cfg.Organize.MoviesDir, cfg.Organize.TVShowsDir}
filePath := filepath.Clean(sr.FilePath) filePath := filepath.Clean(sr.FilePath)
if !isAllowedStreamPath(filePath, cfg.Download.Dir, cfg.Library.ScanPath, // Self-heal a base-path mismatch: the web may hand us a path under an old
cfg.Organize.MoviesDir, cfg.Organize.TVShowsDir) { // root (e.g. /mnt/nas/peliculas/… from before a binary→docker move) that
log.Printf("[%s] stream request rejected: path outside allowed dirs: %s", agent.ShortID(sr.TaskID), filePath) // is now outside our allowed dirs but whose file still exists under a
reportStreamError(fmt.Sprintf("path outside allowed dirs: %s", filePath)) // current root (/downloads/…). Remap the path's tail onto an allowed root
return // so playback works immediately; the next re-scan persists the fix to the
// DB. See docs/plans/unarr-path-resilience.md.
if !isAllowedStreamPath(filePath, allowedRoots...) {
if remapped := relocateUnreachable(filePath, allowedRoots); remapped != "" {
log.Printf("[%s] stream self-heal: remapped %s → %s", agent.ShortID(sr.TaskID), filePath, remapped)
filePath = remapped
} else {
log.Printf("[%s] stream request rejected: path outside allowed dirs: %s", agent.ShortID(sr.TaskID), filePath)
reportStreamError(fmt.Sprintf("path outside allowed dirs: %s", filePath))
return
}
} }
// os.Stat over NFS can transiently fail (ESTALE/EAGAIN/timeout) right // os.Stat over NFS can transiently fail (ESTALE/EAGAIN/timeout) right
// after a remount or under load. Retry a few times before giving up so // after a remount or under load. Retry a few times before giving up so
@ -619,6 +632,15 @@ func runDaemonStart() error {
time.Sleep(300 * time.Millisecond) time.Sleep(300 * time.Millisecond)
} }
} }
if statErr != nil {
// Last resort before failing: the file may simply have moved within
// an allowed root — try to relocate it by path tail.
if remapped := relocateUnreachable(filePath, allowedRoots); remapped != "" {
log.Printf("[%s] stream self-heal: relocated missing %s → %s", agent.ShortID(sr.TaskID), filePath, remapped)
filePath = remapped
info, statErr = os.Stat(filePath)
}
}
if statErr != nil { if statErr != nil {
log.Printf("[%s] stream request: file not found after retries: %s (%v)", agent.ShortID(sr.TaskID), filePath, statErr) log.Printf("[%s] stream request: file not found after retries: %s (%v)", agent.ShortID(sr.TaskID), filePath, statErr)
reportStreamError(fmt.Sprintf("file not found: %s", filePath)) reportStreamError(fmt.Sprintf("file not found: %s", filePath))
@ -977,6 +999,53 @@ func isAllowedStreamPath(filePath string, allowedDirs ...string) bool {
return false return false
} }
// relocateUnreachable tries to find a file the web asked us to stream under a
// path we can't serve (e.g. an old base path) by joining the longest suffix of
// that path onto each current allowed root and checking it exists. Returns the
// found absolute path or "".
//
// Conservative by design — it must never serve the WRONG file:
// - Requires a tail of at least three segments (collection/season/file), so a
// generic "Season 01/Episode.mkv" can't match a different show by accident.
// Flat single-file-at-root layouts simply aren't self-healed here; the next
// re-scan re-maps them instead.
// - Re-checks containment AFTER resolving symlinks, so a symlink inside a root
// pointing outside it can't be used to escape the allowed dirs (isAllowed
// StreamPath alone is a lexical check that os.Stat would happily follow out).
func relocateUnreachable(filePath string, allowedRoots []string) string {
segs := strings.Split(filepath.ToSlash(filePath), "/")
// Longest tail first (most specific match wins). Stop before 3-segment tails
// so a short, ambiguous suffix can't match the wrong file.
for start := 0; start <= len(segs)-3; start++ {
tail := filepath.Join(segs[start:]...)
if tail == "" {
continue
}
for _, root := range allowedRoots {
if root == "" {
continue
}
cand := filepath.Join(root, tail)
if !isAllowedStreamPath(cand, root) {
continue
}
fi, err := os.Stat(cand)
if err != nil || fi.IsDir() {
continue
}
// Re-validate containment against the symlink-resolved real paths so
// a symlink under the root can't point the stream outside it.
realCand, e1 := filepath.EvalSymlinks(cand)
realRoot, e2 := filepath.EvalSymlinks(root)
if e1 != nil || e2 != nil || !isAllowedStreamPath(realCand, realRoot) {
continue
}
return cand
}
}
return ""
}
func formatSpeedLog(bps int64) string { func formatSpeedLog(bps int64) string {
switch { switch {
case bps >= 1024*1024*1024: case bps >= 1024*1024*1024:
@ -993,6 +1062,23 @@ func formatSpeedLog(bps int64) string {
// runAutoScan runs a library scan + sync on a timer or on-demand via scanNow channel. // runAutoScan runs a library scan + sync on a timer or on-demand via scanNow channel.
// It scans all provided paths and syncs each independently so stale-item cleanup // It scans all provided paths and syncs each independently so stale-item cleanup
// is scoped to the correct directory prefix on the server. // is scoped to the correct directory prefix on the server.
// basePathChanged reports whether the library's scan root moved since the last
// saved cache — i.e. the previously-scanned root is no longer one of the current
// scan paths. Used to force a full (non-incremental) re-scan so the server can
// re-map paths by fingerprint and reap the old prefix.
func basePathChanged(existing *library.LibraryCache, scanPaths []string) bool {
if existing == nil || len(existing.Items) == 0 || existing.Path == "" {
return false
}
prev := filepath.Clean(existing.Path)
for _, p := range scanPaths {
if filepath.Clean(p) == prev {
return false
}
}
return true
}
func runAutoScan(ctx context.Context, cfg config.Config, interval time.Duration, ac *agent.Client, scanNow <-chan struct{}, scanPaths []string) { func runAutoScan(ctx context.Context, cfg config.Config, interval time.Duration, ac *agent.Client, scanNow <-chan struct{}, scanPaths []string) {
log.Printf("[auto-scan] enabled: every %s, paths: %v", interval, scanPaths) log.Printf("[auto-scan] enabled: every %s, paths: %v", interval, scanPaths)
@ -1018,10 +1104,23 @@ func runAutoScan(ctx context.Context, cfg config.Config, interval time.Duration,
workers = 8 workers = 8
} }
// If the library base path changed (e.g. the agent moved from the host
// binary to docker, remapping /mnt/nas/peliculas → /downloads, or the
// user moved their media folder), force a FULL re-scan instead of an
// incremental one. The fingerprint merge on the server then relocates
// existing rows in place rather than duplicating, and per-agent cleanup
// reaps the old prefix. See docs/plans/unarr-path-resilience.md.
forceFull := basePathChanged(existing, scanPaths)
if forceFull {
log.Printf("[auto-scan] WARNING: library base path changed (was %q, now %v) — "+
"running a FULL re-scan. This can take a while on large libraries; "+
"playback and matches are preserved.", existing.Path, scanPaths)
}
scanOpts := library.ScanOptions{ scanOpts := library.ScanOptions{
Workers: workers, Workers: workers,
FFprobePath: cfg.Library.FFprobePath, FFprobePath: cfg.Library.FFprobePath,
Incremental: existing != nil, Incremental: existing != nil && !forceFull,
} }
// Resolve ffmpeg once for the sidecar prewarm (extracts text subs → WebVTT // Resolve ffmpeg once for the sidecar prewarm (extracts text subs → WebVTT
@ -1077,6 +1176,7 @@ func runAutoScan(ctx context.Context, cfg config.Config, interval time.Duration,
_, err := ac.SyncLibrary(ctx, agent.LibrarySyncRequest{ _, err := ac.SyncLibrary(ctx, agent.LibrarySyncRequest{
Items: items[i:end], Items: items[i:end],
ScanPath: scanPath, ScanPath: scanPath,
AgentID: cfg.Agent.ID,
IsLastBatch: isLast, IsLastBatch: isLast,
SyncStartedAt: syncStartedAt, SyncStartedAt: syncStartedAt,
}) })

View file

@ -0,0 +1,74 @@
package cmd
import (
"os"
"path/filepath"
"runtime"
"testing"
)
func mkfile(t *testing.T, path string) {
t.Helper()
if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(path, []byte("x"), 0o644); err != nil {
t.Fatal(err)
}
}
func TestRelocateUnreachable(t *testing.T) {
root := t.TempDir()
// A 3-segment-deep file under the current root.
mkfile(t, filepath.Join(root, "Acme Show", "Season 01", "ep.mkv"))
// A 2-segment-deep file (too shallow to be matched by a short tail).
mkfile(t, filepath.Join(root, "Season 01", "lonely.mkv"))
roots := []string{root}
// Base-path change: an old-root path whose 3-seg tail exists under the new
// root → relocates to the real file.
got := relocateUnreachable("/old/base/Acme Show/Season 01/ep.mkv", roots)
want := filepath.Join(root, "Acme Show", "Season 01", "ep.mkv")
if got != want {
t.Errorf("relocate moved file: got %q want %q", got, want)
}
// Only a 2-segment tail would match → must NOT relocate (ambiguous).
if got := relocateUnreachable("/old/Season 01/lonely.mkv", roots); got != "" {
t.Errorf("2-segment tail should not match, got %q", got)
}
// Nonexistent file → no relocation.
if got := relocateUnreachable("/old/base/Acme Show/Season 01/missing.mkv", roots); got != "" {
t.Errorf("missing file should not relocate, got %q", got)
}
// Traversal attempt: ".." segments are cleaned by filepath.Join and the
// result is re-validated, so it can't escape.
if got := relocateUnreachable("/old/../../../etc/passwd", roots); got != "" {
t.Errorf("traversal should not match, got %q", got)
}
}
func TestRelocateUnreachableSymlinkEscape(t *testing.T) {
if runtime.GOOS == "windows" {
t.Skip("symlink semantics differ on windows")
}
root := t.TempDir()
outside := t.TempDir()
// A real file living OUTSIDE any allowed root.
mkfile(t, filepath.Join(outside, "sub", "secret.mkv"))
// A symlink inside the root pointing at the outside tree.
if err := os.Symlink(outside, filepath.Join(root, "link")); err != nil {
t.Skipf("symlink unsupported: %v", err)
}
// The lexical candidate root/link/sub/secret.mkv exists (os.Stat follows the
// symlink), but after resolving symlinks it's outside the root → must be
// rejected so the stream can't escape the allowed dirs.
got := relocateUnreachable("/old/link/sub/secret.mkv", []string{root})
if got != "" {
t.Errorf("symlink escape must be rejected, got %q", got)
}
}

View file

@ -205,6 +205,7 @@ func syncToServer(ctx context.Context, cfg config.Config, cache *library.Library
resp, err := ac.SyncLibrary(ctx, agent.LibrarySyncRequest{ resp, err := ac.SyncLibrary(ctx, agent.LibrarySyncRequest{
Items: batch, Items: batch,
ScanPath: cache.Path, ScanPath: cache.Path,
AgentID: cfg.Agent.ID,
IsLastBatch: isLast, IsLastBatch: isLast,
SyncStartedAt: syncStartedAt, SyncStartedAt: syncStartedAt,
}) })

View file

@ -0,0 +1,55 @@
package library
import (
"crypto/sha256"
"encoding/binary"
"encoding/hex"
"io"
"os"
)
// fpChunk is how many bytes are hashed from the head and the tail of a file.
const fpChunk = 1 << 20 // 1 MiB
// ComputeFingerprint returns a stable content identity for a media file:
// sha256(fileSize ‖ first 1 MiB ‖ last 1 MiB). It survives renames, moves, and
// base-path changes (unlike the absolute path), so the server can recognise the
// same file at a new location and move its library row in place instead of
// duplicating it. Cheap: two bounded reads, never the whole file (except small
// ones). See docs/plans/unarr-path-resilience.md in the web repo.
func ComputeFingerprint(path string, size int64) (string, error) {
f, err := os.Open(path)
if err != nil {
return "", err
}
defer f.Close()
h := sha256.New()
var sizeBuf [8]byte
binary.LittleEndian.PutUint64(sizeBuf[:], uint64(size))
h.Write(sizeBuf[:])
if size <= 2*fpChunk {
// Small file: hash it whole — head+tail would overlap anyway.
if _, err := io.Copy(h, f); err != nil {
return "", err
}
} else {
head := make([]byte, fpChunk)
if _, err := io.ReadFull(f, head); err != nil {
return "", err
}
h.Write(head)
if _, err := f.Seek(size-fpChunk, io.SeekStart); err != nil {
return "", err
}
tail := make([]byte, fpChunk)
if _, err := io.ReadFull(f, tail); err != nil {
return "", err
}
h.Write(tail)
}
return hex.EncodeToString(h.Sum(nil)), nil
}

View file

@ -0,0 +1,81 @@
package library
import (
"os"
"path/filepath"
"testing"
)
func writeFile(t *testing.T, dir, name string, data []byte) string {
t.Helper()
p := filepath.Join(dir, name)
if err := os.WriteFile(p, data, 0o644); err != nil {
t.Fatalf("write %s: %v", p, err)
}
return p
}
func fp(t *testing.T, path string) string {
t.Helper()
fi, err := os.Stat(path)
if err != nil {
t.Fatalf("stat %s: %v", path, err)
}
s, err := ComputeFingerprint(path, fi.Size())
if err != nil {
t.Fatalf("fingerprint %s: %v", path, err)
}
return s
}
func TestComputeFingerprint(t *testing.T) {
dir := t.TempDir()
big := make([]byte, 5<<20) // 5 MiB > 2*chunk
for i := range big {
big[i] = byte(i * 7)
}
a := fp(t, writeFile(t, dir, "a.bin", big))
if len(a) != 64 {
t.Fatalf("want 64-hex, got %d", len(a))
}
// Move-invariance: identical bytes at a different path → same fingerprint.
if b := fp(t, writeFile(t, dir, "moved.bin", big)); b != a {
t.Errorf("move changed fingerprint: %s != %s", a, b)
}
// Tail sensitivity: flipping the last byte must change the fingerprint.
tailMut := append([]byte(nil), big...)
tailMut[len(tailMut)-1] ^= 0xFF
if c := fp(t, writeFile(t, dir, "tail.bin", tailMut)); c == a {
t.Error("tail mutation did not change fingerprint")
}
// Head sensitivity.
headMut := append([]byte(nil), big...)
headMut[0] ^= 0xFF
if c := fp(t, writeFile(t, dir, "head.bin", headMut)); c == a {
t.Error("head mutation did not change fingerprint")
}
// Size is mixed in: a small file and a large file never collide trivially.
small := fp(t, writeFile(t, dir, "small.bin", []byte("hello world")))
if small == a {
t.Error("small and big fingerprints collided")
}
}
func TestRelToRoot(t *testing.T) {
cases := []struct{ root, full, want string }{
{"/downloads", "/downloads/TV Shows/X/S01E09.mkv", "TV Shows/X/S01E09.mkv"},
{"/downloads", "/mnt/other/file.mkv", ""}, // outside root
{"/downloads", "/downloads", ""}, // equal → "."
{"", "/x/y.mkv", ""}, // no root
}
for _, c := range cases {
if got := relToRoot(c.root, c.full); got != c.want {
t.Errorf("relToRoot(%q,%q)=%q want %q", c.root, c.full, got, c.want)
}
}
}

View file

@ -130,6 +130,26 @@ func scanSingleFile(ctx context.Context, ffprobePath, filePath string, cacheIdx
ModTime: info.ModTime().UTC().Format(time.RFC3339), ModTime: info.ModTime().UTC().Format(time.RFC3339),
} }
// Look up the cached entry once — reused for both fingerprint reuse and the
// incremental ffprobe skip below.
var cached *LibraryItem
if existing != nil {
if idx, ok := cacheIdx[filePath]; ok {
cached = &existing.Items[idx]
}
}
unchanged := cached != nil &&
cached.FileSize == item.FileSize && cached.ModTime == item.ModTime
// Fingerprint: reuse the cached value when the file is unchanged and already
// has one; otherwise compute it (cheap, two bounded reads). Computed even on
// the incremental path so every synced item carries a stable identity.
if unchanged && cached.Fingerprint != "" {
item.Fingerprint = cached.Fingerprint
} else if fp, fpErr := ComputeFingerprint(filePath, item.FileSize); fpErr == nil {
item.Fingerprint = fp
}
// Parse filename for title, year, quality, codec // Parse filename for title, year, quality, codec
parsed := parser.Parse(item.FileName) parsed := parser.Parse(item.FileName)
item.Quality = parsed.Quality item.Quality = parsed.Quality
@ -150,15 +170,10 @@ func scanSingleFile(ctx context.Context, ffprobePath, filePath string, cacheIdx
// an identical size+mtime (some torrent clients preserve the torrent's // an identical size+mtime (some torrent clients preserve the torrent's
// mtime), so trusting the cached "damaged" verdict would pin a now-healthy // mtime), so trusting the cached "damaged" verdict would pin a now-healthy
// file as broken forever. Re-probing damaged items is cheap (they're few). // file as broken forever. Re-probing damaged items is cheap (they're few).
if incremental && existing != nil { if incremental && unchanged &&
if idx, ok := cacheIdx[filePath]; ok { cached.MediaInfo != nil && cached.MediaInfo.Integrity == nil {
cached := existing.Items[idx] item.MediaInfo = cached.MediaInfo
if cached.FileSize == item.FileSize && cached.ModTime == item.ModTime && return item
cached.MediaInfo != nil && cached.MediaInfo.Integrity == nil {
item.MediaInfo = cached.MediaInfo
return item
}
}
} }
// Run ffprobe // Run ffprobe

View file

@ -1,6 +1,25 @@
package library package library
import "github.com/torrentclaw/unarr/internal/agent" import (
"path/filepath"
"strings"
"github.com/torrentclaw/unarr/internal/agent"
)
// relToRoot returns the file's path relative to the scan root (forward-slashed),
// or "" when it doesn't live under root. The server stores this so streaming can
// later reconstruct the absolute path from the agent's *current* root.
func relToRoot(root, full string) string {
if root == "" {
return ""
}
rel, err := filepath.Rel(root, full)
if err != nil || rel == "." || strings.HasPrefix(rel, "..") {
return ""
}
return filepath.ToSlash(rel)
}
// BuildSyncItems converts cached library items to sync request items. // BuildSyncItems converts cached library items to sync request items.
// Shared between unarr scan (cmd/scan.go) and auto-scan (cmd/daemon.go). // Shared between unarr scan (cmd/scan.go) and auto-scan (cmd/daemon.go).
@ -11,14 +30,17 @@ func BuildSyncItems(cache *LibraryCache) []agent.LibrarySyncItem {
continue continue
} }
si := agent.LibrarySyncItem{ si := agent.LibrarySyncItem{
FilePath: item.FilePath, FilePath: item.FilePath,
FileName: item.FileName, FileName: item.FileName,
FileSize: item.FileSize, FileSize: item.FileSize,
Title: item.Title, Title: item.Title,
Year: item.Year, Year: item.Year,
ContentType: DeriveContentType(item), ContentType: DeriveContentType(item),
Season: item.Season, Season: item.Season,
Episode: item.Episode, Episode: item.Episode,
Fingerprint: item.Fingerprint,
RelPath: relToRoot(cache.Path, item.FilePath),
LibraryRootKey: "library",
} }
if item.MediaInfo != nil { if item.MediaInfo != nil {

View file

@ -4,18 +4,21 @@ import "github.com/torrentclaw/unarr/internal/library/mediainfo"
// LibraryItem represents a single scanned media file. // LibraryItem represents a single scanned media file.
type LibraryItem struct { type LibraryItem struct {
FilePath string `json:"filePath"` FilePath string `json:"filePath"`
FileName string `json:"fileName"` FileName string `json:"fileName"`
FileSize int64 `json:"fileSize"` FileSize int64 `json:"fileSize"`
ModTime string `json:"modTime"` // ISO 8601 ModTime string `json:"modTime"` // ISO 8601
Title string `json:"title"` // Fingerprint is a stable content identity (see fingerprint.go). Cached so
Year string `json:"year,omitempty"` // incremental scans reuse it when size+mtime are unchanged.
Season int `json:"season,omitempty"` Fingerprint string `json:"fingerprint,omitempty"`
Episode int `json:"episode,omitempty"` Title string `json:"title"`
Quality string `json:"quality,omitempty"` // "1080p" etc (from filename) Year string `json:"year,omitempty"`
Codec string `json:"codec,omitempty"` // "x265" etc (from filename) Season int `json:"season,omitempty"`
MediaInfo *mediainfo.MediaInfo `json:"mediaInfo,omitempty"` Episode int `json:"episode,omitempty"`
ScanError string `json:"scanError,omitempty"` Quality string `json:"quality,omitempty"` // "1080p" etc (from filename)
Codec string `json:"codec,omitempty"` // "x265" etc (from filename)
MediaInfo *mediainfo.MediaInfo `json:"mediaInfo,omitempty"`
ScanError string `json:"scanError,omitempty"`
} }
// LibraryCache is the on-disk cache of scanned library items. // LibraryCache is the on-disk cache of scanned library items.