feat(library): content fingerprint + path-resilient sync + stream self-heal

Stop treating the absolute path as a file's identity so a base-path change
(host binary→docker remap, moved media folder, remount) no longer makes the
server duplicate and orphan library rows.

- fingerprint.go: ComputeFingerprint = sha256(size ‖ first 1MiB ‖ last 1MiB),
  a stable content identity that survives rename/move/base-path change. Cached
  in LibraryItem and reused on incremental scans when size+mtime are unchanged.
- sync: send fingerprint + rel_path (relative to the scan root) + agent_id in
  the library-sync request, so the server can move a row in place and scope
  stale-cleanup per agent.
- daemon: force a FULL re-scan (with a user-facing WARNING) when the scan root
  changed since the last cache, so the server re-maps by fingerprint instead of
  duplicating. basePathChanged compares filepath.Clean'd roots.
- daemon: relocateUnreachable self-heals a stream request whose path is under an
  old root but whose file still exists under a current allowed root, so playback
  works immediately without waiting for the re-scan. Conservative: requires a
  3-segment tail and re-checks containment after resolving symlinks so it can
  neither serve the wrong file nor escape the allowed dirs.

See docs/plans/unarr-path-resilience.md in the web repo.
This commit is contained in:
Deivid Soto 2026-06-03 12:04:04 +02:00
parent e298ff6c05
commit b6ddeea129
9 changed files with 396 additions and 38 deletions

View file

@ -0,0 +1,55 @@
package library
import (
"crypto/sha256"
"encoding/binary"
"encoding/hex"
"io"
"os"
)
// fpChunk is how many bytes are hashed from the head and the tail of a file.
const fpChunk = 1 << 20 // 1 MiB
// ComputeFingerprint returns a stable content identity for a media file:
// sha256(fileSize ‖ first 1 MiB ‖ last 1 MiB). It survives renames, moves, and
// base-path changes (unlike the absolute path), so the server can recognise the
// same file at a new location and move its library row in place instead of
// duplicating it. Cheap: two bounded reads, never the whole file (except small
// ones). See docs/plans/unarr-path-resilience.md in the web repo.
func ComputeFingerprint(path string, size int64) (string, error) {
f, err := os.Open(path)
if err != nil {
return "", err
}
defer f.Close()
h := sha256.New()
var sizeBuf [8]byte
binary.LittleEndian.PutUint64(sizeBuf[:], uint64(size))
h.Write(sizeBuf[:])
if size <= 2*fpChunk {
// Small file: hash it whole — head+tail would overlap anyway.
if _, err := io.Copy(h, f); err != nil {
return "", err
}
} else {
head := make([]byte, fpChunk)
if _, err := io.ReadFull(f, head); err != nil {
return "", err
}
h.Write(head)
if _, err := f.Seek(size-fpChunk, io.SeekStart); err != nil {
return "", err
}
tail := make([]byte, fpChunk)
if _, err := io.ReadFull(f, tail); err != nil {
return "", err
}
h.Write(tail)
}
return hex.EncodeToString(h.Sum(nil)), nil
}

View file

@ -0,0 +1,81 @@
package library
import (
"os"
"path/filepath"
"testing"
)
func writeFile(t *testing.T, dir, name string, data []byte) string {
t.Helper()
p := filepath.Join(dir, name)
if err := os.WriteFile(p, data, 0o644); err != nil {
t.Fatalf("write %s: %v", p, err)
}
return p
}
func fp(t *testing.T, path string) string {
t.Helper()
fi, err := os.Stat(path)
if err != nil {
t.Fatalf("stat %s: %v", path, err)
}
s, err := ComputeFingerprint(path, fi.Size())
if err != nil {
t.Fatalf("fingerprint %s: %v", path, err)
}
return s
}
func TestComputeFingerprint(t *testing.T) {
dir := t.TempDir()
big := make([]byte, 5<<20) // 5 MiB > 2*chunk
for i := range big {
big[i] = byte(i * 7)
}
a := fp(t, writeFile(t, dir, "a.bin", big))
if len(a) != 64 {
t.Fatalf("want 64-hex, got %d", len(a))
}
// Move-invariance: identical bytes at a different path → same fingerprint.
if b := fp(t, writeFile(t, dir, "moved.bin", big)); b != a {
t.Errorf("move changed fingerprint: %s != %s", a, b)
}
// Tail sensitivity: flipping the last byte must change the fingerprint.
tailMut := append([]byte(nil), big...)
tailMut[len(tailMut)-1] ^= 0xFF
if c := fp(t, writeFile(t, dir, "tail.bin", tailMut)); c == a {
t.Error("tail mutation did not change fingerprint")
}
// Head sensitivity.
headMut := append([]byte(nil), big...)
headMut[0] ^= 0xFF
if c := fp(t, writeFile(t, dir, "head.bin", headMut)); c == a {
t.Error("head mutation did not change fingerprint")
}
// Size is mixed in: a small file and a large file never collide trivially.
small := fp(t, writeFile(t, dir, "small.bin", []byte("hello world")))
if small == a {
t.Error("small and big fingerprints collided")
}
}
func TestRelToRoot(t *testing.T) {
cases := []struct{ root, full, want string }{
{"/downloads", "/downloads/TV Shows/X/S01E09.mkv", "TV Shows/X/S01E09.mkv"},
{"/downloads", "/mnt/other/file.mkv", ""}, // outside root
{"/downloads", "/downloads", ""}, // equal → "."
{"", "/x/y.mkv", ""}, // no root
}
for _, c := range cases {
if got := relToRoot(c.root, c.full); got != c.want {
t.Errorf("relToRoot(%q,%q)=%q want %q", c.root, c.full, got, c.want)
}
}
}

View file

@ -130,6 +130,26 @@ func scanSingleFile(ctx context.Context, ffprobePath, filePath string, cacheIdx
ModTime: info.ModTime().UTC().Format(time.RFC3339),
}
// Look up the cached entry once — reused for both fingerprint reuse and the
// incremental ffprobe skip below.
var cached *LibraryItem
if existing != nil {
if idx, ok := cacheIdx[filePath]; ok {
cached = &existing.Items[idx]
}
}
unchanged := cached != nil &&
cached.FileSize == item.FileSize && cached.ModTime == item.ModTime
// Fingerprint: reuse the cached value when the file is unchanged and already
// has one; otherwise compute it (cheap, two bounded reads). Computed even on
// the incremental path so every synced item carries a stable identity.
if unchanged && cached.Fingerprint != "" {
item.Fingerprint = cached.Fingerprint
} else if fp, fpErr := ComputeFingerprint(filePath, item.FileSize); fpErr == nil {
item.Fingerprint = fp
}
// Parse filename for title, year, quality, codec
parsed := parser.Parse(item.FileName)
item.Quality = parsed.Quality
@ -150,15 +170,10 @@ func scanSingleFile(ctx context.Context, ffprobePath, filePath string, cacheIdx
// an identical size+mtime (some torrent clients preserve the torrent's
// mtime), so trusting the cached "damaged" verdict would pin a now-healthy
// file as broken forever. Re-probing damaged items is cheap (they're few).
if incremental && existing != nil {
if idx, ok := cacheIdx[filePath]; ok {
cached := existing.Items[idx]
if cached.FileSize == item.FileSize && cached.ModTime == item.ModTime &&
cached.MediaInfo != nil && cached.MediaInfo.Integrity == nil {
item.MediaInfo = cached.MediaInfo
return item
}
}
if incremental && unchanged &&
cached.MediaInfo != nil && cached.MediaInfo.Integrity == nil {
item.MediaInfo = cached.MediaInfo
return item
}
// Run ffprobe

View file

@ -1,6 +1,25 @@
package library
import "github.com/torrentclaw/unarr/internal/agent"
import (
"path/filepath"
"strings"
"github.com/torrentclaw/unarr/internal/agent"
)
// relToRoot returns the file's path relative to the scan root (forward-slashed),
// or "" when it doesn't live under root. The server stores this so streaming can
// later reconstruct the absolute path from the agent's *current* root.
func relToRoot(root, full string) string {
if root == "" {
return ""
}
rel, err := filepath.Rel(root, full)
if err != nil || rel == "." || strings.HasPrefix(rel, "..") {
return ""
}
return filepath.ToSlash(rel)
}
// BuildSyncItems converts cached library items to sync request items.
// Shared between unarr scan (cmd/scan.go) and auto-scan (cmd/daemon.go).
@ -11,14 +30,17 @@ func BuildSyncItems(cache *LibraryCache) []agent.LibrarySyncItem {
continue
}
si := agent.LibrarySyncItem{
FilePath: item.FilePath,
FileName: item.FileName,
FileSize: item.FileSize,
Title: item.Title,
Year: item.Year,
ContentType: DeriveContentType(item),
Season: item.Season,
Episode: item.Episode,
FilePath: item.FilePath,
FileName: item.FileName,
FileSize: item.FileSize,
Title: item.Title,
Year: item.Year,
ContentType: DeriveContentType(item),
Season: item.Season,
Episode: item.Episode,
Fingerprint: item.Fingerprint,
RelPath: relToRoot(cache.Path, item.FilePath),
LibraryRootKey: "library",
}
if item.MediaInfo != nil {

View file

@ -4,18 +4,21 @@ import "github.com/torrentclaw/unarr/internal/library/mediainfo"
// LibraryItem represents a single scanned media file.
type LibraryItem struct {
FilePath string `json:"filePath"`
FileName string `json:"fileName"`
FileSize int64 `json:"fileSize"`
ModTime string `json:"modTime"` // ISO 8601
Title string `json:"title"`
Year string `json:"year,omitempty"`
Season int `json:"season,omitempty"`
Episode int `json:"episode,omitempty"`
Quality string `json:"quality,omitempty"` // "1080p" etc (from filename)
Codec string `json:"codec,omitempty"` // "x265" etc (from filename)
MediaInfo *mediainfo.MediaInfo `json:"mediaInfo,omitempty"`
ScanError string `json:"scanError,omitempty"`
FilePath string `json:"filePath"`
FileName string `json:"fileName"`
FileSize int64 `json:"fileSize"`
ModTime string `json:"modTime"` // ISO 8601
// Fingerprint is a stable content identity (see fingerprint.go). Cached so
// incremental scans reuse it when size+mtime are unchanged.
Fingerprint string `json:"fingerprint,omitempty"`
Title string `json:"title"`
Year string `json:"year,omitempty"`
Season int `json:"season,omitempty"`
Episode int `json:"episode,omitempty"`
Quality string `json:"quality,omitempty"` // "1080p" etc (from filename)
Codec string `json:"codec,omitempty"` // "x265" etc (from filename)
MediaInfo *mediainfo.MediaInfo `json:"mediaInfo,omitempty"`
ScanError string `json:"scanError,omitempty"`
}
// LibraryCache is the on-disk cache of scanned library items.