fix(debrid): fsync before verify to stop NFS write-back truncation

Debrid downloads to a network mount (the prod download dir is an NFS share
at /mnt/nas/peliculas) could fail verification with "size mismatch:
expected N, got M" where the file on disk was smaller than the bytes
streamed. Every Write() succeeded into the client page cache (progress
reached 100%) but the function returned before the asynchronous write-back
to the NAS completed, so the caller's verify() stat'd a half-flushed file
and rejected it. Concurrent downloads to the same mount made the contention
worse; a lone retry succeeded because there was no flush pressure.

- fsync the file (file.Sync) and check the error before reporting success,
  so a write-back failure surfaces here instead of silently truncating.
- Stop swallowing the file.Close() error via a guarded close: error paths
  still clean up through defer, the success path closes explicitly and
  inspects the error.
- Guard against a premature end-of-stream (downloaded < Content-Length).
- Safety-net re-stat after the flush: if the on-disk size is short, remove
  the corrupt partial and return a retryable error. This is also the only
  integrity check when the server sends no Content-Length.

Add TestDebridDownloadTruncatedStream.
This commit is contained in:
Deivid Soto 2026-06-15 17:11:48 +02:00
parent cff89fbe31
commit 47521effea
2 changed files with 88 additions and 1 deletions

View file

@ -188,7 +188,19 @@ func (d *DebridDownloader) Download(ctx context.Context, task *Task, outputDir s
if err != nil { if err != nil {
return nil, fmt.Errorf("open file: %w", err) return nil, fmt.Errorf("open file: %w", err)
} }
defer file.Close() // Guarded close: error paths below clean up the fd via defer, while the
// success path closes explicitly and inspects the error (a swallowed Close
// error hides write-back failures on network mounts — the root cause of the
// 2026-06-15 NFS truncation incident).
closed := false
closeFile := func() error {
if closed {
return nil
}
closed = true
return file.Close()
}
defer func() { _ = closeFile() }()
// Download with progress reporting // Download with progress reporting
downloaded := startOffset downloaded := startOffset
@ -260,6 +272,40 @@ func (d *DebridDownloader) Download(ctx context.Context, task *Task, outputDir s
} }
} }
// Guard against a premature end-of-stream: if the server advertised a length
// and we read fewer bytes, the transfer was truncated (e.g. a debrid CDN edge
// closing the connection). Don't hand a short file to verify as if complete.
if totalBytes > 0 && downloaded < totalBytes {
return nil, fmt.Errorf("incomplete download: got %s of %s", formatBytes(downloaded), formatBytes(totalBytes))
}
// Force the OS to flush the file to durable storage BEFORE we report success.
// Without this, every Write() can succeed into the page cache while the actual
// write-back to a network mount (the prod download dir is an NFS share at
// /mnt/nas/peliculas) lags or fails — verify() then stats a half-flushed file
// and rejects it ("size mismatch"). fsync surfaces a write-back error here,
// where it's actionable, instead of silently truncating the file.
if err := file.Sync(); err != nil {
return nil, fmt.Errorf("flush to disk failed (write-back/network-mount error): %w", err)
}
if err := closeFile(); err != nil {
return nil, fmt.Errorf("close file failed (write-back/network-mount error): %w", err)
}
// Safety net: after a durable flush, the on-disk size must match what we wrote.
// On a stalled mount a write-back error can still leave the file short even
// when Sync/Close returned nil. This is also the ONLY integrity check when the
// server sent no Content-Length (totalBytes == 0 → the guard above is skipped).
// Remove the corrupt partial so a retry starts clean, rather than passing a
// truncated file to verify().
if fi, statErr := os.Stat(destPath); statErr == nil && fi.Size() < downloaded {
if rmErr := os.Remove(destPath); rmErr != nil {
log.Printf("[%s] failed to remove corrupt partial %s: %v", agent.ShortID(task.ID), destPath, rmErr)
}
return nil, fmt.Errorf("post-write size mismatch: wrote %s but file is %s on disk — likely a stalled or failing storage mount (%s)",
formatBytes(downloaded), formatBytes(fi.Size()), outputDir)
}
log.Printf("[%s] debrid download complete: %s (%s)", agent.ShortID(task.ID), fileName, formatBytes(downloaded)) log.Printf("[%s] debrid download complete: %s (%s)", agent.ShortID(task.ID), fileName, formatBytes(downloaded))
return &Result{ return &Result{

View file

@ -96,6 +96,47 @@ func TestDebridDownloadSuccess(t *testing.T) {
} }
} }
// TestDebridDownloadTruncatedStream reproduces the 2026-06-15 incident shape: the
// server advertises a full Content-Length but delivers a short body (a CDN edge
// closing the connection early). The download must surface an error and return NO
// success Result — otherwise verify() would receive a half file. A Content-Length
// shortfall is caught by the read layer (io.ErrUnexpectedEOF); the post-loop length
// guard, Sync, and on-disk re-stat are the added defenses for the clean-EOF and
// write-back-loss variants the read layer can't see.
func TestDebridDownloadTruncatedStream(t *testing.T) {
full := 1024 * 100 // advertised length
delivered := full / 4 // bytes actually written before the handler returns
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
w.Header().Set("Content-Length", fmt.Sprintf("%d", full))
w.WriteHeader(http.StatusOK)
w.Write([]byte(strings.Repeat("x", delivered)))
// Handler returns without writing the rest → client sees a short body.
}))
defer srv.Close()
d := NewDebridDownloader()
task := &Task{
ID: "debrid-trunc-001",
InfoHash: "abc123def456abc123def456abc123def456abc1",
Title: "Truncated",
DirectURL: srv.URL + "/file.mkv",
DirectFileName: "Truncated.1080p.mkv",
Status: StatusDownloading,
}
progressCh := make(chan Progress, 100)
result, err := d.Download(context.Background(), task, t.TempDir(), progressCh)
close(progressCh)
if err == nil {
t.Fatal("expected error for a truncated stream, got nil (a short file would be passed to verify as complete)")
}
if result != nil {
t.Errorf("expected nil result on a truncated stream, got %+v", result)
}
}
func TestDebridDownloadNoURL(t *testing.T) { func TestDebridDownloadNoURL(t *testing.T) {
d := NewDebridDownloader() d := NewDebridDownloader()
task := &Task{ID: "no-url-001", DirectURL: ""} task := &Task{ID: "no-url-001", DirectURL: ""}