From 47521effea0c9e36d18d52bc8848fd4d98aeaa81 Mon Sep 17 00:00:00 2001 From: Deivid Soto Date: Mon, 15 Jun 2026 17:11:48 +0200 Subject: [PATCH] fix(debrid): fsync before verify to stop NFS write-back truncation Debrid downloads to a network mount (the prod download dir is an NFS share at /mnt/nas/peliculas) could fail verification with "size mismatch: expected N, got M" where the file on disk was smaller than the bytes streamed. Every Write() succeeded into the client page cache (progress reached 100%) but the function returned before the asynchronous write-back to the NAS completed, so the caller's verify() stat'd a half-flushed file and rejected it. Concurrent downloads to the same mount made the contention worse; a lone retry succeeded because there was no flush pressure. - fsync the file (file.Sync) and check the error before reporting success, so a write-back failure surfaces here instead of silently truncating. - Stop swallowing the file.Close() error via a guarded close: error paths still clean up through defer, the success path closes explicitly and inspects the error. - Guard against a premature end-of-stream (downloaded < Content-Length). - Safety-net re-stat after the flush: if the on-disk size is short, remove the corrupt partial and return a retryable error. This is also the only integrity check when the server sends no Content-Length. Add TestDebridDownloadTruncatedStream. --- internal/engine/debrid.go | 48 +++++++++++++++++++++++++++++++++- internal/engine/debrid_test.go | 41 +++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+), 1 deletion(-) diff --git a/internal/engine/debrid.go b/internal/engine/debrid.go index 49fe8e6..9867255 100644 --- a/internal/engine/debrid.go +++ b/internal/engine/debrid.go @@ -188,7 +188,19 @@ func (d *DebridDownloader) Download(ctx context.Context, task *Task, outputDir s if err != nil { return nil, fmt.Errorf("open file: %w", err) } - defer file.Close() + // Guarded close: error paths below clean up the fd via defer, while the + // success path closes explicitly and inspects the error (a swallowed Close + // error hides write-back failures on network mounts — the root cause of the + // 2026-06-15 NFS truncation incident). + closed := false + closeFile := func() error { + if closed { + return nil + } + closed = true + return file.Close() + } + defer func() { _ = closeFile() }() // Download with progress reporting downloaded := startOffset @@ -260,6 +272,40 @@ func (d *DebridDownloader) Download(ctx context.Context, task *Task, outputDir s } } + // Guard against a premature end-of-stream: if the server advertised a length + // and we read fewer bytes, the transfer was truncated (e.g. a debrid CDN edge + // closing the connection). Don't hand a short file to verify as if complete. + if totalBytes > 0 && downloaded < totalBytes { + return nil, fmt.Errorf("incomplete download: got %s of %s", formatBytes(downloaded), formatBytes(totalBytes)) + } + + // Force the OS to flush the file to durable storage BEFORE we report success. + // Without this, every Write() can succeed into the page cache while the actual + // write-back to a network mount (the prod download dir is an NFS share at + // /mnt/nas/peliculas) lags or fails — verify() then stats a half-flushed file + // and rejects it ("size mismatch"). fsync surfaces a write-back error here, + // where it's actionable, instead of silently truncating the file. + if err := file.Sync(); err != nil { + return nil, fmt.Errorf("flush to disk failed (write-back/network-mount error): %w", err) + } + if err := closeFile(); err != nil { + return nil, fmt.Errorf("close file failed (write-back/network-mount error): %w", err) + } + + // Safety net: after a durable flush, the on-disk size must match what we wrote. + // On a stalled mount a write-back error can still leave the file short even + // when Sync/Close returned nil. This is also the ONLY integrity check when the + // server sent no Content-Length (totalBytes == 0 → the guard above is skipped). + // Remove the corrupt partial so a retry starts clean, rather than passing a + // truncated file to verify(). + if fi, statErr := os.Stat(destPath); statErr == nil && fi.Size() < downloaded { + if rmErr := os.Remove(destPath); rmErr != nil { + log.Printf("[%s] failed to remove corrupt partial %s: %v", agent.ShortID(task.ID), destPath, rmErr) + } + return nil, fmt.Errorf("post-write size mismatch: wrote %s but file is %s on disk — likely a stalled or failing storage mount (%s)", + formatBytes(downloaded), formatBytes(fi.Size()), outputDir) + } + log.Printf("[%s] debrid download complete: %s (%s)", agent.ShortID(task.ID), fileName, formatBytes(downloaded)) return &Result{ diff --git a/internal/engine/debrid_test.go b/internal/engine/debrid_test.go index b7fc019..3ca9340 100644 --- a/internal/engine/debrid_test.go +++ b/internal/engine/debrid_test.go @@ -96,6 +96,47 @@ func TestDebridDownloadSuccess(t *testing.T) { } } +// TestDebridDownloadTruncatedStream reproduces the 2026-06-15 incident shape: the +// server advertises a full Content-Length but delivers a short body (a CDN edge +// closing the connection early). The download must surface an error and return NO +// success Result — otherwise verify() would receive a half file. A Content-Length +// shortfall is caught by the read layer (io.ErrUnexpectedEOF); the post-loop length +// guard, Sync, and on-disk re-stat are the added defenses for the clean-EOF and +// write-back-loss variants the read layer can't see. +func TestDebridDownloadTruncatedStream(t *testing.T) { + full := 1024 * 100 // advertised length + delivered := full / 4 // bytes actually written before the handler returns + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.Header().Set("Content-Length", fmt.Sprintf("%d", full)) + w.WriteHeader(http.StatusOK) + w.Write([]byte(strings.Repeat("x", delivered))) + // Handler returns without writing the rest → client sees a short body. + })) + defer srv.Close() + + d := NewDebridDownloader() + task := &Task{ + ID: "debrid-trunc-001", + InfoHash: "abc123def456abc123def456abc123def456abc1", + Title: "Truncated", + DirectURL: srv.URL + "/file.mkv", + DirectFileName: "Truncated.1080p.mkv", + Status: StatusDownloading, + } + + progressCh := make(chan Progress, 100) + result, err := d.Download(context.Background(), task, t.TempDir(), progressCh) + close(progressCh) + + if err == nil { + t.Fatal("expected error for a truncated stream, got nil (a short file would be passed to verify as complete)") + } + if result != nil { + t.Errorf("expected nil result on a truncated stream, got %+v", result) + } +} + func TestDebridDownloadNoURL(t *testing.T) { d := NewDebridDownloader() task := &Task{ID: "no-url-001", DirectURL: ""}