fix(debrid): fsync before verify to stop NFS write-back truncation
Debrid downloads to a network mount (the prod download dir is an NFS share at /mnt/nas/peliculas) could fail verification with "size mismatch: expected N, got M" where the file on disk was smaller than the bytes streamed. Every Write() succeeded into the client page cache (progress reached 100%) but the function returned before the asynchronous write-back to the NAS completed, so the caller's verify() stat'd a half-flushed file and rejected it. Concurrent downloads to the same mount made the contention worse; a lone retry succeeded because there was no flush pressure. - fsync the file (file.Sync) and check the error before reporting success, so a write-back failure surfaces here instead of silently truncating. - Stop swallowing the file.Close() error via a guarded close: error paths still clean up through defer, the success path closes explicitly and inspects the error. - Guard against a premature end-of-stream (downloaded < Content-Length). - Safety-net re-stat after the flush: if the on-disk size is short, remove the corrupt partial and return a retryable error. This is also the only integrity check when the server sends no Content-Length. Add TestDebridDownloadTruncatedStream.
This commit is contained in:
parent
cff89fbe31
commit
47521effea
2 changed files with 88 additions and 1 deletions
|
|
@ -188,7 +188,19 @@ func (d *DebridDownloader) Download(ctx context.Context, task *Task, outputDir s
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("open file: %w", err)
|
return nil, fmt.Errorf("open file: %w", err)
|
||||||
}
|
}
|
||||||
defer file.Close()
|
// Guarded close: error paths below clean up the fd via defer, while the
|
||||||
|
// success path closes explicitly and inspects the error (a swallowed Close
|
||||||
|
// error hides write-back failures on network mounts — the root cause of the
|
||||||
|
// 2026-06-15 NFS truncation incident).
|
||||||
|
closed := false
|
||||||
|
closeFile := func() error {
|
||||||
|
if closed {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
closed = true
|
||||||
|
return file.Close()
|
||||||
|
}
|
||||||
|
defer func() { _ = closeFile() }()
|
||||||
|
|
||||||
// Download with progress reporting
|
// Download with progress reporting
|
||||||
downloaded := startOffset
|
downloaded := startOffset
|
||||||
|
|
@ -260,6 +272,40 @@ func (d *DebridDownloader) Download(ctx context.Context, task *Task, outputDir s
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Guard against a premature end-of-stream: if the server advertised a length
|
||||||
|
// and we read fewer bytes, the transfer was truncated (e.g. a debrid CDN edge
|
||||||
|
// closing the connection). Don't hand a short file to verify as if complete.
|
||||||
|
if totalBytes > 0 && downloaded < totalBytes {
|
||||||
|
return nil, fmt.Errorf("incomplete download: got %s of %s", formatBytes(downloaded), formatBytes(totalBytes))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Force the OS to flush the file to durable storage BEFORE we report success.
|
||||||
|
// Without this, every Write() can succeed into the page cache while the actual
|
||||||
|
// write-back to a network mount (the prod download dir is an NFS share at
|
||||||
|
// /mnt/nas/peliculas) lags or fails — verify() then stats a half-flushed file
|
||||||
|
// and rejects it ("size mismatch"). fsync surfaces a write-back error here,
|
||||||
|
// where it's actionable, instead of silently truncating the file.
|
||||||
|
if err := file.Sync(); err != nil {
|
||||||
|
return nil, fmt.Errorf("flush to disk failed (write-back/network-mount error): %w", err)
|
||||||
|
}
|
||||||
|
if err := closeFile(); err != nil {
|
||||||
|
return nil, fmt.Errorf("close file failed (write-back/network-mount error): %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Safety net: after a durable flush, the on-disk size must match what we wrote.
|
||||||
|
// On a stalled mount a write-back error can still leave the file short even
|
||||||
|
// when Sync/Close returned nil. This is also the ONLY integrity check when the
|
||||||
|
// server sent no Content-Length (totalBytes == 0 → the guard above is skipped).
|
||||||
|
// Remove the corrupt partial so a retry starts clean, rather than passing a
|
||||||
|
// truncated file to verify().
|
||||||
|
if fi, statErr := os.Stat(destPath); statErr == nil && fi.Size() < downloaded {
|
||||||
|
if rmErr := os.Remove(destPath); rmErr != nil {
|
||||||
|
log.Printf("[%s] failed to remove corrupt partial %s: %v", agent.ShortID(task.ID), destPath, rmErr)
|
||||||
|
}
|
||||||
|
return nil, fmt.Errorf("post-write size mismatch: wrote %s but file is %s on disk — likely a stalled or failing storage mount (%s)",
|
||||||
|
formatBytes(downloaded), formatBytes(fi.Size()), outputDir)
|
||||||
|
}
|
||||||
|
|
||||||
log.Printf("[%s] debrid download complete: %s (%s)", agent.ShortID(task.ID), fileName, formatBytes(downloaded))
|
log.Printf("[%s] debrid download complete: %s (%s)", agent.ShortID(task.ID), fileName, formatBytes(downloaded))
|
||||||
|
|
||||||
return &Result{
|
return &Result{
|
||||||
|
|
|
||||||
|
|
@ -96,6 +96,47 @@ func TestDebridDownloadSuccess(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TestDebridDownloadTruncatedStream reproduces the 2026-06-15 incident shape: the
|
||||||
|
// server advertises a full Content-Length but delivers a short body (a CDN edge
|
||||||
|
// closing the connection early). The download must surface an error and return NO
|
||||||
|
// success Result — otherwise verify() would receive a half file. A Content-Length
|
||||||
|
// shortfall is caught by the read layer (io.ErrUnexpectedEOF); the post-loop length
|
||||||
|
// guard, Sync, and on-disk re-stat are the added defenses for the clean-EOF and
|
||||||
|
// write-back-loss variants the read layer can't see.
|
||||||
|
func TestDebridDownloadTruncatedStream(t *testing.T) {
|
||||||
|
full := 1024 * 100 // advertised length
|
||||||
|
delivered := full / 4 // bytes actually written before the handler returns
|
||||||
|
|
||||||
|
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
|
||||||
|
w.Header().Set("Content-Length", fmt.Sprintf("%d", full))
|
||||||
|
w.WriteHeader(http.StatusOK)
|
||||||
|
w.Write([]byte(strings.Repeat("x", delivered)))
|
||||||
|
// Handler returns without writing the rest → client sees a short body.
|
||||||
|
}))
|
||||||
|
defer srv.Close()
|
||||||
|
|
||||||
|
d := NewDebridDownloader()
|
||||||
|
task := &Task{
|
||||||
|
ID: "debrid-trunc-001",
|
||||||
|
InfoHash: "abc123def456abc123def456abc123def456abc1",
|
||||||
|
Title: "Truncated",
|
||||||
|
DirectURL: srv.URL + "/file.mkv",
|
||||||
|
DirectFileName: "Truncated.1080p.mkv",
|
||||||
|
Status: StatusDownloading,
|
||||||
|
}
|
||||||
|
|
||||||
|
progressCh := make(chan Progress, 100)
|
||||||
|
result, err := d.Download(context.Background(), task, t.TempDir(), progressCh)
|
||||||
|
close(progressCh)
|
||||||
|
|
||||||
|
if err == nil {
|
||||||
|
t.Fatal("expected error for a truncated stream, got nil (a short file would be passed to verify as complete)")
|
||||||
|
}
|
||||||
|
if result != nil {
|
||||||
|
t.Errorf("expected nil result on a truncated stream, got %+v", result)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestDebridDownloadNoURL(t *testing.T) {
|
func TestDebridDownloadNoURL(t *testing.T) {
|
||||||
d := NewDebridDownloader()
|
d := NewDebridDownloader()
|
||||||
task := &Task{ID: "no-url-001", DirectURL: ""}
|
task := &Task{ID: "no-url-001", DirectURL: ""}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue