From 2398707cc103c9b8c9cbec6c858cdcd56ac0719d Mon Sep 17 00:00:00 2001 From: Deivid Soto Date: Wed, 8 Apr 2026 00:06:19 +0200 Subject: [PATCH] fix(ws): add ping/pong keepalive and read deadline to detect zombie connections MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Without a SetReadDeadline, a silently dead WebSocket (e.g. Cloudflare dropping the connection without a close frame) would block readLoop forever. The daemon would appear connected but never receive tasks, and never fall back to HTTP polling. - Send RFC 6455 pings every 30s (resets Cloudflare's idle timer) - SetReadDeadline of 45s, refreshed on every pong and text message - SetWriteDeadline of 10s on all writes to prevent blocked sends - On timeout, readLoop emits "disconnected" → HybridTransport falls back to HTTP and starts WS reconnection loop --- internal/agent/transport_ws.go | 44 ++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/internal/agent/transport_ws.go b/internal/agent/transport_ws.go index 9d50f9e..4860ca5 100644 --- a/internal/agent/transport_ws.go +++ b/internal/agent/transport_ws.go @@ -226,10 +226,51 @@ func (t *WSTransport) send(msg any) error { if err != nil { return err } + _ = t.conn.SetWriteDeadline(time.Now().Add(10 * time.Second)) return t.conn.WriteMessage(websocket.TextMessage, data) } func (t *WSTransport) readLoop(conn *websocket.Conn) { + // Cloudflare idle timeout is 100s. We send pings every 30s and expect + // either a pong or a server message within 45s. If neither arrives, + // the read deadline fires and we detect the zombie connection. + const ( + pongWait = 45 * time.Second + pingPeriod = 30 * time.Second + ) + + _ = conn.SetReadDeadline(time.Now().Add(pongWait)) + conn.SetPongHandler(func(string) error { + _ = conn.SetReadDeadline(time.Now().Add(pongWait)) + return nil + }) + + // Ping ticker goroutine — stops when readLoop returns. + pingDone := make(chan struct{}) + go func() { + ticker := time.NewTicker(pingPeriod) + defer ticker.Stop() + for { + select { + case <-ticker.C: + t.mu.Lock() + if t.conn != nil { + _ = t.conn.SetWriteDeadline(time.Now().Add(10 * time.Second)) + err := t.conn.WriteMessage(websocket.PingMessage, nil) + _ = t.conn.SetWriteDeadline(time.Time{}) + if err != nil { + t.mu.Unlock() + return + } + } + t.mu.Unlock() + case <-pingDone: + return + } + } + }() + defer close(pingDone) + for { _, msg, err := conn.ReadMessage() if err != nil { @@ -244,6 +285,9 @@ func (t *WSTransport) readLoop(conn *websocket.Conn) { return } + // Any message (text or pong) proves the connection is alive. + _ = conn.SetReadDeadline(time.Now().Add(pongWait)) + var envelope struct { Type string `json:"type"` }