fix(agent): add retry with backoff and WebSocket connect for daemon registration

This commit is contained in:
Deivid Soto 2026-04-06 17:26:32 +02:00
parent 8388220dae
commit 6f81a2f3ea
4 changed files with 73 additions and 6 deletions

View file

@ -246,14 +246,14 @@ func (c *Client) handleResponse(resp *http.Response, dst any) error {
// Try to parse as JSON error
var errResp ErrorResponse
if json.Unmarshal(body, &errResp) == nil && errResp.Error != "" {
return fmt.Errorf("API error %d: %s", resp.StatusCode, errResp.Error)
return &HTTPError{StatusCode: resp.StatusCode, Message: errResp.Error}
}
// Non-JSON response (e.g. HTML error page) — truncate to something readable
msg := string(body)
if len(msg) > 120 || strings.Contains(msg, "<html") || strings.Contains(msg, "<!DOCTYPE") {
msg = fmt.Sprintf("server returned %s (non-JSON response, likely a server error)", resp.Status)
}
return fmt.Errorf("API error %d: %s", resp.StatusCode, msg)
return &HTTPError{StatusCode: resp.StatusCode, Message: msg}
}
if dst != nil {

View file

@ -2,10 +2,12 @@ package agent
import (
"context"
"errors"
"fmt"
"log"
"os"
"runtime"
"strings"
"sync/atomic"
"time"
)
@ -76,6 +78,7 @@ func NewDaemon(cfg DaemonConfig, transport Transport) *Daemon {
func (d *Daemon) Transport() Transport { return d.transport }
// Register registers the agent and fetches user info + features.
// Retries with exponential backoff on transient errors (429, 5xx, network).
func (d *Daemon) Register(ctx context.Context) error {
req := RegisterRequest{
AgentID: d.cfg.AgentID,
@ -90,9 +93,32 @@ func (d *Daemon) Register(ctx context.Context) error {
req.DiskTotalBytes = total
}
resp, err := d.transport.Register(ctx, req)
const maxRetries = 5
backoff := 5 * time.Second
var resp *RegisterResponse
var err error
for attempt := range maxRetries {
resp, err = d.transport.Register(ctx, req)
if err == nil {
break
}
// Only retry on transient errors (429, 5xx, network failures)
if !isTransientError(err) {
return fmt.Errorf("register: %w", err)
}
log.Printf("Register failed (attempt %d/%d): %v - retrying in %v", attempt+1, maxRetries, err, backoff)
timer := time.NewTimer(backoff)
select {
case <-ctx.Done():
timer.Stop()
return fmt.Errorf("register: %w", ctx.Err())
case <-timer.C:
}
backoff = min(backoff*2, 60*time.Second)
}
if err != nil {
return fmt.Errorf("register: %w", err)
return fmt.Errorf("register: %w (after %d retries)", err, maxRetries)
}
d.User = resp.User
@ -118,8 +144,14 @@ func (d *Daemon) Register(ctx context.Context) error {
return nil
}
// Run starts the main daemon loop. Blocks until ctx is cancelled.
// Run connects the transport, registers the agent, and starts the main loop.
// Blocks until ctx is cancelled. Callers must NOT call transport.Connect before Run.
func (d *Daemon) Run(ctx context.Context) error {
// Connect transport (establishes WebSocket if available, falls back to HTTP)
if err := d.transport.Connect(ctx); err != nil {
return fmt.Errorf("connect transport: %w", err)
}
// Register
if err := d.Register(ctx); err != nil {
return err
@ -265,6 +297,26 @@ func (d *Daemon) deregister() {
RemoveState()
}
// isTransientError returns true for errors worth retrying (429, 5xx, network).
func isTransientError(err error) bool {
if err == nil {
return false
}
// Structured check: HTTPError carries the status code directly
var httpErr *HTTPError
if errors.As(err, &httpErr) {
return httpErr.StatusCode == 429 || httpErr.StatusCode >= 500
}
// Fallback: network-level errors (no HTTP response received)
lower := strings.ToLower(err.Error())
for _, keyword := range []string{"connection refused", "no such host", "timeout", "request failed"} {
if strings.Contains(lower, keyword) {
return true
}
}
return false
}
func (d *Daemon) poll(ctx context.Context) {
resp, err := d.transport.ClaimTasks(ctx, d.cfg.AgentID)
if err != nil {

View file

@ -6,6 +6,7 @@ import "context"
// Both WebSocket (via CF Durable Object) and HTTP (direct to origin) implement this.
type Transport interface {
// Connect establishes the transport connection.
// Called internally by Daemon.Run — callers must NOT call Connect separately.
Connect(ctx context.Context) error
// Close tears down the connection gracefully.

View file

@ -1,6 +1,9 @@
package agent
import "time"
import (
"fmt"
"time"
)
// RegisterRequest is sent by the CLI on startup to register itself.
type RegisterRequest struct {
@ -147,6 +150,17 @@ type ErrorResponse struct {
Details any `json:"details,omitempty"`
}
// HTTPError represents an HTTP API error with a status code.
// Use errors.As to extract the status code for retry decisions.
type HTTPError struct {
StatusCode int
Message string
}
func (e *HTTPError) Error() string {
return fmt.Sprintf("API error %d: %s", e.StatusCode, e.Message)
}
// AgentInfo holds metadata about the running agent for display.
type AgentInfo struct {
ID string