From 2e7cd7e8ed0dbc3b730a4b82867263ba7e1369f7 Mon Sep 17 00:00:00 2001 From: Deivid Soto Date: Wed, 27 May 2026 08:18:33 +0200 Subject: [PATCH] fix(upgrade): break auto-apply restart loop (0.9.8) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two bugs in 0.9.6/0.9.7 caused an infinite restart loop after a Force update signal: the CLI never reported the upgrade outcome, so `upgrade_requested` stayed `true`; AND `applyAutoUpgrade` called `os.Exit(0)` even when the target version equalled the current one, so systemd respawned and saw the flag again. - new Client.ReportUpgradeResult → POST /api/internal/agent/upgrade-result - applyAutoUpgrade calls it on success / failure / no-op - no-op case detected up front (same version) — skips Execute + Exit, clears server flag instead --- CHANGELOG.md | 21 +++++++++++++++++++++ internal/agent/client.go | 18 ++++++++++++++++++ internal/agent/daemon.go | 38 +++++++++++++++++++++++++++++++++++++- internal/cmd/version.go | 2 +- 4 files changed, 77 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 50cfa98..7a2366f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,27 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.9.8] - 2026-05-27 + +### Fixed + +- **auto-upgrade restart loop**: when the server signal arrived for a version + the daemon was already running (e.g. flag still set after a previous + upgrade), `applyAutoUpgrade` would call `upgrade.Execute` (which no-ops), + then `os.Exit(0)` anyway — systemd respawned, the flag was still set, the + cycle repeated. Now: no-op case is detected up front, the daemon clears + the server flag via `/api/internal/agent/upgrade-result` and stays alive. +- **upgrade flag stuck after success**: the CLI never reported the upgrade + outcome, so `upgrade_requested` stayed `true` in the DB forever. The + daemon now calls `/api/internal/agent/upgrade-result` on every applyAutoUpgrade + branch (success, failure, no-op) — server clears the flag, restart loops + end. + +### Added + +- New `Client.ReportUpgradeResult(agentID, success, version, error)` HTTP + method wrapping `POST /api/internal/agent/upgrade-result`. + ## [0.9.7] - 2026-05-26 ### Added diff --git a/internal/agent/client.go b/internal/agent/client.go index 9aa3c2a..e60b0a4 100644 --- a/internal/agent/client.go +++ b/internal/agent/client.go @@ -91,6 +91,24 @@ func (c *Client) Deregister(ctx context.Context, agentID string) error { return nil } +// ReportUpgradeResult tells the server the outcome of a previously requested +// upgrade so the server can clear `upgrade_requested`. Without this call the +// flag stays sticky and the daemon would re-trigger applyAutoUpgrade on every +// sync after upgrade — even for "already on target version" no-ops. +func (c *Client) ReportUpgradeResult(ctx context.Context, agentID string, success bool, version, errMsg string) error { + req := struct { + AgentID string `json:"agentId"` + Success bool `json:"success"` + Version string `json:"version,omitempty"` + Error string `json:"error,omitempty"` + }{AgentID: agentID, Success: success, Version: version, Error: errMsg} + var resp StatusResponse + if err := c.doPost(ctx, "/api/internal/agent/upgrade-result", req, &resp); err != nil { + return fmt.Errorf("report upgrade result: %w", err) + } + return nil +} + // ReportStatus reports download progress. Returns server-side flags the CLI must act on. func (c *Client) ReportStatus(ctx context.Context, update StatusUpdate) (*StatusResponse, error) { var resp StatusResponse diff --git a/internal/agent/daemon.go b/internal/agent/daemon.go index e79fc0a..68a187f 100644 --- a/internal/agent/daemon.go +++ b/internal/agent/daemon.go @@ -294,8 +294,30 @@ func (d *Daemon) Deregister() { // supervisor (systemd Restart=always on Linux) respawns on the new binary. // Triggered by the server's upgrade signal — opt-in flag set by the user from // the web UI; the daemon never auto-upgrades on a passive version bump. +// +// Reports the outcome to /api/internal/agent/upgrade-result so the server +// clears `upgrade_requested`. Without this report the flag stays sticky and +// the daemon would loop on every sync — including the no-op case where it's +// already on the target version. func (d *Daemon) applyAutoUpgrade(targetVersion string) { currentClean := strings.TrimPrefix(d.cfg.Version, "v") + targetClean := strings.TrimPrefix(targetVersion, "v") + + // No-op: server signal arrived but we're already running the target. This + // happens when the daemon restarts after a previous auto-upgrade before + // reportUpgradeResult cleared the flag, or when the operator manually + // installed the same version off-band. Skip Execute (which would also + // no-op) AND skip os.Exit, but DO clear the flag — otherwise we loop. + if currentClean == targetClean { + log.Printf("[upgrade] already on v%s — clearing server flag", currentClean) + ctxR, cancelR := context.WithTimeout(context.Background(), 10*time.Second) + defer cancelR() + if err := d.client.ReportUpgradeResult(ctxR, d.cfg.AgentID, true, currentClean, ""); err != nil { + log.Printf("[upgrade] report-result failed (will retry on next signal): %v", err) + } + return + } + upgrader := &upgrade.Upgrader{ CurrentVersion: currentClean, OnProgress: func(msg string) { @@ -307,10 +329,24 @@ func (d *Daemon) applyAutoUpgrade(targetVersion string) { result := upgrader.Execute(ctx, targetVersion) if !result.Success { log.Printf("[upgrade] auto-upgrade failed: %v", result.Error) + errMsg := "" + if result.Error != nil { + errMsg = result.Error.Error() + } + ctxR, cancelR := context.WithTimeout(context.Background(), 10*time.Second) + defer cancelR() + if err := d.client.ReportUpgradeResult(ctxR, d.cfg.AgentID, false, targetClean, errMsg); err != nil { + log.Printf("[upgrade] report-result failed: %v", err) + } return } - log.Printf("[upgrade] upgraded v%s → v%s; exiting so service supervisor restarts on new binary", + log.Printf("[upgrade] upgraded v%s → v%s; reporting result + exiting so service supervisor restarts on new binary", result.OldVersion, result.NewVersion) + ctxR, cancelR := context.WithTimeout(context.Background(), 10*time.Second) + if err := d.client.ReportUpgradeResult(ctxR, d.cfg.AgentID, true, result.NewVersion, ""); err != nil { + log.Printf("[upgrade] report-result failed: %v", err) + } + cancelR() time.Sleep(500 * time.Millisecond) os.Exit(0) } diff --git a/internal/cmd/version.go b/internal/cmd/version.go index 1fd2df9..379f923 100644 --- a/internal/cmd/version.go +++ b/internal/cmd/version.go @@ -1,4 +1,4 @@ package cmd // Version is the CLI version. Overridden by goreleaser ldflags at release time. -var Version = "0.9.7" +var Version = "0.9.8"