fix(upgrade): break auto-apply restart loop (0.9.8)
Some checks failed
Release / release (push) Failing after 0s
Release / docker (push) Has been skipped
Release / virustotal (push) Failing after 0s

Two bugs in 0.9.6/0.9.7 caused an infinite restart loop after a Force update
signal: the CLI never reported the upgrade outcome, so `upgrade_requested`
stayed `true`; AND `applyAutoUpgrade` called `os.Exit(0)` even when the
target version equalled the current one, so systemd respawned and saw the
flag again.

  - new Client.ReportUpgradeResult → POST /api/internal/agent/upgrade-result
  - applyAutoUpgrade calls it on success / failure / no-op
  - no-op case detected up front (same version) — skips Execute + Exit,
    clears server flag instead
This commit is contained in:
Deivid Soto 2026-05-27 08:18:33 +02:00
parent 7e96976257
commit 2e7cd7e8ed
4 changed files with 77 additions and 2 deletions

View file

@ -91,6 +91,24 @@ func (c *Client) Deregister(ctx context.Context, agentID string) error {
return nil
}
// ReportUpgradeResult tells the server the outcome of a previously requested
// upgrade so the server can clear `upgrade_requested`. Without this call the
// flag stays sticky and the daemon would re-trigger applyAutoUpgrade on every
// sync after upgrade — even for "already on target version" no-ops.
func (c *Client) ReportUpgradeResult(ctx context.Context, agentID string, success bool, version, errMsg string) error {
req := struct {
AgentID string `json:"agentId"`
Success bool `json:"success"`
Version string `json:"version,omitempty"`
Error string `json:"error,omitempty"`
}{AgentID: agentID, Success: success, Version: version, Error: errMsg}
var resp StatusResponse
if err := c.doPost(ctx, "/api/internal/agent/upgrade-result", req, &resp); err != nil {
return fmt.Errorf("report upgrade result: %w", err)
}
return nil
}
// ReportStatus reports download progress. Returns server-side flags the CLI must act on.
func (c *Client) ReportStatus(ctx context.Context, update StatusUpdate) (*StatusResponse, error) {
var resp StatusResponse

View file

@ -294,8 +294,30 @@ func (d *Daemon) Deregister() {
// supervisor (systemd Restart=always on Linux) respawns on the new binary.
// Triggered by the server's upgrade signal — opt-in flag set by the user from
// the web UI; the daemon never auto-upgrades on a passive version bump.
//
// Reports the outcome to /api/internal/agent/upgrade-result so the server
// clears `upgrade_requested`. Without this report the flag stays sticky and
// the daemon would loop on every sync — including the no-op case where it's
// already on the target version.
func (d *Daemon) applyAutoUpgrade(targetVersion string) {
currentClean := strings.TrimPrefix(d.cfg.Version, "v")
targetClean := strings.TrimPrefix(targetVersion, "v")
// No-op: server signal arrived but we're already running the target. This
// happens when the daemon restarts after a previous auto-upgrade before
// reportUpgradeResult cleared the flag, or when the operator manually
// installed the same version off-band. Skip Execute (which would also
// no-op) AND skip os.Exit, but DO clear the flag — otherwise we loop.
if currentClean == targetClean {
log.Printf("[upgrade] already on v%s — clearing server flag", currentClean)
ctxR, cancelR := context.WithTimeout(context.Background(), 10*time.Second)
defer cancelR()
if err := d.client.ReportUpgradeResult(ctxR, d.cfg.AgentID, true, currentClean, ""); err != nil {
log.Printf("[upgrade] report-result failed (will retry on next signal): %v", err)
}
return
}
upgrader := &upgrade.Upgrader{
CurrentVersion: currentClean,
OnProgress: func(msg string) {
@ -307,10 +329,24 @@ func (d *Daemon) applyAutoUpgrade(targetVersion string) {
result := upgrader.Execute(ctx, targetVersion)
if !result.Success {
log.Printf("[upgrade] auto-upgrade failed: %v", result.Error)
errMsg := ""
if result.Error != nil {
errMsg = result.Error.Error()
}
ctxR, cancelR := context.WithTimeout(context.Background(), 10*time.Second)
defer cancelR()
if err := d.client.ReportUpgradeResult(ctxR, d.cfg.AgentID, false, targetClean, errMsg); err != nil {
log.Printf("[upgrade] report-result failed: %v", err)
}
return
}
log.Printf("[upgrade] upgraded v%s → v%s; exiting so service supervisor restarts on new binary",
log.Printf("[upgrade] upgraded v%s → v%s; reporting result + exiting so service supervisor restarts on new binary",
result.OldVersion, result.NewVersion)
ctxR, cancelR := context.WithTimeout(context.Background(), 10*time.Second)
if err := d.client.ReportUpgradeResult(ctxR, d.cfg.AgentID, true, result.NewVersion, ""); err != nil {
log.Printf("[upgrade] report-result failed: %v", err)
}
cancelR()
time.Sleep(500 * time.Millisecond)
os.Exit(0)
}