From 98b166fa7b32b609c879ce325955d6cb48d15433 Mon Sep 17 00:00:00 2001 From: Kyle Isom Date: Thu, 11 Jun 2026 12:58:50 -0700 Subject: [PATCH] agent: recover down components on startup when no boot sequence MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A unikernel VM has no runtime restart policy, so if it exits — including when an agent restart's cgroup kill takes it down — nothing restarts it, and it sits in drift. Recover() already handles this (and unikernels, via runtimeFor), but only ran inside RunBootSequence, which is gated on a [boot] sequence that worker nodes don't define. Now the agent also runs Recover once in the background on startup when there is no boot sequence, so desired=running components (VMs especially) come back after an agent or host restart without delaying registration. Co-Authored-By: Claude Opus 4.8 --- internal/agent/agent.go | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/internal/agent/agent.go b/internal/agent/agent.go index 4331c09..e1f7624 100644 --- a/internal/agent/agent.go +++ b/internal/agent/agent.go @@ -11,6 +11,7 @@ import ( "os/signal" "path/filepath" "syscall" + "time" mcpv1 "git.wntrmute.dev/mc/mcp/gen/mcp/v1" "git.wntrmute.dev/mc/mcp/internal/auth" @@ -155,6 +156,20 @@ func Run(cfg *config.AgentConfig, version string) error { logger.Error("boot sequence failed", "err", err) // Continue starting the gRPC server — partial boot is better than no agent. } + } else { + // No ordered boot sequence: still reconcile once, in the background, so + // desired=running components are brought back up after an agent or host + // restart without delaying registration. This matters most for + // unikernel VMs: unlike podman containers (which have a restart policy), + // a VM that exits — including when an agent restart's cgroup kill takes + // it down — has nothing to restart it. + go func() { + recCtx, recCancel := context.WithTimeout(context.Background(), 10*time.Minute) + defer recCancel() + if err := a.Recover(recCtx); err != nil { + logger.Error("startup recover failed", "err", err) + } + }() } // Start heartbeat client (registers with master and sends heartbeats).