agent: recover down components on startup when no boot sequence
A unikernel VM has no runtime restart policy, so if it exits — including when an agent restart's cgroup kill takes it down — nothing restarts it, and it sits in drift. Recover() already handles this (and unikernels, via runtimeFor), but only ran inside RunBootSequence, which is gated on a [boot] sequence that worker nodes don't define. Now the agent also runs Recover once in the background on startup when there is no boot sequence, so desired=running components (VMs especially) come back after an agent or host restart without delaying registration. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -11,6 +11,7 @@ import (
|
|||||||
"os/signal"
|
"os/signal"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"syscall"
|
"syscall"
|
||||||
|
"time"
|
||||||
|
|
||||||
mcpv1 "git.wntrmute.dev/mc/mcp/gen/mcp/v1"
|
mcpv1 "git.wntrmute.dev/mc/mcp/gen/mcp/v1"
|
||||||
"git.wntrmute.dev/mc/mcp/internal/auth"
|
"git.wntrmute.dev/mc/mcp/internal/auth"
|
||||||
@@ -155,6 +156,20 @@ func Run(cfg *config.AgentConfig, version string) error {
|
|||||||
logger.Error("boot sequence failed", "err", err)
|
logger.Error("boot sequence failed", "err", err)
|
||||||
// Continue starting the gRPC server — partial boot is better than no agent.
|
// Continue starting the gRPC server — partial boot is better than no agent.
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
// No ordered boot sequence: still reconcile once, in the background, so
|
||||||
|
// desired=running components are brought back up after an agent or host
|
||||||
|
// restart without delaying registration. This matters most for
|
||||||
|
// unikernel VMs: unlike podman containers (which have a restart policy),
|
||||||
|
// a VM that exits — including when an agent restart's cgroup kill takes
|
||||||
|
// it down — has nothing to restart it.
|
||||||
|
go func() {
|
||||||
|
recCtx, recCancel := context.WithTimeout(context.Background(), 10*time.Minute)
|
||||||
|
defer recCancel()
|
||||||
|
if err := a.Recover(recCtx); err != nil {
|
||||||
|
logger.Error("startup recover failed", "err", err)
|
||||||
|
}
|
||||||
|
}()
|
||||||
}
|
}
|
||||||
|
|
||||||
// Start heartbeat client (registers with master and sends heartbeats).
|
// Start heartbeat client (registers with master and sends heartbeats).
|
||||||
|
|||||||
Reference in New Issue
Block a user