agent/recover: only treat running containers as up

Recover built its up-set from every listed container, so a stopped/exited
container or a dead unikernel VM (whose state dir is still listed) counted
as already-running and was skipped — the exact reason dead VMs stayed in
drift instead of being recovered. Filter the set to State==running.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Kyle Isom
2026-06-11 13:07:41 -07:00
parent fd88ccb9d6
commit 90d47427ff

View File

@@ -21,16 +21,21 @@ func (a *Agent) Recover(ctx context.Context) error {
return fmt.Errorf("list services: %w", err) return fmt.Errorf("list services: %w", err)
} }
// Get the list of currently running containers from podman. // Get the list of containers across runtimes. Only those actually in the
running, err := a.listAllContainers(ctx) // "running" state count as up — a stopped/exited container or a dead
// unikernel VM (whose state dir still exists, so it is listed) must be
// recovered, not skipped.
listed, err := a.listAllContainers(ctx)
if err != nil { if err != nil {
a.Logger.Warn("cannot list containers, assuming none running", "err", err) a.Logger.Warn("cannot list containers, assuming none running", "err", err)
running = nil listed = nil
} }
runningSet := make(map[string]bool) runningSet := make(map[string]bool)
for _, c := range running { for _, c := range listed {
if c.State == "running" {
runningSet[c.Name] = true runningSet[c.Name] = true
} }
}
var recovered, skipped, already int var recovered, skipped, already int