package agent import ( "context" "fmt" "git.wntrmute.dev/mc/mcp/internal/registry" "git.wntrmute.dev/mc/mcp/internal/runtime" ) // Recover recreates containers from the agent's registry for all services // whose desired state is "running" but which don't have a running container // in podman. This is the recovery path after a podman database loss (e.g., // after a UID change or podman reset). // // Recover does NOT pull images — it assumes the images are cached locally. // If an image is missing, that component is skipped with a warning. func (a *Agent) Recover(ctx context.Context) error { services, err := registry.ListServices(a.DB) if err != nil { return fmt.Errorf("list services: %w", err) } // Get the list of containers across runtimes. Only those actually in the // "running" state count as up — a stopped/exited container or a dead // unikernel VM (whose state dir still exists, so it is listed) must be // recovered, not skipped. listed, err := a.listAllContainers(ctx) if err != nil { a.Logger.Warn("cannot list containers, assuming none running", "err", err) listed = nil } runningSet := make(map[string]bool) for _, c := range listed { if c.State == "running" { runningSet[c.Name] = true } } var recovered, skipped, already int for _, svc := range services { if !svc.Active { continue } comps, err := registry.ListComponents(a.DB, svc.Name) if err != nil { a.Logger.Warn("list components", "service", svc.Name, "err", err) continue } for _, comp := range comps { if comp.DesiredState != "running" { continue } containerName := svc.Name + "-" + comp.Name if comp.Name == svc.Name { containerName = svc.Name } // Skip if container is already running. if runningSet[containerName] { already++ continue } a.Logger.Info("recovering container", "service", svc.Name, "component", comp.Name, "image", comp.Image, ) rt := a.runtimeFor(comp.Runtime) // Remove any stale container with the same name. _ = rt.Remove(ctx, containerName) // Build the container spec from the registry. spec := runtime.ContainerSpec{ Name: containerName, Image: comp.Image, Network: comp.Network, User: comp.UserSpec, Restart: comp.Restart, Volumes: comp.Volumes, Cmd: comp.Cmd, MemoryMB: comp.MemoryMB, VCPUs: comp.VCPUs, } // Allocate ports from routes if the component has routes. if len(comp.Routes) > 0 && a.PortAlloc != nil { ports, env, allocErr := a.allocateRoutePorts(svc.Name, comp.Name, comp.Routes) if allocErr != nil { a.Logger.Warn("allocate route ports", "container", containerName, "err", allocErr) spec.Ports = comp.Ports } else { spec.Ports = append(comp.Ports, ports...) spec.Env = append(spec.Env, env...) } } else { spec.Ports = comp.Ports } if err := rt.Run(ctx, spec); err != nil { a.Logger.Error("recover container failed", "container", containerName, "err", err, ) skipped++ continue } // Re-register mc-proxy routes. if a.Proxy != nil && len(comp.Routes) > 0 { hostPorts, hpErr := registry.GetRouteHostPorts(a.DB, svc.Name, comp.Name) if hpErr == nil { if proxyErr := a.Proxy.RegisterRoutes(ctx, svc.Name, comp.Routes, hostPorts); proxyErr != nil { a.Logger.Warn("re-register routes", "service", svc.Name, "err", proxyErr) } } } // Provision TLS certs if needed. if a.Certs != nil && hasL7Routes(comp.Routes) { hostnames := l7Hostnames(svc.Name, comp.Routes) if certErr := a.Certs.EnsureCert(ctx, svc.Name, hostnames); certErr != nil { a.Logger.Warn("cert provisioning", "service", svc.Name, "err", certErr) } } recovered++ a.Logger.Info("container recovered", "container", containerName) } } a.Logger.Info("recovery complete", "recovered", recovered, "skipped", skipped, "already_running", already, ) return nil } // hasL7Routes and l7Hostnames are defined in deploy.go.