package agent import ( "context" "fmt" "git.wntrmute.dev/mc/mcp/internal/registry" "git.wntrmute.dev/mc/mcp/internal/runtime" ) // Recover recreates containers from the agent's registry for all services // whose desired state is "running" but which don't have a running container // in podman. This is the recovery path after a podman database loss (e.g., // after a UID change or podman reset). // // Recover does NOT pull images — it assumes the images are cached locally. // If an image is missing, that component is skipped with a warning. func (a *Agent) Recover(ctx context.Context) error { services, err := registry.ListServices(a.DB) if err != nil { return fmt.Errorf("list services: %w", err) } // Get the list of currently running containers from podman. running, err := a.Runtime.List(ctx) if err != nil { a.Logger.Warn("cannot list containers, assuming none running", "err", err) running = nil } runningSet := make(map[string]bool) for _, c := range running { runningSet[c.Name] = true } var recovered, skipped, already int for _, svc := range services { if !svc.Active { continue } comps, err := registry.ListComponents(a.DB, svc.Name) if err != nil { a.Logger.Warn("list components", "service", svc.Name, "err", err) continue } for _, comp := range comps { if comp.DesiredState != "running" { continue } containerName := svc.Name + "-" + comp.Name if comp.Name == svc.Name { containerName = svc.Name } // Skip if container is already running. if runningSet[containerName] { already++ continue } a.Logger.Info("recovering container", "service", svc.Name, "component", comp.Name, "image", comp.Image, ) // Remove any stale container with the same name. _ = a.Runtime.Remove(ctx, containerName) // Build the container spec from the registry. spec := runtime.ContainerSpec{ Name: containerName, Image: comp.Image, Network: comp.Network, User: comp.UserSpec, Restart: comp.Restart, Volumes: comp.Volumes, Cmd: comp.Cmd, } // Allocate ports from routes if the component has routes. if len(comp.Routes) > 0 && a.PortAlloc != nil { ports, env, allocErr := a.allocateRoutePorts(svc.Name, comp.Name, comp.Routes) if allocErr != nil { a.Logger.Warn("allocate route ports", "container", containerName, "err", allocErr) spec.Ports = comp.Ports } else { spec.Ports = append(comp.Ports, ports...) spec.Env = append(spec.Env, env...) } } else { spec.Ports = comp.Ports } if err := a.Runtime.Run(ctx, spec); err != nil { a.Logger.Error("recover container failed", "container", containerName, "err", err, ) skipped++ continue } // Re-register mc-proxy routes. if a.Proxy != nil && len(comp.Routes) > 0 { hostPorts, hpErr := registry.GetRouteHostPorts(a.DB, svc.Name, comp.Name) if hpErr == nil { if proxyErr := a.Proxy.RegisterRoutes(ctx, svc.Name, comp.Routes, hostPorts); proxyErr != nil { a.Logger.Warn("re-register routes", "service", svc.Name, "err", proxyErr) } } } // Provision TLS certs if needed. if a.Certs != nil && hasL7Routes(comp.Routes) { hostnames := l7Hostnames(svc.Name, comp.Routes) if certErr := a.Certs.EnsureCert(ctx, svc.Name, hostnames); certErr != nil { a.Logger.Warn("cert provisioning", "service", svc.Name, "err", certErr) } } recovered++ a.Logger.Info("container recovered", "container", containerName) } } a.Logger.Info("recovery complete", "recovered", recovered, "skipped", skipped, "already_running", already, ) return nil } // hasL7Routes and l7Hostnames are defined in deploy.go.