diff --git a/cmd/mcp-agent/main.go b/cmd/mcp-agent/main.go index 1814687..a14edf8 100644 --- a/cmd/mcp-agent/main.go +++ b/cmd/mcp-agent/main.go @@ -43,6 +43,7 @@ func main() { }) root.AddCommand(snapshotCmd()) + root.AddCommand(recoverCmd()) if err := root.Execute(); err != nil { log.Fatal(err) diff --git a/cmd/mcp-agent/recover.go b/cmd/mcp-agent/recover.go new file mode 100644 index 0000000..077bea4 --- /dev/null +++ b/cmd/mcp-agent/recover.go @@ -0,0 +1,68 @@ +package main + +import ( + "context" + "fmt" + "log/slog" + "os" + + "git.wntrmute.dev/mc/mcp/internal/agent" + "git.wntrmute.dev/mc/mcp/internal/config" + "git.wntrmute.dev/mc/mcp/internal/registry" + "git.wntrmute.dev/mc/mcp/internal/runtime" + "github.com/spf13/cobra" +) + +func recoverCmd() *cobra.Command { + return &cobra.Command{ + Use: "recover", + Short: "Recreate containers from the agent registry", + Long: `Recover recreates containers from the agent's SQLite registry for all +services whose desired state is "running" but which don't have a running +container in podman. + +This is the recovery path after a podman database loss (e.g., after a +UID change, podman reset, or reboot that cleared container state). + +Images must be cached locally — recover does not pull from MCR.`, + RunE: func(cmd *cobra.Command, args []string) error { + cfg, err := config.LoadAgentConfig(cfgPath) + if err != nil { + return fmt.Errorf("load config: %w", err) + } + + logger := slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{ + Level: slog.LevelInfo, + })) + + db, err := registry.Open(cfg.Database.Path) + if err != nil { + return fmt.Errorf("open registry: %w", err) + } + defer func() { _ = db.Close() }() + + proxy, err := agent.NewProxyRouter(cfg.MCProxy.Socket, cfg.MCProxy.CertDir, logger) + if err != nil { + logger.Warn("mc-proxy not available, routes will not be registered", "err", err) + } + + certs, err := agent.NewCertProvisioner(cfg.Metacrypt, cfg.MCProxy.CertDir, logger) + if err != nil { + logger.Warn("cert provisioner not available", "err", err) + } + + a := &agent.Agent{ + Config: cfg, + DB: db, + Runtime: &runtime.Podman{}, + Logger: logger, + PortAlloc: agent.NewPortAllocator(), + Proxy: proxy, + Certs: certs, + Version: version, + } + + return a.Recover(context.Background()) + }, + } +} diff --git a/internal/agent/recover.go b/internal/agent/recover.go new file mode 100644 index 0000000..a0aaa1e --- /dev/null +++ b/internal/agent/recover.go @@ -0,0 +1,139 @@ +package agent + +import ( + "context" + "fmt" + + "git.wntrmute.dev/mc/mcp/internal/registry" + "git.wntrmute.dev/mc/mcp/internal/runtime" +) + +// Recover recreates containers from the agent's registry for all services +// whose desired state is "running" but which don't have a running container +// in podman. This is the recovery path after a podman database loss (e.g., +// after a UID change or podman reset). +// +// Recover does NOT pull images — it assumes the images are cached locally. +// If an image is missing, that component is skipped with a warning. +func (a *Agent) Recover(ctx context.Context) error { + services, err := registry.ListServices(a.DB) + if err != nil { + return fmt.Errorf("list services: %w", err) + } + + // Get the list of currently running containers from podman. + running, err := a.Runtime.List(ctx) + if err != nil { + a.Logger.Warn("cannot list containers, assuming none running", "err", err) + running = nil + } + runningSet := make(map[string]bool) + for _, c := range running { + runningSet[c.Name] = true + } + + var recovered, skipped, already int + + for _, svc := range services { + if !svc.Active { + continue + } + + comps, err := registry.ListComponents(a.DB, svc.Name) + if err != nil { + a.Logger.Warn("list components", "service", svc.Name, "err", err) + continue + } + + for _, comp := range comps { + if comp.DesiredState != "running" { + continue + } + + containerName := svc.Name + "-" + comp.Name + if comp.Name == svc.Name { + containerName = svc.Name + } + + // Skip if container is already running. + if runningSet[containerName] { + already++ + continue + } + + a.Logger.Info("recovering container", + "service", svc.Name, + "component", comp.Name, + "image", comp.Image, + ) + + // Remove any stale container with the same name. + _ = a.Runtime.Remove(ctx, containerName) + + // Build the container spec from the registry. + spec := runtime.ContainerSpec{ + Name: containerName, + Image: comp.Image, + Network: comp.Network, + User: comp.UserSpec, + Restart: comp.Restart, + Volumes: comp.Volumes, + Cmd: comp.Cmd, + } + + // Allocate ports from routes if the component has routes. + if len(comp.Routes) > 0 && a.PortAlloc != nil { + ports, env, allocErr := a.allocateRoutePorts(svc.Name, comp.Name, comp.Routes) + if allocErr != nil { + a.Logger.Warn("allocate route ports", "container", containerName, "err", allocErr) + spec.Ports = comp.Ports + } else { + spec.Ports = append(comp.Ports, ports...) + spec.Env = append(spec.Env, env...) + } + } else { + spec.Ports = comp.Ports + } + + if err := a.Runtime.Run(ctx, spec); err != nil { + a.Logger.Error("recover container failed", + "container", containerName, + "err", err, + ) + skipped++ + continue + } + + // Re-register mc-proxy routes. + if a.Proxy != nil && len(comp.Routes) > 0 { + hostPorts, hpErr := registry.GetRouteHostPorts(a.DB, svc.Name, comp.Name) + if hpErr == nil { + if proxyErr := a.Proxy.RegisterRoutes(ctx, svc.Name, comp.Routes, hostPorts); proxyErr != nil { + a.Logger.Warn("re-register routes", "service", svc.Name, "err", proxyErr) + } + } + } + + // Provision TLS certs if needed. + if a.Certs != nil && hasL7Routes(comp.Routes) { + hostnames := l7Hostnames(svc.Name, comp.Routes) + if certErr := a.Certs.EnsureCert(ctx, svc.Name, hostnames); certErr != nil { + a.Logger.Warn("cert provisioning", "service", svc.Name, "err", certErr) + } + } + + recovered++ + a.Logger.Info("container recovered", "container", containerName) + } + } + + a.Logger.Info("recovery complete", + "recovered", recovered, + "skipped", skipped, + "already_running", already, + ) + + return nil +} + +// hasL7Routes and l7Hostnames are defined in deploy.go.