Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 9d543998dc |
@@ -43,6 +43,7 @@ func main() {
|
||||
})
|
||||
|
||||
root.AddCommand(snapshotCmd())
|
||||
root.AddCommand(recoverCmd())
|
||||
|
||||
if err := root.Execute(); err != nil {
|
||||
log.Fatal(err)
|
||||
|
||||
68
cmd/mcp-agent/recover.go
Normal file
68
cmd/mcp-agent/recover.go
Normal file
@@ -0,0 +1,68 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
|
||||
"git.wntrmute.dev/mc/mcp/internal/agent"
|
||||
"git.wntrmute.dev/mc/mcp/internal/config"
|
||||
"git.wntrmute.dev/mc/mcp/internal/registry"
|
||||
"git.wntrmute.dev/mc/mcp/internal/runtime"
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
func recoverCmd() *cobra.Command {
|
||||
return &cobra.Command{
|
||||
Use: "recover",
|
||||
Short: "Recreate containers from the agent registry",
|
||||
Long: `Recover recreates containers from the agent's SQLite registry for all
|
||||
services whose desired state is "running" but which don't have a running
|
||||
container in podman.
|
||||
|
||||
This is the recovery path after a podman database loss (e.g., after a
|
||||
UID change, podman reset, or reboot that cleared container state).
|
||||
|
||||
Images must be cached locally — recover does not pull from MCR.`,
|
||||
RunE: func(cmd *cobra.Command, args []string) error {
|
||||
cfg, err := config.LoadAgentConfig(cfgPath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("load config: %w", err)
|
||||
}
|
||||
|
||||
logger := slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
|
||||
Level: slog.LevelInfo,
|
||||
}))
|
||||
|
||||
db, err := registry.Open(cfg.Database.Path)
|
||||
if err != nil {
|
||||
return fmt.Errorf("open registry: %w", err)
|
||||
}
|
||||
defer func() { _ = db.Close() }()
|
||||
|
||||
proxy, err := agent.NewProxyRouter(cfg.MCProxy.Socket, cfg.MCProxy.CertDir, logger)
|
||||
if err != nil {
|
||||
logger.Warn("mc-proxy not available, routes will not be registered", "err", err)
|
||||
}
|
||||
|
||||
certs, err := agent.NewCertProvisioner(cfg.Metacrypt, cfg.MCProxy.CertDir, logger)
|
||||
if err != nil {
|
||||
logger.Warn("cert provisioner not available", "err", err)
|
||||
}
|
||||
|
||||
a := &agent.Agent{
|
||||
Config: cfg,
|
||||
DB: db,
|
||||
Runtime: &runtime.Podman{},
|
||||
Logger: logger,
|
||||
PortAlloc: agent.NewPortAllocator(),
|
||||
Proxy: proxy,
|
||||
Certs: certs,
|
||||
Version: version,
|
||||
}
|
||||
|
||||
return a.Recover(context.Background())
|
||||
},
|
||||
}
|
||||
}
|
||||
139
internal/agent/recover.go
Normal file
139
internal/agent/recover.go
Normal file
@@ -0,0 +1,139 @@
|
||||
package agent
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
"git.wntrmute.dev/mc/mcp/internal/registry"
|
||||
"git.wntrmute.dev/mc/mcp/internal/runtime"
|
||||
)
|
||||
|
||||
// Recover recreates containers from the agent's registry for all services
|
||||
// whose desired state is "running" but which don't have a running container
|
||||
// in podman. This is the recovery path after a podman database loss (e.g.,
|
||||
// after a UID change or podman reset).
|
||||
//
|
||||
// Recover does NOT pull images — it assumes the images are cached locally.
|
||||
// If an image is missing, that component is skipped with a warning.
|
||||
func (a *Agent) Recover(ctx context.Context) error {
|
||||
services, err := registry.ListServices(a.DB)
|
||||
if err != nil {
|
||||
return fmt.Errorf("list services: %w", err)
|
||||
}
|
||||
|
||||
// Get the list of currently running containers from podman.
|
||||
running, err := a.Runtime.List(ctx)
|
||||
if err != nil {
|
||||
a.Logger.Warn("cannot list containers, assuming none running", "err", err)
|
||||
running = nil
|
||||
}
|
||||
runningSet := make(map[string]bool)
|
||||
for _, c := range running {
|
||||
runningSet[c.Name] = true
|
||||
}
|
||||
|
||||
var recovered, skipped, already int
|
||||
|
||||
for _, svc := range services {
|
||||
if !svc.Active {
|
||||
continue
|
||||
}
|
||||
|
||||
comps, err := registry.ListComponents(a.DB, svc.Name)
|
||||
if err != nil {
|
||||
a.Logger.Warn("list components", "service", svc.Name, "err", err)
|
||||
continue
|
||||
}
|
||||
|
||||
for _, comp := range comps {
|
||||
if comp.DesiredState != "running" {
|
||||
continue
|
||||
}
|
||||
|
||||
containerName := svc.Name + "-" + comp.Name
|
||||
if comp.Name == svc.Name {
|
||||
containerName = svc.Name
|
||||
}
|
||||
|
||||
// Skip if container is already running.
|
||||
if runningSet[containerName] {
|
||||
already++
|
||||
continue
|
||||
}
|
||||
|
||||
a.Logger.Info("recovering container",
|
||||
"service", svc.Name,
|
||||
"component", comp.Name,
|
||||
"image", comp.Image,
|
||||
)
|
||||
|
||||
// Remove any stale container with the same name.
|
||||
_ = a.Runtime.Remove(ctx, containerName)
|
||||
|
||||
// Build the container spec from the registry.
|
||||
spec := runtime.ContainerSpec{
|
||||
Name: containerName,
|
||||
Image: comp.Image,
|
||||
Network: comp.Network,
|
||||
User: comp.UserSpec,
|
||||
Restart: comp.Restart,
|
||||
Volumes: comp.Volumes,
|
||||
Cmd: comp.Cmd,
|
||||
}
|
||||
|
||||
// Allocate ports from routes if the component has routes.
|
||||
if len(comp.Routes) > 0 && a.PortAlloc != nil {
|
||||
ports, env, allocErr := a.allocateRoutePorts(svc.Name, comp.Name, comp.Routes)
|
||||
if allocErr != nil {
|
||||
a.Logger.Warn("allocate route ports", "container", containerName, "err", allocErr)
|
||||
spec.Ports = comp.Ports
|
||||
} else {
|
||||
spec.Ports = append(comp.Ports, ports...)
|
||||
spec.Env = append(spec.Env, env...)
|
||||
}
|
||||
} else {
|
||||
spec.Ports = comp.Ports
|
||||
}
|
||||
|
||||
if err := a.Runtime.Run(ctx, spec); err != nil {
|
||||
a.Logger.Error("recover container failed",
|
||||
"container", containerName,
|
||||
"err", err,
|
||||
)
|
||||
skipped++
|
||||
continue
|
||||
}
|
||||
|
||||
// Re-register mc-proxy routes.
|
||||
if a.Proxy != nil && len(comp.Routes) > 0 {
|
||||
hostPorts, hpErr := registry.GetRouteHostPorts(a.DB, svc.Name, comp.Name)
|
||||
if hpErr == nil {
|
||||
if proxyErr := a.Proxy.RegisterRoutes(ctx, svc.Name, comp.Routes, hostPorts); proxyErr != nil {
|
||||
a.Logger.Warn("re-register routes", "service", svc.Name, "err", proxyErr)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Provision TLS certs if needed.
|
||||
if a.Certs != nil && hasL7Routes(comp.Routes) {
|
||||
hostnames := l7Hostnames(svc.Name, comp.Routes)
|
||||
if certErr := a.Certs.EnsureCert(ctx, svc.Name, hostnames); certErr != nil {
|
||||
a.Logger.Warn("cert provisioning", "service", svc.Name, "err", certErr)
|
||||
}
|
||||
}
|
||||
|
||||
recovered++
|
||||
a.Logger.Info("container recovered", "container", containerName)
|
||||
}
|
||||
}
|
||||
|
||||
a.Logger.Info("recovery complete",
|
||||
"recovered", recovered,
|
||||
"skipped", skipped,
|
||||
"already_running", already,
|
||||
)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// hasL7Routes and l7Hostnames are defined in deploy.go.
|
||||
Reference in New Issue
Block a user