Files
mcp/internal/agent/recover.go
Kyle Isom d56f224359 Add unikernel runtime: run services as Nanos VMs under QEMU/KVM
Implements the hypervisor design's Phase 1: a second runtime.Runtime
backend (QEMU) that runs each service component as a Nanos unikernel VM
instead of a podman container, selected per-component via a new
runtime = "unikernel" service-def field.

- internal/runtime/qemu.go: QEMURuntime. Pull extracts the ELF from the
  OCI image; Run does `ops build` + boots qemu-system-x86_64 with KVM,
  user-mode net port-forwards, QMP control socket and serial console log;
  Stop/Remove/Inspect/List/Logs map onto VM lifecycle + state dir.
- proto/registry/servicedef: add runtime, memory_mb, vcpus fields
  (registry migration 5).
- agent: holds both runtimes; runtimeFor() selects per component;
  listAllContainers() merges containers + VMs so drift/status see both.
  Unikernel runtime auto-enables on nodes with /dev/kvm + ops.

Validated end-to-end on straylight: a test service deploys via
`mcp deploy --direct`, boots as a Nanos unikernel, serves HTTP through
the agent port-forward, and reports running via `mcp status`/`mcp logs`.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-11 00:54:49 -07:00

144 lines
3.8 KiB
Go

package agent
import (
"context"
"fmt"
"git.wntrmute.dev/mc/mcp/internal/registry"
"git.wntrmute.dev/mc/mcp/internal/runtime"
)
// Recover recreates containers from the agent's registry for all services
// whose desired state is "running" but which don't have a running container
// in podman. This is the recovery path after a podman database loss (e.g.,
// after a UID change or podman reset).
//
// Recover does NOT pull images — it assumes the images are cached locally.
// If an image is missing, that component is skipped with a warning.
func (a *Agent) Recover(ctx context.Context) error {
services, err := registry.ListServices(a.DB)
if err != nil {
return fmt.Errorf("list services: %w", err)
}
// Get the list of currently running containers from podman.
running, err := a.listAllContainers(ctx)
if err != nil {
a.Logger.Warn("cannot list containers, assuming none running", "err", err)
running = nil
}
runningSet := make(map[string]bool)
for _, c := range running {
runningSet[c.Name] = true
}
var recovered, skipped, already int
for _, svc := range services {
if !svc.Active {
continue
}
comps, err := registry.ListComponents(a.DB, svc.Name)
if err != nil {
a.Logger.Warn("list components", "service", svc.Name, "err", err)
continue
}
for _, comp := range comps {
if comp.DesiredState != "running" {
continue
}
containerName := svc.Name + "-" + comp.Name
if comp.Name == svc.Name {
containerName = svc.Name
}
// Skip if container is already running.
if runningSet[containerName] {
already++
continue
}
a.Logger.Info("recovering container",
"service", svc.Name,
"component", comp.Name,
"image", comp.Image,
)
rt := a.runtimeFor(comp.Runtime)
// Remove any stale container with the same name.
_ = rt.Remove(ctx, containerName)
// Build the container spec from the registry.
spec := runtime.ContainerSpec{
Name: containerName,
Image: comp.Image,
Network: comp.Network,
User: comp.UserSpec,
Restart: comp.Restart,
Volumes: comp.Volumes,
Cmd: comp.Cmd,
MemoryMB: comp.MemoryMB,
VCPUs: comp.VCPUs,
}
// Allocate ports from routes if the component has routes.
if len(comp.Routes) > 0 && a.PortAlloc != nil {
ports, env, allocErr := a.allocateRoutePorts(svc.Name, comp.Name, comp.Routes)
if allocErr != nil {
a.Logger.Warn("allocate route ports", "container", containerName, "err", allocErr)
spec.Ports = comp.Ports
} else {
spec.Ports = append(comp.Ports, ports...)
spec.Env = append(spec.Env, env...)
}
} else {
spec.Ports = comp.Ports
}
if err := rt.Run(ctx, spec); err != nil {
a.Logger.Error("recover container failed",
"container", containerName,
"err", err,
)
skipped++
continue
}
// Re-register mc-proxy routes.
if a.Proxy != nil && len(comp.Routes) > 0 {
hostPorts, hpErr := registry.GetRouteHostPorts(a.DB, svc.Name, comp.Name)
if hpErr == nil {
if proxyErr := a.Proxy.RegisterRoutes(ctx, svc.Name, comp.Routes, hostPorts); proxyErr != nil {
a.Logger.Warn("re-register routes", "service", svc.Name, "err", proxyErr)
}
}
}
// Provision TLS certs if needed.
if a.Certs != nil && hasL7Routes(comp.Routes) {
hostnames := l7Hostnames(svc.Name, comp.Routes)
if certErr := a.Certs.EnsureCert(ctx, svc.Name, hostnames); certErr != nil {
a.Logger.Warn("cert provisioning", "service", svc.Name, "err", certErr)
}
}
recovered++
a.Logger.Info("container recovered", "container", containerName)
}
}
a.Logger.Info("recovery complete",
"recovered", recovered,
"skipped", skipped,
"already_running", already,
)
return nil
}
// hasL7Routes and l7Hostnames are defined in deploy.go.