Files
mcp/internal/agent/boot.go
Kyle Isom d56f224359 Add unikernel runtime: run services as Nanos VMs under QEMU/KVM
Implements the hypervisor design's Phase 1: a second runtime.Runtime
backend (QEMU) that runs each service component as a Nanos unikernel VM
instead of a podman container, selected per-component via a new
runtime = "unikernel" service-def field.

- internal/runtime/qemu.go: QEMURuntime. Pull extracts the ELF from the
  OCI image; Run does `ops build` + boots qemu-system-x86_64 with KVM,
  user-mode net port-forwards, QMP control socket and serial console log;
  Stop/Remove/Inspect/List/Logs map onto VM lifecycle + state dir.
- proto/registry/servicedef: add runtime, memory_mb, vcpus fields
  (registry migration 5).
- agent: holds both runtimes; runtimeFor() selects per component;
  listAllContainers() merges containers + VMs so drift/status see both.
  Unikernel runtime auto-enables on nodes with /dev/kvm + ops.

Validated end-to-end on straylight: a test service deploys via
`mcp deploy --direct`, boots as a Nanos unikernel, serves HTTP through
the agent port-forward, and reports running via `mcp status`/`mcp logs`.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-11 00:54:49 -07:00

203 lines
5.6 KiB
Go

package agent
import (
"context"
"fmt"
"net"
"time"
"git.wntrmute.dev/mc/mcp/internal/config"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials/insecure"
healthpb "google.golang.org/grpc/health/grpc_health_v1"
)
// RunBootSequence executes the boot stages defined in the agent config.
// Each stage's services must be healthy before the next stage starts.
// This is used on the master node to start foundation services (MCIAS,
// MCNS) before core services (Metacrypt, MCR) before the master itself.
//
// If no boot sequence is configured, this is a no-op.
func (a *Agent) RunBootSequence(ctx context.Context) error {
stages := a.Config.Boot.Sequence
if len(stages) == 0 {
return nil
}
a.Logger.Info("boot sequence starting", "stages", len(stages))
for i, stage := range stages {
a.Logger.Info("boot stage starting",
"stage", stage.Name,
"services", stage.Services,
"timeout", stage.Timeout.Duration,
"health", stage.Health,
)
// Use the recover logic to start any services in this stage
// that aren't already running.
if err := a.Recover(ctx); err != nil {
a.Logger.Warn("boot stage recover failed", "stage", stage.Name, "err", err)
}
// Wait for all services in this stage to be healthy.
timeout := stage.Timeout.Duration
if timeout == 0 {
timeout = 60 * time.Second
}
if err := a.waitForHealthy(ctx, stage, timeout, i == 0); err != nil {
if i == 0 {
// Foundation stage: block and retry indefinitely.
a.Logger.Error("foundation stage failed — retrying indefinitely",
"stage", stage.Name, "err", err)
for {
time.Sleep(10 * time.Second)
if retryErr := a.waitForHealthy(ctx, stage, timeout, true); retryErr == nil {
break
}
}
} else {
// Non-foundation: log and proceed.
a.Logger.Warn("boot stage not fully healthy, proceeding",
"stage", stage.Name, "err", err)
}
}
a.Logger.Info("boot stage complete", "stage", stage.Name)
}
a.Logger.Info("boot sequence complete")
return nil
}
// waitForHealthy waits until all services in the stage pass their health check.
func (a *Agent) waitForHealthy(ctx context.Context, stage config.BootStage, timeout time.Duration, isFoundation bool) error {
deadline := time.Now().Add(timeout)
for _, svc := range stage.Services {
for {
if time.Now().After(deadline) {
return fmt.Errorf("timeout waiting for %s", svc)
}
healthy, err := a.checkServiceHealth(ctx, svc, stage.Health)
if err == nil && healthy {
a.Logger.Info("service healthy", "service", svc, "check", stage.Health)
break
}
if ctx.Err() != nil {
return ctx.Err()
}
time.Sleep(2 * time.Second)
}
}
return nil
}
// checkServiceHealth probes a service using the specified health check method.
func (a *Agent) checkServiceHealth(ctx context.Context, serviceName, method string) (bool, error) {
// Find the service's port from the registry.
port, err := a.findServicePort(serviceName)
if err != nil {
return false, err
}
switch method {
case "tcp", "":
return a.checkTCP(ctx, port)
case "grpc":
return a.checkGRPC(ctx, port)
default:
// Unknown method, fall back to TCP.
return a.checkTCP(ctx, port)
}
}
// findServicePort finds the first mapped port for a service from the registry
// or from the running container.
func (a *Agent) findServicePort(serviceName string) (int, error) {
// Check the running containers for a mapped port.
containers, err := a.listAllContainers(context.Background())
if err != nil {
return 0, fmt.Errorf("list containers: %w", err)
}
for _, c := range containers {
// Container name might be "service" or "service-component"
if c.Name == serviceName || len(c.Name) > len(serviceName) && c.Name[:len(serviceName)+1] == serviceName+"-" {
// Parse the first port mapping to get the host port.
for _, p := range c.Ports {
// Port format: "127.0.0.1:28443->8443/tcp" or "8443/tcp"
port := parseHostPort(p)
if port > 0 {
return port, nil
}
}
}
}
return 0, fmt.Errorf("no port found for service %s", serviceName)
}
// parseHostPort extracts the host port from a podman port mapping string.
func parseHostPort(mapping string) int {
// Format: "127.0.0.1:28443->8443/tcp" or "0.0.0.0:53->53/tcp"
for i := len(mapping) - 1; i >= 0; i-- {
if mapping[i] == ':' {
// Found the host:port separator
rest := mapping[i+1:]
// Find the -> separator
for j := 0; j < len(rest); j++ {
if rest[j] == '-' {
portStr := rest[:j]
var port int
for _, ch := range portStr {
if ch >= '0' && ch <= '9' {
port = port*10 + int(ch-'0')
}
}
if port > 0 {
return port
}
}
}
}
}
return 0
}
// checkTCP attempts a TCP connection to localhost:port.
func (a *Agent) checkTCP(ctx context.Context, port int) (bool, error) {
addr := fmt.Sprintf("127.0.0.1:%d", port)
conn, err := net.DialTimeout("tcp", addr, 2*time.Second)
if err != nil {
return false, err
}
_ = conn.Close()
return true, nil
}
// checkGRPC calls the standard gRPC health check on localhost:port.
func (a *Agent) checkGRPC(ctx context.Context, port int) (bool, error) {
addr := fmt.Sprintf("127.0.0.1:%d", port)
ctx, cancel := context.WithTimeout(ctx, 3*time.Second)
defer cancel()
conn, err := grpc.NewClient(addr, grpc.WithTransportCredentials(insecure.NewCredentials()))
if err != nil {
return false, err
}
defer func() { _ = conn.Close() }()
client := healthpb.NewHealthClient(conn)
resp, err := client.Check(ctx, &healthpb.HealthCheckRequest{})
if err != nil {
return false, err
}
return resp.GetStatus() == healthpb.HealthCheckResponse_SERVING, nil
}