Files
mcp/internal/agent/agent.go
Kyle Isom d56f224359 Add unikernel runtime: run services as Nanos VMs under QEMU/KVM
Implements the hypervisor design's Phase 1: a second runtime.Runtime
backend (QEMU) that runs each service component as a Nanos unikernel VM
instead of a podman container, selected per-component via a new
runtime = "unikernel" service-def field.

- internal/runtime/qemu.go: QEMURuntime. Pull extracts the ELF from the
  OCI image; Run does `ops build` + boots qemu-system-x86_64 with KVM,
  user-mode net port-forwards, QMP control socket and serial console log;
  Stop/Remove/Inspect/List/Logs map onto VM lifecycle + state dir.
- proto/registry/servicedef: add runtime, memory_mb, vcpus fields
  (registry migration 5).
- agent: holds both runtimes; runtimeFor() selects per component;
  listAllContainers() merges containers + VMs so drift/status see both.
  Unikernel runtime auto-enables on nodes with /dev/kvm + ops.

Validated end-to-end on straylight: a test service deploys via
`mcp deploy --direct`, boots as a Nanos unikernel, serves HTTP through
the agent port-forward, and reports running via `mcp status`/`mcp logs`.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-11 00:54:49 -07:00

196 lines
5.0 KiB
Go

package agent
import (
"context"
"crypto/tls"
"database/sql"
"fmt"
"log/slog"
"net"
"os"
"os/signal"
"path/filepath"
"syscall"
mcpv1 "git.wntrmute.dev/mc/mcp/gen/mcp/v1"
"git.wntrmute.dev/mc/mcp/internal/auth"
"git.wntrmute.dev/mc/mcp/internal/config"
"git.wntrmute.dev/mc/mcp/internal/monitor"
"git.wntrmute.dev/mc/mcp/internal/registry"
"git.wntrmute.dev/mc/mcp/internal/runtime"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials"
)
// Agent is the MCP node agent. It manages containers, stores the registry,
// monitors for drift, and serves the gRPC API.
type Agent struct {
mcpv1.UnimplementedMcpAgentServiceServer
Config *config.AgentConfig
DB *sql.DB
Runtime runtime.Runtime // container runtime (podman)
Unikernel runtime.Runtime // unikernel runtime (qemu/nanos); nil if unavailable
Monitor *monitor.Monitor
Logger *slog.Logger
PortAlloc *PortAllocator
Proxy *ProxyRouter
Certs *CertProvisioner
DNS *DNSRegistrar
Version string
}
// Run starts the agent: opens the database, sets up the gRPC server with
// TLS and auth, and blocks until SIGINT/SIGTERM.
func Run(cfg *config.AgentConfig, version string) error {
logger := slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
Level: parseLogLevel(cfg.Log.Level),
}))
db, err := registry.Open(cfg.Database.Path)
if err != nil {
return fmt.Errorf("open registry: %w", err)
}
defer func() { _ = db.Close() }()
rt := &runtime.Podman{}
// The unikernel runtime is enabled only on nodes with KVM. Services with
// runtime = "unikernel" are placed by the master on KVM-capable nodes.
var uk runtime.Runtime
if unikernelSupported() {
uk = &runtime.QEMU{
ImageDir: filepath.Join(homeDir(cfg), "images"),
StateDir: filepath.Join(homeDir(cfg), "vm"),
HomeDir: homeDir(cfg),
}
logger.Info("unikernel runtime enabled (KVM detected)")
}
mon := monitor.New(db, rt, cfg.Monitor, cfg.Agent.NodeName, logger)
proxy, err := NewProxyRouter(cfg.MCProxy.Socket, cfg.MCProxy.CertDir, logger)
if err != nil {
return fmt.Errorf("connect to mc-proxy: %w", err)
}
certs, err := NewCertProvisioner(cfg.Metacrypt, cfg.MCProxy.CertDir, logger)
if err != nil {
return fmt.Errorf("create cert provisioner: %w", err)
}
dns, err := NewDNSRegistrar(cfg.MCNS, logger)
if err != nil {
return fmt.Errorf("create DNS registrar: %w", err)
}
a := &Agent{
Config: cfg,
DB: db,
Runtime: rt,
Unikernel: uk,
Monitor: mon,
Logger: logger,
PortAlloc: NewPortAllocator(),
Proxy: proxy,
Certs: certs,
DNS: dns,
Version: version,
}
tlsCert, err := tls.LoadX509KeyPair(cfg.Server.TLSCert, cfg.Server.TLSKey)
if err != nil {
return fmt.Errorf("load TLS cert: %w", err)
}
tlsConfig := &tls.Config{
Certificates: []tls.Certificate{tlsCert},
MinVersion: tls.VersionTLS13,
}
validator, err := auth.NewMCIASValidator(cfg.MCIAS.ServerURL, cfg.MCIAS.CACert)
if err != nil {
return fmt.Errorf("create MCIAS validator: %w", err)
}
server := grpc.NewServer(
grpc.Creds(credentials.NewTLS(tlsConfig)),
grpc.ChainUnaryInterceptor(
auth.AuthInterceptor(validator),
),
grpc.ChainStreamInterceptor(
auth.StreamAuthInterceptor(validator),
),
)
mcpv1.RegisterMcpAgentServiceServer(server, a)
lis, err := net.Listen("tcp", cfg.Server.GRPCAddr)
if err != nil {
return fmt.Errorf("listen %q: %w", cfg.Server.GRPCAddr, err)
}
logger.Info("agent starting",
"addr", cfg.Server.GRPCAddr,
"node", cfg.Agent.NodeName,
"runtime", cfg.Agent.ContainerRuntime,
)
// Run boot sequence before starting the gRPC server.
// On the master node, this starts foundation services (MCIAS, MCNS)
// before core services, ensuring dependencies are met.
if len(cfg.Boot.Sequence) > 0 {
bootCtx, bootCancel := context.WithCancel(context.Background())
defer bootCancel()
if err := a.RunBootSequence(bootCtx); err != nil {
logger.Error("boot sequence failed", "err", err)
// Continue starting the gRPC server — partial boot is better than no agent.
}
}
// Start heartbeat client (registers with master and sends heartbeats).
hbClient, hbErr := NewHeartbeatClient(*cfg, logger)
if hbErr != nil {
logger.Warn("heartbeat client failed to start", "err", hbErr)
} else if hbClient != nil {
hbClient.Start()
logger.Info("heartbeat client started", "master", cfg.Master.Address)
}
mon.Start()
ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
defer stop()
errCh := make(chan error, 1)
go func() {
errCh <- server.Serve(lis)
}()
select {
case <-ctx.Done():
logger.Info("shutting down")
if hbClient != nil {
hbClient.Stop()
}
mon.Stop()
server.GracefulStop()
_ = proxy.Close()
return nil
case err := <-errCh:
mon.Stop()
return fmt.Errorf("serve: %w", err)
}
}
func parseLogLevel(level string) slog.Level {
switch level {
case "debug":
return slog.LevelDebug
case "warn":
return slog.LevelWarn
case "error":
return slog.LevelError
default:
return slog.LevelInfo
}
}