Add unikernel runtime: run services as Nanos VMs under QEMU/KVM

Implements the hypervisor design's Phase 1: a second runtime.Runtime
backend (QEMU) that runs each service component as a Nanos unikernel VM
instead of a podman container, selected per-component via a new
runtime = "unikernel" service-def field.

- internal/runtime/qemu.go: QEMURuntime. Pull extracts the ELF from the
  OCI image; Run does `ops build` + boots qemu-system-x86_64 with KVM,
  user-mode net port-forwards, QMP control socket and serial console log;
  Stop/Remove/Inspect/List/Logs map onto VM lifecycle + state dir.
- proto/registry/servicedef: add runtime, memory_mb, vcpus fields
  (registry migration 5).
- agent: holds both runtimes; runtimeFor() selects per component;
  listAllContainers() merges containers + VMs so drift/status see both.
  Unikernel runtime auto-enables on nodes with /dev/kvm + ops.

Validated end-to-end on straylight: a test service deploys via
`mcp deploy --direct`, boots as a Nanos unikernel, serves HTTP through
the agent port-forward, and reports running via `mcp status`/`mcp logs`.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Kyle Isom
2026-06-11 00:54:49 -07:00
parent 3b08caaa0a
commit d56f224359
30 changed files with 949 additions and 152 deletions

558
internal/runtime/qemu.go Normal file
View File

@@ -0,0 +1,558 @@
package runtime
import (
"context"
"crypto/sha256"
"encoding/hex"
"encoding/json"
"fmt"
"net"
"os"
"os/exec"
"path/filepath"
"strconv"
"strings"
"syscall"
"time"
)
// dialUnix connects to a unix-domain socket with a timeout.
func dialUnix(path string, timeout time.Duration) (net.Conn, error) {
return net.DialTimeout("unix", path, timeout)
}
// QEMU implements the Runtime interface by running services as Nanos
// unikernel virtual machines under QEMU/KVM instead of containers.
//
// Each service component becomes a single-process VM with its own kernel.
// The lifecycle maps onto the same Runtime interface as Podman so the agent
// can treat unikernels and containers uniformly:
//
// Pull -> pull the OCI image, extract the ELF binary, cache it
// Run -> `ops build` the binary into a Nanos image, boot it under QEMU
// Stop -> graceful QMP powerdown, then SIGTERM/SIGKILL the QEMU process
// Remove -> stop and delete the VM state directory
// Inspect -> read persisted metadata + check process liveness
// List -> enumerate VM state directories
// Logs -> stream the serial console log file
//
// Phase 1 uses QEMU user-mode networking with host port forwards, which is
// functionally equivalent to rootless podman's localhost port mappings:
// mc-proxy routes to 127.0.0.1:<hostport> exactly as it does for containers.
// Isolated bridge networking is a later phase.
type QEMU struct {
// ImageDir holds built Nanos images and extracted binaries.
// Default: /srv/mcp/images
ImageDir string
// StateDir holds per-VM runtime state (pidfile, QMP socket, console log,
// metadata). Default: /srv/mcp/vm
StateDir string
// OpsPath is the path to the `ops` Nanos toolchain binary. Default: "ops".
OpsPath string
// QemuPath is the path to qemu-system-x86_64. Default: "qemu-system-x86_64".
QemuPath string
// Memory is the default guest memory in MB when a spec does not set one.
Memory int
// HomeDir is set as $HOME for `ops` so it uses a stable ~/.ops directory.
HomeDir string
}
func (q *QEMU) imageDir() string {
if q.ImageDir != "" {
return q.ImageDir
}
return "/srv/mcp/images"
}
func (q *QEMU) stateDir() string {
if q.StateDir != "" {
return q.StateDir
}
return "/srv/mcp/vm"
}
func (q *QEMU) opsPath() string {
if q.OpsPath != "" {
return q.OpsPath
}
return "ops"
}
func (q *QEMU) qemuPath() string {
if q.QemuPath != "" {
return q.QemuPath
}
return "qemu-system-x86_64"
}
func (q *QEMU) memory() int {
if q.Memory > 0 {
return q.Memory
}
return 256
}
// opsEnv returns the environment for invoking `ops`, pinning $HOME so its
// cache and image directory are stable across invocations.
func (q *QEMU) opsEnv() []string {
env := os.Environ()
if q.HomeDir != "" {
env = append(env, "HOME="+q.HomeDir)
}
return env
}
// sanitizeImage turns an image reference into a filesystem-safe stem.
//
// "mcr.example:8443/mcdoc:v0.1.0" -> "mcr.example_8443_mcdoc_v0.1.0"
func sanitizeImage(image string) string {
r := strings.NewReplacer("/", "_", ":", "_")
return r.Replace(image)
}
// binaryName derives the in-image ELF binary name from an image reference by
// taking the repository basename. "host:8443/mcdoc:v0.1.0" -> "mcdoc".
func binaryName(image string) string {
name := image
if i := strings.LastIndex(name, "/"); i >= 0 {
name = name[i+1:]
}
if i := strings.Index(name, ":"); i >= 0 {
name = name[:i]
}
return name
}
func (q *QEMU) binPath(image string) string {
return filepath.Join(q.imageDir(), sanitizeImage(image)+".bin")
}
func (q *QEMU) imgPath(name string) string {
return filepath.Join(q.imageDir(), name+".img")
}
func (q *QEMU) vmDir(name string) string {
return filepath.Join(q.stateDir(), name)
}
// vmMeta is the persisted per-VM metadata written at Run time so that
// Inspect/List can report accurate information after an agent restart.
type vmMeta struct {
Name string `json:"name"`
Image string `json:"image"`
User string `json:"user"`
Restart string `json:"restart"`
Ports []string `json:"ports"`
Volumes []string `json:"volumes"`
Cmd []string `json:"cmd"`
MemoryMB int `json:"memory_mb"`
VCPUs int `json:"vcpus"`
ImageHash string `json:"image_hash"`
Started time.Time `json:"started"`
}
// Pull pulls the OCI image and extracts its ELF binary into the image cache.
// The binary is the input to `ops build`; the Nanos image itself is built at
// Run time so the service's command arguments can be baked in.
func (q *QEMU) Pull(ctx context.Context, image string) error {
if err := os.MkdirAll(q.imageDir(), 0o750); err != nil {
return fmt.Errorf("create image dir: %w", err)
}
// Pull the OCI image via podman (reuses the agent's registry auth).
if out, err := exec.CommandContext(ctx, "podman", "pull", image).CombinedOutput(); err != nil { //nolint:gosec // args built programmatically
return fmt.Errorf("podman pull %q: %w: %s", image, err, out)
}
// Create (do not start) a container to copy the binary out of.
tmp := "ukextract-" + sanitizeImage(image)
_ = exec.CommandContext(ctx, "podman", "rm", "-f", tmp).Run() //nolint:gosec
if out, err := exec.CommandContext(ctx, "podman", "create", "--name", tmp, image).CombinedOutput(); err != nil { //nolint:gosec
return fmt.Errorf("podman create %q: %w: %s", image, err, out)
}
defer func() { _ = exec.Command("podman", "rm", "-f", tmp).Run() }() //nolint:gosec
bin := binaryName(image)
src := tmp + ":/usr/local/bin/" + bin
dst := q.binPath(image)
if out, err := exec.CommandContext(ctx, "podman", "cp", src, dst).CombinedOutput(); err != nil { //nolint:gosec
return fmt.Errorf("extract binary %q from %q: %w: %s", bin, image, err, out)
}
if err := os.Chmod(dst, 0o755); err != nil { //nolint:gosec // unikernel ELF must be executable
return fmt.Errorf("chmod extracted binary: %w", err)
}
return nil
}
// opsConfig is the subset of the `ops` build configuration we generate.
type opsConfig struct {
Args []string `json:"Args,omitempty"`
Env map[string]string `json:"Env,omitempty"`
RunConfig opsRunConfig `json:"RunConfig"`
}
type opsRunConfig struct {
Ports []string `json:"Ports,omitempty"`
Memory string `json:"Memory,omitempty"`
CPUs int `json:"CPUs,omitempty"`
Klibs []string `json:"Klibs,omitempty"`
Mounts any `json:"Mounts,omitempty"`
NoTrace []string `json:"NoTrace,omitempty"`
GDBPort int `json:"GDBPort,omitempty"`
Nanos string `json:"Nanos,omitempty"`
Hostname string `json:"Hostname,omitempty"`
}
// guestPorts extracts the guest (container) port from each spec port mapping.
// Accepts "host:container", "ip:host:container", or a bare "port".
func guestPorts(ports []string) []string {
var gp []string
for _, p := range ports {
parts := strings.Split(p, ":")
gp = append(gp, parts[len(parts)-1])
}
return gp
}
// hostForward builds the QEMU hostfwd value for a spec port mapping.
// "ip:host:container" -> "tcp:ip:host-:container"
// "host:container" -> "tcp:127.0.0.1:host-:container"
func hostForward(p string) string {
parts := strings.Split(p, ":")
switch len(parts) {
case 3:
return fmt.Sprintf("tcp:%s:%s-:%s", parts[0], parts[1], parts[2])
case 2:
return fmt.Sprintf("tcp:127.0.0.1:%s-:%s", parts[0], parts[1])
default:
return fmt.Sprintf("tcp:127.0.0.1:%s-:%s", parts[0], parts[0])
}
}
// Run builds the Nanos image (if needed) and boots it under QEMU/KVM.
func (q *QEMU) Run(ctx context.Context, spec ContainerSpec) error {
if err := os.MkdirAll(q.vmDir(spec.Name), 0o750); err != nil {
return fmt.Errorf("create vm state dir: %w", err)
}
mem := spec.MemoryMB
if mem <= 0 {
mem = q.memory()
}
cpus := spec.VCPUs
if cpus <= 0 {
cpus = 1
}
// Build the Nanos image from the extracted binary, baking in command args.
cfg := opsConfig{
Args: spec.Cmd,
RunConfig: opsRunConfig{
Ports: guestPorts(spec.Ports),
Memory: strconv.Itoa(mem) + "m",
CPUs: cpus,
},
}
if len(spec.Env) > 0 {
cfg.Env = map[string]string{}
for _, e := range spec.Env {
if i := strings.Index(e, "="); i >= 0 {
cfg.Env[e[:i]] = e[i+1:]
}
}
}
cfgPath := filepath.Join(q.vmDir(spec.Name), "ops.json")
cfgBytes, err := json.MarshalIndent(cfg, "", " ")
if err != nil {
return fmt.Errorf("marshal ops config: %w", err)
}
if err := os.WriteFile(cfgPath, cfgBytes, 0o640); err != nil { //nolint:gosec // mcp-group-readable config
return fmt.Errorf("write ops config: %w", err)
}
img := q.imgPath(spec.Name)
bin := q.binPath(spec.Image)
if _, err := os.Stat(bin); err != nil {
return fmt.Errorf("binary not found for %q (Pull first): %w", spec.Image, err)
}
build := exec.CommandContext(ctx, q.opsPath(), "build", bin, "-c", cfgPath, "-i", spec.Name) //nolint:gosec
build.Env = q.opsEnv()
if out, err := build.CombinedOutput(); err != nil {
return fmt.Errorf("ops build %q: %w: %s", spec.Name, err, out)
}
// ops writes to ~/.ops/images/<name>.img; move it into our image dir.
opsImg := filepath.Join(q.opsImagesDir(), spec.Name+".img")
if _, err := os.Stat(opsImg); err == nil {
if err := os.Rename(opsImg, img); err != nil {
// Cross-device fallback: copy.
if cpErr := copyFile(opsImg, img); cpErr != nil {
return fmt.Errorf("relocate built image: %w", err)
}
_ = os.Remove(opsImg)
}
}
hash, _ := fileSHA256(img)
// Assemble QEMU invocation: KVM-accelerated, headless, serial console to a
// file, QMP control socket, virtio disk + NIC with user-mode port forwards.
netdev := "user,id=n0"
for _, p := range spec.Ports {
netdev += ",hostfwd=" + hostForward(p)
}
args := []string{
"-enable-kvm",
"-m", strconv.Itoa(mem),
"-smp", strconv.Itoa(cpus),
"-display", "none",
"-no-reboot",
"-daemonize",
"-pidfile", filepath.Join(q.vmDir(spec.Name), "qemu.pid"),
"-serial", "file:" + filepath.Join(q.vmDir(spec.Name), "console.log"),
"-qmp", "unix:" + filepath.Join(q.vmDir(spec.Name), "qmp.sock") + ",server,nowait",
"-drive", "file=" + img + ",format=raw,if=virtio",
"-device", "virtio-net-pci,netdev=n0",
"-netdev", netdev,
}
// 9p passthrough for host /srv/<service> volumes (best-effort; Nanos must
// support the 9p client for the guest to mount it).
for i, v := range spec.Volumes {
parts := strings.SplitN(v, ":", 2)
host := parts[0]
tag := fmt.Sprintf("srv%d", i)
args = append(args,
"-virtfs", fmt.Sprintf("local,path=%s,mount_tag=%s,security_model=none,id=%s", host, tag, tag),
)
}
cmd := exec.CommandContext(ctx, q.qemuPath(), args...) //nolint:gosec
if out, err := cmd.CombinedOutput(); err != nil {
return fmt.Errorf("qemu launch %q: %w: %s", spec.Name, err, out)
}
meta := vmMeta{
Name: spec.Name,
Image: spec.Image,
User: spec.User,
Restart: spec.Restart,
Ports: spec.Ports,
Volumes: spec.Volumes,
Cmd: spec.Cmd,
MemoryMB: mem,
VCPUs: cpus,
ImageHash: hash,
Started: time.Now().UTC(),
}
return q.writeMeta(spec.Name, meta)
}
func (q *QEMU) opsImagesDir() string {
home := q.HomeDir
if home == "" {
home, _ = os.UserHomeDir()
}
return filepath.Join(home, ".ops", "images")
}
func (q *QEMU) writeMeta(name string, m vmMeta) error {
b, err := json.MarshalIndent(m, "", " ")
if err != nil {
return fmt.Errorf("marshal vm meta: %w", err)
}
return os.WriteFile(filepath.Join(q.vmDir(name), "meta.json"), b, 0o640) //nolint:gosec // mcp-group-readable metadata
}
func (q *QEMU) readMeta(name string) (vmMeta, error) {
var m vmMeta
b, err := os.ReadFile(filepath.Join(q.vmDir(name), "meta.json"))
if err != nil {
return m, err
}
return m, json.Unmarshal(b, &m)
}
// pidOf returns the running QEMU pid for a VM, or 0 if not running.
func (q *QEMU) pidOf(name string) int {
b, err := os.ReadFile(filepath.Join(q.vmDir(name), "qemu.pid"))
if err != nil {
return 0
}
pid, err := strconv.Atoi(strings.TrimSpace(string(b)))
if err != nil || pid <= 0 {
return 0
}
if err := syscall.Kill(pid, 0); err != nil {
return 0
}
return pid
}
// Stop gracefully powers down the VM, falling back to SIGTERM/SIGKILL.
func (q *QEMU) Stop(ctx context.Context, name string) error {
pid := q.pidOf(name)
if pid == 0 {
return nil
}
// Try a graceful QMP system_powerdown.
_ = q.qmpCommand(name, "system_powerdown")
deadline := time.Now().Add(10 * time.Second)
for time.Now().Before(deadline) {
if q.pidOf(name) == 0 {
return nil
}
time.Sleep(300 * time.Millisecond)
}
// Escalate.
_ = syscall.Kill(pid, syscall.SIGTERM)
time.Sleep(2 * time.Second)
if q.pidOf(name) != 0 {
_ = syscall.Kill(pid, syscall.SIGKILL)
}
return nil
}
// qmpCommand sends a single QMP command over the VM's control socket.
func (q *QEMU) qmpCommand(name, command string) error {
sock := filepath.Join(q.vmDir(name), "qmp.sock")
conn, err := dialUnix(sock, 3*time.Second)
if err != nil {
return err
}
defer func() { _ = conn.Close() }()
// QMP handshake: read greeting, send qmp_capabilities, then the command.
dec := json.NewDecoder(conn)
var greeting map[string]any
_ = dec.Decode(&greeting)
_, _ = conn.Write([]byte(`{"execute":"qmp_capabilities"}`))
var ack map[string]any
_ = dec.Decode(&ack)
_, err = conn.Write([]byte(`{"execute":"` + command + `"}`))
return err
}
// Remove stops the VM and deletes its state directory.
func (q *QEMU) Remove(ctx context.Context, name string) error {
_ = q.Stop(ctx, name)
if pid := q.pidOf(name); pid != 0 {
_ = syscall.Kill(pid, syscall.SIGKILL)
}
return os.RemoveAll(q.vmDir(name))
}
// Inspect reports the observed state of a VM.
func (q *QEMU) Inspect(ctx context.Context, name string) (ContainerInfo, error) {
m, err := q.readMeta(name)
if err != nil {
return ContainerInfo{}, fmt.Errorf("qemu inspect %q: %w", name, err)
}
state := "stopped"
if q.pidOf(name) != 0 {
state = "running"
}
return q.infoFromMeta(m, state), nil
}
func (q *QEMU) infoFromMeta(m vmMeta, state string) ContainerInfo {
return ContainerInfo{
Name: m.Name,
Image: m.Image,
State: state,
Network: "user",
User: m.User,
Restart: m.Restart,
Ports: m.Ports,
Volumes: m.Volumes,
Cmd: m.Cmd,
Version: ExtractVersion(m.Image),
Started: m.Started,
}
}
// List enumerates all VMs known from the state directory.
func (q *QEMU) List(ctx context.Context) ([]ContainerInfo, error) {
entries, err := os.ReadDir(q.stateDir())
if err != nil {
if os.IsNotExist(err) {
return nil, nil
}
return nil, fmt.Errorf("read vm state dir: %w", err)
}
var infos []ContainerInfo
for _, e := range entries {
if !e.IsDir() {
continue
}
m, err := q.readMeta(e.Name())
if err != nil {
continue
}
state := "stopped"
if q.pidOf(e.Name()) != 0 {
state = "running"
}
infos = append(infos, q.infoFromMeta(m, state))
}
return infos, nil
}
// Logs streams the VM's serial console log.
func (q *QEMU) Logs(ctx context.Context, name string, tail int, follow, timestamps bool, since string) *exec.Cmd {
console := filepath.Join(q.vmDir(name), "console.log")
args := []string{}
if follow {
args = append(args, "-f")
}
if tail > 0 {
args = append(args, "-n", strconv.Itoa(tail))
} else {
args = append(args, "-n", "+1")
}
args = append(args, console)
return exec.CommandContext(ctx, "tail", args...) //nolint:gosec
}
// Build builds a Nanos image from a context directory's binary. Used by the
// `mcp build --unikernel` path. Not the primary deploy path.
func (q *QEMU) Build(ctx context.Context, image, contextDir, dockerfile string) error {
return fmt.Errorf("qemu build: not implemented; build OCI image then Pull")
}
// Push is not implemented for the QEMU runtime.
func (q *QEMU) Push(ctx context.Context, image string) error {
return fmt.Errorf("qemu push: not implemented")
}
// ImageExists reports whether the extracted binary for the image is cached.
func (q *QEMU) ImageExists(ctx context.Context, image string) (bool, error) {
if _, err := os.Stat(q.binPath(image)); err == nil {
return true, nil
}
return false, nil
}
// Login delegates registry auth to podman (shared credential store).
func (q *QEMU) Login(ctx context.Context, registry, username, token string) error {
cmd := exec.CommandContext(ctx, "podman", "login", "--username", username, "--password-stdin", registry) //nolint:gosec
cmd.Stdin = strings.NewReader(token)
if out, err := cmd.CombinedOutput(); err != nil {
return fmt.Errorf("podman login %q: %w: %s", registry, err, out)
}
return nil
}
func fileSHA256(path string) (string, error) {
b, err := os.ReadFile(path) //nolint:gosec // hashing a known image path
if err != nil {
return "", err
}
sum := sha256.Sum256(b)
return hex.EncodeToString(sum[:]), nil
}
func copyFile(src, dst string) error {
in, err := os.ReadFile(src) //nolint:gosec // relocating a built image
if err != nil {
return err
}
return os.WriteFile(dst, in, 0o640) //nolint:gosec // relocating a built image
}

View File

@@ -0,0 +1,43 @@
package runtime
import "testing"
func TestSanitizeImage(t *testing.T) {
got := sanitizeImage("mcr.example:8443/mcdoc:v0.1.0")
if got != "mcr.example_8443_mcdoc_v0.1.0" {
t.Errorf("sanitizeImage = %q", got)
}
}
func TestBinaryName(t *testing.T) {
cases := map[string]string{
"host:8443/mcdoc:v0.1.0": "mcdoc",
"mcdoc:v1": "mcdoc",
"reg/uktest": "uktest",
}
for in, want := range cases {
if got := binaryName(in); got != want {
t.Errorf("binaryName(%q) = %q, want %q", in, got, want)
}
}
}
func TestHostForward(t *testing.T) {
cases := map[string]string{
"100.88.197.9:18080:8080": "tcp:100.88.197.9:18080-:8080",
"18080:8080": "tcp:127.0.0.1:18080-:8080",
"8080": "tcp:127.0.0.1:8080-:8080",
}
for in, want := range cases {
if got := hostForward(in); got != want {
t.Errorf("hostForward(%q) = %q, want %q", in, got, want)
}
}
}
func TestGuestPorts(t *testing.T) {
got := guestPorts([]string{"100.88.197.9:18080:8080", "9090"})
if len(got) != 2 || got[0] != "8080" || got[1] != "9090" {
t.Errorf("guestPorts = %v", got)
}
}

View File

@@ -17,6 +17,10 @@ type ContainerSpec struct {
Volumes []string // "host:container" volume mounts
Cmd []string // command and arguments
Env []string // environment variables (KEY=VALUE)
// Unikernel-only fields (ignored by the container runtime).
MemoryMB int // guest memory in MB (default 256)
VCPUs int // guest vCPUs (default 1)
}
// ContainerInfo describes the observed state of a running or stopped container.