Files
mcp/internal/runtime/qemu.go
Kyle Isom 84dd897bcd unikernel: bake volume config into images + per-service user-mode net
Enables migrating real services (config/cert dirs, stateless) to
unikernels. Volume host dirs are copied into a per-VM staging tree
mirroring guest paths; the ops config goes in the staging root with the
top-level dirs in Dirs, so ops bakes them at the right absolute paths.
(Staging is required — an absolute /srv MapDirs source makes ops descend
into the agent's podman overlay storage and fail.) A component may set
network = "user" to use QEMU user-mode NAT instead of the isolated
bridge (Phase-1 networking for first migrations, before a gateway proxy).

Verified: mcat (the MCIAS policy tester) deployed as a Nanos unikernel
via 'mcp deploy', booting with its baked /srv/mcat config+certs, serving
HTTPS verified against the platform CA, configured against MCIAS.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-11 10:16:30 -07:00

805 lines
24 KiB
Go

package runtime
import (
"context"
"crypto/sha256"
"encoding/hex"
"encoding/json"
"fmt"
"net"
"os"
"os/exec"
"path/filepath"
"strconv"
"strings"
"syscall"
"time"
)
// dialUnix connects to a unix-domain socket with a timeout.
func dialUnix(path string, timeout time.Duration) (net.Conn, error) {
return net.DialTimeout("unix", path, timeout)
}
// QEMU implements the Runtime interface by running services as Nanos
// unikernel virtual machines under QEMU/KVM instead of containers.
//
// Each service component becomes a single-process VM with its own kernel.
// The lifecycle maps onto the same Runtime interface as Podman so the agent
// can treat unikernels and containers uniformly:
//
// Pull -> pull the OCI image, extract the ELF binary, cache it
// Run -> `ops build` the binary into a Nanos image, boot it under QEMU
// Stop -> graceful QMP powerdown, then SIGTERM/SIGKILL the QEMU process
// Remove -> stop and delete the VM state directory
// Inspect -> read persisted metadata + check process liveness
// List -> enumerate VM state directories
// Logs -> stream the serial console log file
//
// Phase 1 uses QEMU user-mode networking with host port forwards, which is
// functionally equivalent to rootless podman's localhost port mappings:
// mc-proxy routes to 127.0.0.1:<hostport> exactly as it does for containers.
// Isolated bridge networking is a later phase.
type QEMU struct {
// ImageDir holds built Nanos images and extracted binaries.
// Default: /srv/mcp/images
ImageDir string
// StateDir holds per-VM runtime state (pidfile, QMP socket, console log,
// metadata). Default: /srv/mcp/vm
StateDir string
// OpsPath is the path to the `ops` Nanos toolchain binary. Default: "ops".
OpsPath string
// QemuPath is the path to qemu-system-x86_64. Default: "qemu-system-x86_64".
QemuPath string
// Memory is the default guest memory in MB when a spec does not set one.
Memory int
// HomeDir is set as $HOME for `ops` so it uses a stable ~/.ops directory.
HomeDir string
// Bridge, when set, switches VMs from QEMU user-mode networking to an
// isolated host-only bridge (Phase 2). Each VM gets a TAP device on the
// bridge and a static IP; a host firewall confines it to reaching only
// the bridge gateway (where mc-proxy listens). Empty = user-mode.
Bridge string // e.g. "mcp-br0"
// Gateway is the bridge's host IP (and the VMs' default route / mc-proxy
// address), e.g. "10.99.0.1".
Gateway string
// SubnetPrefix is the /24 network prefix VMs are numbered in, e.g.
// "10.99.0" (VMs get .2 .. .254).
SubnetPrefix string
}
// bridgeMode reports whether isolated bridge networking is configured.
func (q *QEMU) bridgeMode() bool { return q.Bridge != "" }
func (q *QEMU) imageDir() string {
if q.ImageDir != "" {
return q.ImageDir
}
return "/srv/mcp/images"
}
func (q *QEMU) stateDir() string {
if q.StateDir != "" {
return q.StateDir
}
return "/srv/mcp/vm"
}
func (q *QEMU) opsPath() string {
if q.OpsPath != "" {
return q.OpsPath
}
return "ops"
}
func (q *QEMU) qemuPath() string {
if q.QemuPath != "" {
return q.QemuPath
}
return "qemu-system-x86_64"
}
func (q *QEMU) memory() int {
if q.Memory > 0 {
return q.Memory
}
return 256
}
// opsEnv returns the environment for invoking `ops`, pinning $HOME so its
// cache and image directory are stable across invocations.
func (q *QEMU) opsEnv() []string {
env := os.Environ()
if q.HomeDir != "" {
env = append(env, "HOME="+q.HomeDir)
}
return env
}
// sanitizeImage turns an image reference into a filesystem-safe stem.
//
// "mcr.example:8443/mcdoc:v0.1.0" -> "mcr.example_8443_mcdoc_v0.1.0"
func sanitizeImage(image string) string {
r := strings.NewReplacer("/", "_", ":", "_")
return r.Replace(image)
}
// binaryName derives the in-image ELF binary name from an image reference by
// taking the repository basename. "host:8443/mcdoc:v0.1.0" -> "mcdoc".
func binaryName(image string) string {
name := image
if i := strings.LastIndex(name, "/"); i >= 0 {
name = name[i+1:]
}
if i := strings.Index(name, ":"); i >= 0 {
name = name[:i]
}
return name
}
func (q *QEMU) binPath(image string) string {
return filepath.Join(q.imageDir(), sanitizeImage(image)+".bin")
}
func (q *QEMU) imgPath(name string) string {
return filepath.Join(q.imageDir(), name+".img")
}
func (q *QEMU) vmDir(name string) string {
return filepath.Join(q.stateDir(), name)
}
// vmMeta is the persisted per-VM metadata written at Run time so that
// Inspect/List can report accurate information after an agent restart.
type vmMeta struct {
Name string `json:"name"`
Image string `json:"image"`
User string `json:"user"`
Restart string `json:"restart"`
Ports []string `json:"ports"`
Volumes []string `json:"volumes"`
Cmd []string `json:"cmd"`
MemoryMB int `json:"memory_mb"`
VCPUs int `json:"vcpus"`
ImageHash string `json:"image_hash"`
IP string `json:"ip,omitempty"` // bridge-mode static IP
Started time.Time `json:"started"`
}
// Pull pulls the OCI image and extracts its ELF binary into the image cache.
// The binary is the input to `ops build`; the Nanos image itself is built at
// Run time so the service's command arguments can be baked in.
func (q *QEMU) Pull(ctx context.Context, image string) error {
if err := os.MkdirAll(q.imageDir(), 0o750); err != nil {
return fmt.Errorf("create image dir: %w", err)
}
// Pull the OCI image via podman (reuses the agent's registry auth).
if out, err := exec.CommandContext(ctx, "podman", "pull", image).CombinedOutput(); err != nil { //nolint:gosec // args built programmatically
return fmt.Errorf("podman pull %q: %w: %s", image, err, out)
}
// Create (do not start) a container to copy the binary out of.
tmp := "ukextract-" + sanitizeImage(image)
_ = exec.CommandContext(ctx, "podman", "rm", "-f", tmp).Run() //nolint:gosec
if out, err := exec.CommandContext(ctx, "podman", "create", "--name", tmp, image).CombinedOutput(); err != nil { //nolint:gosec
return fmt.Errorf("podman create %q: %w: %s", image, err, out)
}
defer func() { _ = exec.Command("podman", "rm", "-f", tmp).Run() }() //nolint:gosec
bin := binaryName(image)
src := tmp + ":/usr/local/bin/" + bin
dst := q.binPath(image)
if out, err := exec.CommandContext(ctx, "podman", "cp", src, dst).CombinedOutput(); err != nil { //nolint:gosec
return fmt.Errorf("extract binary %q from %q: %w: %s", bin, image, err, out)
}
if err := os.Chmod(dst, 0o755); err != nil { //nolint:gosec // unikernel ELF must be executable
return fmt.Errorf("chmod extracted binary: %w", err)
}
return nil
}
// opsConfig is the subset of the `ops` build configuration we generate.
type opsConfig struct {
Args []string `json:"Args,omitempty"`
Env map[string]string `json:"Env,omitempty"`
Dirs []string `json:"Dirs,omitempty"` // dirs (relative to the config file) baked into the image
RunConfig opsRunConfig `json:"RunConfig"`
}
type opsRunConfig struct {
Ports []string `json:"Ports,omitempty"`
Memory string `json:"Memory,omitempty"`
CPUs int `json:"CPUs,omitempty"`
Klibs []string `json:"Klibs,omitempty"`
Mounts any `json:"Mounts,omitempty"`
NoTrace []string `json:"NoTrace,omitempty"`
GDBPort int `json:"GDBPort,omitempty"`
Nanos string `json:"Nanos,omitempty"`
Hostname string `json:"Hostname,omitempty"`
IPAddress string `json:"IPAddress,omitempty"` // static IP for bridge mode
NetMask string `json:"NetMask,omitempty"`
Gateway string `json:"Gateway,omitempty"`
}
// guestPorts extracts the guest (container) port from each spec port mapping.
// Accepts "host:container", "ip:host:container", or a bare "port".
func guestPorts(ports []string) []string {
var gp []string
for _, p := range ports {
parts := strings.Split(p, ":")
gp = append(gp, parts[len(parts)-1])
}
return gp
}
// hostForward builds the QEMU hostfwd value for a spec port mapping.
// "ip:host:container" -> "tcp:ip:host-:container"
// "host:container" -> "tcp:127.0.0.1:host-:container"
func hostForward(p string) string {
parts := strings.Split(p, ":")
switch len(parts) {
case 3:
return fmt.Sprintf("tcp:%s:%s-:%s", parts[0], parts[1], parts[2])
case 2:
return fmt.Sprintf("tcp:127.0.0.1:%s-:%s", parts[0], parts[1])
default:
return fmt.Sprintf("tcp:127.0.0.1:%s-:%s", parts[0], parts[0])
}
}
// Run builds the Nanos image (if needed) and boots it under QEMU/KVM.
func (q *QEMU) Run(ctx context.Context, spec ContainerSpec) error {
if err := os.MkdirAll(q.vmDir(spec.Name), 0o750); err != nil {
return fmt.Errorf("create vm state dir: %w", err)
}
mem := spec.MemoryMB
if mem <= 0 {
mem = q.memory()
}
cpus := spec.VCPUs
if cpus <= 0 {
cpus = 1
}
// A component can opt out of the isolated bridge with network = "user"
// (e.g. a stateless service that needs NAT egress to its dependencies
// before the gateway proxy exists). Otherwise unikernels use the bridge
// when one is configured.
useBridge := q.bridgeMode() && spec.Network != "user"
// In bridge mode, allocate a static IP on the isolated bridge for this VM.
var vmIP string
if useBridge {
ip, allocErr := q.allocateIP(spec.Name)
if allocErr != nil {
return fmt.Errorf("allocate VM IP: %w", allocErr)
}
vmIP = ip
}
// Build the Nanos image from the extracted binary, baking in command args.
cfg := opsConfig{
Args: spec.Cmd,
RunConfig: opsRunConfig{
Ports: guestPorts(spec.Ports),
Memory: strconv.Itoa(mem) + "m",
CPUs: cpus,
},
}
if useBridge {
cfg.RunConfig.IPAddress = vmIP
cfg.RunConfig.NetMask = "255.255.255.0"
cfg.RunConfig.Gateway = q.Gateway
}
// Bake each volume's host directory into the image (read-only). Unikernels
// have no writable host mount yet, so this suits config/cert directories;
// stateful services need a different storage story (9p/virtio-blk).
//
// Volume contents are copied into a per-VM staging tree that mirrors the
// guest paths (e.g. /srv/mcat -> <bake>/srv/mcat). We then put the ops
// config in the staging root and list the top-level dirs in `Dirs`: `ops`
// resolves Dirs relative to the config file and bakes the tree at the
// matching absolute path. (Staging is required: an absolute /srv source
// would make `ops` descend into the agent's podman overlay storage and
// fail.)
bakeDir := ""
if len(spec.Volumes) > 0 {
bakeDir = filepath.Join(q.vmDir(spec.Name), "bake")
_ = os.RemoveAll(bakeDir)
topLevel := map[string]bool{}
for _, v := range spec.Volumes {
parts := strings.SplitN(v, ":", 2)
host := parts[0]
guest := host
if len(parts) == 2 {
guest = parts[1]
}
rel := strings.TrimPrefix(guest, "/")
stage := filepath.Join(bakeDir, rel)
if err := os.MkdirAll(filepath.Dir(stage), 0o750); err != nil {
return fmt.Errorf("stage volume dir: %w", err)
}
if err := os.CopyFS(stage, os.DirFS(host)); err != nil {
return fmt.Errorf("stage volume %q: %w", host, err)
}
if i := strings.Index(rel, "/"); i >= 0 {
topLevel[rel[:i]] = true
} else {
topLevel[rel] = true
}
}
for d := range topLevel {
cfg.Dirs = append(cfg.Dirs, d)
}
}
if len(spec.Env) > 0 {
cfg.Env = map[string]string{}
for _, e := range spec.Env {
if i := strings.Index(e, "="); i >= 0 {
cfg.Env[e[:i]] = e[i+1:]
}
}
}
// Place the ops config in the staging root when baking volumes, so its
// relative Dirs resolve to the staged tree; otherwise keep it in the VM
// state dir.
cfgPath := filepath.Join(q.vmDir(spec.Name), "ops.json")
if bakeDir != "" {
cfgPath = filepath.Join(bakeDir, "ops.json")
}
cfgBytes, err := json.MarshalIndent(cfg, "", " ")
if err != nil {
return fmt.Errorf("marshal ops config: %w", err)
}
if err := os.WriteFile(cfgPath, cfgBytes, 0o640); err != nil { //nolint:gosec // mcp-group-readable config
return fmt.Errorf("write ops config: %w", err)
}
img := q.imgPath(spec.Name)
bin := q.binPath(spec.Image)
if _, err := os.Stat(bin); err != nil {
return fmt.Errorf("binary not found for %q (Pull first): %w", spec.Image, err)
}
build := exec.CommandContext(ctx, q.opsPath(), "build", bin, "-c", cfgPath, "-i", spec.Name) //nolint:gosec
build.Env = q.opsEnv()
if bakeDir != "" {
build.Dir = bakeDir
}
if out, err := build.CombinedOutput(); err != nil {
return fmt.Errorf("ops build %q: %w: %s", spec.Name, err, out)
}
// ops writes to ~/.ops/images/<name>.img; move it into our image dir.
opsImg := filepath.Join(q.opsImagesDir(), spec.Name+".img")
if _, err := os.Stat(opsImg); err == nil {
if err := os.Rename(opsImg, img); err != nil {
// Cross-device fallback: copy.
if cpErr := copyFile(opsImg, img); cpErr != nil {
return fmt.Errorf("relocate built image: %w", err)
}
_ = os.Remove(opsImg)
}
}
hash, _ := fileSHA256(img)
// Assemble QEMU invocation: KVM-accelerated, headless, serial console to a
// file, QMP control socket, virtio disk + virtio NIC.
//
// Networking has two modes:
// user-mode (Phase 1): host port forwards to localhost, like rootless
// podman. mc-proxy routes to 127.0.0.1:<hostport>.
// bridge (Phase 2): a TAP device on an isolated host-only bridge.
// The VM has no route off the bridge; a host firewall confines it to
// reaching only mc-proxy on the gateway. This makes mediation
// mandatory rather than cooperative.
var netDevice, netBackend string
if useBridge {
tap, tapErr := q.createTAP(spec.Name)
if tapErr != nil {
return fmt.Errorf("create TAP: %w", tapErr)
}
mac := deterministicMAC(spec.Name)
netDevice = "virtio-net-pci,netdev=n0,mac=" + mac
netBackend = fmt.Sprintf("tap,id=n0,ifname=%s,script=no,downscript=no", tap)
} else {
netDevice = "virtio-net-pci,netdev=n0"
netBackend = "user,id=n0"
for _, p := range spec.Ports {
netBackend += ",hostfwd=" + hostForward(p)
}
}
args := []string{
"-enable-kvm",
"-m", strconv.Itoa(mem),
"-smp", strconv.Itoa(cpus),
"-display", "none",
"-no-reboot",
"-daemonize",
"-pidfile", filepath.Join(q.vmDir(spec.Name), "qemu.pid"),
"-serial", "file:" + filepath.Join(q.vmDir(spec.Name), "console.log"),
"-qmp", "unix:" + filepath.Join(q.vmDir(spec.Name), "qmp.sock") + ",server,nowait",
"-drive", "file=" + img + ",format=raw,if=virtio",
"-device", netDevice,
"-netdev", netBackend,
}
cmd := exec.CommandContext(ctx, q.qemuPath(), args...) //nolint:gosec
if out, err := cmd.CombinedOutput(); err != nil {
return fmt.Errorf("qemu launch %q: %w: %s", spec.Name, err, out)
}
meta := vmMeta{
Name: spec.Name,
Image: spec.Image,
User: spec.User,
Restart: spec.Restart,
Ports: spec.Ports,
Volumes: spec.Volumes,
Cmd: spec.Cmd,
MemoryMB: mem,
VCPUs: cpus,
ImageHash: hash,
IP: vmIP,
Started: time.Now().UTC(),
}
return q.writeMeta(spec.Name, meta)
}
func (q *QEMU) opsImagesDir() string {
home := q.HomeDir
if home == "" {
home, _ = os.UserHomeDir()
}
return filepath.Join(home, ".ops", "images")
}
func (q *QEMU) writeMeta(name string, m vmMeta) error {
b, err := json.MarshalIndent(m, "", " ")
if err != nil {
return fmt.Errorf("marshal vm meta: %w", err)
}
return os.WriteFile(filepath.Join(q.vmDir(name), "meta.json"), b, 0o640) //nolint:gosec // mcp-group-readable metadata
}
func (q *QEMU) readMeta(name string) (vmMeta, error) {
var m vmMeta
b, err := os.ReadFile(filepath.Join(q.vmDir(name), "meta.json"))
if err != nil {
return m, err
}
return m, json.Unmarshal(b, &m)
}
// pidOf returns the running QEMU pid for a VM, or 0 if not running.
func (q *QEMU) pidOf(name string) int {
b, err := os.ReadFile(filepath.Join(q.vmDir(name), "qemu.pid"))
if err != nil {
return 0
}
pid, err := strconv.Atoi(strings.TrimSpace(string(b)))
if err != nil || pid <= 0 {
return 0
}
if err := syscall.Kill(pid, 0); err != nil {
return 0
}
return pid
}
// Stop gracefully powers down the VM, falling back to SIGTERM/SIGKILL.
func (q *QEMU) Stop(ctx context.Context, name string) error {
pid := q.pidOf(name)
if pid == 0 {
return nil
}
// Try a graceful QMP system_powerdown, but Nanos has no ACPI shutdown
// handler, so escalate to signals quickly rather than waiting long.
_ = q.qmpCommand(name, "system_powerdown")
if q.waitGone(name, 2*time.Second) {
return nil
}
_ = syscall.Kill(pid, syscall.SIGTERM)
if q.waitGone(name, 2*time.Second) {
return nil
}
_ = syscall.Kill(pid, syscall.SIGKILL)
q.waitGone(name, 2*time.Second)
if q.bridgeMode() {
q.destroyTAP(name)
}
return nil
}
// waitGone polls until the VM process exits or the timeout elapses,
// returning true if it has exited.
func (q *QEMU) waitGone(name string, timeout time.Duration) bool {
deadline := time.Now().Add(timeout)
for time.Now().Before(deadline) {
if q.pidOf(name) == 0 {
return true
}
time.Sleep(200 * time.Millisecond)
}
return q.pidOf(name) == 0
}
// qmpCommand sends a single QMP command over the VM's control socket.
func (q *QEMU) qmpCommand(name, command string) error {
sock := filepath.Join(q.vmDir(name), "qmp.sock")
conn, err := dialUnix(sock, 3*time.Second)
if err != nil {
return err
}
defer func() { _ = conn.Close() }()
// QMP handshake: read greeting, send qmp_capabilities, then the command.
dec := json.NewDecoder(conn)
var greeting map[string]any
_ = dec.Decode(&greeting)
_, _ = conn.Write([]byte(`{"execute":"qmp_capabilities"}`))
var ack map[string]any
_ = dec.Decode(&ack)
_, err = conn.Write([]byte(`{"execute":"` + command + `"}`))
return err
}
// Remove stops the VM and deletes its state directory.
func (q *QEMU) Remove(ctx context.Context, name string) error {
_ = q.Stop(ctx, name)
if pid := q.pidOf(name); pid != 0 {
_ = syscall.Kill(pid, syscall.SIGKILL)
}
if q.bridgeMode() {
q.destroyTAP(name)
_ = q.releaseIP(name)
}
return os.RemoveAll(q.vmDir(name))
}
// Inspect reports the observed state of a VM.
func (q *QEMU) Inspect(ctx context.Context, name string) (ContainerInfo, error) {
m, err := q.readMeta(name)
if err != nil {
return ContainerInfo{}, fmt.Errorf("qemu inspect %q: %w", name, err)
}
state := "stopped"
if q.pidOf(name) != 0 {
state = "running"
}
return q.infoFromMeta(m, state), nil
}
func (q *QEMU) infoFromMeta(m vmMeta, state string) ContainerInfo {
network := "user"
if m.IP != "" {
network = q.Bridge + " " + m.IP // isolated bridge + the VM's static IP
}
return ContainerInfo{
Name: m.Name,
Image: m.Image,
State: state,
Network: network,
User: m.User,
Restart: m.Restart,
Ports: m.Ports,
Volumes: m.Volumes,
Cmd: m.Cmd,
Version: ExtractVersion(m.Image),
Started: m.Started,
}
}
// VMAddr returns the host-reachable "ip:port" backend for a bridge-mode VM
// component, given its first guest port. Returns "" if the VM has no IP
// (user-mode) or no ports.
func (q *QEMU) VMAddr(name string) string {
m, err := q.readMeta(name)
if err != nil || m.IP == "" {
return ""
}
gp := guestPorts(m.Ports)
if len(gp) == 0 {
return ""
}
return m.IP + ":" + gp[0]
}
// List enumerates all VMs known from the state directory.
func (q *QEMU) List(ctx context.Context) ([]ContainerInfo, error) {
entries, err := os.ReadDir(q.stateDir())
if err != nil {
if os.IsNotExist(err) {
return nil, nil
}
return nil, fmt.Errorf("read vm state dir: %w", err)
}
var infos []ContainerInfo
for _, e := range entries {
if !e.IsDir() {
continue
}
m, err := q.readMeta(e.Name())
if err != nil {
continue
}
state := "stopped"
if q.pidOf(e.Name()) != 0 {
state = "running"
}
infos = append(infos, q.infoFromMeta(m, state))
}
return infos, nil
}
// Logs streams the VM's serial console log.
func (q *QEMU) Logs(ctx context.Context, name string, tail int, follow, timestamps bool, since string) *exec.Cmd {
console := filepath.Join(q.vmDir(name), "console.log")
args := []string{}
if follow {
args = append(args, "-f")
}
if tail > 0 {
args = append(args, "-n", strconv.Itoa(tail))
} else {
args = append(args, "-n", "+1")
}
args = append(args, console)
return exec.CommandContext(ctx, "tail", args...) //nolint:gosec
}
// Build builds a Nanos image from a context directory's binary. Used by the
// `mcp build --unikernel` path. Not the primary deploy path.
func (q *QEMU) Build(ctx context.Context, image, contextDir, dockerfile string) error {
return fmt.Errorf("qemu build: not implemented; build OCI image then Pull")
}
// Push is not implemented for the QEMU runtime.
func (q *QEMU) Push(ctx context.Context, image string) error {
return fmt.Errorf("qemu push: not implemented")
}
// ImageExists reports whether the extracted binary for the image is cached.
func (q *QEMU) ImageExists(ctx context.Context, image string) (bool, error) {
if _, err := os.Stat(q.binPath(image)); err == nil {
return true, nil
}
return false, nil
}
// Login delegates registry auth to podman (shared credential store).
func (q *QEMU) Login(ctx context.Context, registry, username, token string) error {
cmd := exec.CommandContext(ctx, "podman", "login", "--username", username, "--password-stdin", registry) //nolint:gosec
cmd.Stdin = strings.NewReader(token)
if out, err := cmd.CombinedOutput(); err != nil {
return fmt.Errorf("podman login %q: %w: %s", registry, err, out)
}
return nil
}
// ---- Isolated bridge networking (Phase 2) ----
// vmUser is the host user that owns TAP devices and runs QEMU.
const vmUser = "mcp"
// tapName derives a TAP interface name for a VM, respecting the 15-char
// IFNAMSIZ limit. Long names fall back to a hash suffix.
func (q *QEMU) tapName(name string) string {
cand := "tap-" + name
if len(cand) <= 15 {
return cand
}
sum := sha256.Sum256([]byte(name))
return "tap-" + hex.EncodeToString(sum[:])[:11]
}
// deterministicMAC derives a stable locally-administered MAC from a name.
func deterministicMAC(name string) string {
sum := sha256.Sum256([]byte(name))
return fmt.Sprintf("52:54:00:%02x:%02x:%02x", sum[0], sum[1], sum[2])
}
func (q *QEMU) ipsFile() string {
return filepath.Join(q.stateDir(), "ips.json")
}
func (q *QEMU) readIPs() (map[string]string, error) {
m := map[string]string{}
b, err := os.ReadFile(q.ipsFile()) //nolint:gosec // fixed state-dir path
if err != nil {
if os.IsNotExist(err) {
return m, nil
}
return m, err
}
if err := json.Unmarshal(b, &m); err != nil {
return map[string]string{}, err
}
return m, nil
}
func (q *QEMU) writeIPs(m map[string]string) error {
if err := os.MkdirAll(q.stateDir(), 0o750); err != nil {
return err
}
b, err := json.MarshalIndent(m, "", " ")
if err != nil {
return err
}
return os.WriteFile(q.ipsFile(), b, 0o640) //nolint:gosec // mcp-group-readable
}
// allocateIP assigns (or returns the existing) static bridge IP for a VM.
// Allocation is serialized by the agent's single-threaded deploy path.
func (q *QEMU) allocateIP(name string) (string, error) {
ips, err := q.readIPs()
if err != nil {
return "", err
}
if ip, ok := ips[name]; ok {
return ip, nil
}
used := map[string]bool{}
for _, ip := range ips {
used[ip] = true
}
for n := 2; n <= 254; n++ {
ip := fmt.Sprintf("%s.%d", q.SubnetPrefix, n)
if !used[ip] {
ips[name] = ip
return ip, q.writeIPs(ips)
}
}
return "", fmt.Errorf("no free IPs in %s.0/24", q.SubnetPrefix)
}
func (q *QEMU) releaseIP(name string) error {
ips, err := q.readIPs()
if err != nil {
return err
}
delete(ips, name)
return q.writeIPs(ips)
}
// createTAP creates a TAP device owned by the VM user and enslaves it to the
// host-only bridge. Requires CAP_NET_ADMIN (granted to the agent on
// unikernel-capable nodes).
func (q *QEMU) createTAP(name string) (string, error) {
tap := q.tapName(name)
_ = exec.Command("ip", "link", "del", tap).Run() //nolint:gosec // best-effort cleanup of a stale device
steps := [][]string{
{"tuntap", "add", "dev", tap, "mode", "tap", "user", vmUser},
{"link", "set", tap, "master", q.Bridge},
{"link", "set", tap, "up"},
}
for _, args := range steps {
if out, err := exec.Command("ip", args...).CombinedOutput(); err != nil { //nolint:gosec // args built programmatically
_ = exec.Command("ip", "link", "del", tap).Run() //nolint:gosec
return "", fmt.Errorf("ip %v: %w: %s", args, err, out)
}
}
return tap, nil
}
func (q *QEMU) destroyTAP(name string) {
_ = exec.Command("ip", "link", "del", q.tapName(name)).Run() //nolint:gosec // best-effort teardown
}
func fileSHA256(path string) (string, error) {
b, err := os.ReadFile(path) //nolint:gosec // hashing a known image path
if err != nil {
return "", err
}
sum := sha256.Sum256(b)
return hex.EncodeToString(sum[:]), nil
}
func copyFile(src, dst string) error {
in, err := os.ReadFile(src) //nolint:gosec // relocating a built image
if err != nil {
return err
}
return os.WriteFile(dst, in, 0o640) //nolint:gosec // relocating a built image
}