Enables migrating real services (config/cert dirs, stateless) to unikernels. Volume host dirs are copied into a per-VM staging tree mirroring guest paths; the ops config goes in the staging root with the top-level dirs in Dirs, so ops bakes them at the right absolute paths. (Staging is required — an absolute /srv MapDirs source makes ops descend into the agent's podman overlay storage and fail.) A component may set network = "user" to use QEMU user-mode NAT instead of the isolated bridge (Phase-1 networking for first migrations, before a gateway proxy). Verified: mcat (the MCIAS policy tester) deployed as a Nanos unikernel via 'mcp deploy', booting with its baked /srv/mcat config+certs, serving HTTPS verified against the platform CA, configured against MCIAS. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
805 lines
24 KiB
Go
805 lines
24 KiB
Go
package runtime
|
|
|
|
import (
|
|
"context"
|
|
"crypto/sha256"
|
|
"encoding/hex"
|
|
"encoding/json"
|
|
"fmt"
|
|
"net"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"strconv"
|
|
"strings"
|
|
"syscall"
|
|
"time"
|
|
)
|
|
|
|
// dialUnix connects to a unix-domain socket with a timeout.
|
|
func dialUnix(path string, timeout time.Duration) (net.Conn, error) {
|
|
return net.DialTimeout("unix", path, timeout)
|
|
}
|
|
|
|
// QEMU implements the Runtime interface by running services as Nanos
|
|
// unikernel virtual machines under QEMU/KVM instead of containers.
|
|
//
|
|
// Each service component becomes a single-process VM with its own kernel.
|
|
// The lifecycle maps onto the same Runtime interface as Podman so the agent
|
|
// can treat unikernels and containers uniformly:
|
|
//
|
|
// Pull -> pull the OCI image, extract the ELF binary, cache it
|
|
// Run -> `ops build` the binary into a Nanos image, boot it under QEMU
|
|
// Stop -> graceful QMP powerdown, then SIGTERM/SIGKILL the QEMU process
|
|
// Remove -> stop and delete the VM state directory
|
|
// Inspect -> read persisted metadata + check process liveness
|
|
// List -> enumerate VM state directories
|
|
// Logs -> stream the serial console log file
|
|
//
|
|
// Phase 1 uses QEMU user-mode networking with host port forwards, which is
|
|
// functionally equivalent to rootless podman's localhost port mappings:
|
|
// mc-proxy routes to 127.0.0.1:<hostport> exactly as it does for containers.
|
|
// Isolated bridge networking is a later phase.
|
|
type QEMU struct {
|
|
// ImageDir holds built Nanos images and extracted binaries.
|
|
// Default: /srv/mcp/images
|
|
ImageDir string
|
|
// StateDir holds per-VM runtime state (pidfile, QMP socket, console log,
|
|
// metadata). Default: /srv/mcp/vm
|
|
StateDir string
|
|
// OpsPath is the path to the `ops` Nanos toolchain binary. Default: "ops".
|
|
OpsPath string
|
|
// QemuPath is the path to qemu-system-x86_64. Default: "qemu-system-x86_64".
|
|
QemuPath string
|
|
// Memory is the default guest memory in MB when a spec does not set one.
|
|
Memory int
|
|
// HomeDir is set as $HOME for `ops` so it uses a stable ~/.ops directory.
|
|
HomeDir string
|
|
|
|
// Bridge, when set, switches VMs from QEMU user-mode networking to an
|
|
// isolated host-only bridge (Phase 2). Each VM gets a TAP device on the
|
|
// bridge and a static IP; a host firewall confines it to reaching only
|
|
// the bridge gateway (where mc-proxy listens). Empty = user-mode.
|
|
Bridge string // e.g. "mcp-br0"
|
|
// Gateway is the bridge's host IP (and the VMs' default route / mc-proxy
|
|
// address), e.g. "10.99.0.1".
|
|
Gateway string
|
|
// SubnetPrefix is the /24 network prefix VMs are numbered in, e.g.
|
|
// "10.99.0" (VMs get .2 .. .254).
|
|
SubnetPrefix string
|
|
}
|
|
|
|
// bridgeMode reports whether isolated bridge networking is configured.
|
|
func (q *QEMU) bridgeMode() bool { return q.Bridge != "" }
|
|
|
|
func (q *QEMU) imageDir() string {
|
|
if q.ImageDir != "" {
|
|
return q.ImageDir
|
|
}
|
|
return "/srv/mcp/images"
|
|
}
|
|
|
|
func (q *QEMU) stateDir() string {
|
|
if q.StateDir != "" {
|
|
return q.StateDir
|
|
}
|
|
return "/srv/mcp/vm"
|
|
}
|
|
|
|
func (q *QEMU) opsPath() string {
|
|
if q.OpsPath != "" {
|
|
return q.OpsPath
|
|
}
|
|
return "ops"
|
|
}
|
|
|
|
func (q *QEMU) qemuPath() string {
|
|
if q.QemuPath != "" {
|
|
return q.QemuPath
|
|
}
|
|
return "qemu-system-x86_64"
|
|
}
|
|
|
|
func (q *QEMU) memory() int {
|
|
if q.Memory > 0 {
|
|
return q.Memory
|
|
}
|
|
return 256
|
|
}
|
|
|
|
// opsEnv returns the environment for invoking `ops`, pinning $HOME so its
|
|
// cache and image directory are stable across invocations.
|
|
func (q *QEMU) opsEnv() []string {
|
|
env := os.Environ()
|
|
if q.HomeDir != "" {
|
|
env = append(env, "HOME="+q.HomeDir)
|
|
}
|
|
return env
|
|
}
|
|
|
|
// sanitizeImage turns an image reference into a filesystem-safe stem.
|
|
//
|
|
// "mcr.example:8443/mcdoc:v0.1.0" -> "mcr.example_8443_mcdoc_v0.1.0"
|
|
func sanitizeImage(image string) string {
|
|
r := strings.NewReplacer("/", "_", ":", "_")
|
|
return r.Replace(image)
|
|
}
|
|
|
|
// binaryName derives the in-image ELF binary name from an image reference by
|
|
// taking the repository basename. "host:8443/mcdoc:v0.1.0" -> "mcdoc".
|
|
func binaryName(image string) string {
|
|
name := image
|
|
if i := strings.LastIndex(name, "/"); i >= 0 {
|
|
name = name[i+1:]
|
|
}
|
|
if i := strings.Index(name, ":"); i >= 0 {
|
|
name = name[:i]
|
|
}
|
|
return name
|
|
}
|
|
|
|
func (q *QEMU) binPath(image string) string {
|
|
return filepath.Join(q.imageDir(), sanitizeImage(image)+".bin")
|
|
}
|
|
|
|
func (q *QEMU) imgPath(name string) string {
|
|
return filepath.Join(q.imageDir(), name+".img")
|
|
}
|
|
|
|
func (q *QEMU) vmDir(name string) string {
|
|
return filepath.Join(q.stateDir(), name)
|
|
}
|
|
|
|
// vmMeta is the persisted per-VM metadata written at Run time so that
|
|
// Inspect/List can report accurate information after an agent restart.
|
|
type vmMeta struct {
|
|
Name string `json:"name"`
|
|
Image string `json:"image"`
|
|
User string `json:"user"`
|
|
Restart string `json:"restart"`
|
|
Ports []string `json:"ports"`
|
|
Volumes []string `json:"volumes"`
|
|
Cmd []string `json:"cmd"`
|
|
MemoryMB int `json:"memory_mb"`
|
|
VCPUs int `json:"vcpus"`
|
|
ImageHash string `json:"image_hash"`
|
|
IP string `json:"ip,omitempty"` // bridge-mode static IP
|
|
Started time.Time `json:"started"`
|
|
}
|
|
|
|
// Pull pulls the OCI image and extracts its ELF binary into the image cache.
|
|
// The binary is the input to `ops build`; the Nanos image itself is built at
|
|
// Run time so the service's command arguments can be baked in.
|
|
func (q *QEMU) Pull(ctx context.Context, image string) error {
|
|
if err := os.MkdirAll(q.imageDir(), 0o750); err != nil {
|
|
return fmt.Errorf("create image dir: %w", err)
|
|
}
|
|
|
|
// Pull the OCI image via podman (reuses the agent's registry auth).
|
|
if out, err := exec.CommandContext(ctx, "podman", "pull", image).CombinedOutput(); err != nil { //nolint:gosec // args built programmatically
|
|
return fmt.Errorf("podman pull %q: %w: %s", image, err, out)
|
|
}
|
|
|
|
// Create (do not start) a container to copy the binary out of.
|
|
tmp := "ukextract-" + sanitizeImage(image)
|
|
_ = exec.CommandContext(ctx, "podman", "rm", "-f", tmp).Run() //nolint:gosec
|
|
if out, err := exec.CommandContext(ctx, "podman", "create", "--name", tmp, image).CombinedOutput(); err != nil { //nolint:gosec
|
|
return fmt.Errorf("podman create %q: %w: %s", image, err, out)
|
|
}
|
|
defer func() { _ = exec.Command("podman", "rm", "-f", tmp).Run() }() //nolint:gosec
|
|
|
|
bin := binaryName(image)
|
|
src := tmp + ":/usr/local/bin/" + bin
|
|
dst := q.binPath(image)
|
|
if out, err := exec.CommandContext(ctx, "podman", "cp", src, dst).CombinedOutput(); err != nil { //nolint:gosec
|
|
return fmt.Errorf("extract binary %q from %q: %w: %s", bin, image, err, out)
|
|
}
|
|
if err := os.Chmod(dst, 0o755); err != nil { //nolint:gosec // unikernel ELF must be executable
|
|
return fmt.Errorf("chmod extracted binary: %w", err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// opsConfig is the subset of the `ops` build configuration we generate.
|
|
type opsConfig struct {
|
|
Args []string `json:"Args,omitempty"`
|
|
Env map[string]string `json:"Env,omitempty"`
|
|
Dirs []string `json:"Dirs,omitempty"` // dirs (relative to the config file) baked into the image
|
|
RunConfig opsRunConfig `json:"RunConfig"`
|
|
}
|
|
|
|
type opsRunConfig struct {
|
|
Ports []string `json:"Ports,omitempty"`
|
|
Memory string `json:"Memory,omitempty"`
|
|
CPUs int `json:"CPUs,omitempty"`
|
|
Klibs []string `json:"Klibs,omitempty"`
|
|
Mounts any `json:"Mounts,omitempty"`
|
|
NoTrace []string `json:"NoTrace,omitempty"`
|
|
GDBPort int `json:"GDBPort,omitempty"`
|
|
Nanos string `json:"Nanos,omitempty"`
|
|
Hostname string `json:"Hostname,omitempty"`
|
|
IPAddress string `json:"IPAddress,omitempty"` // static IP for bridge mode
|
|
NetMask string `json:"NetMask,omitempty"`
|
|
Gateway string `json:"Gateway,omitempty"`
|
|
}
|
|
|
|
// guestPorts extracts the guest (container) port from each spec port mapping.
|
|
// Accepts "host:container", "ip:host:container", or a bare "port".
|
|
func guestPorts(ports []string) []string {
|
|
var gp []string
|
|
for _, p := range ports {
|
|
parts := strings.Split(p, ":")
|
|
gp = append(gp, parts[len(parts)-1])
|
|
}
|
|
return gp
|
|
}
|
|
|
|
// hostForward builds the QEMU hostfwd value for a spec port mapping.
|
|
// "ip:host:container" -> "tcp:ip:host-:container"
|
|
// "host:container" -> "tcp:127.0.0.1:host-:container"
|
|
func hostForward(p string) string {
|
|
parts := strings.Split(p, ":")
|
|
switch len(parts) {
|
|
case 3:
|
|
return fmt.Sprintf("tcp:%s:%s-:%s", parts[0], parts[1], parts[2])
|
|
case 2:
|
|
return fmt.Sprintf("tcp:127.0.0.1:%s-:%s", parts[0], parts[1])
|
|
default:
|
|
return fmt.Sprintf("tcp:127.0.0.1:%s-:%s", parts[0], parts[0])
|
|
}
|
|
}
|
|
|
|
// Run builds the Nanos image (if needed) and boots it under QEMU/KVM.
|
|
func (q *QEMU) Run(ctx context.Context, spec ContainerSpec) error {
|
|
if err := os.MkdirAll(q.vmDir(spec.Name), 0o750); err != nil {
|
|
return fmt.Errorf("create vm state dir: %w", err)
|
|
}
|
|
|
|
mem := spec.MemoryMB
|
|
if mem <= 0 {
|
|
mem = q.memory()
|
|
}
|
|
cpus := spec.VCPUs
|
|
if cpus <= 0 {
|
|
cpus = 1
|
|
}
|
|
|
|
// A component can opt out of the isolated bridge with network = "user"
|
|
// (e.g. a stateless service that needs NAT egress to its dependencies
|
|
// before the gateway proxy exists). Otherwise unikernels use the bridge
|
|
// when one is configured.
|
|
useBridge := q.bridgeMode() && spec.Network != "user"
|
|
|
|
// In bridge mode, allocate a static IP on the isolated bridge for this VM.
|
|
var vmIP string
|
|
if useBridge {
|
|
ip, allocErr := q.allocateIP(spec.Name)
|
|
if allocErr != nil {
|
|
return fmt.Errorf("allocate VM IP: %w", allocErr)
|
|
}
|
|
vmIP = ip
|
|
}
|
|
|
|
// Build the Nanos image from the extracted binary, baking in command args.
|
|
cfg := opsConfig{
|
|
Args: spec.Cmd,
|
|
RunConfig: opsRunConfig{
|
|
Ports: guestPorts(spec.Ports),
|
|
Memory: strconv.Itoa(mem) + "m",
|
|
CPUs: cpus,
|
|
},
|
|
}
|
|
if useBridge {
|
|
cfg.RunConfig.IPAddress = vmIP
|
|
cfg.RunConfig.NetMask = "255.255.255.0"
|
|
cfg.RunConfig.Gateway = q.Gateway
|
|
}
|
|
// Bake each volume's host directory into the image (read-only). Unikernels
|
|
// have no writable host mount yet, so this suits config/cert directories;
|
|
// stateful services need a different storage story (9p/virtio-blk).
|
|
//
|
|
// Volume contents are copied into a per-VM staging tree that mirrors the
|
|
// guest paths (e.g. /srv/mcat -> <bake>/srv/mcat). We then put the ops
|
|
// config in the staging root and list the top-level dirs in `Dirs`: `ops`
|
|
// resolves Dirs relative to the config file and bakes the tree at the
|
|
// matching absolute path. (Staging is required: an absolute /srv source
|
|
// would make `ops` descend into the agent's podman overlay storage and
|
|
// fail.)
|
|
bakeDir := ""
|
|
if len(spec.Volumes) > 0 {
|
|
bakeDir = filepath.Join(q.vmDir(spec.Name), "bake")
|
|
_ = os.RemoveAll(bakeDir)
|
|
topLevel := map[string]bool{}
|
|
for _, v := range spec.Volumes {
|
|
parts := strings.SplitN(v, ":", 2)
|
|
host := parts[0]
|
|
guest := host
|
|
if len(parts) == 2 {
|
|
guest = parts[1]
|
|
}
|
|
rel := strings.TrimPrefix(guest, "/")
|
|
stage := filepath.Join(bakeDir, rel)
|
|
if err := os.MkdirAll(filepath.Dir(stage), 0o750); err != nil {
|
|
return fmt.Errorf("stage volume dir: %w", err)
|
|
}
|
|
if err := os.CopyFS(stage, os.DirFS(host)); err != nil {
|
|
return fmt.Errorf("stage volume %q: %w", host, err)
|
|
}
|
|
if i := strings.Index(rel, "/"); i >= 0 {
|
|
topLevel[rel[:i]] = true
|
|
} else {
|
|
topLevel[rel] = true
|
|
}
|
|
}
|
|
for d := range topLevel {
|
|
cfg.Dirs = append(cfg.Dirs, d)
|
|
}
|
|
}
|
|
if len(spec.Env) > 0 {
|
|
cfg.Env = map[string]string{}
|
|
for _, e := range spec.Env {
|
|
if i := strings.Index(e, "="); i >= 0 {
|
|
cfg.Env[e[:i]] = e[i+1:]
|
|
}
|
|
}
|
|
}
|
|
// Place the ops config in the staging root when baking volumes, so its
|
|
// relative Dirs resolve to the staged tree; otherwise keep it in the VM
|
|
// state dir.
|
|
cfgPath := filepath.Join(q.vmDir(spec.Name), "ops.json")
|
|
if bakeDir != "" {
|
|
cfgPath = filepath.Join(bakeDir, "ops.json")
|
|
}
|
|
cfgBytes, err := json.MarshalIndent(cfg, "", " ")
|
|
if err != nil {
|
|
return fmt.Errorf("marshal ops config: %w", err)
|
|
}
|
|
if err := os.WriteFile(cfgPath, cfgBytes, 0o640); err != nil { //nolint:gosec // mcp-group-readable config
|
|
return fmt.Errorf("write ops config: %w", err)
|
|
}
|
|
|
|
img := q.imgPath(spec.Name)
|
|
bin := q.binPath(spec.Image)
|
|
if _, err := os.Stat(bin); err != nil {
|
|
return fmt.Errorf("binary not found for %q (Pull first): %w", spec.Image, err)
|
|
}
|
|
build := exec.CommandContext(ctx, q.opsPath(), "build", bin, "-c", cfgPath, "-i", spec.Name) //nolint:gosec
|
|
build.Env = q.opsEnv()
|
|
if bakeDir != "" {
|
|
build.Dir = bakeDir
|
|
}
|
|
if out, err := build.CombinedOutput(); err != nil {
|
|
return fmt.Errorf("ops build %q: %w: %s", spec.Name, err, out)
|
|
}
|
|
// ops writes to ~/.ops/images/<name>.img; move it into our image dir.
|
|
opsImg := filepath.Join(q.opsImagesDir(), spec.Name+".img")
|
|
if _, err := os.Stat(opsImg); err == nil {
|
|
if err := os.Rename(opsImg, img); err != nil {
|
|
// Cross-device fallback: copy.
|
|
if cpErr := copyFile(opsImg, img); cpErr != nil {
|
|
return fmt.Errorf("relocate built image: %w", err)
|
|
}
|
|
_ = os.Remove(opsImg)
|
|
}
|
|
}
|
|
|
|
hash, _ := fileSHA256(img)
|
|
|
|
// Assemble QEMU invocation: KVM-accelerated, headless, serial console to a
|
|
// file, QMP control socket, virtio disk + virtio NIC.
|
|
//
|
|
// Networking has two modes:
|
|
// user-mode (Phase 1): host port forwards to localhost, like rootless
|
|
// podman. mc-proxy routes to 127.0.0.1:<hostport>.
|
|
// bridge (Phase 2): a TAP device on an isolated host-only bridge.
|
|
// The VM has no route off the bridge; a host firewall confines it to
|
|
// reaching only mc-proxy on the gateway. This makes mediation
|
|
// mandatory rather than cooperative.
|
|
var netDevice, netBackend string
|
|
if useBridge {
|
|
tap, tapErr := q.createTAP(spec.Name)
|
|
if tapErr != nil {
|
|
return fmt.Errorf("create TAP: %w", tapErr)
|
|
}
|
|
mac := deterministicMAC(spec.Name)
|
|
netDevice = "virtio-net-pci,netdev=n0,mac=" + mac
|
|
netBackend = fmt.Sprintf("tap,id=n0,ifname=%s,script=no,downscript=no", tap)
|
|
} else {
|
|
netDevice = "virtio-net-pci,netdev=n0"
|
|
netBackend = "user,id=n0"
|
|
for _, p := range spec.Ports {
|
|
netBackend += ",hostfwd=" + hostForward(p)
|
|
}
|
|
}
|
|
args := []string{
|
|
"-enable-kvm",
|
|
"-m", strconv.Itoa(mem),
|
|
"-smp", strconv.Itoa(cpus),
|
|
"-display", "none",
|
|
"-no-reboot",
|
|
"-daemonize",
|
|
"-pidfile", filepath.Join(q.vmDir(spec.Name), "qemu.pid"),
|
|
"-serial", "file:" + filepath.Join(q.vmDir(spec.Name), "console.log"),
|
|
"-qmp", "unix:" + filepath.Join(q.vmDir(spec.Name), "qmp.sock") + ",server,nowait",
|
|
"-drive", "file=" + img + ",format=raw,if=virtio",
|
|
"-device", netDevice,
|
|
"-netdev", netBackend,
|
|
}
|
|
cmd := exec.CommandContext(ctx, q.qemuPath(), args...) //nolint:gosec
|
|
if out, err := cmd.CombinedOutput(); err != nil {
|
|
return fmt.Errorf("qemu launch %q: %w: %s", spec.Name, err, out)
|
|
}
|
|
|
|
meta := vmMeta{
|
|
Name: spec.Name,
|
|
Image: spec.Image,
|
|
User: spec.User,
|
|
Restart: spec.Restart,
|
|
Ports: spec.Ports,
|
|
Volumes: spec.Volumes,
|
|
Cmd: spec.Cmd,
|
|
MemoryMB: mem,
|
|
VCPUs: cpus,
|
|
ImageHash: hash,
|
|
IP: vmIP,
|
|
Started: time.Now().UTC(),
|
|
}
|
|
return q.writeMeta(spec.Name, meta)
|
|
}
|
|
|
|
func (q *QEMU) opsImagesDir() string {
|
|
home := q.HomeDir
|
|
if home == "" {
|
|
home, _ = os.UserHomeDir()
|
|
}
|
|
return filepath.Join(home, ".ops", "images")
|
|
}
|
|
|
|
func (q *QEMU) writeMeta(name string, m vmMeta) error {
|
|
b, err := json.MarshalIndent(m, "", " ")
|
|
if err != nil {
|
|
return fmt.Errorf("marshal vm meta: %w", err)
|
|
}
|
|
return os.WriteFile(filepath.Join(q.vmDir(name), "meta.json"), b, 0o640) //nolint:gosec // mcp-group-readable metadata
|
|
}
|
|
|
|
func (q *QEMU) readMeta(name string) (vmMeta, error) {
|
|
var m vmMeta
|
|
b, err := os.ReadFile(filepath.Join(q.vmDir(name), "meta.json"))
|
|
if err != nil {
|
|
return m, err
|
|
}
|
|
return m, json.Unmarshal(b, &m)
|
|
}
|
|
|
|
// pidOf returns the running QEMU pid for a VM, or 0 if not running.
|
|
func (q *QEMU) pidOf(name string) int {
|
|
b, err := os.ReadFile(filepath.Join(q.vmDir(name), "qemu.pid"))
|
|
if err != nil {
|
|
return 0
|
|
}
|
|
pid, err := strconv.Atoi(strings.TrimSpace(string(b)))
|
|
if err != nil || pid <= 0 {
|
|
return 0
|
|
}
|
|
if err := syscall.Kill(pid, 0); err != nil {
|
|
return 0
|
|
}
|
|
return pid
|
|
}
|
|
|
|
// Stop gracefully powers down the VM, falling back to SIGTERM/SIGKILL.
|
|
func (q *QEMU) Stop(ctx context.Context, name string) error {
|
|
pid := q.pidOf(name)
|
|
if pid == 0 {
|
|
return nil
|
|
}
|
|
// Try a graceful QMP system_powerdown, but Nanos has no ACPI shutdown
|
|
// handler, so escalate to signals quickly rather than waiting long.
|
|
_ = q.qmpCommand(name, "system_powerdown")
|
|
if q.waitGone(name, 2*time.Second) {
|
|
return nil
|
|
}
|
|
_ = syscall.Kill(pid, syscall.SIGTERM)
|
|
if q.waitGone(name, 2*time.Second) {
|
|
return nil
|
|
}
|
|
_ = syscall.Kill(pid, syscall.SIGKILL)
|
|
q.waitGone(name, 2*time.Second)
|
|
if q.bridgeMode() {
|
|
q.destroyTAP(name)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// waitGone polls until the VM process exits or the timeout elapses,
|
|
// returning true if it has exited.
|
|
func (q *QEMU) waitGone(name string, timeout time.Duration) bool {
|
|
deadline := time.Now().Add(timeout)
|
|
for time.Now().Before(deadline) {
|
|
if q.pidOf(name) == 0 {
|
|
return true
|
|
}
|
|
time.Sleep(200 * time.Millisecond)
|
|
}
|
|
return q.pidOf(name) == 0
|
|
}
|
|
|
|
// qmpCommand sends a single QMP command over the VM's control socket.
|
|
func (q *QEMU) qmpCommand(name, command string) error {
|
|
sock := filepath.Join(q.vmDir(name), "qmp.sock")
|
|
conn, err := dialUnix(sock, 3*time.Second)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer func() { _ = conn.Close() }()
|
|
// QMP handshake: read greeting, send qmp_capabilities, then the command.
|
|
dec := json.NewDecoder(conn)
|
|
var greeting map[string]any
|
|
_ = dec.Decode(&greeting)
|
|
_, _ = conn.Write([]byte(`{"execute":"qmp_capabilities"}`))
|
|
var ack map[string]any
|
|
_ = dec.Decode(&ack)
|
|
_, err = conn.Write([]byte(`{"execute":"` + command + `"}`))
|
|
return err
|
|
}
|
|
|
|
// Remove stops the VM and deletes its state directory.
|
|
func (q *QEMU) Remove(ctx context.Context, name string) error {
|
|
_ = q.Stop(ctx, name)
|
|
if pid := q.pidOf(name); pid != 0 {
|
|
_ = syscall.Kill(pid, syscall.SIGKILL)
|
|
}
|
|
if q.bridgeMode() {
|
|
q.destroyTAP(name)
|
|
_ = q.releaseIP(name)
|
|
}
|
|
return os.RemoveAll(q.vmDir(name))
|
|
}
|
|
|
|
// Inspect reports the observed state of a VM.
|
|
func (q *QEMU) Inspect(ctx context.Context, name string) (ContainerInfo, error) {
|
|
m, err := q.readMeta(name)
|
|
if err != nil {
|
|
return ContainerInfo{}, fmt.Errorf("qemu inspect %q: %w", name, err)
|
|
}
|
|
state := "stopped"
|
|
if q.pidOf(name) != 0 {
|
|
state = "running"
|
|
}
|
|
return q.infoFromMeta(m, state), nil
|
|
}
|
|
|
|
func (q *QEMU) infoFromMeta(m vmMeta, state string) ContainerInfo {
|
|
network := "user"
|
|
if m.IP != "" {
|
|
network = q.Bridge + " " + m.IP // isolated bridge + the VM's static IP
|
|
}
|
|
return ContainerInfo{
|
|
Name: m.Name,
|
|
Image: m.Image,
|
|
State: state,
|
|
Network: network,
|
|
User: m.User,
|
|
Restart: m.Restart,
|
|
Ports: m.Ports,
|
|
Volumes: m.Volumes,
|
|
Cmd: m.Cmd,
|
|
Version: ExtractVersion(m.Image),
|
|
Started: m.Started,
|
|
}
|
|
}
|
|
|
|
// VMAddr returns the host-reachable "ip:port" backend for a bridge-mode VM
|
|
// component, given its first guest port. Returns "" if the VM has no IP
|
|
// (user-mode) or no ports.
|
|
func (q *QEMU) VMAddr(name string) string {
|
|
m, err := q.readMeta(name)
|
|
if err != nil || m.IP == "" {
|
|
return ""
|
|
}
|
|
gp := guestPorts(m.Ports)
|
|
if len(gp) == 0 {
|
|
return ""
|
|
}
|
|
return m.IP + ":" + gp[0]
|
|
}
|
|
|
|
// List enumerates all VMs known from the state directory.
|
|
func (q *QEMU) List(ctx context.Context) ([]ContainerInfo, error) {
|
|
entries, err := os.ReadDir(q.stateDir())
|
|
if err != nil {
|
|
if os.IsNotExist(err) {
|
|
return nil, nil
|
|
}
|
|
return nil, fmt.Errorf("read vm state dir: %w", err)
|
|
}
|
|
var infos []ContainerInfo
|
|
for _, e := range entries {
|
|
if !e.IsDir() {
|
|
continue
|
|
}
|
|
m, err := q.readMeta(e.Name())
|
|
if err != nil {
|
|
continue
|
|
}
|
|
state := "stopped"
|
|
if q.pidOf(e.Name()) != 0 {
|
|
state = "running"
|
|
}
|
|
infos = append(infos, q.infoFromMeta(m, state))
|
|
}
|
|
return infos, nil
|
|
}
|
|
|
|
// Logs streams the VM's serial console log.
|
|
func (q *QEMU) Logs(ctx context.Context, name string, tail int, follow, timestamps bool, since string) *exec.Cmd {
|
|
console := filepath.Join(q.vmDir(name), "console.log")
|
|
args := []string{}
|
|
if follow {
|
|
args = append(args, "-f")
|
|
}
|
|
if tail > 0 {
|
|
args = append(args, "-n", strconv.Itoa(tail))
|
|
} else {
|
|
args = append(args, "-n", "+1")
|
|
}
|
|
args = append(args, console)
|
|
return exec.CommandContext(ctx, "tail", args...) //nolint:gosec
|
|
}
|
|
|
|
// Build builds a Nanos image from a context directory's binary. Used by the
|
|
// `mcp build --unikernel` path. Not the primary deploy path.
|
|
func (q *QEMU) Build(ctx context.Context, image, contextDir, dockerfile string) error {
|
|
return fmt.Errorf("qemu build: not implemented; build OCI image then Pull")
|
|
}
|
|
|
|
// Push is not implemented for the QEMU runtime.
|
|
func (q *QEMU) Push(ctx context.Context, image string) error {
|
|
return fmt.Errorf("qemu push: not implemented")
|
|
}
|
|
|
|
// ImageExists reports whether the extracted binary for the image is cached.
|
|
func (q *QEMU) ImageExists(ctx context.Context, image string) (bool, error) {
|
|
if _, err := os.Stat(q.binPath(image)); err == nil {
|
|
return true, nil
|
|
}
|
|
return false, nil
|
|
}
|
|
|
|
// Login delegates registry auth to podman (shared credential store).
|
|
func (q *QEMU) Login(ctx context.Context, registry, username, token string) error {
|
|
cmd := exec.CommandContext(ctx, "podman", "login", "--username", username, "--password-stdin", registry) //nolint:gosec
|
|
cmd.Stdin = strings.NewReader(token)
|
|
if out, err := cmd.CombinedOutput(); err != nil {
|
|
return fmt.Errorf("podman login %q: %w: %s", registry, err, out)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// ---- Isolated bridge networking (Phase 2) ----
|
|
|
|
// vmUser is the host user that owns TAP devices and runs QEMU.
|
|
const vmUser = "mcp"
|
|
|
|
// tapName derives a TAP interface name for a VM, respecting the 15-char
|
|
// IFNAMSIZ limit. Long names fall back to a hash suffix.
|
|
func (q *QEMU) tapName(name string) string {
|
|
cand := "tap-" + name
|
|
if len(cand) <= 15 {
|
|
return cand
|
|
}
|
|
sum := sha256.Sum256([]byte(name))
|
|
return "tap-" + hex.EncodeToString(sum[:])[:11]
|
|
}
|
|
|
|
// deterministicMAC derives a stable locally-administered MAC from a name.
|
|
func deterministicMAC(name string) string {
|
|
sum := sha256.Sum256([]byte(name))
|
|
return fmt.Sprintf("52:54:00:%02x:%02x:%02x", sum[0], sum[1], sum[2])
|
|
}
|
|
|
|
func (q *QEMU) ipsFile() string {
|
|
return filepath.Join(q.stateDir(), "ips.json")
|
|
}
|
|
|
|
func (q *QEMU) readIPs() (map[string]string, error) {
|
|
m := map[string]string{}
|
|
b, err := os.ReadFile(q.ipsFile()) //nolint:gosec // fixed state-dir path
|
|
if err != nil {
|
|
if os.IsNotExist(err) {
|
|
return m, nil
|
|
}
|
|
return m, err
|
|
}
|
|
if err := json.Unmarshal(b, &m); err != nil {
|
|
return map[string]string{}, err
|
|
}
|
|
return m, nil
|
|
}
|
|
|
|
func (q *QEMU) writeIPs(m map[string]string) error {
|
|
if err := os.MkdirAll(q.stateDir(), 0o750); err != nil {
|
|
return err
|
|
}
|
|
b, err := json.MarshalIndent(m, "", " ")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return os.WriteFile(q.ipsFile(), b, 0o640) //nolint:gosec // mcp-group-readable
|
|
}
|
|
|
|
// allocateIP assigns (or returns the existing) static bridge IP for a VM.
|
|
// Allocation is serialized by the agent's single-threaded deploy path.
|
|
func (q *QEMU) allocateIP(name string) (string, error) {
|
|
ips, err := q.readIPs()
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
if ip, ok := ips[name]; ok {
|
|
return ip, nil
|
|
}
|
|
used := map[string]bool{}
|
|
for _, ip := range ips {
|
|
used[ip] = true
|
|
}
|
|
for n := 2; n <= 254; n++ {
|
|
ip := fmt.Sprintf("%s.%d", q.SubnetPrefix, n)
|
|
if !used[ip] {
|
|
ips[name] = ip
|
|
return ip, q.writeIPs(ips)
|
|
}
|
|
}
|
|
return "", fmt.Errorf("no free IPs in %s.0/24", q.SubnetPrefix)
|
|
}
|
|
|
|
func (q *QEMU) releaseIP(name string) error {
|
|
ips, err := q.readIPs()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
delete(ips, name)
|
|
return q.writeIPs(ips)
|
|
}
|
|
|
|
// createTAP creates a TAP device owned by the VM user and enslaves it to the
|
|
// host-only bridge. Requires CAP_NET_ADMIN (granted to the agent on
|
|
// unikernel-capable nodes).
|
|
func (q *QEMU) createTAP(name string) (string, error) {
|
|
tap := q.tapName(name)
|
|
_ = exec.Command("ip", "link", "del", tap).Run() //nolint:gosec // best-effort cleanup of a stale device
|
|
steps := [][]string{
|
|
{"tuntap", "add", "dev", tap, "mode", "tap", "user", vmUser},
|
|
{"link", "set", tap, "master", q.Bridge},
|
|
{"link", "set", tap, "up"},
|
|
}
|
|
for _, args := range steps {
|
|
if out, err := exec.Command("ip", args...).CombinedOutput(); err != nil { //nolint:gosec // args built programmatically
|
|
_ = exec.Command("ip", "link", "del", tap).Run() //nolint:gosec
|
|
return "", fmt.Errorf("ip %v: %w: %s", args, err, out)
|
|
}
|
|
}
|
|
return tap, nil
|
|
}
|
|
|
|
func (q *QEMU) destroyTAP(name string) {
|
|
_ = exec.Command("ip", "link", "del", q.tapName(name)).Run() //nolint:gosec // best-effort teardown
|
|
}
|
|
|
|
func fileSHA256(path string) (string, error) {
|
|
b, err := os.ReadFile(path) //nolint:gosec // hashing a known image path
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
sum := sha256.Sum256(b)
|
|
return hex.EncodeToString(sum[:]), nil
|
|
}
|
|
|
|
func copyFile(src, dst string) error {
|
|
in, err := os.ReadFile(src) //nolint:gosec // relocating a built image
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return os.WriteFile(dst, in, 0o640) //nolint:gosec // relocating a built image
|
|
}
|