package runtime import ( "context" "crypto/sha256" "encoding/hex" "encoding/json" "fmt" "net" "os" "os/exec" "path/filepath" "strconv" "strings" "syscall" "time" ) // dialUnix connects to a unix-domain socket with a timeout. func dialUnix(path string, timeout time.Duration) (net.Conn, error) { return net.DialTimeout("unix", path, timeout) } // QEMU implements the Runtime interface by running services as Nanos // unikernel virtual machines under QEMU/KVM instead of containers. // // Each service component becomes a single-process VM with its own kernel. // The lifecycle maps onto the same Runtime interface as Podman so the agent // can treat unikernels and containers uniformly: // // Pull -> pull the OCI image, extract the ELF binary, cache it // Run -> `ops build` the binary into a Nanos image, boot it under QEMU // Stop -> graceful QMP powerdown, then SIGTERM/SIGKILL the QEMU process // Remove -> stop and delete the VM state directory // Inspect -> read persisted metadata + check process liveness // List -> enumerate VM state directories // Logs -> stream the serial console log file // // Phase 1 uses QEMU user-mode networking with host port forwards, which is // functionally equivalent to rootless podman's localhost port mappings: // mc-proxy routes to 127.0.0.1: exactly as it does for containers. // Isolated bridge networking is a later phase. type QEMU struct { // ImageDir holds built Nanos images and extracted binaries. // Default: /srv/mcp/images ImageDir string // StateDir holds per-VM runtime state (pidfile, QMP socket, console log, // metadata). Default: /srv/mcp/vm StateDir string // OpsPath is the path to the `ops` Nanos toolchain binary. Default: "ops". OpsPath string // QemuPath is the path to qemu-system-x86_64. Default: "qemu-system-x86_64". QemuPath string // Memory is the default guest memory in MB when a spec does not set one. Memory int // HomeDir is set as $HOME for `ops` so it uses a stable ~/.ops directory. HomeDir string } func (q *QEMU) imageDir() string { if q.ImageDir != "" { return q.ImageDir } return "/srv/mcp/images" } func (q *QEMU) stateDir() string { if q.StateDir != "" { return q.StateDir } return "/srv/mcp/vm" } func (q *QEMU) opsPath() string { if q.OpsPath != "" { return q.OpsPath } return "ops" } func (q *QEMU) qemuPath() string { if q.QemuPath != "" { return q.QemuPath } return "qemu-system-x86_64" } func (q *QEMU) memory() int { if q.Memory > 0 { return q.Memory } return 256 } // opsEnv returns the environment for invoking `ops`, pinning $HOME so its // cache and image directory are stable across invocations. func (q *QEMU) opsEnv() []string { env := os.Environ() if q.HomeDir != "" { env = append(env, "HOME="+q.HomeDir) } return env } // sanitizeImage turns an image reference into a filesystem-safe stem. // // "mcr.example:8443/mcdoc:v0.1.0" -> "mcr.example_8443_mcdoc_v0.1.0" func sanitizeImage(image string) string { r := strings.NewReplacer("/", "_", ":", "_") return r.Replace(image) } // binaryName derives the in-image ELF binary name from an image reference by // taking the repository basename. "host:8443/mcdoc:v0.1.0" -> "mcdoc". func binaryName(image string) string { name := image if i := strings.LastIndex(name, "/"); i >= 0 { name = name[i+1:] } if i := strings.Index(name, ":"); i >= 0 { name = name[:i] } return name } func (q *QEMU) binPath(image string) string { return filepath.Join(q.imageDir(), sanitizeImage(image)+".bin") } func (q *QEMU) imgPath(name string) string { return filepath.Join(q.imageDir(), name+".img") } func (q *QEMU) vmDir(name string) string { return filepath.Join(q.stateDir(), name) } // vmMeta is the persisted per-VM metadata written at Run time so that // Inspect/List can report accurate information after an agent restart. type vmMeta struct { Name string `json:"name"` Image string `json:"image"` User string `json:"user"` Restart string `json:"restart"` Ports []string `json:"ports"` Volumes []string `json:"volumes"` Cmd []string `json:"cmd"` MemoryMB int `json:"memory_mb"` VCPUs int `json:"vcpus"` ImageHash string `json:"image_hash"` Started time.Time `json:"started"` } // Pull pulls the OCI image and extracts its ELF binary into the image cache. // The binary is the input to `ops build`; the Nanos image itself is built at // Run time so the service's command arguments can be baked in. func (q *QEMU) Pull(ctx context.Context, image string) error { if err := os.MkdirAll(q.imageDir(), 0o750); err != nil { return fmt.Errorf("create image dir: %w", err) } // Pull the OCI image via podman (reuses the agent's registry auth). if out, err := exec.CommandContext(ctx, "podman", "pull", image).CombinedOutput(); err != nil { //nolint:gosec // args built programmatically return fmt.Errorf("podman pull %q: %w: %s", image, err, out) } // Create (do not start) a container to copy the binary out of. tmp := "ukextract-" + sanitizeImage(image) _ = exec.CommandContext(ctx, "podman", "rm", "-f", tmp).Run() //nolint:gosec if out, err := exec.CommandContext(ctx, "podman", "create", "--name", tmp, image).CombinedOutput(); err != nil { //nolint:gosec return fmt.Errorf("podman create %q: %w: %s", image, err, out) } defer func() { _ = exec.Command("podman", "rm", "-f", tmp).Run() }() //nolint:gosec bin := binaryName(image) src := tmp + ":/usr/local/bin/" + bin dst := q.binPath(image) if out, err := exec.CommandContext(ctx, "podman", "cp", src, dst).CombinedOutput(); err != nil { //nolint:gosec return fmt.Errorf("extract binary %q from %q: %w: %s", bin, image, err, out) } if err := os.Chmod(dst, 0o755); err != nil { //nolint:gosec // unikernel ELF must be executable return fmt.Errorf("chmod extracted binary: %w", err) } return nil } // opsConfig is the subset of the `ops` build configuration we generate. type opsConfig struct { Args []string `json:"Args,omitempty"` Env map[string]string `json:"Env,omitempty"` RunConfig opsRunConfig `json:"RunConfig"` } type opsRunConfig struct { Ports []string `json:"Ports,omitempty"` Memory string `json:"Memory,omitempty"` CPUs int `json:"CPUs,omitempty"` Klibs []string `json:"Klibs,omitempty"` Mounts any `json:"Mounts,omitempty"` NoTrace []string `json:"NoTrace,omitempty"` GDBPort int `json:"GDBPort,omitempty"` Nanos string `json:"Nanos,omitempty"` Hostname string `json:"Hostname,omitempty"` } // guestPorts extracts the guest (container) port from each spec port mapping. // Accepts "host:container", "ip:host:container", or a bare "port". func guestPorts(ports []string) []string { var gp []string for _, p := range ports { parts := strings.Split(p, ":") gp = append(gp, parts[len(parts)-1]) } return gp } // hostForward builds the QEMU hostfwd value for a spec port mapping. // "ip:host:container" -> "tcp:ip:host-:container" // "host:container" -> "tcp:127.0.0.1:host-:container" func hostForward(p string) string { parts := strings.Split(p, ":") switch len(parts) { case 3: return fmt.Sprintf("tcp:%s:%s-:%s", parts[0], parts[1], parts[2]) case 2: return fmt.Sprintf("tcp:127.0.0.1:%s-:%s", parts[0], parts[1]) default: return fmt.Sprintf("tcp:127.0.0.1:%s-:%s", parts[0], parts[0]) } } // Run builds the Nanos image (if needed) and boots it under QEMU/KVM. func (q *QEMU) Run(ctx context.Context, spec ContainerSpec) error { if err := os.MkdirAll(q.vmDir(spec.Name), 0o750); err != nil { return fmt.Errorf("create vm state dir: %w", err) } mem := spec.MemoryMB if mem <= 0 { mem = q.memory() } cpus := spec.VCPUs if cpus <= 0 { cpus = 1 } // Build the Nanos image from the extracted binary, baking in command args. cfg := opsConfig{ Args: spec.Cmd, RunConfig: opsRunConfig{ Ports: guestPorts(spec.Ports), Memory: strconv.Itoa(mem) + "m", CPUs: cpus, }, } if len(spec.Env) > 0 { cfg.Env = map[string]string{} for _, e := range spec.Env { if i := strings.Index(e, "="); i >= 0 { cfg.Env[e[:i]] = e[i+1:] } } } cfgPath := filepath.Join(q.vmDir(spec.Name), "ops.json") cfgBytes, err := json.MarshalIndent(cfg, "", " ") if err != nil { return fmt.Errorf("marshal ops config: %w", err) } if err := os.WriteFile(cfgPath, cfgBytes, 0o640); err != nil { //nolint:gosec // mcp-group-readable config return fmt.Errorf("write ops config: %w", err) } img := q.imgPath(spec.Name) bin := q.binPath(spec.Image) if _, err := os.Stat(bin); err != nil { return fmt.Errorf("binary not found for %q (Pull first): %w", spec.Image, err) } build := exec.CommandContext(ctx, q.opsPath(), "build", bin, "-c", cfgPath, "-i", spec.Name) //nolint:gosec build.Env = q.opsEnv() if out, err := build.CombinedOutput(); err != nil { return fmt.Errorf("ops build %q: %w: %s", spec.Name, err, out) } // ops writes to ~/.ops/images/.img; move it into our image dir. opsImg := filepath.Join(q.opsImagesDir(), spec.Name+".img") if _, err := os.Stat(opsImg); err == nil { if err := os.Rename(opsImg, img); err != nil { // Cross-device fallback: copy. if cpErr := copyFile(opsImg, img); cpErr != nil { return fmt.Errorf("relocate built image: %w", err) } _ = os.Remove(opsImg) } } hash, _ := fileSHA256(img) // Assemble QEMU invocation: KVM-accelerated, headless, serial console to a // file, QMP control socket, virtio disk + NIC with user-mode port forwards. netdev := "user,id=n0" for _, p := range spec.Ports { netdev += ",hostfwd=" + hostForward(p) } args := []string{ "-enable-kvm", "-m", strconv.Itoa(mem), "-smp", strconv.Itoa(cpus), "-display", "none", "-no-reboot", "-daemonize", "-pidfile", filepath.Join(q.vmDir(spec.Name), "qemu.pid"), "-serial", "file:" + filepath.Join(q.vmDir(spec.Name), "console.log"), "-qmp", "unix:" + filepath.Join(q.vmDir(spec.Name), "qmp.sock") + ",server,nowait", "-drive", "file=" + img + ",format=raw,if=virtio", "-device", "virtio-net-pci,netdev=n0", "-netdev", netdev, } // 9p passthrough for host /srv/ volumes (best-effort; Nanos must // support the 9p client for the guest to mount it). for i, v := range spec.Volumes { parts := strings.SplitN(v, ":", 2) host := parts[0] tag := fmt.Sprintf("srv%d", i) args = append(args, "-virtfs", fmt.Sprintf("local,path=%s,mount_tag=%s,security_model=none,id=%s", host, tag, tag), ) } cmd := exec.CommandContext(ctx, q.qemuPath(), args...) //nolint:gosec if out, err := cmd.CombinedOutput(); err != nil { return fmt.Errorf("qemu launch %q: %w: %s", spec.Name, err, out) } meta := vmMeta{ Name: spec.Name, Image: spec.Image, User: spec.User, Restart: spec.Restart, Ports: spec.Ports, Volumes: spec.Volumes, Cmd: spec.Cmd, MemoryMB: mem, VCPUs: cpus, ImageHash: hash, Started: time.Now().UTC(), } return q.writeMeta(spec.Name, meta) } func (q *QEMU) opsImagesDir() string { home := q.HomeDir if home == "" { home, _ = os.UserHomeDir() } return filepath.Join(home, ".ops", "images") } func (q *QEMU) writeMeta(name string, m vmMeta) error { b, err := json.MarshalIndent(m, "", " ") if err != nil { return fmt.Errorf("marshal vm meta: %w", err) } return os.WriteFile(filepath.Join(q.vmDir(name), "meta.json"), b, 0o640) //nolint:gosec // mcp-group-readable metadata } func (q *QEMU) readMeta(name string) (vmMeta, error) { var m vmMeta b, err := os.ReadFile(filepath.Join(q.vmDir(name), "meta.json")) if err != nil { return m, err } return m, json.Unmarshal(b, &m) } // pidOf returns the running QEMU pid for a VM, or 0 if not running. func (q *QEMU) pidOf(name string) int { b, err := os.ReadFile(filepath.Join(q.vmDir(name), "qemu.pid")) if err != nil { return 0 } pid, err := strconv.Atoi(strings.TrimSpace(string(b))) if err != nil || pid <= 0 { return 0 } if err := syscall.Kill(pid, 0); err != nil { return 0 } return pid } // Stop gracefully powers down the VM, falling back to SIGTERM/SIGKILL. func (q *QEMU) Stop(ctx context.Context, name string) error { pid := q.pidOf(name) if pid == 0 { return nil } // Try a graceful QMP system_powerdown. _ = q.qmpCommand(name, "system_powerdown") deadline := time.Now().Add(10 * time.Second) for time.Now().Before(deadline) { if q.pidOf(name) == 0 { return nil } time.Sleep(300 * time.Millisecond) } // Escalate. _ = syscall.Kill(pid, syscall.SIGTERM) time.Sleep(2 * time.Second) if q.pidOf(name) != 0 { _ = syscall.Kill(pid, syscall.SIGKILL) } return nil } // qmpCommand sends a single QMP command over the VM's control socket. func (q *QEMU) qmpCommand(name, command string) error { sock := filepath.Join(q.vmDir(name), "qmp.sock") conn, err := dialUnix(sock, 3*time.Second) if err != nil { return err } defer func() { _ = conn.Close() }() // QMP handshake: read greeting, send qmp_capabilities, then the command. dec := json.NewDecoder(conn) var greeting map[string]any _ = dec.Decode(&greeting) _, _ = conn.Write([]byte(`{"execute":"qmp_capabilities"}`)) var ack map[string]any _ = dec.Decode(&ack) _, err = conn.Write([]byte(`{"execute":"` + command + `"}`)) return err } // Remove stops the VM and deletes its state directory. func (q *QEMU) Remove(ctx context.Context, name string) error { _ = q.Stop(ctx, name) if pid := q.pidOf(name); pid != 0 { _ = syscall.Kill(pid, syscall.SIGKILL) } return os.RemoveAll(q.vmDir(name)) } // Inspect reports the observed state of a VM. func (q *QEMU) Inspect(ctx context.Context, name string) (ContainerInfo, error) { m, err := q.readMeta(name) if err != nil { return ContainerInfo{}, fmt.Errorf("qemu inspect %q: %w", name, err) } state := "stopped" if q.pidOf(name) != 0 { state = "running" } return q.infoFromMeta(m, state), nil } func (q *QEMU) infoFromMeta(m vmMeta, state string) ContainerInfo { return ContainerInfo{ Name: m.Name, Image: m.Image, State: state, Network: "user", User: m.User, Restart: m.Restart, Ports: m.Ports, Volumes: m.Volumes, Cmd: m.Cmd, Version: ExtractVersion(m.Image), Started: m.Started, } } // List enumerates all VMs known from the state directory. func (q *QEMU) List(ctx context.Context) ([]ContainerInfo, error) { entries, err := os.ReadDir(q.stateDir()) if err != nil { if os.IsNotExist(err) { return nil, nil } return nil, fmt.Errorf("read vm state dir: %w", err) } var infos []ContainerInfo for _, e := range entries { if !e.IsDir() { continue } m, err := q.readMeta(e.Name()) if err != nil { continue } state := "stopped" if q.pidOf(e.Name()) != 0 { state = "running" } infos = append(infos, q.infoFromMeta(m, state)) } return infos, nil } // Logs streams the VM's serial console log. func (q *QEMU) Logs(ctx context.Context, name string, tail int, follow, timestamps bool, since string) *exec.Cmd { console := filepath.Join(q.vmDir(name), "console.log") args := []string{} if follow { args = append(args, "-f") } if tail > 0 { args = append(args, "-n", strconv.Itoa(tail)) } else { args = append(args, "-n", "+1") } args = append(args, console) return exec.CommandContext(ctx, "tail", args...) //nolint:gosec } // Build builds a Nanos image from a context directory's binary. Used by the // `mcp build --unikernel` path. Not the primary deploy path. func (q *QEMU) Build(ctx context.Context, image, contextDir, dockerfile string) error { return fmt.Errorf("qemu build: not implemented; build OCI image then Pull") } // Push is not implemented for the QEMU runtime. func (q *QEMU) Push(ctx context.Context, image string) error { return fmt.Errorf("qemu push: not implemented") } // ImageExists reports whether the extracted binary for the image is cached. func (q *QEMU) ImageExists(ctx context.Context, image string) (bool, error) { if _, err := os.Stat(q.binPath(image)); err == nil { return true, nil } return false, nil } // Login delegates registry auth to podman (shared credential store). func (q *QEMU) Login(ctx context.Context, registry, username, token string) error { cmd := exec.CommandContext(ctx, "podman", "login", "--username", username, "--password-stdin", registry) //nolint:gosec cmd.Stdin = strings.NewReader(token) if out, err := cmd.CombinedOutput(); err != nil { return fmt.Errorf("podman login %q: %w: %s", registry, err, out) } return nil } func fileSHA256(path string) (string, error) { b, err := os.ReadFile(path) //nolint:gosec // hashing a known image path if err != nil { return "", err } sum := sha256.Sum256(b) return hex.EncodeToString(sum[:]), nil } func copyFile(src, dst string) error { in, err := os.ReadFile(src) //nolint:gosec // relocating a built image if err != nil { return err } return os.WriteFile(dst, in, 0o640) //nolint:gosec // relocating a built image }