diff --git a/internal/agent/agent.go b/internal/agent/agent.go index b13e70a..c016548 100644 --- a/internal/agent/agent.go +++ b/internal/agent/agent.go @@ -59,12 +59,23 @@ func Run(cfg *config.AgentConfig, version string) error { // runtime = "unikernel" are placed by the master on KVM-capable nodes. var uk runtime.Runtime if unikernelSupported() { - uk = &runtime.QEMU{ + qemu := &runtime.QEMU{ ImageDir: filepath.Join(homeDir(cfg), "images"), StateDir: filepath.Join(homeDir(cfg), "vm"), HomeDir: homeDir(cfg), } - logger.Info("unikernel runtime enabled (KVM detected)") + // If the isolated host-only bridge exists, switch unikernels to + // bridge networking (Phase 2: mandatory mediation). Otherwise they + // use QEMU user-mode port forwards (Phase 1). + if _, err := os.Stat("/sys/class/net/" + unikernelBridge); err == nil { + qemu.Bridge = unikernelBridge + qemu.Gateway = unikernelGateway + qemu.SubnetPrefix = unikernelSubnetPrefix + logger.Info("unikernel runtime enabled (KVM + isolated bridge)", "bridge", unikernelBridge) + } else { + logger.Info("unikernel runtime enabled (KVM, user-mode networking)") + } + uk = qemu } mon := monitor.New(db, rt, cfg.Monitor, cfg.Agent.NodeName, logger) diff --git a/internal/agent/runtime.go b/internal/agent/runtime.go index 00996b0..88a1515 100644 --- a/internal/agent/runtime.go +++ b/internal/agent/runtime.go @@ -10,6 +10,15 @@ import ( "git.wntrmute.dev/mc/mcp/internal/runtime" ) +// Isolated unikernel bridge parameters. The bridge (mcp-br0) is created by +// the node's NixOS config; when present, the agent runs unikernels on it with +// a host firewall confining each VM to reaching only the gateway (mc-proxy). +const ( + unikernelBridge = "mcp-br0" + unikernelGateway = "10.99.0.1" + unikernelSubnetPrefix = "10.99.0" +) + // unikernelSupported reports whether this node can run Nanos unikernels: // it needs KVM (/dev/kvm) and the `ops` toolchain on PATH. func unikernelSupported() bool { diff --git a/internal/runtime/qemu.go b/internal/runtime/qemu.go index 861a596..5a51779 100644 --- a/internal/runtime/qemu.go +++ b/internal/runtime/qemu.go @@ -55,8 +55,23 @@ type QEMU struct { Memory int // HomeDir is set as $HOME for `ops` so it uses a stable ~/.ops directory. HomeDir string + + // Bridge, when set, switches VMs from QEMU user-mode networking to an + // isolated host-only bridge (Phase 2). Each VM gets a TAP device on the + // bridge and a static IP; a host firewall confines it to reaching only + // the bridge gateway (where mc-proxy listens). Empty = user-mode. + Bridge string // e.g. "mcp-br0" + // Gateway is the bridge's host IP (and the VMs' default route / mc-proxy + // address), e.g. "10.99.0.1". + Gateway string + // SubnetPrefix is the /24 network prefix VMs are numbered in, e.g. + // "10.99.0" (VMs get .2 .. .254). + SubnetPrefix string } +// bridgeMode reports whether isolated bridge networking is configured. +func (q *QEMU) bridgeMode() bool { return q.Bridge != "" } + func (q *QEMU) imageDir() string { if q.ImageDir != "" { return q.ImageDir @@ -148,6 +163,7 @@ type vmMeta struct { MemoryMB int `json:"memory_mb"` VCPUs int `json:"vcpus"` ImageHash string `json:"image_hash"` + IP string `json:"ip,omitempty"` // bridge-mode static IP Started time.Time `json:"started"` } @@ -192,15 +208,18 @@ type opsConfig struct { } type opsRunConfig struct { - Ports []string `json:"Ports,omitempty"` - Memory string `json:"Memory,omitempty"` - CPUs int `json:"CPUs,omitempty"` - Klibs []string `json:"Klibs,omitempty"` - Mounts any `json:"Mounts,omitempty"` - NoTrace []string `json:"NoTrace,omitempty"` - GDBPort int `json:"GDBPort,omitempty"` - Nanos string `json:"Nanos,omitempty"` - Hostname string `json:"Hostname,omitempty"` + Ports []string `json:"Ports,omitempty"` + Memory string `json:"Memory,omitempty"` + CPUs int `json:"CPUs,omitempty"` + Klibs []string `json:"Klibs,omitempty"` + Mounts any `json:"Mounts,omitempty"` + NoTrace []string `json:"NoTrace,omitempty"` + GDBPort int `json:"GDBPort,omitempty"` + Nanos string `json:"Nanos,omitempty"` + Hostname string `json:"Hostname,omitempty"` + IPAddress string `json:"IPAddress,omitempty"` // static IP for bridge mode + NetMask string `json:"NetMask,omitempty"` + Gateway string `json:"Gateway,omitempty"` } // guestPorts extracts the guest (container) port from each spec port mapping. @@ -244,6 +263,16 @@ func (q *QEMU) Run(ctx context.Context, spec ContainerSpec) error { cpus = 1 } + // In bridge mode, allocate a static IP on the isolated bridge for this VM. + var vmIP string + if q.bridgeMode() { + ip, allocErr := q.allocateIP(spec.Name) + if allocErr != nil { + return fmt.Errorf("allocate VM IP: %w", allocErr) + } + vmIP = ip + } + // Build the Nanos image from the extracted binary, baking in command args. cfg := opsConfig{ Args: spec.Cmd, @@ -253,6 +282,11 @@ func (q *QEMU) Run(ctx context.Context, spec ContainerSpec) error { CPUs: cpus, }, } + if q.bridgeMode() { + cfg.RunConfig.IPAddress = vmIP + cfg.RunConfig.NetMask = "255.255.255.0" + cfg.RunConfig.Gateway = q.Gateway + } if len(spec.Env) > 0 { cfg.Env = map[string]string{} for _, e := range spec.Env { @@ -295,10 +329,30 @@ func (q *QEMU) Run(ctx context.Context, spec ContainerSpec) error { hash, _ := fileSHA256(img) // Assemble QEMU invocation: KVM-accelerated, headless, serial console to a - // file, QMP control socket, virtio disk + NIC with user-mode port forwards. - netdev := "user,id=n0" - for _, p := range spec.Ports { - netdev += ",hostfwd=" + hostForward(p) + // file, QMP control socket, virtio disk + virtio NIC. + // + // Networking has two modes: + // user-mode (Phase 1): host port forwards to localhost, like rootless + // podman. mc-proxy routes to 127.0.0.1:. + // bridge (Phase 2): a TAP device on an isolated host-only bridge. + // The VM has no route off the bridge; a host firewall confines it to + // reaching only mc-proxy on the gateway. This makes mediation + // mandatory rather than cooperative. + var netDevice, netBackend string + if q.bridgeMode() { + tap, tapErr := q.createTAP(spec.Name) + if tapErr != nil { + return fmt.Errorf("create TAP: %w", tapErr) + } + mac := deterministicMAC(spec.Name) + netDevice = "virtio-net-pci,netdev=n0,mac=" + mac + netBackend = fmt.Sprintf("tap,id=n0,ifname=%s,script=no,downscript=no", tap) + } else { + netDevice = "virtio-net-pci,netdev=n0" + netBackend = "user,id=n0" + for _, p := range spec.Ports { + netBackend += ",hostfwd=" + hostForward(p) + } } args := []string{ "-enable-kvm", @@ -311,8 +365,8 @@ func (q *QEMU) Run(ctx context.Context, spec ContainerSpec) error { "-serial", "file:" + filepath.Join(q.vmDir(spec.Name), "console.log"), "-qmp", "unix:" + filepath.Join(q.vmDir(spec.Name), "qmp.sock") + ",server,nowait", "-drive", "file=" + img + ",format=raw,if=virtio", - "-device", "virtio-net-pci,netdev=n0", - "-netdev", netdev, + "-device", netDevice, + "-netdev", netBackend, } // 9p passthrough for host /srv/ volumes (best-effort; Nanos must // support the 9p client for the guest to mount it). @@ -341,6 +395,7 @@ func (q *QEMU) Run(ctx context.Context, spec ContainerSpec) error { MemoryMB: mem, VCPUs: cpus, ImageHash: hash, + IP: vmIP, Started: time.Now().UTC(), } return q.writeMeta(spec.Name, meta) @@ -393,24 +448,37 @@ func (q *QEMU) Stop(ctx context.Context, name string) error { if pid == 0 { return nil } - // Try a graceful QMP system_powerdown. + // Try a graceful QMP system_powerdown, but Nanos has no ACPI shutdown + // handler, so escalate to signals quickly rather than waiting long. _ = q.qmpCommand(name, "system_powerdown") - deadline := time.Now().Add(10 * time.Second) - for time.Now().Before(deadline) { - if q.pidOf(name) == 0 { - return nil - } - time.Sleep(300 * time.Millisecond) + if q.waitGone(name, 2*time.Second) { + return nil } - // Escalate. _ = syscall.Kill(pid, syscall.SIGTERM) - time.Sleep(2 * time.Second) - if q.pidOf(name) != 0 { - _ = syscall.Kill(pid, syscall.SIGKILL) + if q.waitGone(name, 2*time.Second) { + return nil + } + _ = syscall.Kill(pid, syscall.SIGKILL) + q.waitGone(name, 2*time.Second) + if q.bridgeMode() { + q.destroyTAP(name) } return nil } +// waitGone polls until the VM process exits or the timeout elapses, +// returning true if it has exited. +func (q *QEMU) waitGone(name string, timeout time.Duration) bool { + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + if q.pidOf(name) == 0 { + return true + } + time.Sleep(200 * time.Millisecond) + } + return q.pidOf(name) == 0 +} + // qmpCommand sends a single QMP command over the VM's control socket. func (q *QEMU) qmpCommand(name, command string) error { sock := filepath.Join(q.vmDir(name), "qmp.sock") @@ -436,6 +504,10 @@ func (q *QEMU) Remove(ctx context.Context, name string) error { if pid := q.pidOf(name); pid != 0 { _ = syscall.Kill(pid, syscall.SIGKILL) } + if q.bridgeMode() { + q.destroyTAP(name) + _ = q.releaseIP(name) + } return os.RemoveAll(q.vmDir(name)) } @@ -453,11 +525,15 @@ func (q *QEMU) Inspect(ctx context.Context, name string) (ContainerInfo, error) } func (q *QEMU) infoFromMeta(m vmMeta, state string) ContainerInfo { + network := "user" + if m.IP != "" { + network = q.Bridge + " " + m.IP // isolated bridge + the VM's static IP + } return ContainerInfo{ Name: m.Name, Image: m.Image, State: state, - Network: "user", + Network: network, User: m.User, Restart: m.Restart, Ports: m.Ports, @@ -468,6 +544,21 @@ func (q *QEMU) infoFromMeta(m vmMeta, state string) ContainerInfo { } } +// VMAddr returns the host-reachable "ip:port" backend for a bridge-mode VM +// component, given its first guest port. Returns "" if the VM has no IP +// (user-mode) or no ports. +func (q *QEMU) VMAddr(name string) string { + m, err := q.readMeta(name) + if err != nil || m.IP == "" { + return "" + } + gp := guestPorts(m.Ports) + if len(gp) == 0 { + return "" + } + return m.IP + ":" + gp[0] +} + // List enumerates all VMs known from the state directory. func (q *QEMU) List(ctx context.Context) ([]ContainerInfo, error) { entries, err := os.ReadDir(q.stateDir()) @@ -540,6 +631,115 @@ func (q *QEMU) Login(ctx context.Context, registry, username, token string) erro return nil } +// ---- Isolated bridge networking (Phase 2) ---- + +// vmUser is the host user that owns TAP devices and runs QEMU. +const vmUser = "mcp" + +// tapName derives a TAP interface name for a VM, respecting the 15-char +// IFNAMSIZ limit. Long names fall back to a hash suffix. +func (q *QEMU) tapName(name string) string { + cand := "tap-" + name + if len(cand) <= 15 { + return cand + } + sum := sha256.Sum256([]byte(name)) + return "tap-" + hex.EncodeToString(sum[:])[:11] +} + +// deterministicMAC derives a stable locally-administered MAC from a name. +func deterministicMAC(name string) string { + sum := sha256.Sum256([]byte(name)) + return fmt.Sprintf("52:54:00:%02x:%02x:%02x", sum[0], sum[1], sum[2]) +} + +func (q *QEMU) ipsFile() string { + return filepath.Join(q.stateDir(), "ips.json") +} + +func (q *QEMU) readIPs() (map[string]string, error) { + m := map[string]string{} + b, err := os.ReadFile(q.ipsFile()) //nolint:gosec // fixed state-dir path + if err != nil { + if os.IsNotExist(err) { + return m, nil + } + return m, err + } + if err := json.Unmarshal(b, &m); err != nil { + return map[string]string{}, err + } + return m, nil +} + +func (q *QEMU) writeIPs(m map[string]string) error { + if err := os.MkdirAll(q.stateDir(), 0o750); err != nil { + return err + } + b, err := json.MarshalIndent(m, "", " ") + if err != nil { + return err + } + return os.WriteFile(q.ipsFile(), b, 0o640) //nolint:gosec // mcp-group-readable +} + +// allocateIP assigns (or returns the existing) static bridge IP for a VM. +// Allocation is serialized by the agent's single-threaded deploy path. +func (q *QEMU) allocateIP(name string) (string, error) { + ips, err := q.readIPs() + if err != nil { + return "", err + } + if ip, ok := ips[name]; ok { + return ip, nil + } + used := map[string]bool{} + for _, ip := range ips { + used[ip] = true + } + for n := 2; n <= 254; n++ { + ip := fmt.Sprintf("%s.%d", q.SubnetPrefix, n) + if !used[ip] { + ips[name] = ip + return ip, q.writeIPs(ips) + } + } + return "", fmt.Errorf("no free IPs in %s.0/24", q.SubnetPrefix) +} + +func (q *QEMU) releaseIP(name string) error { + ips, err := q.readIPs() + if err != nil { + return err + } + delete(ips, name) + return q.writeIPs(ips) +} + +// createTAP creates a TAP device owned by the VM user and enslaves it to the +// host-only bridge. Requires CAP_NET_ADMIN (granted to the agent on +// unikernel-capable nodes). +func (q *QEMU) createTAP(name string) (string, error) { + tap := q.tapName(name) + _ = exec.Command("ip", "link", "del", tap).Run() //nolint:gosec // best-effort cleanup of a stale device + steps := [][]string{ + {"tuntap", "add", "dev", tap, "mode", "tap", "user", vmUser}, + {"link", "set", tap, "master", q.Bridge}, + {"link", "set", tap, "up"}, + } + for _, args := range steps { + if out, err := exec.Command("ip", args...).CombinedOutput(); err != nil { //nolint:gosec // args built programmatically + _ = exec.Command("ip", "link", "del", tap).Run() //nolint:gosec + return "", fmt.Errorf("ip %v: %w: %s", args, err, out) + } + } + return tap, nil +} + +func (q *QEMU) destroyTAP(name string) { + _ = exec.Command("ip", "link", "del", q.tapName(name)).Run() //nolint:gosec // best-effort teardown +} + func fileSHA256(path string) (string, error) { b, err := os.ReadFile(path) //nolint:gosec // hashing a known image path if err != nil {