unikernel: isolated host-only bridge networking (Phase 2)

When the mcp-br0 bridge exists, the agent runs unikernels on it instead
of QEMU user-mode networking: each VM gets a TAP device on the bridge
and a static 10.99.0.0/24 IP (baked into the Nanos image via ops
RunConfig). With the host firewall dropping off-bridge VM traffic and no
NAT, a VM can reach only the gateway -- making mc-proxy mediation
mandatory by topology rather than convention.

- runtime/qemu.go: bridge mode (createTAP/destroyTAP, IP allocator,
  deterministic MAC, static-IP ops config, VMAddr for proxy backends).
- agent auto-enables bridge mode when /sys/class/net/mcp-br0 exists.

Verified on straylight: uktest unikernel boots on mcp-br0 at 10.99.0.2,
serves via the gateway, TAP enslaved to the bridge; bridge has no uplink
and off-bridge forwarding is dropped.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Kyle Isom
2026-06-11 01:07:49 -07:00
parent d56f224359
commit 47ec4e60ad
3 changed files with 249 additions and 29 deletions

View File

@@ -59,12 +59,23 @@ func Run(cfg *config.AgentConfig, version string) error {
// runtime = "unikernel" are placed by the master on KVM-capable nodes. // runtime = "unikernel" are placed by the master on KVM-capable nodes.
var uk runtime.Runtime var uk runtime.Runtime
if unikernelSupported() { if unikernelSupported() {
uk = &runtime.QEMU{ qemu := &runtime.QEMU{
ImageDir: filepath.Join(homeDir(cfg), "images"), ImageDir: filepath.Join(homeDir(cfg), "images"),
StateDir: filepath.Join(homeDir(cfg), "vm"), StateDir: filepath.Join(homeDir(cfg), "vm"),
HomeDir: homeDir(cfg), HomeDir: homeDir(cfg),
} }
logger.Info("unikernel runtime enabled (KVM detected)") // If the isolated host-only bridge exists, switch unikernels to
// bridge networking (Phase 2: mandatory mediation). Otherwise they
// use QEMU user-mode port forwards (Phase 1).
if _, err := os.Stat("/sys/class/net/" + unikernelBridge); err == nil {
qemu.Bridge = unikernelBridge
qemu.Gateway = unikernelGateway
qemu.SubnetPrefix = unikernelSubnetPrefix
logger.Info("unikernel runtime enabled (KVM + isolated bridge)", "bridge", unikernelBridge)
} else {
logger.Info("unikernel runtime enabled (KVM, user-mode networking)")
}
uk = qemu
} }
mon := monitor.New(db, rt, cfg.Monitor, cfg.Agent.NodeName, logger) mon := monitor.New(db, rt, cfg.Monitor, cfg.Agent.NodeName, logger)

View File

@@ -10,6 +10,15 @@ import (
"git.wntrmute.dev/mc/mcp/internal/runtime" "git.wntrmute.dev/mc/mcp/internal/runtime"
) )
// Isolated unikernel bridge parameters. The bridge (mcp-br0) is created by
// the node's NixOS config; when present, the agent runs unikernels on it with
// a host firewall confining each VM to reaching only the gateway (mc-proxy).
const (
unikernelBridge = "mcp-br0"
unikernelGateway = "10.99.0.1"
unikernelSubnetPrefix = "10.99.0"
)
// unikernelSupported reports whether this node can run Nanos unikernels: // unikernelSupported reports whether this node can run Nanos unikernels:
// it needs KVM (/dev/kvm) and the `ops` toolchain on PATH. // it needs KVM (/dev/kvm) and the `ops` toolchain on PATH.
func unikernelSupported() bool { func unikernelSupported() bool {

View File

@@ -55,8 +55,23 @@ type QEMU struct {
Memory int Memory int
// HomeDir is set as $HOME for `ops` so it uses a stable ~/.ops directory. // HomeDir is set as $HOME for `ops` so it uses a stable ~/.ops directory.
HomeDir string HomeDir string
// Bridge, when set, switches VMs from QEMU user-mode networking to an
// isolated host-only bridge (Phase 2). Each VM gets a TAP device on the
// bridge and a static IP; a host firewall confines it to reaching only
// the bridge gateway (where mc-proxy listens). Empty = user-mode.
Bridge string // e.g. "mcp-br0"
// Gateway is the bridge's host IP (and the VMs' default route / mc-proxy
// address), e.g. "10.99.0.1".
Gateway string
// SubnetPrefix is the /24 network prefix VMs are numbered in, e.g.
// "10.99.0" (VMs get .2 .. .254).
SubnetPrefix string
} }
// bridgeMode reports whether isolated bridge networking is configured.
func (q *QEMU) bridgeMode() bool { return q.Bridge != "" }
func (q *QEMU) imageDir() string { func (q *QEMU) imageDir() string {
if q.ImageDir != "" { if q.ImageDir != "" {
return q.ImageDir return q.ImageDir
@@ -148,6 +163,7 @@ type vmMeta struct {
MemoryMB int `json:"memory_mb"` MemoryMB int `json:"memory_mb"`
VCPUs int `json:"vcpus"` VCPUs int `json:"vcpus"`
ImageHash string `json:"image_hash"` ImageHash string `json:"image_hash"`
IP string `json:"ip,omitempty"` // bridge-mode static IP
Started time.Time `json:"started"` Started time.Time `json:"started"`
} }
@@ -201,6 +217,9 @@ type opsRunConfig struct {
GDBPort int `json:"GDBPort,omitempty"` GDBPort int `json:"GDBPort,omitempty"`
Nanos string `json:"Nanos,omitempty"` Nanos string `json:"Nanos,omitempty"`
Hostname string `json:"Hostname,omitempty"` Hostname string `json:"Hostname,omitempty"`
IPAddress string `json:"IPAddress,omitempty"` // static IP for bridge mode
NetMask string `json:"NetMask,omitempty"`
Gateway string `json:"Gateway,omitempty"`
} }
// guestPorts extracts the guest (container) port from each spec port mapping. // guestPorts extracts the guest (container) port from each spec port mapping.
@@ -244,6 +263,16 @@ func (q *QEMU) Run(ctx context.Context, spec ContainerSpec) error {
cpus = 1 cpus = 1
} }
// In bridge mode, allocate a static IP on the isolated bridge for this VM.
var vmIP string
if q.bridgeMode() {
ip, allocErr := q.allocateIP(spec.Name)
if allocErr != nil {
return fmt.Errorf("allocate VM IP: %w", allocErr)
}
vmIP = ip
}
// Build the Nanos image from the extracted binary, baking in command args. // Build the Nanos image from the extracted binary, baking in command args.
cfg := opsConfig{ cfg := opsConfig{
Args: spec.Cmd, Args: spec.Cmd,
@@ -253,6 +282,11 @@ func (q *QEMU) Run(ctx context.Context, spec ContainerSpec) error {
CPUs: cpus, CPUs: cpus,
}, },
} }
if q.bridgeMode() {
cfg.RunConfig.IPAddress = vmIP
cfg.RunConfig.NetMask = "255.255.255.0"
cfg.RunConfig.Gateway = q.Gateway
}
if len(spec.Env) > 0 { if len(spec.Env) > 0 {
cfg.Env = map[string]string{} cfg.Env = map[string]string{}
for _, e := range spec.Env { for _, e := range spec.Env {
@@ -295,10 +329,30 @@ func (q *QEMU) Run(ctx context.Context, spec ContainerSpec) error {
hash, _ := fileSHA256(img) hash, _ := fileSHA256(img)
// Assemble QEMU invocation: KVM-accelerated, headless, serial console to a // Assemble QEMU invocation: KVM-accelerated, headless, serial console to a
// file, QMP control socket, virtio disk + NIC with user-mode port forwards. // file, QMP control socket, virtio disk + virtio NIC.
netdev := "user,id=n0" //
// Networking has two modes:
// user-mode (Phase 1): host port forwards to localhost, like rootless
// podman. mc-proxy routes to 127.0.0.1:<hostport>.
// bridge (Phase 2): a TAP device on an isolated host-only bridge.
// The VM has no route off the bridge; a host firewall confines it to
// reaching only mc-proxy on the gateway. This makes mediation
// mandatory rather than cooperative.
var netDevice, netBackend string
if q.bridgeMode() {
tap, tapErr := q.createTAP(spec.Name)
if tapErr != nil {
return fmt.Errorf("create TAP: %w", tapErr)
}
mac := deterministicMAC(spec.Name)
netDevice = "virtio-net-pci,netdev=n0,mac=" + mac
netBackend = fmt.Sprintf("tap,id=n0,ifname=%s,script=no,downscript=no", tap)
} else {
netDevice = "virtio-net-pci,netdev=n0"
netBackend = "user,id=n0"
for _, p := range spec.Ports { for _, p := range spec.Ports {
netdev += ",hostfwd=" + hostForward(p) netBackend += ",hostfwd=" + hostForward(p)
}
} }
args := []string{ args := []string{
"-enable-kvm", "-enable-kvm",
@@ -311,8 +365,8 @@ func (q *QEMU) Run(ctx context.Context, spec ContainerSpec) error {
"-serial", "file:" + filepath.Join(q.vmDir(spec.Name), "console.log"), "-serial", "file:" + filepath.Join(q.vmDir(spec.Name), "console.log"),
"-qmp", "unix:" + filepath.Join(q.vmDir(spec.Name), "qmp.sock") + ",server,nowait", "-qmp", "unix:" + filepath.Join(q.vmDir(spec.Name), "qmp.sock") + ",server,nowait",
"-drive", "file=" + img + ",format=raw,if=virtio", "-drive", "file=" + img + ",format=raw,if=virtio",
"-device", "virtio-net-pci,netdev=n0", "-device", netDevice,
"-netdev", netdev, "-netdev", netBackend,
} }
// 9p passthrough for host /srv/<service> volumes (best-effort; Nanos must // 9p passthrough for host /srv/<service> volumes (best-effort; Nanos must
// support the 9p client for the guest to mount it). // support the 9p client for the guest to mount it).
@@ -341,6 +395,7 @@ func (q *QEMU) Run(ctx context.Context, spec ContainerSpec) error {
MemoryMB: mem, MemoryMB: mem,
VCPUs: cpus, VCPUs: cpus,
ImageHash: hash, ImageHash: hash,
IP: vmIP,
Started: time.Now().UTC(), Started: time.Now().UTC(),
} }
return q.writeMeta(spec.Name, meta) return q.writeMeta(spec.Name, meta)
@@ -393,22 +448,35 @@ func (q *QEMU) Stop(ctx context.Context, name string) error {
if pid == 0 { if pid == 0 {
return nil return nil
} }
// Try a graceful QMP system_powerdown. // Try a graceful QMP system_powerdown, but Nanos has no ACPI shutdown
// handler, so escalate to signals quickly rather than waiting long.
_ = q.qmpCommand(name, "system_powerdown") _ = q.qmpCommand(name, "system_powerdown")
deadline := time.Now().Add(10 * time.Second) if q.waitGone(name, 2*time.Second) {
return nil
}
_ = syscall.Kill(pid, syscall.SIGTERM)
if q.waitGone(name, 2*time.Second) {
return nil
}
_ = syscall.Kill(pid, syscall.SIGKILL)
q.waitGone(name, 2*time.Second)
if q.bridgeMode() {
q.destroyTAP(name)
}
return nil
}
// waitGone polls until the VM process exits or the timeout elapses,
// returning true if it has exited.
func (q *QEMU) waitGone(name string, timeout time.Duration) bool {
deadline := time.Now().Add(timeout)
for time.Now().Before(deadline) { for time.Now().Before(deadline) {
if q.pidOf(name) == 0 { if q.pidOf(name) == 0 {
return nil return true
} }
time.Sleep(300 * time.Millisecond) time.Sleep(200 * time.Millisecond)
} }
// Escalate. return q.pidOf(name) == 0
_ = syscall.Kill(pid, syscall.SIGTERM)
time.Sleep(2 * time.Second)
if q.pidOf(name) != 0 {
_ = syscall.Kill(pid, syscall.SIGKILL)
}
return nil
} }
// qmpCommand sends a single QMP command over the VM's control socket. // qmpCommand sends a single QMP command over the VM's control socket.
@@ -436,6 +504,10 @@ func (q *QEMU) Remove(ctx context.Context, name string) error {
if pid := q.pidOf(name); pid != 0 { if pid := q.pidOf(name); pid != 0 {
_ = syscall.Kill(pid, syscall.SIGKILL) _ = syscall.Kill(pid, syscall.SIGKILL)
} }
if q.bridgeMode() {
q.destroyTAP(name)
_ = q.releaseIP(name)
}
return os.RemoveAll(q.vmDir(name)) return os.RemoveAll(q.vmDir(name))
} }
@@ -453,11 +525,15 @@ func (q *QEMU) Inspect(ctx context.Context, name string) (ContainerInfo, error)
} }
func (q *QEMU) infoFromMeta(m vmMeta, state string) ContainerInfo { func (q *QEMU) infoFromMeta(m vmMeta, state string) ContainerInfo {
network := "user"
if m.IP != "" {
network = q.Bridge + " " + m.IP // isolated bridge + the VM's static IP
}
return ContainerInfo{ return ContainerInfo{
Name: m.Name, Name: m.Name,
Image: m.Image, Image: m.Image,
State: state, State: state,
Network: "user", Network: network,
User: m.User, User: m.User,
Restart: m.Restart, Restart: m.Restart,
Ports: m.Ports, Ports: m.Ports,
@@ -468,6 +544,21 @@ func (q *QEMU) infoFromMeta(m vmMeta, state string) ContainerInfo {
} }
} }
// VMAddr returns the host-reachable "ip:port" backend for a bridge-mode VM
// component, given its first guest port. Returns "" if the VM has no IP
// (user-mode) or no ports.
func (q *QEMU) VMAddr(name string) string {
m, err := q.readMeta(name)
if err != nil || m.IP == "" {
return ""
}
gp := guestPorts(m.Ports)
if len(gp) == 0 {
return ""
}
return m.IP + ":" + gp[0]
}
// List enumerates all VMs known from the state directory. // List enumerates all VMs known from the state directory.
func (q *QEMU) List(ctx context.Context) ([]ContainerInfo, error) { func (q *QEMU) List(ctx context.Context) ([]ContainerInfo, error) {
entries, err := os.ReadDir(q.stateDir()) entries, err := os.ReadDir(q.stateDir())
@@ -540,6 +631,115 @@ func (q *QEMU) Login(ctx context.Context, registry, username, token string) erro
return nil return nil
} }
// ---- Isolated bridge networking (Phase 2) ----
// vmUser is the host user that owns TAP devices and runs QEMU.
const vmUser = "mcp"
// tapName derives a TAP interface name for a VM, respecting the 15-char
// IFNAMSIZ limit. Long names fall back to a hash suffix.
func (q *QEMU) tapName(name string) string {
cand := "tap-" + name
if len(cand) <= 15 {
return cand
}
sum := sha256.Sum256([]byte(name))
return "tap-" + hex.EncodeToString(sum[:])[:11]
}
// deterministicMAC derives a stable locally-administered MAC from a name.
func deterministicMAC(name string) string {
sum := sha256.Sum256([]byte(name))
return fmt.Sprintf("52:54:00:%02x:%02x:%02x", sum[0], sum[1], sum[2])
}
func (q *QEMU) ipsFile() string {
return filepath.Join(q.stateDir(), "ips.json")
}
func (q *QEMU) readIPs() (map[string]string, error) {
m := map[string]string{}
b, err := os.ReadFile(q.ipsFile()) //nolint:gosec // fixed state-dir path
if err != nil {
if os.IsNotExist(err) {
return m, nil
}
return m, err
}
if err := json.Unmarshal(b, &m); err != nil {
return map[string]string{}, err
}
return m, nil
}
func (q *QEMU) writeIPs(m map[string]string) error {
if err := os.MkdirAll(q.stateDir(), 0o750); err != nil {
return err
}
b, err := json.MarshalIndent(m, "", " ")
if err != nil {
return err
}
return os.WriteFile(q.ipsFile(), b, 0o640) //nolint:gosec // mcp-group-readable
}
// allocateIP assigns (or returns the existing) static bridge IP for a VM.
// Allocation is serialized by the agent's single-threaded deploy path.
func (q *QEMU) allocateIP(name string) (string, error) {
ips, err := q.readIPs()
if err != nil {
return "", err
}
if ip, ok := ips[name]; ok {
return ip, nil
}
used := map[string]bool{}
for _, ip := range ips {
used[ip] = true
}
for n := 2; n <= 254; n++ {
ip := fmt.Sprintf("%s.%d", q.SubnetPrefix, n)
if !used[ip] {
ips[name] = ip
return ip, q.writeIPs(ips)
}
}
return "", fmt.Errorf("no free IPs in %s.0/24", q.SubnetPrefix)
}
func (q *QEMU) releaseIP(name string) error {
ips, err := q.readIPs()
if err != nil {
return err
}
delete(ips, name)
return q.writeIPs(ips)
}
// createTAP creates a TAP device owned by the VM user and enslaves it to the
// host-only bridge. Requires CAP_NET_ADMIN (granted to the agent on
// unikernel-capable nodes).
func (q *QEMU) createTAP(name string) (string, error) {
tap := q.tapName(name)
_ = exec.Command("ip", "link", "del", tap).Run() //nolint:gosec // best-effort cleanup of a stale device
steps := [][]string{
{"tuntap", "add", "dev", tap, "mode", "tap", "user", vmUser},
{"link", "set", tap, "master", q.Bridge},
{"link", "set", tap, "up"},
}
for _, args := range steps {
if out, err := exec.Command("ip", args...).CombinedOutput(); err != nil { //nolint:gosec // args built programmatically
_ = exec.Command("ip", "link", "del", tap).Run() //nolint:gosec
return "", fmt.Errorf("ip %v: %w: %s", args, err, out)
}
}
return tap, nil
}
func (q *QEMU) destroyTAP(name string) {
_ = exec.Command("ip", "link", "del", q.tapName(name)).Run() //nolint:gosec // best-effort teardown
}
func fileSHA256(path string) (string, error) { func fileSHA256(path string) (string, error) {
b, err := os.ReadFile(path) //nolint:gosec // hashing a known image path b, err := os.ReadFile(path) //nolint:gosec // hashing a known image path
if err != nil { if err != nil {