unikernel: isolated host-only bridge networking (Phase 2)
When the mcp-br0 bridge exists, the agent runs unikernels on it instead of QEMU user-mode networking: each VM gets a TAP device on the bridge and a static 10.99.0.0/24 IP (baked into the Nanos image via ops RunConfig). With the host firewall dropping off-bridge VM traffic and no NAT, a VM can reach only the gateway -- making mc-proxy mediation mandatory by topology rather than convention. - runtime/qemu.go: bridge mode (createTAP/destroyTAP, IP allocator, deterministic MAC, static-IP ops config, VMAddr for proxy backends). - agent auto-enables bridge mode when /sys/class/net/mcp-br0 exists. Verified on straylight: uktest unikernel boots on mcp-br0 at 10.99.0.2, serves via the gateway, TAP enslaved to the bridge; bridge has no uplink and off-bridge forwarding is dropped. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -59,12 +59,23 @@ func Run(cfg *config.AgentConfig, version string) error {
|
||||
// runtime = "unikernel" are placed by the master on KVM-capable nodes.
|
||||
var uk runtime.Runtime
|
||||
if unikernelSupported() {
|
||||
uk = &runtime.QEMU{
|
||||
qemu := &runtime.QEMU{
|
||||
ImageDir: filepath.Join(homeDir(cfg), "images"),
|
||||
StateDir: filepath.Join(homeDir(cfg), "vm"),
|
||||
HomeDir: homeDir(cfg),
|
||||
}
|
||||
logger.Info("unikernel runtime enabled (KVM detected)")
|
||||
// If the isolated host-only bridge exists, switch unikernels to
|
||||
// bridge networking (Phase 2: mandatory mediation). Otherwise they
|
||||
// use QEMU user-mode port forwards (Phase 1).
|
||||
if _, err := os.Stat("/sys/class/net/" + unikernelBridge); err == nil {
|
||||
qemu.Bridge = unikernelBridge
|
||||
qemu.Gateway = unikernelGateway
|
||||
qemu.SubnetPrefix = unikernelSubnetPrefix
|
||||
logger.Info("unikernel runtime enabled (KVM + isolated bridge)", "bridge", unikernelBridge)
|
||||
} else {
|
||||
logger.Info("unikernel runtime enabled (KVM, user-mode networking)")
|
||||
}
|
||||
uk = qemu
|
||||
}
|
||||
|
||||
mon := monitor.New(db, rt, cfg.Monitor, cfg.Agent.NodeName, logger)
|
||||
|
||||
@@ -10,6 +10,15 @@ import (
|
||||
"git.wntrmute.dev/mc/mcp/internal/runtime"
|
||||
)
|
||||
|
||||
// Isolated unikernel bridge parameters. The bridge (mcp-br0) is created by
|
||||
// the node's NixOS config; when present, the agent runs unikernels on it with
|
||||
// a host firewall confining each VM to reaching only the gateway (mc-proxy).
|
||||
const (
|
||||
unikernelBridge = "mcp-br0"
|
||||
unikernelGateway = "10.99.0.1"
|
||||
unikernelSubnetPrefix = "10.99.0"
|
||||
)
|
||||
|
||||
// unikernelSupported reports whether this node can run Nanos unikernels:
|
||||
// it needs KVM (/dev/kvm) and the `ops` toolchain on PATH.
|
||||
func unikernelSupported() bool {
|
||||
|
||||
@@ -55,8 +55,23 @@ type QEMU struct {
|
||||
Memory int
|
||||
// HomeDir is set as $HOME for `ops` so it uses a stable ~/.ops directory.
|
||||
HomeDir string
|
||||
|
||||
// Bridge, when set, switches VMs from QEMU user-mode networking to an
|
||||
// isolated host-only bridge (Phase 2). Each VM gets a TAP device on the
|
||||
// bridge and a static IP; a host firewall confines it to reaching only
|
||||
// the bridge gateway (where mc-proxy listens). Empty = user-mode.
|
||||
Bridge string // e.g. "mcp-br0"
|
||||
// Gateway is the bridge's host IP (and the VMs' default route / mc-proxy
|
||||
// address), e.g. "10.99.0.1".
|
||||
Gateway string
|
||||
// SubnetPrefix is the /24 network prefix VMs are numbered in, e.g.
|
||||
// "10.99.0" (VMs get .2 .. .254).
|
||||
SubnetPrefix string
|
||||
}
|
||||
|
||||
// bridgeMode reports whether isolated bridge networking is configured.
|
||||
func (q *QEMU) bridgeMode() bool { return q.Bridge != "" }
|
||||
|
||||
func (q *QEMU) imageDir() string {
|
||||
if q.ImageDir != "" {
|
||||
return q.ImageDir
|
||||
@@ -148,6 +163,7 @@ type vmMeta struct {
|
||||
MemoryMB int `json:"memory_mb"`
|
||||
VCPUs int `json:"vcpus"`
|
||||
ImageHash string `json:"image_hash"`
|
||||
IP string `json:"ip,omitempty"` // bridge-mode static IP
|
||||
Started time.Time `json:"started"`
|
||||
}
|
||||
|
||||
@@ -192,15 +208,18 @@ type opsConfig struct {
|
||||
}
|
||||
|
||||
type opsRunConfig struct {
|
||||
Ports []string `json:"Ports,omitempty"`
|
||||
Memory string `json:"Memory,omitempty"`
|
||||
CPUs int `json:"CPUs,omitempty"`
|
||||
Klibs []string `json:"Klibs,omitempty"`
|
||||
Mounts any `json:"Mounts,omitempty"`
|
||||
NoTrace []string `json:"NoTrace,omitempty"`
|
||||
GDBPort int `json:"GDBPort,omitempty"`
|
||||
Nanos string `json:"Nanos,omitempty"`
|
||||
Hostname string `json:"Hostname,omitempty"`
|
||||
Ports []string `json:"Ports,omitempty"`
|
||||
Memory string `json:"Memory,omitempty"`
|
||||
CPUs int `json:"CPUs,omitempty"`
|
||||
Klibs []string `json:"Klibs,omitempty"`
|
||||
Mounts any `json:"Mounts,omitempty"`
|
||||
NoTrace []string `json:"NoTrace,omitempty"`
|
||||
GDBPort int `json:"GDBPort,omitempty"`
|
||||
Nanos string `json:"Nanos,omitempty"`
|
||||
Hostname string `json:"Hostname,omitempty"`
|
||||
IPAddress string `json:"IPAddress,omitempty"` // static IP for bridge mode
|
||||
NetMask string `json:"NetMask,omitempty"`
|
||||
Gateway string `json:"Gateway,omitempty"`
|
||||
}
|
||||
|
||||
// guestPorts extracts the guest (container) port from each spec port mapping.
|
||||
@@ -244,6 +263,16 @@ func (q *QEMU) Run(ctx context.Context, spec ContainerSpec) error {
|
||||
cpus = 1
|
||||
}
|
||||
|
||||
// In bridge mode, allocate a static IP on the isolated bridge for this VM.
|
||||
var vmIP string
|
||||
if q.bridgeMode() {
|
||||
ip, allocErr := q.allocateIP(spec.Name)
|
||||
if allocErr != nil {
|
||||
return fmt.Errorf("allocate VM IP: %w", allocErr)
|
||||
}
|
||||
vmIP = ip
|
||||
}
|
||||
|
||||
// Build the Nanos image from the extracted binary, baking in command args.
|
||||
cfg := opsConfig{
|
||||
Args: spec.Cmd,
|
||||
@@ -253,6 +282,11 @@ func (q *QEMU) Run(ctx context.Context, spec ContainerSpec) error {
|
||||
CPUs: cpus,
|
||||
},
|
||||
}
|
||||
if q.bridgeMode() {
|
||||
cfg.RunConfig.IPAddress = vmIP
|
||||
cfg.RunConfig.NetMask = "255.255.255.0"
|
||||
cfg.RunConfig.Gateway = q.Gateway
|
||||
}
|
||||
if len(spec.Env) > 0 {
|
||||
cfg.Env = map[string]string{}
|
||||
for _, e := range spec.Env {
|
||||
@@ -295,10 +329,30 @@ func (q *QEMU) Run(ctx context.Context, spec ContainerSpec) error {
|
||||
hash, _ := fileSHA256(img)
|
||||
|
||||
// Assemble QEMU invocation: KVM-accelerated, headless, serial console to a
|
||||
// file, QMP control socket, virtio disk + NIC with user-mode port forwards.
|
||||
netdev := "user,id=n0"
|
||||
for _, p := range spec.Ports {
|
||||
netdev += ",hostfwd=" + hostForward(p)
|
||||
// file, QMP control socket, virtio disk + virtio NIC.
|
||||
//
|
||||
// Networking has two modes:
|
||||
// user-mode (Phase 1): host port forwards to localhost, like rootless
|
||||
// podman. mc-proxy routes to 127.0.0.1:<hostport>.
|
||||
// bridge (Phase 2): a TAP device on an isolated host-only bridge.
|
||||
// The VM has no route off the bridge; a host firewall confines it to
|
||||
// reaching only mc-proxy on the gateway. This makes mediation
|
||||
// mandatory rather than cooperative.
|
||||
var netDevice, netBackend string
|
||||
if q.bridgeMode() {
|
||||
tap, tapErr := q.createTAP(spec.Name)
|
||||
if tapErr != nil {
|
||||
return fmt.Errorf("create TAP: %w", tapErr)
|
||||
}
|
||||
mac := deterministicMAC(spec.Name)
|
||||
netDevice = "virtio-net-pci,netdev=n0,mac=" + mac
|
||||
netBackend = fmt.Sprintf("tap,id=n0,ifname=%s,script=no,downscript=no", tap)
|
||||
} else {
|
||||
netDevice = "virtio-net-pci,netdev=n0"
|
||||
netBackend = "user,id=n0"
|
||||
for _, p := range spec.Ports {
|
||||
netBackend += ",hostfwd=" + hostForward(p)
|
||||
}
|
||||
}
|
||||
args := []string{
|
||||
"-enable-kvm",
|
||||
@@ -311,8 +365,8 @@ func (q *QEMU) Run(ctx context.Context, spec ContainerSpec) error {
|
||||
"-serial", "file:" + filepath.Join(q.vmDir(spec.Name), "console.log"),
|
||||
"-qmp", "unix:" + filepath.Join(q.vmDir(spec.Name), "qmp.sock") + ",server,nowait",
|
||||
"-drive", "file=" + img + ",format=raw,if=virtio",
|
||||
"-device", "virtio-net-pci,netdev=n0",
|
||||
"-netdev", netdev,
|
||||
"-device", netDevice,
|
||||
"-netdev", netBackend,
|
||||
}
|
||||
// 9p passthrough for host /srv/<service> volumes (best-effort; Nanos must
|
||||
// support the 9p client for the guest to mount it).
|
||||
@@ -341,6 +395,7 @@ func (q *QEMU) Run(ctx context.Context, spec ContainerSpec) error {
|
||||
MemoryMB: mem,
|
||||
VCPUs: cpus,
|
||||
ImageHash: hash,
|
||||
IP: vmIP,
|
||||
Started: time.Now().UTC(),
|
||||
}
|
||||
return q.writeMeta(spec.Name, meta)
|
||||
@@ -393,24 +448,37 @@ func (q *QEMU) Stop(ctx context.Context, name string) error {
|
||||
if pid == 0 {
|
||||
return nil
|
||||
}
|
||||
// Try a graceful QMP system_powerdown.
|
||||
// Try a graceful QMP system_powerdown, but Nanos has no ACPI shutdown
|
||||
// handler, so escalate to signals quickly rather than waiting long.
|
||||
_ = q.qmpCommand(name, "system_powerdown")
|
||||
deadline := time.Now().Add(10 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
if q.pidOf(name) == 0 {
|
||||
return nil
|
||||
}
|
||||
time.Sleep(300 * time.Millisecond)
|
||||
if q.waitGone(name, 2*time.Second) {
|
||||
return nil
|
||||
}
|
||||
// Escalate.
|
||||
_ = syscall.Kill(pid, syscall.SIGTERM)
|
||||
time.Sleep(2 * time.Second)
|
||||
if q.pidOf(name) != 0 {
|
||||
_ = syscall.Kill(pid, syscall.SIGKILL)
|
||||
if q.waitGone(name, 2*time.Second) {
|
||||
return nil
|
||||
}
|
||||
_ = syscall.Kill(pid, syscall.SIGKILL)
|
||||
q.waitGone(name, 2*time.Second)
|
||||
if q.bridgeMode() {
|
||||
q.destroyTAP(name)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// waitGone polls until the VM process exits or the timeout elapses,
|
||||
// returning true if it has exited.
|
||||
func (q *QEMU) waitGone(name string, timeout time.Duration) bool {
|
||||
deadline := time.Now().Add(timeout)
|
||||
for time.Now().Before(deadline) {
|
||||
if q.pidOf(name) == 0 {
|
||||
return true
|
||||
}
|
||||
time.Sleep(200 * time.Millisecond)
|
||||
}
|
||||
return q.pidOf(name) == 0
|
||||
}
|
||||
|
||||
// qmpCommand sends a single QMP command over the VM's control socket.
|
||||
func (q *QEMU) qmpCommand(name, command string) error {
|
||||
sock := filepath.Join(q.vmDir(name), "qmp.sock")
|
||||
@@ -436,6 +504,10 @@ func (q *QEMU) Remove(ctx context.Context, name string) error {
|
||||
if pid := q.pidOf(name); pid != 0 {
|
||||
_ = syscall.Kill(pid, syscall.SIGKILL)
|
||||
}
|
||||
if q.bridgeMode() {
|
||||
q.destroyTAP(name)
|
||||
_ = q.releaseIP(name)
|
||||
}
|
||||
return os.RemoveAll(q.vmDir(name))
|
||||
}
|
||||
|
||||
@@ -453,11 +525,15 @@ func (q *QEMU) Inspect(ctx context.Context, name string) (ContainerInfo, error)
|
||||
}
|
||||
|
||||
func (q *QEMU) infoFromMeta(m vmMeta, state string) ContainerInfo {
|
||||
network := "user"
|
||||
if m.IP != "" {
|
||||
network = q.Bridge + " " + m.IP // isolated bridge + the VM's static IP
|
||||
}
|
||||
return ContainerInfo{
|
||||
Name: m.Name,
|
||||
Image: m.Image,
|
||||
State: state,
|
||||
Network: "user",
|
||||
Network: network,
|
||||
User: m.User,
|
||||
Restart: m.Restart,
|
||||
Ports: m.Ports,
|
||||
@@ -468,6 +544,21 @@ func (q *QEMU) infoFromMeta(m vmMeta, state string) ContainerInfo {
|
||||
}
|
||||
}
|
||||
|
||||
// VMAddr returns the host-reachable "ip:port" backend for a bridge-mode VM
|
||||
// component, given its first guest port. Returns "" if the VM has no IP
|
||||
// (user-mode) or no ports.
|
||||
func (q *QEMU) VMAddr(name string) string {
|
||||
m, err := q.readMeta(name)
|
||||
if err != nil || m.IP == "" {
|
||||
return ""
|
||||
}
|
||||
gp := guestPorts(m.Ports)
|
||||
if len(gp) == 0 {
|
||||
return ""
|
||||
}
|
||||
return m.IP + ":" + gp[0]
|
||||
}
|
||||
|
||||
// List enumerates all VMs known from the state directory.
|
||||
func (q *QEMU) List(ctx context.Context) ([]ContainerInfo, error) {
|
||||
entries, err := os.ReadDir(q.stateDir())
|
||||
@@ -540,6 +631,115 @@ func (q *QEMU) Login(ctx context.Context, registry, username, token string) erro
|
||||
return nil
|
||||
}
|
||||
|
||||
// ---- Isolated bridge networking (Phase 2) ----
|
||||
|
||||
// vmUser is the host user that owns TAP devices and runs QEMU.
|
||||
const vmUser = "mcp"
|
||||
|
||||
// tapName derives a TAP interface name for a VM, respecting the 15-char
|
||||
// IFNAMSIZ limit. Long names fall back to a hash suffix.
|
||||
func (q *QEMU) tapName(name string) string {
|
||||
cand := "tap-" + name
|
||||
if len(cand) <= 15 {
|
||||
return cand
|
||||
}
|
||||
sum := sha256.Sum256([]byte(name))
|
||||
return "tap-" + hex.EncodeToString(sum[:])[:11]
|
||||
}
|
||||
|
||||
// deterministicMAC derives a stable locally-administered MAC from a name.
|
||||
func deterministicMAC(name string) string {
|
||||
sum := sha256.Sum256([]byte(name))
|
||||
return fmt.Sprintf("52:54:00:%02x:%02x:%02x", sum[0], sum[1], sum[2])
|
||||
}
|
||||
|
||||
func (q *QEMU) ipsFile() string {
|
||||
return filepath.Join(q.stateDir(), "ips.json")
|
||||
}
|
||||
|
||||
func (q *QEMU) readIPs() (map[string]string, error) {
|
||||
m := map[string]string{}
|
||||
b, err := os.ReadFile(q.ipsFile()) //nolint:gosec // fixed state-dir path
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return m, nil
|
||||
}
|
||||
return m, err
|
||||
}
|
||||
if err := json.Unmarshal(b, &m); err != nil {
|
||||
return map[string]string{}, err
|
||||
}
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func (q *QEMU) writeIPs(m map[string]string) error {
|
||||
if err := os.MkdirAll(q.stateDir(), 0o750); err != nil {
|
||||
return err
|
||||
}
|
||||
b, err := json.MarshalIndent(m, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return os.WriteFile(q.ipsFile(), b, 0o640) //nolint:gosec // mcp-group-readable
|
||||
}
|
||||
|
||||
// allocateIP assigns (or returns the existing) static bridge IP for a VM.
|
||||
// Allocation is serialized by the agent's single-threaded deploy path.
|
||||
func (q *QEMU) allocateIP(name string) (string, error) {
|
||||
ips, err := q.readIPs()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
if ip, ok := ips[name]; ok {
|
||||
return ip, nil
|
||||
}
|
||||
used := map[string]bool{}
|
||||
for _, ip := range ips {
|
||||
used[ip] = true
|
||||
}
|
||||
for n := 2; n <= 254; n++ {
|
||||
ip := fmt.Sprintf("%s.%d", q.SubnetPrefix, n)
|
||||
if !used[ip] {
|
||||
ips[name] = ip
|
||||
return ip, q.writeIPs(ips)
|
||||
}
|
||||
}
|
||||
return "", fmt.Errorf("no free IPs in %s.0/24", q.SubnetPrefix)
|
||||
}
|
||||
|
||||
func (q *QEMU) releaseIP(name string) error {
|
||||
ips, err := q.readIPs()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
delete(ips, name)
|
||||
return q.writeIPs(ips)
|
||||
}
|
||||
|
||||
// createTAP creates a TAP device owned by the VM user and enslaves it to the
|
||||
// host-only bridge. Requires CAP_NET_ADMIN (granted to the agent on
|
||||
// unikernel-capable nodes).
|
||||
func (q *QEMU) createTAP(name string) (string, error) {
|
||||
tap := q.tapName(name)
|
||||
_ = exec.Command("ip", "link", "del", tap).Run() //nolint:gosec // best-effort cleanup of a stale device
|
||||
steps := [][]string{
|
||||
{"tuntap", "add", "dev", tap, "mode", "tap", "user", vmUser},
|
||||
{"link", "set", tap, "master", q.Bridge},
|
||||
{"link", "set", tap, "up"},
|
||||
}
|
||||
for _, args := range steps {
|
||||
if out, err := exec.Command("ip", args...).CombinedOutput(); err != nil { //nolint:gosec // args built programmatically
|
||||
_ = exec.Command("ip", "link", "del", tap).Run() //nolint:gosec
|
||||
return "", fmt.Errorf("ip %v: %w: %s", args, err, out)
|
||||
}
|
||||
}
|
||||
return tap, nil
|
||||
}
|
||||
|
||||
func (q *QEMU) destroyTAP(name string) {
|
||||
_ = exec.Command("ip", "link", "del", q.tapName(name)).Run() //nolint:gosec // best-effort teardown
|
||||
}
|
||||
|
||||
func fileSHA256(path string) (string, error) {
|
||||
b, err := os.ReadFile(path) //nolint:gosec // hashing a known image path
|
||||
if err != nil {
|
||||
|
||||
Reference in New Issue
Block a user