unikernel: bake volume config into images + per-service user-mode net

Enables migrating real services (config/cert dirs, stateless) to
unikernels. Volume host dirs are copied into a per-VM staging tree
mirroring guest paths; the ops config goes in the staging root with the
top-level dirs in Dirs, so ops bakes them at the right absolute paths.
(Staging is required — an absolute /srv MapDirs source makes ops descend
into the agent's podman overlay storage and fail.) A component may set
network = "user" to use QEMU user-mode NAT instead of the isolated
bridge (Phase-1 networking for first migrations, before a gateway proxy).

Verified: mcat (the MCIAS policy tester) deployed as a Nanos unikernel
via 'mcp deploy', booting with its baked /srv/mcat config+certs, serving
HTTPS verified against the platform CA, configured against MCIAS.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Kyle Isom
2026-06-11 10:16:30 -07:00
parent d2431f281d
commit 84dd897bcd

View File

@@ -204,6 +204,7 @@ func (q *QEMU) Pull(ctx context.Context, image string) error {
type opsConfig struct {
Args []string `json:"Args,omitempty"`
Env map[string]string `json:"Env,omitempty"`
Dirs []string `json:"Dirs,omitempty"` // dirs (relative to the config file) baked into the image
RunConfig opsRunConfig `json:"RunConfig"`
}
@@ -263,9 +264,15 @@ func (q *QEMU) Run(ctx context.Context, spec ContainerSpec) error {
cpus = 1
}
// A component can opt out of the isolated bridge with network = "user"
// (e.g. a stateless service that needs NAT egress to its dependencies
// before the gateway proxy exists). Otherwise unikernels use the bridge
// when one is configured.
useBridge := q.bridgeMode() && spec.Network != "user"
// In bridge mode, allocate a static IP on the isolated bridge for this VM.
var vmIP string
if q.bridgeMode() {
if useBridge {
ip, allocErr := q.allocateIP(spec.Name)
if allocErr != nil {
return fmt.Errorf("allocate VM IP: %w", allocErr)
@@ -282,11 +289,52 @@ func (q *QEMU) Run(ctx context.Context, spec ContainerSpec) error {
CPUs: cpus,
},
}
if q.bridgeMode() {
if useBridge {
cfg.RunConfig.IPAddress = vmIP
cfg.RunConfig.NetMask = "255.255.255.0"
cfg.RunConfig.Gateway = q.Gateway
}
// Bake each volume's host directory into the image (read-only). Unikernels
// have no writable host mount yet, so this suits config/cert directories;
// stateful services need a different storage story (9p/virtio-blk).
//
// Volume contents are copied into a per-VM staging tree that mirrors the
// guest paths (e.g. /srv/mcat -> <bake>/srv/mcat). We then put the ops
// config in the staging root and list the top-level dirs in `Dirs`: `ops`
// resolves Dirs relative to the config file and bakes the tree at the
// matching absolute path. (Staging is required: an absolute /srv source
// would make `ops` descend into the agent's podman overlay storage and
// fail.)
bakeDir := ""
if len(spec.Volumes) > 0 {
bakeDir = filepath.Join(q.vmDir(spec.Name), "bake")
_ = os.RemoveAll(bakeDir)
topLevel := map[string]bool{}
for _, v := range spec.Volumes {
parts := strings.SplitN(v, ":", 2)
host := parts[0]
guest := host
if len(parts) == 2 {
guest = parts[1]
}
rel := strings.TrimPrefix(guest, "/")
stage := filepath.Join(bakeDir, rel)
if err := os.MkdirAll(filepath.Dir(stage), 0o750); err != nil {
return fmt.Errorf("stage volume dir: %w", err)
}
if err := os.CopyFS(stage, os.DirFS(host)); err != nil {
return fmt.Errorf("stage volume %q: %w", host, err)
}
if i := strings.Index(rel, "/"); i >= 0 {
topLevel[rel[:i]] = true
} else {
topLevel[rel] = true
}
}
for d := range topLevel {
cfg.Dirs = append(cfg.Dirs, d)
}
}
if len(spec.Env) > 0 {
cfg.Env = map[string]string{}
for _, e := range spec.Env {
@@ -295,7 +343,13 @@ func (q *QEMU) Run(ctx context.Context, spec ContainerSpec) error {
}
}
}
// Place the ops config in the staging root when baking volumes, so its
// relative Dirs resolve to the staged tree; otherwise keep it in the VM
// state dir.
cfgPath := filepath.Join(q.vmDir(spec.Name), "ops.json")
if bakeDir != "" {
cfgPath = filepath.Join(bakeDir, "ops.json")
}
cfgBytes, err := json.MarshalIndent(cfg, "", " ")
if err != nil {
return fmt.Errorf("marshal ops config: %w", err)
@@ -311,6 +365,9 @@ func (q *QEMU) Run(ctx context.Context, spec ContainerSpec) error {
}
build := exec.CommandContext(ctx, q.opsPath(), "build", bin, "-c", cfgPath, "-i", spec.Name) //nolint:gosec
build.Env = q.opsEnv()
if bakeDir != "" {
build.Dir = bakeDir
}
if out, err := build.CombinedOutput(); err != nil {
return fmt.Errorf("ops build %q: %w: %s", spec.Name, err, out)
}
@@ -339,7 +396,7 @@ func (q *QEMU) Run(ctx context.Context, spec ContainerSpec) error {
// reaching only mc-proxy on the gateway. This makes mediation
// mandatory rather than cooperative.
var netDevice, netBackend string
if q.bridgeMode() {
if useBridge {
tap, tapErr := q.createTAP(spec.Name)
if tapErr != nil {
return fmt.Errorf("create TAP: %w", tapErr)
@@ -368,17 +425,6 @@ func (q *QEMU) Run(ctx context.Context, spec ContainerSpec) error {
"-device", netDevice,
"-netdev", netBackend,
}
// 9p passthrough for host /srv/<service> volumes (best-effort; Nanos must
// support the 9p client for the guest to mount it).
for i, v := range spec.Volumes {
parts := strings.SplitN(v, ":", 2)
host := parts[0]
tag := fmt.Sprintf("srv%d", i)
args = append(args,
"-virtfs", fmt.Sprintf("local,path=%s,mount_tag=%s,security_model=none,id=%s", host, tag, tag),
)
}
cmd := exec.CommandContext(ctx, q.qemuPath(), args...) //nolint:gosec
if out, err := cmd.CombinedOutput(); err != nil {
return fmt.Errorf("qemu launch %q: %w: %s", spec.Name, err, out)