unikernel: bake volume config into images + per-service user-mode net
Enables migrating real services (config/cert dirs, stateless) to unikernels. Volume host dirs are copied into a per-VM staging tree mirroring guest paths; the ops config goes in the staging root with the top-level dirs in Dirs, so ops bakes them at the right absolute paths. (Staging is required — an absolute /srv MapDirs source makes ops descend into the agent's podman overlay storage and fail.) A component may set network = "user" to use QEMU user-mode NAT instead of the isolated bridge (Phase-1 networking for first migrations, before a gateway proxy). Verified: mcat (the MCIAS policy tester) deployed as a Nanos unikernel via 'mcp deploy', booting with its baked /srv/mcat config+certs, serving HTTPS verified against the platform CA, configured against MCIAS. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -204,6 +204,7 @@ func (q *QEMU) Pull(ctx context.Context, image string) error {
|
|||||||
type opsConfig struct {
|
type opsConfig struct {
|
||||||
Args []string `json:"Args,omitempty"`
|
Args []string `json:"Args,omitempty"`
|
||||||
Env map[string]string `json:"Env,omitempty"`
|
Env map[string]string `json:"Env,omitempty"`
|
||||||
|
Dirs []string `json:"Dirs,omitempty"` // dirs (relative to the config file) baked into the image
|
||||||
RunConfig opsRunConfig `json:"RunConfig"`
|
RunConfig opsRunConfig `json:"RunConfig"`
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -263,9 +264,15 @@ func (q *QEMU) Run(ctx context.Context, spec ContainerSpec) error {
|
|||||||
cpus = 1
|
cpus = 1
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// A component can opt out of the isolated bridge with network = "user"
|
||||||
|
// (e.g. a stateless service that needs NAT egress to its dependencies
|
||||||
|
// before the gateway proxy exists). Otherwise unikernels use the bridge
|
||||||
|
// when one is configured.
|
||||||
|
useBridge := q.bridgeMode() && spec.Network != "user"
|
||||||
|
|
||||||
// In bridge mode, allocate a static IP on the isolated bridge for this VM.
|
// In bridge mode, allocate a static IP on the isolated bridge for this VM.
|
||||||
var vmIP string
|
var vmIP string
|
||||||
if q.bridgeMode() {
|
if useBridge {
|
||||||
ip, allocErr := q.allocateIP(spec.Name)
|
ip, allocErr := q.allocateIP(spec.Name)
|
||||||
if allocErr != nil {
|
if allocErr != nil {
|
||||||
return fmt.Errorf("allocate VM IP: %w", allocErr)
|
return fmt.Errorf("allocate VM IP: %w", allocErr)
|
||||||
@@ -282,11 +289,52 @@ func (q *QEMU) Run(ctx context.Context, spec ContainerSpec) error {
|
|||||||
CPUs: cpus,
|
CPUs: cpus,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
if q.bridgeMode() {
|
if useBridge {
|
||||||
cfg.RunConfig.IPAddress = vmIP
|
cfg.RunConfig.IPAddress = vmIP
|
||||||
cfg.RunConfig.NetMask = "255.255.255.0"
|
cfg.RunConfig.NetMask = "255.255.255.0"
|
||||||
cfg.RunConfig.Gateway = q.Gateway
|
cfg.RunConfig.Gateway = q.Gateway
|
||||||
}
|
}
|
||||||
|
// Bake each volume's host directory into the image (read-only). Unikernels
|
||||||
|
// have no writable host mount yet, so this suits config/cert directories;
|
||||||
|
// stateful services need a different storage story (9p/virtio-blk).
|
||||||
|
//
|
||||||
|
// Volume contents are copied into a per-VM staging tree that mirrors the
|
||||||
|
// guest paths (e.g. /srv/mcat -> <bake>/srv/mcat). We then put the ops
|
||||||
|
// config in the staging root and list the top-level dirs in `Dirs`: `ops`
|
||||||
|
// resolves Dirs relative to the config file and bakes the tree at the
|
||||||
|
// matching absolute path. (Staging is required: an absolute /srv source
|
||||||
|
// would make `ops` descend into the agent's podman overlay storage and
|
||||||
|
// fail.)
|
||||||
|
bakeDir := ""
|
||||||
|
if len(spec.Volumes) > 0 {
|
||||||
|
bakeDir = filepath.Join(q.vmDir(spec.Name), "bake")
|
||||||
|
_ = os.RemoveAll(bakeDir)
|
||||||
|
topLevel := map[string]bool{}
|
||||||
|
for _, v := range spec.Volumes {
|
||||||
|
parts := strings.SplitN(v, ":", 2)
|
||||||
|
host := parts[0]
|
||||||
|
guest := host
|
||||||
|
if len(parts) == 2 {
|
||||||
|
guest = parts[1]
|
||||||
|
}
|
||||||
|
rel := strings.TrimPrefix(guest, "/")
|
||||||
|
stage := filepath.Join(bakeDir, rel)
|
||||||
|
if err := os.MkdirAll(filepath.Dir(stage), 0o750); err != nil {
|
||||||
|
return fmt.Errorf("stage volume dir: %w", err)
|
||||||
|
}
|
||||||
|
if err := os.CopyFS(stage, os.DirFS(host)); err != nil {
|
||||||
|
return fmt.Errorf("stage volume %q: %w", host, err)
|
||||||
|
}
|
||||||
|
if i := strings.Index(rel, "/"); i >= 0 {
|
||||||
|
topLevel[rel[:i]] = true
|
||||||
|
} else {
|
||||||
|
topLevel[rel] = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for d := range topLevel {
|
||||||
|
cfg.Dirs = append(cfg.Dirs, d)
|
||||||
|
}
|
||||||
|
}
|
||||||
if len(spec.Env) > 0 {
|
if len(spec.Env) > 0 {
|
||||||
cfg.Env = map[string]string{}
|
cfg.Env = map[string]string{}
|
||||||
for _, e := range spec.Env {
|
for _, e := range spec.Env {
|
||||||
@@ -295,7 +343,13 @@ func (q *QEMU) Run(ctx context.Context, spec ContainerSpec) error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// Place the ops config in the staging root when baking volumes, so its
|
||||||
|
// relative Dirs resolve to the staged tree; otherwise keep it in the VM
|
||||||
|
// state dir.
|
||||||
cfgPath := filepath.Join(q.vmDir(spec.Name), "ops.json")
|
cfgPath := filepath.Join(q.vmDir(spec.Name), "ops.json")
|
||||||
|
if bakeDir != "" {
|
||||||
|
cfgPath = filepath.Join(bakeDir, "ops.json")
|
||||||
|
}
|
||||||
cfgBytes, err := json.MarshalIndent(cfg, "", " ")
|
cfgBytes, err := json.MarshalIndent(cfg, "", " ")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("marshal ops config: %w", err)
|
return fmt.Errorf("marshal ops config: %w", err)
|
||||||
@@ -311,6 +365,9 @@ func (q *QEMU) Run(ctx context.Context, spec ContainerSpec) error {
|
|||||||
}
|
}
|
||||||
build := exec.CommandContext(ctx, q.opsPath(), "build", bin, "-c", cfgPath, "-i", spec.Name) //nolint:gosec
|
build := exec.CommandContext(ctx, q.opsPath(), "build", bin, "-c", cfgPath, "-i", spec.Name) //nolint:gosec
|
||||||
build.Env = q.opsEnv()
|
build.Env = q.opsEnv()
|
||||||
|
if bakeDir != "" {
|
||||||
|
build.Dir = bakeDir
|
||||||
|
}
|
||||||
if out, err := build.CombinedOutput(); err != nil {
|
if out, err := build.CombinedOutput(); err != nil {
|
||||||
return fmt.Errorf("ops build %q: %w: %s", spec.Name, err, out)
|
return fmt.Errorf("ops build %q: %w: %s", spec.Name, err, out)
|
||||||
}
|
}
|
||||||
@@ -339,7 +396,7 @@ func (q *QEMU) Run(ctx context.Context, spec ContainerSpec) error {
|
|||||||
// reaching only mc-proxy on the gateway. This makes mediation
|
// reaching only mc-proxy on the gateway. This makes mediation
|
||||||
// mandatory rather than cooperative.
|
// mandatory rather than cooperative.
|
||||||
var netDevice, netBackend string
|
var netDevice, netBackend string
|
||||||
if q.bridgeMode() {
|
if useBridge {
|
||||||
tap, tapErr := q.createTAP(spec.Name)
|
tap, tapErr := q.createTAP(spec.Name)
|
||||||
if tapErr != nil {
|
if tapErr != nil {
|
||||||
return fmt.Errorf("create TAP: %w", tapErr)
|
return fmt.Errorf("create TAP: %w", tapErr)
|
||||||
@@ -368,17 +425,6 @@ func (q *QEMU) Run(ctx context.Context, spec ContainerSpec) error {
|
|||||||
"-device", netDevice,
|
"-device", netDevice,
|
||||||
"-netdev", netBackend,
|
"-netdev", netBackend,
|
||||||
}
|
}
|
||||||
// 9p passthrough for host /srv/<service> volumes (best-effort; Nanos must
|
|
||||||
// support the 9p client for the guest to mount it).
|
|
||||||
for i, v := range spec.Volumes {
|
|
||||||
parts := strings.SplitN(v, ":", 2)
|
|
||||||
host := parts[0]
|
|
||||||
tag := fmt.Sprintf("srv%d", i)
|
|
||||||
args = append(args,
|
|
||||||
"-virtfs", fmt.Sprintf("local,path=%s,mount_tag=%s,security_model=none,id=%s", host, tag, tag),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
cmd := exec.CommandContext(ctx, q.qemuPath(), args...) //nolint:gosec
|
cmd := exec.CommandContext(ctx, q.qemuPath(), args...) //nolint:gosec
|
||||||
if out, err := cmd.CombinedOutput(); err != nil {
|
if out, err := cmd.CombinedOutput(); err != nil {
|
||||||
return fmt.Errorf("qemu launch %q: %w: %s", spec.Name, err, out)
|
return fmt.Errorf("qemu launch %q: %w: %s", spec.Name, err, out)
|
||||||
|
|||||||
Reference in New Issue
Block a user