diff --git a/internal/runtime/qemu.go b/internal/runtime/qemu.go index 5a51779..dd25702 100644 --- a/internal/runtime/qemu.go +++ b/internal/runtime/qemu.go @@ -204,6 +204,7 @@ func (q *QEMU) Pull(ctx context.Context, image string) error { type opsConfig struct { Args []string `json:"Args,omitempty"` Env map[string]string `json:"Env,omitempty"` + Dirs []string `json:"Dirs,omitempty"` // dirs (relative to the config file) baked into the image RunConfig opsRunConfig `json:"RunConfig"` } @@ -263,9 +264,15 @@ func (q *QEMU) Run(ctx context.Context, spec ContainerSpec) error { cpus = 1 } + // A component can opt out of the isolated bridge with network = "user" + // (e.g. a stateless service that needs NAT egress to its dependencies + // before the gateway proxy exists). Otherwise unikernels use the bridge + // when one is configured. + useBridge := q.bridgeMode() && spec.Network != "user" + // In bridge mode, allocate a static IP on the isolated bridge for this VM. var vmIP string - if q.bridgeMode() { + if useBridge { ip, allocErr := q.allocateIP(spec.Name) if allocErr != nil { return fmt.Errorf("allocate VM IP: %w", allocErr) @@ -282,11 +289,52 @@ func (q *QEMU) Run(ctx context.Context, spec ContainerSpec) error { CPUs: cpus, }, } - if q.bridgeMode() { + if useBridge { cfg.RunConfig.IPAddress = vmIP cfg.RunConfig.NetMask = "255.255.255.0" cfg.RunConfig.Gateway = q.Gateway } + // Bake each volume's host directory into the image (read-only). Unikernels + // have no writable host mount yet, so this suits config/cert directories; + // stateful services need a different storage story (9p/virtio-blk). + // + // Volume contents are copied into a per-VM staging tree that mirrors the + // guest paths (e.g. /srv/mcat -> /srv/mcat). We then put the ops + // config in the staging root and list the top-level dirs in `Dirs`: `ops` + // resolves Dirs relative to the config file and bakes the tree at the + // matching absolute path. (Staging is required: an absolute /srv source + // would make `ops` descend into the agent's podman overlay storage and + // fail.) + bakeDir := "" + if len(spec.Volumes) > 0 { + bakeDir = filepath.Join(q.vmDir(spec.Name), "bake") + _ = os.RemoveAll(bakeDir) + topLevel := map[string]bool{} + for _, v := range spec.Volumes { + parts := strings.SplitN(v, ":", 2) + host := parts[0] + guest := host + if len(parts) == 2 { + guest = parts[1] + } + rel := strings.TrimPrefix(guest, "/") + stage := filepath.Join(bakeDir, rel) + if err := os.MkdirAll(filepath.Dir(stage), 0o750); err != nil { + return fmt.Errorf("stage volume dir: %w", err) + } + if err := os.CopyFS(stage, os.DirFS(host)); err != nil { + return fmt.Errorf("stage volume %q: %w", host, err) + } + if i := strings.Index(rel, "/"); i >= 0 { + topLevel[rel[:i]] = true + } else { + topLevel[rel] = true + } + } + for d := range topLevel { + cfg.Dirs = append(cfg.Dirs, d) + } + } if len(spec.Env) > 0 { cfg.Env = map[string]string{} for _, e := range spec.Env { @@ -295,7 +343,13 @@ func (q *QEMU) Run(ctx context.Context, spec ContainerSpec) error { } } } + // Place the ops config in the staging root when baking volumes, so its + // relative Dirs resolve to the staged tree; otherwise keep it in the VM + // state dir. cfgPath := filepath.Join(q.vmDir(spec.Name), "ops.json") + if bakeDir != "" { + cfgPath = filepath.Join(bakeDir, "ops.json") + } cfgBytes, err := json.MarshalIndent(cfg, "", " ") if err != nil { return fmt.Errorf("marshal ops config: %w", err) @@ -311,6 +365,9 @@ func (q *QEMU) Run(ctx context.Context, spec ContainerSpec) error { } build := exec.CommandContext(ctx, q.opsPath(), "build", bin, "-c", cfgPath, "-i", spec.Name) //nolint:gosec build.Env = q.opsEnv() + if bakeDir != "" { + build.Dir = bakeDir + } if out, err := build.CombinedOutput(); err != nil { return fmt.Errorf("ops build %q: %w: %s", spec.Name, err, out) } @@ -339,7 +396,7 @@ func (q *QEMU) Run(ctx context.Context, spec ContainerSpec) error { // reaching only mc-proxy on the gateway. This makes mediation // mandatory rather than cooperative. var netDevice, netBackend string - if q.bridgeMode() { + if useBridge { tap, tapErr := q.createTAP(spec.Name) if tapErr != nil { return fmt.Errorf("create TAP: %w", tapErr) @@ -368,17 +425,6 @@ func (q *QEMU) Run(ctx context.Context, spec ContainerSpec) error { "-device", netDevice, "-netdev", netBackend, } - // 9p passthrough for host /srv/ volumes (best-effort; Nanos must - // support the 9p client for the guest to mount it). - for i, v := range spec.Volumes { - parts := strings.SplitN(v, ":", 2) - host := parts[0] - tag := fmt.Sprintf("srv%d", i) - args = append(args, - "-virtfs", fmt.Sprintf("local,path=%s,mount_tag=%s,security_model=none,id=%s", host, tag, tag), - ) - } - cmd := exec.CommandContext(ctx, q.qemuPath(), args...) //nolint:gosec if out, err := cmd.CombinedOutput(); err != nil { return fmt.Errorf("qemu launch %q: %w: %s", spec.Name, err, out)