Update ARCHITECTURE.md with design audit findings

Incorporates all 14 items from DESIGN_AUDIT.md: node registry in CLI config, container naming convention (<service>-<component>), active state semantics, adopt by service prefix, EventInfo service field, version from image tag, snapshot/backup timer, exec-style alert commands, overlay-only bind address, RPC audit logging, /srv/ ownership, rootless podman UID mapping docs. Three minor fixes from final review: stale adopt syntax in bootstrap section, explicit container naming in deploy flow, clarify that list/ps query all registered nodes. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-26 11:03:25 -07:00
parent 12d8d733be
commit a1bbc008b5
1 changed files with 145 additions and 70 deletions
--- a/ARCHITECTURE.md
+++ b/ARCHITECTURE.md
@@ -171,6 +171,18 @@ Services with a single component (e.g., mc-proxy) simply have one

 The unique identity of a component is `node/service/component`.

+### Container Naming Convention
+
+Containers are named `<service>-<component>`:
+
+- `metacrypt-api`, `metacrypt-web`
+- `mcr-api`, `mcr-web`
+- `mc-proxy` (single-component service)
+
+This convention enables `mcp adopt <service>` to match all containers
+for a service by prefix and derive component names automatically
+(`metacrypt-api` → component `api`, `metacrypt-web` → component `web`).
+
 ---

 ## CLI
@@ -183,18 +195,18 @@ mcp login                              Authenticate to MCIAS, store token
 mcp deploy <service>                   Deploy all components from service definition
 mcp deploy <service>/<component>       Deploy a single component
 mcp deploy <service> -f <file>         Deploy from explicit file
-mcp stop <service>                     Stop all components
-mcp start <service>                    Start all components
+mcp stop <service>                     Stop all components, set active=false
+mcp start <service>                    Start all components, set active=true
 mcp restart <service>                  Restart all components

-mcp list                               List services from agent registry (no runtime query)
-mcp ps                                 Live check: query runtime, show running containers
-                                         with uptime and version
+mcp list                               List services from all agents (registry, no runtime query)
+mcp ps                                 Live check: query runtime on all agents, show running
+                                         containers with uptime and version
 mcp status [service]                   Full picture: live query + drift + recent events
 mcp sync                               Push service definitions to agent (update desired
                                         state without deploying)

-mcp adopt <container> <service>        Claim an unmanaged container into a service
+mcp adopt <service>                    Adopt all <service>-* containers into a service

 mcp service show <service>             Print current spec from agent registry
 mcp service edit <service>             Open service definition in $EDITOR
@@ -243,6 +255,24 @@ volumes = ["/srv/metacrypt:/srv/metacrypt"]
 cmd = ["server", "--config", "/srv/metacrypt/metacrypt.toml"]
 ```

+### Active State
+
+The `active` field is the operator's desired state for the service:
+
+- `active = true` → CLI tells agent: all components should be `running`.
+- `active = false` → CLI tells agent: all components should be `stopped`.
+
+Lifecycle commands update the service definition file:
+
+- `mcp stop <service>` sets `active = false` in the local file and tells
+  the agent to stop all components.
+- `mcp start <service>` sets `active = true` and tells the agent to start.
+- `mcp sync` pushes all service definitions — the agent stops anything
+  marked inactive and keeps active services running.
+
+The service definition file is always the source of truth. Lifecycle
+commands modify it so the file stays in sync with the operator's intent.
+
 ### Deploy Resolution

 `mcp deploy <service>` resolves the component spec through a precedence
@@ -423,7 +453,7 @@ message ComponentInfo {
  string image = 2;
  string desired_state = 3;        // "running", "stopped", "ignore"
  string observed_state = 4;       // "running", "stopped", "exited", "removed", "unknown"
-  string version = 5;
+  string version = 5;              // extracted from image tag
  google.protobuf.Timestamp started = 6;
 }

@@ -443,10 +473,11 @@ message DriftInfo {
 }

 message EventInfo {
-  string component = 1;
-  string prev_state = 2;
-  string new_state = 3;
-  google.protobuf.Timestamp timestamp = 4;
+  string service = 1;
+  string component = 2;
+  string prev_state = 3;
+  string new_state = 4;
+  google.protobuf.Timestamp timestamp = 5;
 }

 message ServiceStatusResponse {
@@ -464,14 +495,18 @@ message LiveCheckResponse {
 // --- Adopt ---

 message AdoptRequest {
+  string service = 1;              // service name; matches <service>-* containers
+}
+
+message AdoptResult {
  string container = 1;            // runtime container name
-  string service = 2;              // service to adopt into
-  string component = 3;            // component name within the service
+  string component = 2;            // derived component name
+  bool success = 3;
+  string error = 4;
 }

 message AdoptResponse {
-  bool success = 1;
-  string error = 2;
+  repeated AdoptResult results = 1;
 }

 // --- File transfer ---
@@ -540,12 +575,14 @@ When the agent receives a `Deploy` RPC:
   a. Pull the image: `podman pull <image>`
   b. Stop and remove the existing container (if any):
      `podman stop <name>` and `podman rm <name>`
-   c. Start the new container:
-      `podman run -d --name <name> [flags] <image> [cmd]`
+   c. Start the new container (named `<service>-<component>`):
+      `podman run -d --name <service>-<component> [flags] <image> [cmd]`
   d. Verify the container is running: `podman inspect <name>`
   e. Update observed state in the registry.
 3. Set desired state to `running` for deployed components.
-4. Return success/failure per component.
+4. Extract version from the image tag (e.g., `mcr.../metacrypt:v1.7.0`
+   → `v1.7.0`) and record it in the registry.
+5. Return success/failure per component.

 The flags passed to `podman run` are derived from the `ComponentSpec`:

@@ -652,18 +689,22 @@ On first sync, every container on rift will appear with desired state
 `ignore` -- MCP didn't deploy them and doesn't know their intended service
 grouping.

-`mcp adopt <container> <service>` claims an unmanaged container:
+`mcp adopt <service>` claims unmanaged containers by prefix:

-1. If the service doesn't exist in the registry, create it.
-2. Assign the container as a component of the specified service.
-3. Set desired state to `running` (or `stopped` if the container is
+1. Find all containers matching `<service>-*` (plus `<service>` itself
+   for single-component services).
+2. Create the service in the registry if it doesn't exist.
+3. Add each container as a component, stripping the service name prefix
+   to derive the component name: `metacrypt-api` → `api`,
+   `metacrypt-web` → `web`.
+4. Set desired state to `running` (or `stopped` if the container is
   currently stopped).

-This lets the operator incrementally bring existing containers under MCP
-management without redeploying them. The typical bootstrap flow:
-`mcp sync` to discover containers, `mcp adopt` to group them into services,
-`mcp service export` to generate service definition files from the adopted
-state.
+This lets the operator bring existing containers under MCP management
+without redeploying them. The typical bootstrap flow: `mcp sync` to
+discover containers, `mcp adopt` to group them into services,
+`mcp service export` to generate service definition files from the
+adopted state.

 ### Monitoring

@@ -673,22 +714,9 @@ function of the agent, not a separate process.

 #### Event Log

-Every state transition is recorded in the `events` table:
-
-```sql
-CREATE TABLE events (
-    id         INTEGER PRIMARY KEY AUTOINCREMENT,
-    component  TEXT NOT NULL,
-    service    TEXT NOT NULL,
-    prev_state TEXT NOT NULL,
-    new_state  TEXT NOT NULL,
-    timestamp  TEXT NOT NULL DEFAULT (datetime('now'))
-);
-
-CREATE INDEX idx_events_component_time ON events(component, timestamp);
-```
-
-Events accumulate over time and support rate queries:
+Every state transition is recorded in the `events` table (see Database
+Schema for the full DDL). Events accumulate over time and support rate
+queries:

 ```sql
 -- How many times has metacrypt-api exited in the last hour?
@@ -728,7 +756,7 @@ transitions. This prevents notification spam from a flapping service.
 ```toml
 [monitor]
 interval       = "60s"
-alert_command  = ""               # command to run on alert; empty = log only
+alert_command  = []               # argv to exec on alert; empty = log only
 cooldown       = "15m"            # suppress repeat alerts per component
 flap_threshold = 3                # state changes within flap_window = flapping
 flap_window    = "10m"
@@ -737,8 +765,9 @@ retention      = "30d"            # event log retention

 #### Alert Command

-When an alert fires, the agent executes the configured command with
-context passed via environment variables:
+When an alert fires, the agent executes the configured command using
+exec-style invocation (no shell). The command is an argv array; context
+is passed via environment variables on the child process:

 | Variable | Value |
 |----------|-------|
@@ -754,17 +783,21 @@ context passed via environment variables:
 The alert command is the operator's choice. MCP does not ship with or
 depend on any notification system.

-```bash
+```toml
 # Push notification
-alert_command = "ntfy publish mcp '$MCP_ALERT_TYPE: $MCP_SERVICE/$MCP_COMPONENT ($MCP_DESIRED -> $MCP_OBSERVED)'"
+alert_command = ["/usr/local/bin/ntfy", "publish", "mcp-alerts"]

-# Webhook
-alert_command = "curl -s -X POST https://hooks.example.com/mcp ..."
+# Custom script (reads MCP_* env vars)
+alert_command = ["/usr/local/bin/mcp-notify"]

 # Syslog
-alert_command = "logger -t mcp '$MCP_ALERT_TYPE: $MCP_SERVICE/$MCP_COMPONENT on $MCP_NODE'"
+alert_command = ["/usr/bin/logger", "-t", "mcp"]
 ```

+The command receives all context via environment variables. No shell
+expansion occurs, eliminating command injection via crafted container
+names or other metadata.
+
 ---

 ## Database Schema
@@ -871,8 +904,19 @@ service_name = "mcp"

 [auth]
 token_path = "/home/kyle/.config/mcp/token"
+# Optional: for unattended operation (scripts, cron)
+# username      = "mcp-operator"
+# password_file = "/home/kyle/.config/mcp/credentials"
+
+[[nodes]]
+name = "rift"
+address = "100.95.252.120:9444"
 ```

+`mcp node add/remove` edits the `[[nodes]]` section. `mcp node list`
+reads it. The CLI routes commands to agents based on the node addresses
+here and the `node` field in service definition files.
+
 Directory layout on the operator's workstation:

 ```
@@ -893,7 +937,7 @@ of truth for desired state. The agent's registry is the operational truth.

 ```toml
 [server]
-grpc_addr = ":9444"
+grpc_addr = "100.95.252.120:9444"   # bind to overlay interface only
 tls_cert  = "/srv/mcp/certs/cert.pem"
 tls_key   = "/srv/mcp/certs/key.pem"

@@ -911,7 +955,7 @@ container_runtime = "podman"

 [monitor]
 interval       = "60s"
-alert_command  = ""
+alert_command  = []
 cooldown       = "15m"
 flap_threshold = 3
 flap_window    = "10m"
@@ -921,6 +965,12 @@ retention      = "30d"
 level = "info"
 ```

+The agent binds to the overlay network interface, not to all interfaces.
+It does **not** sit behind MC-Proxy -- MCP manages MC-Proxy's lifecycle,
+so a circular dependency would make the agent unreachable when MC-Proxy
+is down. Like MC-Proxy itself, the agent is infrastructure that must be
+directly reachable on the overlay.
+
 The agent's data directory follows the platform convention:

 ```
@@ -941,12 +991,15 @@ The agent's data directory follows the platform convention:

 The agent is deployed like any other Metacircular service:

-1. Provision the `mcp` system user via NixOS config (with podman access).
-2. Create `/srv/mcp/` directory and config file.
-3. Provision TLS certificate from Metacrypt.
-4. Create an MCIAS system account for the agent (`mcp-agent`).
-5. Install the `mcp-agent` binary.
-6. Start via systemd unit.
+1. Provision the `mcp` system user via NixOS config (with podman access
+   and subuid/subgid ranges for rootless containers).
+2. Set `/srv/` ownership to the `mcp` user (the agent creates and manages
+   `/srv/<service>/` directories for all services).
+3. Create `/srv/mcp/` directory and config file.
+4. Provision TLS certificate from Metacrypt.
+5. Create an MCIAS system account for the agent (`mcp-agent`).
+6. Install the `mcp-agent` binary.
+7. Start via systemd unit.

 The agent runs as a systemd service. Container-first deployment is a v2
 concern -- MCP needs to be running before it can manage its own agent.
@@ -1007,8 +1060,8 @@ When bringing MCP up on a node that already has running containers:
 2. `mcp sync` with no service definition files -- the agent discovers all
   running containers and adds them to its registry with desired state
   `ignore`.
-3. `mcp adopt <container> <service>` for each container -- group them into
-   services and set desired state to `running`.
+3. `mcp adopt <service>` for each service -- groups matching containers
+   into the service and sets desired state to `running`.
 4. `mcp service export <service>` for each service -- generate service
   definition files from the adopted state.
 5. Review and edit the generated files as needed.
@@ -1018,7 +1071,18 @@ From this point, the service definition files are the source of truth and

 Existing containers on rift currently run under kyle's podman instance.
 As part of MCP bootstrap, they will need to be re-created under the `mcp`
-user's rootless podman. This is a one-time migration.
+user's rootless podman. This is a one-time migration. Containers should
+also be renamed to follow the `<service>-<component>` convention (e.g.,
+`metacrypt` → `metacrypt-api`) before adoption.
+
+#### Rootless Podman and UID Mapping
+
+The `mcp` user's subuid/subgid ranges (configured via NixOS) determine
+how container UIDs map to host UIDs. With `user = "0:0"` inside the
+container, the effective host UID depends on the mapping. Files in
+`/srv/<service>/` must be accessible to the mapped UIDs. The NixOS
+configuration should provision appropriate subuid/subgid ranges when
+creating the `mcp` user.

 ---

@@ -1036,20 +1100,28 @@ user's rootless podman. This is a one-time migration.
 | Cross-service file access | File ops require a service name; agent resolves to that service's directory only |
 | Container runtime escape | Rootless podman under `mcp` user; containers follow platform hardening |
 | Network eavesdropping | All C2 traffic is gRPC over TLS over encrypted overlay |
+| Agent exposure on LAN | Agent binds to overlay interface only, not all interfaces |
+| Alert command injection | Alert command is exec'd as argv array, no shell interpretation |
+| Unaudited operations | Every RPC is logged at info level with method, caller identity, and timestamp |

 ### Security Invariants

 1. Every agent RPC requires a valid MCIAS admin token. No anonymous or
   unprivileged access.
-2. File operations are scoped to `/srv/<service>/` for the named service.
+2. Every RPC is audit-logged at `info` level via the auth interceptor:
+   method name, caller identity (from MCIAS token), timestamp. Uses
+   `log/slog` per platform convention.
+3. File operations are scoped to `/srv/<service>/` for the named service.
   Path traversal attempts (`../`, symlinks outside the service directory)
   are rejected.
-3. The agent never executes arbitrary commands. It only runs container
+4. The agent never executes arbitrary commands. It only runs container
   runtime operations and file I/O through well-defined code paths.
-4. TLS 1.3 minimum on the agent's gRPC listener.
-5. The CLI's stored token is file-permission protected (0600).
-6. The agent runs as a dedicated `mcp` user with rootless podman. No root
-   access required.
+   Alert commands are exec'd as argv arrays with no shell interpretation.
+5. TLS 1.3 minimum on the agent's gRPC listener. The agent binds to the
+   overlay interface only.
+6. The CLI's stored token is file-permission protected (0600).
+7. The agent runs as a dedicated `mcp` user with rootless podman. `/srv/`
+   is owned by the `mcp` user. No root access required.

 ---

@@ -1070,7 +1142,8 @@ mcp/
 │   │   ├── transfer.go       push, pull
 │   │   └── node.go           node add/list/remove
 │   └── mcp-agent/            Agent daemon
-│       └── main.go
+│       ├── main.go
+│       └── snapshot.go       Database backup command
 ├── internal/
 │   ├── agent/                Agent core
 │   │   ├── agent.go          Agent struct, setup, gRPC server
@@ -1105,7 +1178,9 @@ mcp/
 │   └── v1/                   Generated Go code
 ├── deploy/
 │   ├── systemd/
-│   │   └── mcp-agent.service
+│   │   ├── mcp-agent.service
+│   │   ├── mcp-agent-backup.service
+│   │   └── mcp-agent-backup.timer
 │   ├── examples/
 │   │   ├── mcp.toml          CLI config example
 │   │   └── mcp-agent.toml    Agent config example