The agent reads [[boot.sequence]] stages from its config and starts services in dependency order before accepting gRPC connections. Each stage waits for its services to pass health checks before proceeding: - tcp: TCP connect to the container's mapped port - grpc: standard gRPC health check Foundation stage (stage 0): blocks and retries indefinitely if health fails — all downstream services depend on it. Non-foundation stages: log warning and proceed on failure. Uses the recover logic to start containers from the registry, then health-checks to verify readiness. Config example: [[boot.sequence]] name = "foundation" services = ["mcias", "mcns"] timeout = "120s" health = "tcp" Architecture v2 Phase 4 feature. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
169 lines
4.1 KiB
Go
169 lines
4.1 KiB
Go
package agent
|
|
|
|
import (
|
|
"context"
|
|
"crypto/tls"
|
|
"database/sql"
|
|
"fmt"
|
|
"log/slog"
|
|
"net"
|
|
"os"
|
|
"os/signal"
|
|
"syscall"
|
|
|
|
mcpv1 "git.wntrmute.dev/mc/mcp/gen/mcp/v1"
|
|
"git.wntrmute.dev/mc/mcp/internal/auth"
|
|
"git.wntrmute.dev/mc/mcp/internal/config"
|
|
"git.wntrmute.dev/mc/mcp/internal/monitor"
|
|
"git.wntrmute.dev/mc/mcp/internal/registry"
|
|
"git.wntrmute.dev/mc/mcp/internal/runtime"
|
|
"google.golang.org/grpc"
|
|
"google.golang.org/grpc/credentials"
|
|
)
|
|
|
|
// Agent is the MCP node agent. It manages containers, stores the registry,
|
|
// monitors for drift, and serves the gRPC API.
|
|
type Agent struct {
|
|
mcpv1.UnimplementedMcpAgentServiceServer
|
|
|
|
Config *config.AgentConfig
|
|
DB *sql.DB
|
|
Runtime runtime.Runtime
|
|
Monitor *monitor.Monitor
|
|
Logger *slog.Logger
|
|
PortAlloc *PortAllocator
|
|
Proxy *ProxyRouter
|
|
Certs *CertProvisioner
|
|
DNS *DNSRegistrar
|
|
Version string
|
|
}
|
|
|
|
// Run starts the agent: opens the database, sets up the gRPC server with
|
|
// TLS and auth, and blocks until SIGINT/SIGTERM.
|
|
func Run(cfg *config.AgentConfig, version string) error {
|
|
logger := slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
|
|
Level: parseLogLevel(cfg.Log.Level),
|
|
}))
|
|
|
|
db, err := registry.Open(cfg.Database.Path)
|
|
if err != nil {
|
|
return fmt.Errorf("open registry: %w", err)
|
|
}
|
|
defer func() { _ = db.Close() }()
|
|
|
|
rt := &runtime.Podman{}
|
|
|
|
mon := monitor.New(db, rt, cfg.Monitor, cfg.Agent.NodeName, logger)
|
|
|
|
proxy, err := NewProxyRouter(cfg.MCProxy.Socket, cfg.MCProxy.CertDir, logger)
|
|
if err != nil {
|
|
return fmt.Errorf("connect to mc-proxy: %w", err)
|
|
}
|
|
|
|
certs, err := NewCertProvisioner(cfg.Metacrypt, cfg.MCProxy.CertDir, logger)
|
|
if err != nil {
|
|
return fmt.Errorf("create cert provisioner: %w", err)
|
|
}
|
|
|
|
dns, err := NewDNSRegistrar(cfg.MCNS, logger)
|
|
if err != nil {
|
|
return fmt.Errorf("create DNS registrar: %w", err)
|
|
}
|
|
|
|
a := &Agent{
|
|
Config: cfg,
|
|
DB: db,
|
|
Runtime: rt,
|
|
Monitor: mon,
|
|
Logger: logger,
|
|
PortAlloc: NewPortAllocator(),
|
|
Proxy: proxy,
|
|
Certs: certs,
|
|
DNS: dns,
|
|
Version: version,
|
|
}
|
|
|
|
tlsCert, err := tls.LoadX509KeyPair(cfg.Server.TLSCert, cfg.Server.TLSKey)
|
|
if err != nil {
|
|
return fmt.Errorf("load TLS cert: %w", err)
|
|
}
|
|
tlsConfig := &tls.Config{
|
|
Certificates: []tls.Certificate{tlsCert},
|
|
MinVersion: tls.VersionTLS13,
|
|
}
|
|
|
|
validator, err := auth.NewMCIASValidator(cfg.MCIAS.ServerURL, cfg.MCIAS.CACert)
|
|
if err != nil {
|
|
return fmt.Errorf("create MCIAS validator: %w", err)
|
|
}
|
|
|
|
server := grpc.NewServer(
|
|
grpc.Creds(credentials.NewTLS(tlsConfig)),
|
|
grpc.ChainUnaryInterceptor(
|
|
auth.AuthInterceptor(validator),
|
|
),
|
|
grpc.ChainStreamInterceptor(
|
|
auth.StreamAuthInterceptor(validator),
|
|
),
|
|
)
|
|
mcpv1.RegisterMcpAgentServiceServer(server, a)
|
|
|
|
lis, err := net.Listen("tcp", cfg.Server.GRPCAddr)
|
|
if err != nil {
|
|
return fmt.Errorf("listen %q: %w", cfg.Server.GRPCAddr, err)
|
|
}
|
|
|
|
logger.Info("agent starting",
|
|
"addr", cfg.Server.GRPCAddr,
|
|
"node", cfg.Agent.NodeName,
|
|
"runtime", cfg.Agent.ContainerRuntime,
|
|
)
|
|
|
|
// Run boot sequence before starting the gRPC server.
|
|
// On the master node, this starts foundation services (MCIAS, MCNS)
|
|
// before core services, ensuring dependencies are met.
|
|
if len(cfg.Boot.Sequence) > 0 {
|
|
bootCtx, bootCancel := context.WithCancel(context.Background())
|
|
defer bootCancel()
|
|
if err := a.RunBootSequence(bootCtx); err != nil {
|
|
logger.Error("boot sequence failed", "err", err)
|
|
// Continue starting the gRPC server — partial boot is better than no agent.
|
|
}
|
|
}
|
|
|
|
mon.Start()
|
|
|
|
ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
|
|
defer stop()
|
|
|
|
errCh := make(chan error, 1)
|
|
go func() {
|
|
errCh <- server.Serve(lis)
|
|
}()
|
|
|
|
select {
|
|
case <-ctx.Done():
|
|
logger.Info("shutting down")
|
|
mon.Stop()
|
|
server.GracefulStop()
|
|
_ = proxy.Close()
|
|
return nil
|
|
case err := <-errCh:
|
|
mon.Stop()
|
|
return fmt.Errorf("serve: %w", err)
|
|
}
|
|
}
|
|
|
|
func parseLogLevel(level string) slog.Level {
|
|
switch level {
|
|
case "debug":
|
|
return slog.LevelDebug
|
|
case "warn":
|
|
return slog.LevelWarn
|
|
case "error":
|
|
return slog.LevelError
|
|
default:
|
|
return slog.LevelInfo
|
|
}
|
|
}
|