Files
mcp/internal/master/master.go
Kyle Isom f9f6f339f4 Add multi-address fallback for node connectivity
NodeConfig and MasterNodeConfig gain an optional addresses[] field
for fallback addresses tried in order after the primary address.
Provides resilience when Tailscale DNS is down or a node is only
reachable via LAN.

- dialAgentMulti: tries each address with a 3s health check, returns
  first success
- forEachNode: uses multi-address dialing
- AgentPool.AddNodeMulti: master tries all addresses when connecting
- AllAddresses(): deduplicates primary + fallback addresses

Config example:
  [[nodes]]
  name = "rift"
  address = "rift.scylla-hammerhead.ts.net:9444"
  addresses = ["100.95.252.120:9444", "192.168.88.181:9444"]

Existing configs without addresses[] work unchanged.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-03 09:45:50 -07:00

160 lines
3.8 KiB
Go

package master
import (
"context"
"crypto/tls"
"database/sql"
"fmt"
"log/slog"
"net"
"os"
"os/signal"
"syscall"
mcpv1 "git.wntrmute.dev/mc/mcp/gen/mcp/v1"
"git.wntrmute.dev/mc/mcp/internal/auth"
"git.wntrmute.dev/mc/mcp/internal/config"
"git.wntrmute.dev/mc/mcp/internal/masterdb"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials"
)
// Master is the MCP cluster master. It coordinates multi-node deployments,
// manages edge routes, and stores cluster state.
type Master struct {
mcpv1.UnimplementedMcpMasterServiceServer
Config *config.MasterConfig
DB *sql.DB
Pool *AgentPool
DNS *DNSClient
Logger *slog.Logger
Version string
}
// Run starts the master: opens the database, bootstraps nodes, sets up the
// gRPC server with TLS and auth, and blocks until SIGINT/SIGTERM.
func Run(cfg *config.MasterConfig, version string) error {
logger := slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
Level: parseLogLevel(cfg.Log.Level),
}))
// Open master database.
db, err := masterdb.Open(cfg.Database.Path)
if err != nil {
return fmt.Errorf("open master database: %w", err)
}
defer func() { _ = db.Close() }()
// Bootstrap nodes from config.
for _, n := range cfg.Nodes {
if err := masterdb.UpsertNode(db, n.Name, n.Address, n.Role, "amd64"); err != nil {
return fmt.Errorf("bootstrap node %s: %w", n.Name, err)
}
logger.Info("bootstrapped node", "name", n.Name, "address", n.Address, "role", n.Role)
}
// Load service token for dialing agents.
token, err := LoadServiceToken(cfg.Master.ServiceTokenPath)
if err != nil {
return fmt.Errorf("load service token: %w", err)
}
// Create agent connection pool.
pool := NewAgentPool(cfg.Master.CACert, token)
for _, n := range cfg.Nodes {
if addErr := pool.AddNodeMulti(n.Name, n.AllAddresses()); addErr != nil {
logger.Warn("failed to connect to agent", "node", n.Name, "err", addErr)
// Non-fatal: the node may come up later.
}
}
// Create DNS client.
dns, err := NewDNSClient(cfg.MCNS, logger)
if err != nil {
return fmt.Errorf("create DNS client: %w", err)
}
m := &Master{
Config: cfg,
DB: db,
Pool: pool,
DNS: dns,
Logger: logger,
Version: version,
}
// TLS.
tlsCert, err := tls.LoadX509KeyPair(cfg.Server.TLSCert, cfg.Server.TLSKey)
if err != nil {
return fmt.Errorf("load TLS cert: %w", err)
}
tlsConfig := &tls.Config{
Certificates: []tls.Certificate{tlsCert},
MinVersion: tls.VersionTLS13,
}
// Auth interceptor (same as agent — validates MCIAS tokens).
validator, err := auth.NewMCIASValidator(cfg.MCIAS.ServerURL, cfg.MCIAS.CACert)
if err != nil {
return fmt.Errorf("create MCIAS validator: %w", err)
}
// gRPC server.
server := grpc.NewServer(
grpc.Creds(credentials.NewTLS(tlsConfig)),
grpc.ChainUnaryInterceptor(
auth.AuthInterceptor(validator),
),
grpc.ChainStreamInterceptor(
auth.StreamAuthInterceptor(validator),
),
)
mcpv1.RegisterMcpMasterServiceServer(server, m)
// Listen.
lis, err := net.Listen("tcp", cfg.Server.GRPCAddr)
if err != nil {
return fmt.Errorf("listen %q: %w", cfg.Server.GRPCAddr, err)
}
logger.Info("master starting",
"addr", cfg.Server.GRPCAddr,
"version", version,
"nodes", len(cfg.Nodes),
)
// Signal handling.
ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
defer stop()
errCh := make(chan error, 1)
go func() {
errCh <- server.Serve(lis)
}()
select {
case <-ctx.Done():
logger.Info("shutting down")
server.GracefulStop()
pool.Close()
return nil
case err := <-errCh:
pool.Close()
return fmt.Errorf("serve: %w", err)
}
}
func parseLogLevel(level string) slog.Level {
switch level {
case "debug":
return slog.LevelDebug
case "warn":
return slog.LevelWarn
case "error":
return slog.LevelError
default:
return slog.LevelInfo
}
}