NodeConfig and MasterNodeConfig gain an optional addresses[] field for fallback addresses tried in order after the primary address. Provides resilience when Tailscale DNS is down or a node is only reachable via LAN. - dialAgentMulti: tries each address with a 3s health check, returns first success - forEachNode: uses multi-address dialing - AgentPool.AddNodeMulti: master tries all addresses when connecting - AllAddresses(): deduplicates primary + fallback addresses Config example: [[nodes]] name = "rift" address = "rift.scylla-hammerhead.ts.net:9444" addresses = ["100.95.252.120:9444", "192.168.88.181:9444"] Existing configs without addresses[] work unchanged. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
160 lines
3.8 KiB
Go
160 lines
3.8 KiB
Go
package master
|
|
|
|
import (
|
|
"context"
|
|
"crypto/tls"
|
|
"database/sql"
|
|
"fmt"
|
|
"log/slog"
|
|
"net"
|
|
"os"
|
|
"os/signal"
|
|
"syscall"
|
|
|
|
mcpv1 "git.wntrmute.dev/mc/mcp/gen/mcp/v1"
|
|
"git.wntrmute.dev/mc/mcp/internal/auth"
|
|
"git.wntrmute.dev/mc/mcp/internal/config"
|
|
"git.wntrmute.dev/mc/mcp/internal/masterdb"
|
|
"google.golang.org/grpc"
|
|
"google.golang.org/grpc/credentials"
|
|
)
|
|
|
|
// Master is the MCP cluster master. It coordinates multi-node deployments,
|
|
// manages edge routes, and stores cluster state.
|
|
type Master struct {
|
|
mcpv1.UnimplementedMcpMasterServiceServer
|
|
|
|
Config *config.MasterConfig
|
|
DB *sql.DB
|
|
Pool *AgentPool
|
|
DNS *DNSClient
|
|
Logger *slog.Logger
|
|
Version string
|
|
}
|
|
|
|
// Run starts the master: opens the database, bootstraps nodes, sets up the
|
|
// gRPC server with TLS and auth, and blocks until SIGINT/SIGTERM.
|
|
func Run(cfg *config.MasterConfig, version string) error {
|
|
logger := slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
|
|
Level: parseLogLevel(cfg.Log.Level),
|
|
}))
|
|
|
|
// Open master database.
|
|
db, err := masterdb.Open(cfg.Database.Path)
|
|
if err != nil {
|
|
return fmt.Errorf("open master database: %w", err)
|
|
}
|
|
defer func() { _ = db.Close() }()
|
|
|
|
// Bootstrap nodes from config.
|
|
for _, n := range cfg.Nodes {
|
|
if err := masterdb.UpsertNode(db, n.Name, n.Address, n.Role, "amd64"); err != nil {
|
|
return fmt.Errorf("bootstrap node %s: %w", n.Name, err)
|
|
}
|
|
logger.Info("bootstrapped node", "name", n.Name, "address", n.Address, "role", n.Role)
|
|
}
|
|
|
|
// Load service token for dialing agents.
|
|
token, err := LoadServiceToken(cfg.Master.ServiceTokenPath)
|
|
if err != nil {
|
|
return fmt.Errorf("load service token: %w", err)
|
|
}
|
|
|
|
// Create agent connection pool.
|
|
pool := NewAgentPool(cfg.Master.CACert, token)
|
|
for _, n := range cfg.Nodes {
|
|
if addErr := pool.AddNodeMulti(n.Name, n.AllAddresses()); addErr != nil {
|
|
logger.Warn("failed to connect to agent", "node", n.Name, "err", addErr)
|
|
// Non-fatal: the node may come up later.
|
|
}
|
|
}
|
|
|
|
// Create DNS client.
|
|
dns, err := NewDNSClient(cfg.MCNS, logger)
|
|
if err != nil {
|
|
return fmt.Errorf("create DNS client: %w", err)
|
|
}
|
|
|
|
m := &Master{
|
|
Config: cfg,
|
|
DB: db,
|
|
Pool: pool,
|
|
DNS: dns,
|
|
Logger: logger,
|
|
Version: version,
|
|
}
|
|
|
|
// TLS.
|
|
tlsCert, err := tls.LoadX509KeyPair(cfg.Server.TLSCert, cfg.Server.TLSKey)
|
|
if err != nil {
|
|
return fmt.Errorf("load TLS cert: %w", err)
|
|
}
|
|
tlsConfig := &tls.Config{
|
|
Certificates: []tls.Certificate{tlsCert},
|
|
MinVersion: tls.VersionTLS13,
|
|
}
|
|
|
|
// Auth interceptor (same as agent — validates MCIAS tokens).
|
|
validator, err := auth.NewMCIASValidator(cfg.MCIAS.ServerURL, cfg.MCIAS.CACert)
|
|
if err != nil {
|
|
return fmt.Errorf("create MCIAS validator: %w", err)
|
|
}
|
|
|
|
// gRPC server.
|
|
server := grpc.NewServer(
|
|
grpc.Creds(credentials.NewTLS(tlsConfig)),
|
|
grpc.ChainUnaryInterceptor(
|
|
auth.AuthInterceptor(validator),
|
|
),
|
|
grpc.ChainStreamInterceptor(
|
|
auth.StreamAuthInterceptor(validator),
|
|
),
|
|
)
|
|
mcpv1.RegisterMcpMasterServiceServer(server, m)
|
|
|
|
// Listen.
|
|
lis, err := net.Listen("tcp", cfg.Server.GRPCAddr)
|
|
if err != nil {
|
|
return fmt.Errorf("listen %q: %w", cfg.Server.GRPCAddr, err)
|
|
}
|
|
|
|
logger.Info("master starting",
|
|
"addr", cfg.Server.GRPCAddr,
|
|
"version", version,
|
|
"nodes", len(cfg.Nodes),
|
|
)
|
|
|
|
// Signal handling.
|
|
ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
|
|
defer stop()
|
|
|
|
errCh := make(chan error, 1)
|
|
go func() {
|
|
errCh <- server.Serve(lis)
|
|
}()
|
|
|
|
select {
|
|
case <-ctx.Done():
|
|
logger.Info("shutting down")
|
|
server.GracefulStop()
|
|
pool.Close()
|
|
return nil
|
|
case err := <-errCh:
|
|
pool.Close()
|
|
return fmt.Errorf("serve: %w", err)
|
|
}
|
|
}
|
|
|
|
func parseLogLevel(level string) slog.Level {
|
|
switch level {
|
|
case "debug":
|
|
return slog.LevelDebug
|
|
case "warn":
|
|
return slog.LevelWarn
|
|
case "error":
|
|
return slog.LevelError
|
|
default:
|
|
return slog.LevelInfo
|
|
}
|
|
}
|