From f9f6f339f41c29ca81560f12013243b1550e6637 Mon Sep 17 00:00:00 2001 From: Kyle Isom Date: Fri, 3 Apr 2026 09:45:50 -0700 Subject: [PATCH] Add multi-address fallback for node connectivity NodeConfig and MasterNodeConfig gain an optional addresses[] field for fallback addresses tried in order after the primary address. Provides resilience when Tailscale DNS is down or a node is only reachable via LAN. - dialAgentMulti: tries each address with a 3s health check, returns first success - forEachNode: uses multi-address dialing - AgentPool.AddNodeMulti: master tries all addresses when connecting - AllAddresses(): deduplicates primary + fallback addresses Config example: [[nodes]] name = "rift" address = "rift.scylla-hammerhead.ts.net:9444" addresses = ["100.95.252.120:9444", "192.168.88.181:9444"] Existing configs without addresses[] work unchanged. Co-Authored-By: Claude Opus 4.6 (1M context) --- cmd/mcp/dial.go | 33 +++++++++++++++++++++++++++++++++ cmd/mcp/helpers.go | 12 +++++++++++- cmd/mcp/status.go | 12 +++++++++--- internal/config/cli.go | 23 +++++++++++++++++++++-- internal/config/master.go | 21 ++++++++++++++++++--- internal/master/agentclient.go | 33 +++++++++++++++++++++------------ internal/master/master.go | 2 +- 7 files changed, 114 insertions(+), 22 deletions(-) diff --git a/cmd/mcp/dial.go b/cmd/mcp/dial.go index 8f23818..5b3d254 100644 --- a/cmd/mcp/dial.go +++ b/cmd/mcp/dial.go @@ -7,6 +7,7 @@ import ( "fmt" "os" "strings" + "time" mcpv1 "git.wntrmute.dev/mc/mcp/gen/mcp/v1" "git.wntrmute.dev/mc/mcp/internal/config" @@ -52,6 +53,38 @@ func dialAgent(address string, cfg *config.CLIConfig) (mcpv1.McpAgentServiceClie return mcpv1.NewMcpAgentServiceClient(conn), conn, nil } +// dialAgentMulti tries each address in order and returns the first successful +// connection. Provides resilience when Tailscale DNS is down or a node is +// reachable via LAN but not Tailnet. +func dialAgentMulti(addresses []string, cfg *config.CLIConfig) (mcpv1.McpAgentServiceClient, *grpc.ClientConn, error) { + if len(addresses) == 0 { + return nil, nil, fmt.Errorf("no addresses to dial") + } + if len(addresses) == 1 { + return dialAgent(addresses[0], cfg) + } + + var lastErr error + for _, addr := range addresses { + client, conn, err := dialAgent(addr, cfg) + if err != nil { + lastErr = fmt.Errorf("%s: %w", addr, err) + continue + } + // Quick health check to verify the connection actually works. + ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) + _, err = client.NodeStatus(ctx, &mcpv1.NodeStatusRequest{}) + cancel() + if err != nil { + _ = conn.Close() + lastErr = fmt.Errorf("%s: %w", addr, err) + continue + } + return client, conn, nil + } + return nil, nil, fmt.Errorf("all addresses failed, last error: %w", lastErr) +} + // dialMaster connects to the master at the given address and returns a gRPC // client for the McpMasterService. func dialMaster(address string, cfg *config.CLIConfig) (mcpv1.McpMasterServiceClient, *grpc.ClientConn, error) { diff --git a/cmd/mcp/helpers.go b/cmd/mcp/helpers.go index fde01b2..25e49eb 100644 --- a/cmd/mcp/helpers.go +++ b/cmd/mcp/helpers.go @@ -9,7 +9,7 @@ import ( ) // findNodeAddress looks up a node by name in the CLI config and returns -// its address. +// its primary address. func findNodeAddress(cfg *config.CLIConfig, nodeName string) (string, error) { for _, n := range cfg.Nodes { if n.Name == nodeName { @@ -19,6 +19,16 @@ func findNodeAddress(cfg *config.CLIConfig, nodeName string) (string, error) { return "", fmt.Errorf("node %q not found in config", nodeName) } +// findNode looks up a node by name in the CLI config. +func findNode(cfg *config.CLIConfig, nodeName string) (*config.NodeConfig, error) { + for i := range cfg.Nodes { + if cfg.Nodes[i].Name == nodeName { + return &cfg.Nodes[i], nil + } + } + return nil, fmt.Errorf("node %q not found in config", nodeName) +} + // printComponentResults prints the result of each component operation. func printComponentResults(results []*mcpv1.ComponentResult) { for _, r := range results { diff --git a/cmd/mcp/status.go b/cmd/mcp/status.go index 64f2909..a93ba32 100644 --- a/cmd/mcp/status.go +++ b/cmd/mcp/status.go @@ -27,7 +27,7 @@ func forEachNode(fn func(node config.NodeConfig, client mcpv1.McpAgentServiceCli } for _, node := range cfg.Nodes { - client, conn, err := dialAgent(node.Address, cfg) + client, conn, err := dialAgentMulti(node.AllAddresses(), cfg) if err != nil { _, _ = fmt.Fprintf(os.Stderr, "warning: %s: %v\n", node.Name, err) continue @@ -85,7 +85,7 @@ func psCmd() *cobra.Command { Short: "Live check: query runtime on all agents", RunE: func(cmd *cobra.Command, args []string) error { w := newTable() - _, _ = fmt.Fprintln(w, "SERVICE\tCOMPONENT\tNODE\tSTATE\tVERSION\tUPTIME") + _, _ = fmt.Fprintln(w, "SERVICE\tCOMPONENT\tNODE\tSTATE\tVERSION\tUPTIME\t") now := time.Now() if err := forEachNode(func(node config.NodeConfig, client mcpv1.McpAgentServiceClient) error { @@ -96,19 +96,25 @@ func psCmd() *cobra.Command { } for _, svc := range resp.GetServices() { + comment := svc.GetComment() for _, comp := range svc.GetComponents() { uptime := "-" if comp.GetStarted() != nil { d := now.Sub(comp.GetStarted().AsTime()) uptime = formatDuration(d) } - _, _ = fmt.Fprintf(w, "%s\t%s\t%s\t%s\t%s\t%s\n", + col7 := "" + if comment != "" { + col7 = "# " + comment + } + _, _ = fmt.Fprintf(w, "%s\t%s\t%s\t%s\t%s\t%s\t%s\n", svc.GetName(), comp.GetName(), node.Name, comp.GetObservedState(), comp.GetVersion(), uptime, + col7, ) } } diff --git a/internal/config/cli.go b/internal/config/cli.go index 021a2af..9956a50 100644 --- a/internal/config/cli.go +++ b/internal/config/cli.go @@ -50,9 +50,28 @@ type AuthConfig struct { } // NodeConfig defines a managed node that the CLI connects to. +// Address is the primary address. Addresses is an optional list of +// fallback addresses tried in order if the primary fails. This +// provides resilience when Tailscale DNS is down or a node is +// reachable via LAN but not Tailnet. type NodeConfig struct { - Name string `toml:"name"` - Address string `toml:"address"` + Name string `toml:"name"` + Address string `toml:"address"` + Addresses []string `toml:"addresses,omitempty"` +} + +// AllAddresses returns the node's primary address followed by any +// fallback addresses, deduplicated. +func (n NodeConfig) AllAddresses() []string { + seen := make(map[string]bool) + var addrs []string + for _, a := range append([]string{n.Address}, n.Addresses...) { + if a != "" && !seen[a] { + seen[a] = true + addrs = append(addrs, a) + } + } + return addrs } // LoadCLIConfig reads and validates a CLI configuration file. diff --git a/internal/config/master.go b/internal/config/master.go index 231ac12..71af3fe 100644 --- a/internal/config/master.go +++ b/internal/config/master.go @@ -62,9 +62,24 @@ type TimeoutsConfig struct { // MasterNodeConfig is a bootstrap node entry in the master config. type MasterNodeConfig struct { - Name string `toml:"name"` - Address string `toml:"address"` - Role string `toml:"role"` // "worker", "edge", or "master" + Name string `toml:"name"` + Address string `toml:"address"` + Addresses []string `toml:"addresses,omitempty"` + Role string `toml:"role"` // "worker", "edge", or "master" +} + +// AllAddresses returns the node's primary address followed by any +// fallback addresses, deduplicated. +func (n MasterNodeConfig) AllAddresses() []string { + seen := make(map[string]bool) + var addrs []string + for _, a := range append([]string{n.Address}, n.Addresses...) { + if a != "" && !seen[a] { + seen[a] = true + addrs = append(addrs, a) + } + } + return addrs } // LoadMasterConfig reads and validates a master configuration file. diff --git a/internal/master/agentclient.go b/internal/master/agentclient.go index 1834aa0..f00e317 100644 --- a/internal/master/agentclient.go +++ b/internal/master/agentclient.go @@ -140,21 +140,30 @@ func NewAgentPool(caCertPath, token string) *AgentPool { // AddNode dials an agent and adds it to the pool. func (p *AgentPool) AddNode(name, address string) error { - client, err := DialAgent(address, p.caCert, p.token) - if err != nil { - return fmt.Errorf("add node %s: %w", name, err) - } - client.Node = name + return p.AddNodeMulti(name, []string{address}) +} - p.mu.Lock() - defer p.mu.Unlock() +// AddNodeMulti tries each address in order and adds the first successful +// connection to the pool. +func (p *AgentPool) AddNodeMulti(name string, addresses []string) error { + var lastErr error + for _, addr := range addresses { + client, err := DialAgent(addr, p.caCert, p.token) + if err != nil { + lastErr = fmt.Errorf("%s: %w", addr, err) + continue + } + client.Node = name - // Close existing connection if re-adding. - if old, ok := p.clients[name]; ok { - _ = old.Close() + p.mu.Lock() + if old, ok := p.clients[name]; ok { + _ = old.Close() + } + p.clients[name] = client + p.mu.Unlock() + return nil } - p.clients[name] = client - return nil + return fmt.Errorf("add node %s: all addresses failed: %w", name, lastErr) } // Get returns the agent client for a node. diff --git a/internal/master/master.go b/internal/master/master.go index e25dafd..247a65e 100644 --- a/internal/master/master.go +++ b/internal/master/master.go @@ -63,7 +63,7 @@ func Run(cfg *config.MasterConfig, version string) error { // Create agent connection pool. pool := NewAgentPool(cfg.Master.CACert, token) for _, n := range cfg.Nodes { - if addErr := pool.AddNode(n.Name, n.Address); addErr != nil { + if addErr := pool.AddNodeMulti(n.Name, n.AllAddresses()); addErr != nil { logger.Warn("failed to connect to agent", "node", n.Name, "err", addErr) // Non-fatal: the node may come up later. }