Add multi-address fallback for node connectivity
NodeConfig and MasterNodeConfig gain an optional addresses[] field for fallback addresses tried in order after the primary address. Provides resilience when Tailscale DNS is down or a node is only reachable via LAN. - dialAgentMulti: tries each address with a 3s health check, returns first success - forEachNode: uses multi-address dialing - AgentPool.AddNodeMulti: master tries all addresses when connecting - AllAddresses(): deduplicates primary + fallback addresses Config example: [[nodes]] name = "rift" address = "rift.scylla-hammerhead.ts.net:9444" addresses = ["100.95.252.120:9444", "192.168.88.181:9444"] Existing configs without addresses[] work unchanged. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -7,6 +7,7 @@ import (
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
mcpv1 "git.wntrmute.dev/mc/mcp/gen/mcp/v1"
|
||||
"git.wntrmute.dev/mc/mcp/internal/config"
|
||||
@@ -52,6 +53,38 @@ func dialAgent(address string, cfg *config.CLIConfig) (mcpv1.McpAgentServiceClie
|
||||
return mcpv1.NewMcpAgentServiceClient(conn), conn, nil
|
||||
}
|
||||
|
||||
// dialAgentMulti tries each address in order and returns the first successful
|
||||
// connection. Provides resilience when Tailscale DNS is down or a node is
|
||||
// reachable via LAN but not Tailnet.
|
||||
func dialAgentMulti(addresses []string, cfg *config.CLIConfig) (mcpv1.McpAgentServiceClient, *grpc.ClientConn, error) {
|
||||
if len(addresses) == 0 {
|
||||
return nil, nil, fmt.Errorf("no addresses to dial")
|
||||
}
|
||||
if len(addresses) == 1 {
|
||||
return dialAgent(addresses[0], cfg)
|
||||
}
|
||||
|
||||
var lastErr error
|
||||
for _, addr := range addresses {
|
||||
client, conn, err := dialAgent(addr, cfg)
|
||||
if err != nil {
|
||||
lastErr = fmt.Errorf("%s: %w", addr, err)
|
||||
continue
|
||||
}
|
||||
// Quick health check to verify the connection actually works.
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
|
||||
_, err = client.NodeStatus(ctx, &mcpv1.NodeStatusRequest{})
|
||||
cancel()
|
||||
if err != nil {
|
||||
_ = conn.Close()
|
||||
lastErr = fmt.Errorf("%s: %w", addr, err)
|
||||
continue
|
||||
}
|
||||
return client, conn, nil
|
||||
}
|
||||
return nil, nil, fmt.Errorf("all addresses failed, last error: %w", lastErr)
|
||||
}
|
||||
|
||||
// dialMaster connects to the master at the given address and returns a gRPC
|
||||
// client for the McpMasterService.
|
||||
func dialMaster(address string, cfg *config.CLIConfig) (mcpv1.McpMasterServiceClient, *grpc.ClientConn, error) {
|
||||
|
||||
@@ -9,7 +9,7 @@ import (
|
||||
)
|
||||
|
||||
// findNodeAddress looks up a node by name in the CLI config and returns
|
||||
// its address.
|
||||
// its primary address.
|
||||
func findNodeAddress(cfg *config.CLIConfig, nodeName string) (string, error) {
|
||||
for _, n := range cfg.Nodes {
|
||||
if n.Name == nodeName {
|
||||
@@ -19,6 +19,16 @@ func findNodeAddress(cfg *config.CLIConfig, nodeName string) (string, error) {
|
||||
return "", fmt.Errorf("node %q not found in config", nodeName)
|
||||
}
|
||||
|
||||
// findNode looks up a node by name in the CLI config.
|
||||
func findNode(cfg *config.CLIConfig, nodeName string) (*config.NodeConfig, error) {
|
||||
for i := range cfg.Nodes {
|
||||
if cfg.Nodes[i].Name == nodeName {
|
||||
return &cfg.Nodes[i], nil
|
||||
}
|
||||
}
|
||||
return nil, fmt.Errorf("node %q not found in config", nodeName)
|
||||
}
|
||||
|
||||
// printComponentResults prints the result of each component operation.
|
||||
func printComponentResults(results []*mcpv1.ComponentResult) {
|
||||
for _, r := range results {
|
||||
|
||||
@@ -27,7 +27,7 @@ func forEachNode(fn func(node config.NodeConfig, client mcpv1.McpAgentServiceCli
|
||||
}
|
||||
|
||||
for _, node := range cfg.Nodes {
|
||||
client, conn, err := dialAgent(node.Address, cfg)
|
||||
client, conn, err := dialAgentMulti(node.AllAddresses(), cfg)
|
||||
if err != nil {
|
||||
_, _ = fmt.Fprintf(os.Stderr, "warning: %s: %v\n", node.Name, err)
|
||||
continue
|
||||
@@ -85,7 +85,7 @@ func psCmd() *cobra.Command {
|
||||
Short: "Live check: query runtime on all agents",
|
||||
RunE: func(cmd *cobra.Command, args []string) error {
|
||||
w := newTable()
|
||||
_, _ = fmt.Fprintln(w, "SERVICE\tCOMPONENT\tNODE\tSTATE\tVERSION\tUPTIME")
|
||||
_, _ = fmt.Fprintln(w, "SERVICE\tCOMPONENT\tNODE\tSTATE\tVERSION\tUPTIME\t")
|
||||
|
||||
now := time.Now()
|
||||
if err := forEachNode(func(node config.NodeConfig, client mcpv1.McpAgentServiceClient) error {
|
||||
@@ -96,19 +96,25 @@ func psCmd() *cobra.Command {
|
||||
}
|
||||
|
||||
for _, svc := range resp.GetServices() {
|
||||
comment := svc.GetComment()
|
||||
for _, comp := range svc.GetComponents() {
|
||||
uptime := "-"
|
||||
if comp.GetStarted() != nil {
|
||||
d := now.Sub(comp.GetStarted().AsTime())
|
||||
uptime = formatDuration(d)
|
||||
}
|
||||
_, _ = fmt.Fprintf(w, "%s\t%s\t%s\t%s\t%s\t%s\n",
|
||||
col7 := ""
|
||||
if comment != "" {
|
||||
col7 = "# " + comment
|
||||
}
|
||||
_, _ = fmt.Fprintf(w, "%s\t%s\t%s\t%s\t%s\t%s\t%s\n",
|
||||
svc.GetName(),
|
||||
comp.GetName(),
|
||||
node.Name,
|
||||
comp.GetObservedState(),
|
||||
comp.GetVersion(),
|
||||
uptime,
|
||||
col7,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -50,9 +50,28 @@ type AuthConfig struct {
|
||||
}
|
||||
|
||||
// NodeConfig defines a managed node that the CLI connects to.
|
||||
// Address is the primary address. Addresses is an optional list of
|
||||
// fallback addresses tried in order if the primary fails. This
|
||||
// provides resilience when Tailscale DNS is down or a node is
|
||||
// reachable via LAN but not Tailnet.
|
||||
type NodeConfig struct {
|
||||
Name string `toml:"name"`
|
||||
Address string `toml:"address"`
|
||||
Name string `toml:"name"`
|
||||
Address string `toml:"address"`
|
||||
Addresses []string `toml:"addresses,omitempty"`
|
||||
}
|
||||
|
||||
// AllAddresses returns the node's primary address followed by any
|
||||
// fallback addresses, deduplicated.
|
||||
func (n NodeConfig) AllAddresses() []string {
|
||||
seen := make(map[string]bool)
|
||||
var addrs []string
|
||||
for _, a := range append([]string{n.Address}, n.Addresses...) {
|
||||
if a != "" && !seen[a] {
|
||||
seen[a] = true
|
||||
addrs = append(addrs, a)
|
||||
}
|
||||
}
|
||||
return addrs
|
||||
}
|
||||
|
||||
// LoadCLIConfig reads and validates a CLI configuration file.
|
||||
|
||||
@@ -62,9 +62,24 @@ type TimeoutsConfig struct {
|
||||
|
||||
// MasterNodeConfig is a bootstrap node entry in the master config.
|
||||
type MasterNodeConfig struct {
|
||||
Name string `toml:"name"`
|
||||
Address string `toml:"address"`
|
||||
Role string `toml:"role"` // "worker", "edge", or "master"
|
||||
Name string `toml:"name"`
|
||||
Address string `toml:"address"`
|
||||
Addresses []string `toml:"addresses,omitempty"`
|
||||
Role string `toml:"role"` // "worker", "edge", or "master"
|
||||
}
|
||||
|
||||
// AllAddresses returns the node's primary address followed by any
|
||||
// fallback addresses, deduplicated.
|
||||
func (n MasterNodeConfig) AllAddresses() []string {
|
||||
seen := make(map[string]bool)
|
||||
var addrs []string
|
||||
for _, a := range append([]string{n.Address}, n.Addresses...) {
|
||||
if a != "" && !seen[a] {
|
||||
seen[a] = true
|
||||
addrs = append(addrs, a)
|
||||
}
|
||||
}
|
||||
return addrs
|
||||
}
|
||||
|
||||
// LoadMasterConfig reads and validates a master configuration file.
|
||||
|
||||
@@ -140,21 +140,30 @@ func NewAgentPool(caCertPath, token string) *AgentPool {
|
||||
|
||||
// AddNode dials an agent and adds it to the pool.
|
||||
func (p *AgentPool) AddNode(name, address string) error {
|
||||
client, err := DialAgent(address, p.caCert, p.token)
|
||||
if err != nil {
|
||||
return fmt.Errorf("add node %s: %w", name, err)
|
||||
}
|
||||
client.Node = name
|
||||
return p.AddNodeMulti(name, []string{address})
|
||||
}
|
||||
|
||||
p.mu.Lock()
|
||||
defer p.mu.Unlock()
|
||||
// AddNodeMulti tries each address in order and adds the first successful
|
||||
// connection to the pool.
|
||||
func (p *AgentPool) AddNodeMulti(name string, addresses []string) error {
|
||||
var lastErr error
|
||||
for _, addr := range addresses {
|
||||
client, err := DialAgent(addr, p.caCert, p.token)
|
||||
if err != nil {
|
||||
lastErr = fmt.Errorf("%s: %w", addr, err)
|
||||
continue
|
||||
}
|
||||
client.Node = name
|
||||
|
||||
// Close existing connection if re-adding.
|
||||
if old, ok := p.clients[name]; ok {
|
||||
_ = old.Close()
|
||||
p.mu.Lock()
|
||||
if old, ok := p.clients[name]; ok {
|
||||
_ = old.Close()
|
||||
}
|
||||
p.clients[name] = client
|
||||
p.mu.Unlock()
|
||||
return nil
|
||||
}
|
||||
p.clients[name] = client
|
||||
return nil
|
||||
return fmt.Errorf("add node %s: all addresses failed: %w", name, lastErr)
|
||||
}
|
||||
|
||||
// Get returns the agent client for a node.
|
||||
|
||||
@@ -63,7 +63,7 @@ func Run(cfg *config.MasterConfig, version string) error {
|
||||
// Create agent connection pool.
|
||||
pool := NewAgentPool(cfg.Master.CACert, token)
|
||||
for _, n := range cfg.Nodes {
|
||||
if addErr := pool.AddNode(n.Name, n.Address); addErr != nil {
|
||||
if addErr := pool.AddNodeMulti(n.Name, n.AllAddresses()); addErr != nil {
|
||||
logger.Warn("failed to connect to agent", "node", n.Name, "err", addErr)
|
||||
// Non-fatal: the node may come up later.
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user