Add multi-address fallback for node connectivity

NodeConfig and MasterNodeConfig gain an optional addresses[] field
for fallback addresses tried in order after the primary address.
Provides resilience when Tailscale DNS is down or a node is only
reachable via LAN.

- dialAgentMulti: tries each address with a 3s health check, returns
  first success
- forEachNode: uses multi-address dialing
- AgentPool.AddNodeMulti: master tries all addresses when connecting
- AllAddresses(): deduplicates primary + fallback addresses

Config example:
  [[nodes]]
  name = "rift"
  address = "rift.scylla-hammerhead.ts.net:9444"
  addresses = ["100.95.252.120:9444", "192.168.88.181:9444"]

Existing configs without addresses[] work unchanged.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-03 09:45:50 -07:00
parent 5da307cab5
commit f9f6f339f4
7 changed files with 114 additions and 22 deletions

View File

@@ -7,6 +7,7 @@ import (
"fmt"
"os"
"strings"
"time"
mcpv1 "git.wntrmute.dev/mc/mcp/gen/mcp/v1"
"git.wntrmute.dev/mc/mcp/internal/config"
@@ -52,6 +53,38 @@ func dialAgent(address string, cfg *config.CLIConfig) (mcpv1.McpAgentServiceClie
return mcpv1.NewMcpAgentServiceClient(conn), conn, nil
}
// dialAgentMulti tries each address in order and returns the first successful
// connection. Provides resilience when Tailscale DNS is down or a node is
// reachable via LAN but not Tailnet.
func dialAgentMulti(addresses []string, cfg *config.CLIConfig) (mcpv1.McpAgentServiceClient, *grpc.ClientConn, error) {
if len(addresses) == 0 {
return nil, nil, fmt.Errorf("no addresses to dial")
}
if len(addresses) == 1 {
return dialAgent(addresses[0], cfg)
}
var lastErr error
for _, addr := range addresses {
client, conn, err := dialAgent(addr, cfg)
if err != nil {
lastErr = fmt.Errorf("%s: %w", addr, err)
continue
}
// Quick health check to verify the connection actually works.
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
_, err = client.NodeStatus(ctx, &mcpv1.NodeStatusRequest{})
cancel()
if err != nil {
_ = conn.Close()
lastErr = fmt.Errorf("%s: %w", addr, err)
continue
}
return client, conn, nil
}
return nil, nil, fmt.Errorf("all addresses failed, last error: %w", lastErr)
}
// dialMaster connects to the master at the given address and returns a gRPC
// client for the McpMasterService.
func dialMaster(address string, cfg *config.CLIConfig) (mcpv1.McpMasterServiceClient, *grpc.ClientConn, error) {

View File

@@ -9,7 +9,7 @@ import (
)
// findNodeAddress looks up a node by name in the CLI config and returns
// its address.
// its primary address.
func findNodeAddress(cfg *config.CLIConfig, nodeName string) (string, error) {
for _, n := range cfg.Nodes {
if n.Name == nodeName {
@@ -19,6 +19,16 @@ func findNodeAddress(cfg *config.CLIConfig, nodeName string) (string, error) {
return "", fmt.Errorf("node %q not found in config", nodeName)
}
// findNode looks up a node by name in the CLI config.
func findNode(cfg *config.CLIConfig, nodeName string) (*config.NodeConfig, error) {
for i := range cfg.Nodes {
if cfg.Nodes[i].Name == nodeName {
return &cfg.Nodes[i], nil
}
}
return nil, fmt.Errorf("node %q not found in config", nodeName)
}
// printComponentResults prints the result of each component operation.
func printComponentResults(results []*mcpv1.ComponentResult) {
for _, r := range results {

View File

@@ -27,7 +27,7 @@ func forEachNode(fn func(node config.NodeConfig, client mcpv1.McpAgentServiceCli
}
for _, node := range cfg.Nodes {
client, conn, err := dialAgent(node.Address, cfg)
client, conn, err := dialAgentMulti(node.AllAddresses(), cfg)
if err != nil {
_, _ = fmt.Fprintf(os.Stderr, "warning: %s: %v\n", node.Name, err)
continue
@@ -85,7 +85,7 @@ func psCmd() *cobra.Command {
Short: "Live check: query runtime on all agents",
RunE: func(cmd *cobra.Command, args []string) error {
w := newTable()
_, _ = fmt.Fprintln(w, "SERVICE\tCOMPONENT\tNODE\tSTATE\tVERSION\tUPTIME")
_, _ = fmt.Fprintln(w, "SERVICE\tCOMPONENT\tNODE\tSTATE\tVERSION\tUPTIME\t")
now := time.Now()
if err := forEachNode(func(node config.NodeConfig, client mcpv1.McpAgentServiceClient) error {
@@ -96,19 +96,25 @@ func psCmd() *cobra.Command {
}
for _, svc := range resp.GetServices() {
comment := svc.GetComment()
for _, comp := range svc.GetComponents() {
uptime := "-"
if comp.GetStarted() != nil {
d := now.Sub(comp.GetStarted().AsTime())
uptime = formatDuration(d)
}
_, _ = fmt.Fprintf(w, "%s\t%s\t%s\t%s\t%s\t%s\n",
col7 := ""
if comment != "" {
col7 = "# " + comment
}
_, _ = fmt.Fprintf(w, "%s\t%s\t%s\t%s\t%s\t%s\t%s\n",
svc.GetName(),
comp.GetName(),
node.Name,
comp.GetObservedState(),
comp.GetVersion(),
uptime,
col7,
)
}
}