4 Commits

Author SHA1 Message Date
f9f6f339f4 Add multi-address fallback for node connectivity
NodeConfig and MasterNodeConfig gain an optional addresses[] field
for fallback addresses tried in order after the primary address.
Provides resilience when Tailscale DNS is down or a node is only
reachable via LAN.

- dialAgentMulti: tries each address with a 3s health check, returns
  first success
- forEachNode: uses multi-address dialing
- AgentPool.AddNodeMulti: master tries all addresses when connecting
- AllAddresses(): deduplicates primary + fallback addresses

Config example:
  [[nodes]]
  name = "rift"
  address = "rift.scylla-hammerhead.ts.net:9444"
  addresses = ["100.95.252.120:9444", "192.168.88.181:9444"]

Existing configs without addresses[] work unchanged.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-03 09:45:50 -07:00
5da307cab5 Add Dockerfile and docker-master build target
Two-stage build: golang:1.25-alpine builder, alpine:3.21 runtime.
Produces a minimal container image for mcp-master.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-02 22:52:10 -07:00
22a836812f Add public, tier, node fields to ServiceDef
RouteDef gains Public field (bool) for edge routing. ServiceDef gains
Tier field. Node validation relaxed: defaults to tier=worker when both
node and tier are empty (v2 compatibility).

ToProto/FromProto updated to round-trip all new fields. Without this,
public=true in TOML was silently dropped and edge routing never triggered.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-02 22:42:00 -07:00
9918859705 Resolve node hostname to IP for DNS registration
Node addresses may be Tailscale DNS names (e.g., rift.scylla-hammerhead.ts.net:9444)
but MCNS needs an IPv4 address for A records. The master now resolves
the hostname via net.LookupHost before passing it to the DNS client.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-02 20:58:21 -07:00
12 changed files with 174 additions and 39 deletions

22
Dockerfile.master Normal file
View File

@@ -0,0 +1,22 @@
FROM golang:1.25-alpine AS builder
ARG VERSION=dev
WORKDIR /build
COPY go.mod go.sum ./
RUN go mod download
COPY . .
RUN CGO_ENABLED=0 go build -trimpath -ldflags="-s -w -X main.version=${VERSION}" \
-o /mcp-master ./cmd/mcp-master
FROM alpine:3.21
RUN apk add --no-cache ca-certificates tzdata
COPY --from=builder /mcp-master /usr/local/bin/mcp-master
WORKDIR /srv/mcp-master
EXPOSE 9555
ENTRYPOINT ["mcp-master"]
CMD ["server", "--config", "/srv/mcp-master/mcp-master.toml"]

View File

@@ -32,6 +32,11 @@ proto-lint:
buf lint buf lint
buf breaking --against '.git#branch=master,subdir=proto' buf breaking --against '.git#branch=master,subdir=proto'
docker-master:
podman build -f Dockerfile.master \
--build-arg VERSION=$(shell git describe --tags --always --dirty) \
-t mcr.svc.mcp.metacircular.net:8443/mcp-master:$(shell git describe --tags --always --dirty) .
clean: clean:
rm -f mcp mcp-agent mcp-master rm -f mcp mcp-agent mcp-master

View File

@@ -7,6 +7,7 @@ import (
"fmt" "fmt"
"os" "os"
"strings" "strings"
"time"
mcpv1 "git.wntrmute.dev/mc/mcp/gen/mcp/v1" mcpv1 "git.wntrmute.dev/mc/mcp/gen/mcp/v1"
"git.wntrmute.dev/mc/mcp/internal/config" "git.wntrmute.dev/mc/mcp/internal/config"
@@ -52,6 +53,38 @@ func dialAgent(address string, cfg *config.CLIConfig) (mcpv1.McpAgentServiceClie
return mcpv1.NewMcpAgentServiceClient(conn), conn, nil return mcpv1.NewMcpAgentServiceClient(conn), conn, nil
} }
// dialAgentMulti tries each address in order and returns the first successful
// connection. Provides resilience when Tailscale DNS is down or a node is
// reachable via LAN but not Tailnet.
func dialAgentMulti(addresses []string, cfg *config.CLIConfig) (mcpv1.McpAgentServiceClient, *grpc.ClientConn, error) {
if len(addresses) == 0 {
return nil, nil, fmt.Errorf("no addresses to dial")
}
if len(addresses) == 1 {
return dialAgent(addresses[0], cfg)
}
var lastErr error
for _, addr := range addresses {
client, conn, err := dialAgent(addr, cfg)
if err != nil {
lastErr = fmt.Errorf("%s: %w", addr, err)
continue
}
// Quick health check to verify the connection actually works.
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
_, err = client.NodeStatus(ctx, &mcpv1.NodeStatusRequest{})
cancel()
if err != nil {
_ = conn.Close()
lastErr = fmt.Errorf("%s: %w", addr, err)
continue
}
return client, conn, nil
}
return nil, nil, fmt.Errorf("all addresses failed, last error: %w", lastErr)
}
// dialMaster connects to the master at the given address and returns a gRPC // dialMaster connects to the master at the given address and returns a gRPC
// client for the McpMasterService. // client for the McpMasterService.
func dialMaster(address string, cfg *config.CLIConfig) (mcpv1.McpMasterServiceClient, *grpc.ClientConn, error) { func dialMaster(address string, cfg *config.CLIConfig) (mcpv1.McpMasterServiceClient, *grpc.ClientConn, error) {

View File

@@ -9,7 +9,7 @@ import (
) )
// findNodeAddress looks up a node by name in the CLI config and returns // findNodeAddress looks up a node by name in the CLI config and returns
// its address. // its primary address.
func findNodeAddress(cfg *config.CLIConfig, nodeName string) (string, error) { func findNodeAddress(cfg *config.CLIConfig, nodeName string) (string, error) {
for _, n := range cfg.Nodes { for _, n := range cfg.Nodes {
if n.Name == nodeName { if n.Name == nodeName {
@@ -19,6 +19,16 @@ func findNodeAddress(cfg *config.CLIConfig, nodeName string) (string, error) {
return "", fmt.Errorf("node %q not found in config", nodeName) return "", fmt.Errorf("node %q not found in config", nodeName)
} }
// findNode looks up a node by name in the CLI config.
func findNode(cfg *config.CLIConfig, nodeName string) (*config.NodeConfig, error) {
for i := range cfg.Nodes {
if cfg.Nodes[i].Name == nodeName {
return &cfg.Nodes[i], nil
}
}
return nil, fmt.Errorf("node %q not found in config", nodeName)
}
// printComponentResults prints the result of each component operation. // printComponentResults prints the result of each component operation.
func printComponentResults(results []*mcpv1.ComponentResult) { func printComponentResults(results []*mcpv1.ComponentResult) {
for _, r := range results { for _, r := range results {

View File

@@ -27,7 +27,7 @@ func forEachNode(fn func(node config.NodeConfig, client mcpv1.McpAgentServiceCli
} }
for _, node := range cfg.Nodes { for _, node := range cfg.Nodes {
client, conn, err := dialAgent(node.Address, cfg) client, conn, err := dialAgentMulti(node.AllAddresses(), cfg)
if err != nil { if err != nil {
_, _ = fmt.Fprintf(os.Stderr, "warning: %s: %v\n", node.Name, err) _, _ = fmt.Fprintf(os.Stderr, "warning: %s: %v\n", node.Name, err)
continue continue
@@ -85,7 +85,7 @@ func psCmd() *cobra.Command {
Short: "Live check: query runtime on all agents", Short: "Live check: query runtime on all agents",
RunE: func(cmd *cobra.Command, args []string) error { RunE: func(cmd *cobra.Command, args []string) error {
w := newTable() w := newTable()
_, _ = fmt.Fprintln(w, "SERVICE\tCOMPONENT\tNODE\tSTATE\tVERSION\tUPTIME") _, _ = fmt.Fprintln(w, "SERVICE\tCOMPONENT\tNODE\tSTATE\tVERSION\tUPTIME\t")
now := time.Now() now := time.Now()
if err := forEachNode(func(node config.NodeConfig, client mcpv1.McpAgentServiceClient) error { if err := forEachNode(func(node config.NodeConfig, client mcpv1.McpAgentServiceClient) error {
@@ -96,19 +96,25 @@ func psCmd() *cobra.Command {
} }
for _, svc := range resp.GetServices() { for _, svc := range resp.GetServices() {
comment := svc.GetComment()
for _, comp := range svc.GetComponents() { for _, comp := range svc.GetComponents() {
uptime := "-" uptime := "-"
if comp.GetStarted() != nil { if comp.GetStarted() != nil {
d := now.Sub(comp.GetStarted().AsTime()) d := now.Sub(comp.GetStarted().AsTime())
uptime = formatDuration(d) uptime = formatDuration(d)
} }
_, _ = fmt.Fprintf(w, "%s\t%s\t%s\t%s\t%s\t%s\n", col7 := ""
if comment != "" {
col7 = "# " + comment
}
_, _ = fmt.Fprintf(w, "%s\t%s\t%s\t%s\t%s\t%s\t%s\n",
svc.GetName(), svc.GetName(),
comp.GetName(), comp.GetName(),
node.Name, node.Name,
comp.GetObservedState(), comp.GetObservedState(),
comp.GetVersion(), comp.GetVersion(),
uptime, uptime,
col7,
) )
} }
} }

View File

@@ -50,9 +50,28 @@ type AuthConfig struct {
} }
// NodeConfig defines a managed node that the CLI connects to. // NodeConfig defines a managed node that the CLI connects to.
// Address is the primary address. Addresses is an optional list of
// fallback addresses tried in order if the primary fails. This
// provides resilience when Tailscale DNS is down or a node is
// reachable via LAN but not Tailnet.
type NodeConfig struct { type NodeConfig struct {
Name string `toml:"name"` Name string `toml:"name"`
Address string `toml:"address"` Address string `toml:"address"`
Addresses []string `toml:"addresses,omitempty"`
}
// AllAddresses returns the node's primary address followed by any
// fallback addresses, deduplicated.
func (n NodeConfig) AllAddresses() []string {
seen := make(map[string]bool)
var addrs []string
for _, a := range append([]string{n.Address}, n.Addresses...) {
if a != "" && !seen[a] {
seen[a] = true
addrs = append(addrs, a)
}
}
return addrs
} }
// LoadCLIConfig reads and validates a CLI configuration file. // LoadCLIConfig reads and validates a CLI configuration file.

View File

@@ -64,9 +64,24 @@ type TimeoutsConfig struct {
type MasterNodeConfig struct { type MasterNodeConfig struct {
Name string `toml:"name"` Name string `toml:"name"`
Address string `toml:"address"` Address string `toml:"address"`
Addresses []string `toml:"addresses,omitempty"`
Role string `toml:"role"` // "worker", "edge", or "master" Role string `toml:"role"` // "worker", "edge", or "master"
} }
// AllAddresses returns the node's primary address followed by any
// fallback addresses, deduplicated.
func (n MasterNodeConfig) AllAddresses() []string {
seen := make(map[string]bool)
var addrs []string
for _, a := range append([]string{n.Address}, n.Addresses...) {
if a != "" && !seen[a] {
seen[a] = true
addrs = append(addrs, a)
}
}
return addrs
}
// LoadMasterConfig reads and validates a master configuration file. // LoadMasterConfig reads and validates a master configuration file.
func LoadMasterConfig(path string) (*MasterConfig, error) { func LoadMasterConfig(path string) (*MasterConfig, error) {
data, err := os.ReadFile(path) //nolint:gosec // config path from trusted CLI flag data, err := os.ReadFile(path) //nolint:gosec // config path from trusted CLI flag

View File

@@ -140,21 +140,30 @@ func NewAgentPool(caCertPath, token string) *AgentPool {
// AddNode dials an agent and adds it to the pool. // AddNode dials an agent and adds it to the pool.
func (p *AgentPool) AddNode(name, address string) error { func (p *AgentPool) AddNode(name, address string) error {
client, err := DialAgent(address, p.caCert, p.token) return p.AddNodeMulti(name, []string{address})
}
// AddNodeMulti tries each address in order and adds the first successful
// connection to the pool.
func (p *AgentPool) AddNodeMulti(name string, addresses []string) error {
var lastErr error
for _, addr := range addresses {
client, err := DialAgent(addr, p.caCert, p.token)
if err != nil { if err != nil {
return fmt.Errorf("add node %s: %w", name, err) lastErr = fmt.Errorf("%s: %w", addr, err)
continue
} }
client.Node = name client.Node = name
p.mu.Lock() p.mu.Lock()
defer p.mu.Unlock()
// Close existing connection if re-adding.
if old, ok := p.clients[name]; ok { if old, ok := p.clients[name]; ok {
_ = old.Close() _ = old.Close()
} }
p.clients[name] = client p.clients[name] = client
p.mu.Unlock()
return nil return nil
}
return fmt.Errorf("add node %s: all addresses failed: %w", name, lastErr)
} }
// Get returns the agent client for a node. // Get returns the agent client for a node.

View File

@@ -51,12 +51,23 @@ func (m *Master) Deploy(ctx context.Context, req *mcpv1.MasterDeployRequest) (*m
return resp, nil return resp, nil
} }
// Parse the node's Tailnet IP from its address (host:port). // Resolve the node's address to an IP for DNS registration.
// Node addresses may be Tailscale DNS names (e.g., rift.scylla-hammerhead.ts.net:9444)
// but MCNS needs an IP address for A records.
nodeHost, _, err := net.SplitHostPort(node.Address) nodeHost, _, err := net.SplitHostPort(node.Address)
if err != nil { if err != nil {
resp.Error = fmt.Sprintf("invalid node address %q: %v", node.Address, err) resp.Error = fmt.Sprintf("invalid node address %q: %v", node.Address, err)
return resp, nil return resp, nil
} }
// If nodeHost is not an IP, resolve it.
if net.ParseIP(nodeHost) == nil {
ips, lookupErr := net.LookupHost(nodeHost)
if lookupErr != nil || len(ips) == 0 {
m.Logger.Warn("cannot resolve node address", "host", nodeHost, "err", lookupErr)
} else {
nodeHost = ips[0]
}
}
// Step 2: Forward deploy to the agent. // Step 2: Forward deploy to the agent.
client, err := m.Pool.Get(nodeName) client, err := m.Pool.Get(nodeName)

View File

@@ -63,7 +63,7 @@ func Run(cfg *config.MasterConfig, version string) error {
// Create agent connection pool. // Create agent connection pool.
pool := NewAgentPool(cfg.Master.CACert, token) pool := NewAgentPool(cfg.Master.CACert, token)
for _, n := range cfg.Nodes { for _, n := range cfg.Nodes {
if addErr := pool.AddNode(n.Name, n.Address); addErr != nil { if addErr := pool.AddNodeMulti(n.Name, n.AllAddresses()); addErr != nil {
logger.Warn("failed to connect to agent", "node", n.Name, "err", addErr) logger.Warn("failed to connect to agent", "node", n.Name, "err", addErr)
// Non-fatal: the node may come up later. // Non-fatal: the node may come up later.
} }

View File

@@ -16,8 +16,10 @@ import (
// ServiceDef is the top-level TOML structure for a service definition file. // ServiceDef is the top-level TOML structure for a service definition file.
type ServiceDef struct { type ServiceDef struct {
Name string `toml:"name"` Name string `toml:"name"`
Node string `toml:"node"` Node string `toml:"node,omitempty"`
Tier string `toml:"tier,omitempty"`
Active *bool `toml:"active,omitempty"` Active *bool `toml:"active,omitempty"`
Comment string `toml:"comment,omitempty"`
Path string `toml:"path,omitempty"` Path string `toml:"path,omitempty"`
Build *BuildDef `toml:"build,omitempty"` Build *BuildDef `toml:"build,omitempty"`
Components []ComponentDef `toml:"components"` Components []ComponentDef `toml:"components"`
@@ -36,6 +38,7 @@ type RouteDef struct {
Port int `toml:"port"` Port int `toml:"port"`
Mode string `toml:"mode,omitempty"` Mode string `toml:"mode,omitempty"`
Hostname string `toml:"hostname,omitempty"` Hostname string `toml:"hostname,omitempty"`
Public bool `toml:"public,omitempty"`
} }
// ComponentDef describes a single container component within a service. // ComponentDef describes a single container component within a service.
@@ -129,8 +132,9 @@ func validate(def *ServiceDef) error {
if def.Name == "" { if def.Name == "" {
return fmt.Errorf("service name is required") return fmt.Errorf("service name is required")
} }
if def.Node == "" { // v2: either node or tier must be set. Tier defaults to "worker" if both empty.
return fmt.Errorf("service node is required") if def.Node == "" && def.Tier == "" {
def.Tier = "worker"
} }
if len(def.Components) == 0 { if len(def.Components) == 0 {
return fmt.Errorf("service %q must have at least one component", def.Name) return fmt.Errorf("service %q must have at least one component", def.Name)
@@ -193,6 +197,9 @@ func ToProto(def *ServiceDef) *mcpv1.ServiceSpec {
spec := &mcpv1.ServiceSpec{ spec := &mcpv1.ServiceSpec{
Name: def.Name, Name: def.Name,
Active: def.Active != nil && *def.Active, Active: def.Active != nil && *def.Active,
Comment: def.Comment,
Tier: def.Tier,
Node: def.Node,
} }
for _, c := range def.Components { for _, c := range def.Components {
@@ -213,6 +220,7 @@ func ToProto(def *ServiceDef) *mcpv1.ServiceSpec {
Port: int32(r.Port), //nolint:gosec // port range validated Port: int32(r.Port), //nolint:gosec // port range validated
Mode: r.Mode, Mode: r.Mode,
Hostname: r.Hostname, Hostname: r.Hostname,
Public: r.Public,
}) })
} }
spec.Components = append(spec.Components, cs) spec.Components = append(spec.Components, cs)
@@ -229,7 +237,9 @@ func FromProto(spec *mcpv1.ServiceSpec, node string) *ServiceDef {
def := &ServiceDef{ def := &ServiceDef{
Name: spec.GetName(), Name: spec.GetName(),
Node: node, Node: node,
Tier: spec.GetTier(),
Active: &active, Active: &active,
Comment: spec.GetComment(),
} }
for _, c := range spec.GetComponents() { for _, c := range spec.GetComponents() {
@@ -250,6 +260,7 @@ func FromProto(spec *mcpv1.ServiceSpec, node string) *ServiceDef {
Port: int(r.GetPort()), Port: int(r.GetPort()),
Mode: r.GetMode(), Mode: r.GetMode(),
Hostname: r.GetHostname(), Hostname: r.GetHostname(),
Public: r.GetPublic(),
}) })
} }
def.Components = append(def.Components, cd) def.Components = append(def.Components, cd)

View File

@@ -119,14 +119,8 @@ func TestValidation(t *testing.T) {
}, },
wantErr: "service name is required", wantErr: "service name is required",
}, },
{ // v2: missing node no longer errors — defaults to tier=worker.
name: "missing node", // Tested separately in TestValidationNodeTierDefault.
def: &ServiceDef{
Name: "svc",
Components: []ComponentDef{{Name: "api", Image: "img:v1"}},
},
wantErr: "service node is required",
},
{ {
name: "empty components", name: "empty components",
def: &ServiceDef{ def: &ServiceDef{