Add agent registration, heartbeats, and monitoring (Phase 4)
Master side: - Register RPC: identity-bound (agent-rift → rift), allowlist check, max nodes limit, upserts node in registry, updates agent pool - Heartbeat RPC: derives node name from MCIAS identity (not request), updates container count and last-heartbeat timestamp - HeartbeatMonitor: background goroutine checks for missed heartbeats (90s threshold), probes agents via HealthCheck, marks unhealthy Agent side: - HeartbeatClient: connects to master via env vars (MCP_MASTER_ADDRESS, MCP_MASTER_CA_CERT, MCP_MASTER_TOKEN_PATH), registers on startup with exponential backoff, sends heartbeats every 30s Proto: added Register and Heartbeat RPCs + messages to master.proto. Architecture v2 Phase 4. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
183
internal/agent/heartbeat.go
Normal file
183
internal/agent/heartbeat.go
Normal file
@@ -0,0 +1,183 @@
|
||||
package agent
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/tls"
|
||||
"crypto/x509"
|
||||
"fmt"
|
||||
"os"
|
||||
"runtime"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
mcpv1 "git.wntrmute.dev/mc/mcp/gen/mcp/v1"
|
||||
"git.wntrmute.dev/mc/mcp/internal/config"
|
||||
"google.golang.org/grpc"
|
||||
"google.golang.org/grpc/credentials"
|
||||
"google.golang.org/grpc/metadata"
|
||||
)
|
||||
|
||||
// MasterConfig holds the optional master connection settings for the agent.
|
||||
// When configured, the agent self-registers and sends periodic heartbeats.
|
||||
type MasterConfig struct {
|
||||
Address string `toml:"address"` // master gRPC address
|
||||
CACert string `toml:"ca_cert"` // CA cert to verify master's TLS
|
||||
TokenPath string `toml:"token_path"` // MCIAS service token for auth
|
||||
}
|
||||
|
||||
// HeartbeatClient manages the agent's connection to the master for
|
||||
// registration and heartbeats.
|
||||
type HeartbeatClient struct {
|
||||
client mcpv1.McpMasterServiceClient
|
||||
conn *grpc.ClientConn
|
||||
nodeName string
|
||||
role string
|
||||
address string // agent's own gRPC address
|
||||
arch string
|
||||
interval time.Duration
|
||||
stop chan struct{}
|
||||
wg sync.WaitGroup
|
||||
logger interface{ Info(string, ...any); Warn(string, ...any); Error(string, ...any) }
|
||||
}
|
||||
|
||||
// NewHeartbeatClient creates a client that registers with the master and
|
||||
// sends periodic heartbeats. Returns nil if master address is not configured.
|
||||
func NewHeartbeatClient(cfg config.AgentConfig, logger interface{ Info(string, ...any); Warn(string, ...any); Error(string, ...any) }) (*HeartbeatClient, error) {
|
||||
masterAddr := os.Getenv("MCP_MASTER_ADDRESS")
|
||||
masterCACert := os.Getenv("MCP_MASTER_CA_CERT")
|
||||
masterToken := os.Getenv("MCP_MASTER_TOKEN_PATH")
|
||||
|
||||
if masterAddr == "" {
|
||||
return nil, nil // master not configured
|
||||
}
|
||||
|
||||
token := ""
|
||||
if masterToken != "" {
|
||||
data, err := os.ReadFile(masterToken) //nolint:gosec // trusted config
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("read master token: %w", err)
|
||||
}
|
||||
token = strings.TrimSpace(string(data))
|
||||
}
|
||||
|
||||
tlsConfig := &tls.Config{MinVersion: tls.VersionTLS13}
|
||||
if masterCACert != "" {
|
||||
caCert, err := os.ReadFile(masterCACert) //nolint:gosec // trusted config
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("read master CA cert: %w", err)
|
||||
}
|
||||
pool := x509.NewCertPool()
|
||||
if !pool.AppendCertsFromPEM(caCert) {
|
||||
return nil, fmt.Errorf("invalid master CA cert")
|
||||
}
|
||||
tlsConfig.RootCAs = pool
|
||||
}
|
||||
|
||||
conn, err := grpc.NewClient(
|
||||
masterAddr,
|
||||
grpc.WithTransportCredentials(credentials.NewTLS(tlsConfig)),
|
||||
grpc.WithUnaryInterceptor(func(ctx context.Context, method string, req, reply any, cc *grpc.ClientConn, invoker grpc.UnaryInvoker, opts ...grpc.CallOption) error {
|
||||
if token != "" {
|
||||
ctx = metadata.AppendToOutgoingContext(ctx, "authorization", "Bearer "+token)
|
||||
}
|
||||
return invoker(ctx, method, req, reply, cc, opts...)
|
||||
}),
|
||||
)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("dial master: %w", err)
|
||||
}
|
||||
|
||||
return &HeartbeatClient{
|
||||
client: mcpv1.NewMcpMasterServiceClient(conn),
|
||||
conn: conn,
|
||||
nodeName: cfg.Agent.NodeName,
|
||||
role: "worker", // default; master node sets this via config
|
||||
address: cfg.Server.GRPCAddr,
|
||||
arch: runtime.GOARCH,
|
||||
interval: 30 * time.Second,
|
||||
stop: make(chan struct{}),
|
||||
logger: logger,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Start registers with the master and begins the heartbeat loop.
|
||||
func (hc *HeartbeatClient) Start() {
|
||||
if hc == nil {
|
||||
return
|
||||
}
|
||||
|
||||
// Register with the master (retry with backoff).
|
||||
hc.wg.Add(1)
|
||||
go func() {
|
||||
defer hc.wg.Done()
|
||||
|
||||
backoff := time.Second
|
||||
for {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
resp, err := hc.client.Register(ctx, &mcpv1.RegisterRequest{
|
||||
Name: hc.nodeName,
|
||||
Role: hc.role,
|
||||
Address: hc.address,
|
||||
Arch: hc.arch,
|
||||
})
|
||||
cancel()
|
||||
|
||||
if err == nil && resp.GetAccepted() {
|
||||
hc.logger.Info("registered with master",
|
||||
"node", hc.nodeName, "master_accepted", true)
|
||||
break
|
||||
}
|
||||
|
||||
hc.logger.Warn("registration failed, retrying",
|
||||
"node", hc.nodeName, "err", err, "backoff", backoff)
|
||||
|
||||
select {
|
||||
case <-hc.stop:
|
||||
return
|
||||
case <-time.After(backoff):
|
||||
}
|
||||
|
||||
backoff *= 2
|
||||
if backoff > 60*time.Second {
|
||||
backoff = 60 * time.Second
|
||||
}
|
||||
}
|
||||
|
||||
// Heartbeat loop.
|
||||
ticker := time.NewTicker(hc.interval)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-hc.stop:
|
||||
return
|
||||
case <-ticker.C:
|
||||
hc.sendHeartbeat()
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
func (hc *HeartbeatClient) sendHeartbeat() {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
_, err := hc.client.Heartbeat(ctx, &mcpv1.HeartbeatRequest{
|
||||
Name: hc.nodeName,
|
||||
Containers: 0, // TODO: count from runtime
|
||||
})
|
||||
if err != nil {
|
||||
hc.logger.Warn("heartbeat failed", "node", hc.nodeName, "err", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Stop stops the heartbeat loop and closes the master connection.
|
||||
func (hc *HeartbeatClient) Stop() {
|
||||
if hc == nil {
|
||||
return
|
||||
}
|
||||
close(hc.stop)
|
||||
hc.wg.Wait()
|
||||
_ = hc.conn.Close()
|
||||
}
|
||||
Reference in New Issue
Block a user