Add agent registration, heartbeats, and monitoring (Phase 4)

Master side:
- Register RPC: identity-bound (agent-rift → rift), allowlist check,
  max nodes limit, upserts node in registry, updates agent pool
- Heartbeat RPC: derives node name from MCIAS identity (not request),
  updates container count and last-heartbeat timestamp
- HeartbeatMonitor: background goroutine checks for missed heartbeats
  (90s threshold), probes agents via HealthCheck, marks unhealthy

Agent side:
- HeartbeatClient: connects to master via env vars (MCP_MASTER_ADDRESS,
  MCP_MASTER_CA_CERT, MCP_MASTER_TOKEN_PATH), registers on startup
  with exponential backoff, sends heartbeats every 30s

Proto: added Register and Heartbeat RPCs + messages to master.proto.

Architecture v2 Phase 4.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-04 12:00:31 -07:00
parent fa4d022bc1
commit 6351b68ef6
6 changed files with 805 additions and 12 deletions

183
internal/agent/heartbeat.go Normal file
View File

@@ -0,0 +1,183 @@
package agent
import (
"context"
"crypto/tls"
"crypto/x509"
"fmt"
"os"
"runtime"
"strings"
"sync"
"time"
mcpv1 "git.wntrmute.dev/mc/mcp/gen/mcp/v1"
"git.wntrmute.dev/mc/mcp/internal/config"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials"
"google.golang.org/grpc/metadata"
)
// MasterConfig holds the optional master connection settings for the agent.
// When configured, the agent self-registers and sends periodic heartbeats.
type MasterConfig struct {
Address string `toml:"address"` // master gRPC address
CACert string `toml:"ca_cert"` // CA cert to verify master's TLS
TokenPath string `toml:"token_path"` // MCIAS service token for auth
}
// HeartbeatClient manages the agent's connection to the master for
// registration and heartbeats.
type HeartbeatClient struct {
client mcpv1.McpMasterServiceClient
conn *grpc.ClientConn
nodeName string
role string
address string // agent's own gRPC address
arch string
interval time.Duration
stop chan struct{}
wg sync.WaitGroup
logger interface{ Info(string, ...any); Warn(string, ...any); Error(string, ...any) }
}
// NewHeartbeatClient creates a client that registers with the master and
// sends periodic heartbeats. Returns nil if master address is not configured.
func NewHeartbeatClient(cfg config.AgentConfig, logger interface{ Info(string, ...any); Warn(string, ...any); Error(string, ...any) }) (*HeartbeatClient, error) {
masterAddr := os.Getenv("MCP_MASTER_ADDRESS")
masterCACert := os.Getenv("MCP_MASTER_CA_CERT")
masterToken := os.Getenv("MCP_MASTER_TOKEN_PATH")
if masterAddr == "" {
return nil, nil // master not configured
}
token := ""
if masterToken != "" {
data, err := os.ReadFile(masterToken) //nolint:gosec // trusted config
if err != nil {
return nil, fmt.Errorf("read master token: %w", err)
}
token = strings.TrimSpace(string(data))
}
tlsConfig := &tls.Config{MinVersion: tls.VersionTLS13}
if masterCACert != "" {
caCert, err := os.ReadFile(masterCACert) //nolint:gosec // trusted config
if err != nil {
return nil, fmt.Errorf("read master CA cert: %w", err)
}
pool := x509.NewCertPool()
if !pool.AppendCertsFromPEM(caCert) {
return nil, fmt.Errorf("invalid master CA cert")
}
tlsConfig.RootCAs = pool
}
conn, err := grpc.NewClient(
masterAddr,
grpc.WithTransportCredentials(credentials.NewTLS(tlsConfig)),
grpc.WithUnaryInterceptor(func(ctx context.Context, method string, req, reply any, cc *grpc.ClientConn, invoker grpc.UnaryInvoker, opts ...grpc.CallOption) error {
if token != "" {
ctx = metadata.AppendToOutgoingContext(ctx, "authorization", "Bearer "+token)
}
return invoker(ctx, method, req, reply, cc, opts...)
}),
)
if err != nil {
return nil, fmt.Errorf("dial master: %w", err)
}
return &HeartbeatClient{
client: mcpv1.NewMcpMasterServiceClient(conn),
conn: conn,
nodeName: cfg.Agent.NodeName,
role: "worker", // default; master node sets this via config
address: cfg.Server.GRPCAddr,
arch: runtime.GOARCH,
interval: 30 * time.Second,
stop: make(chan struct{}),
logger: logger,
}, nil
}
// Start registers with the master and begins the heartbeat loop.
func (hc *HeartbeatClient) Start() {
if hc == nil {
return
}
// Register with the master (retry with backoff).
hc.wg.Add(1)
go func() {
defer hc.wg.Done()
backoff := time.Second
for {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
resp, err := hc.client.Register(ctx, &mcpv1.RegisterRequest{
Name: hc.nodeName,
Role: hc.role,
Address: hc.address,
Arch: hc.arch,
})
cancel()
if err == nil && resp.GetAccepted() {
hc.logger.Info("registered with master",
"node", hc.nodeName, "master_accepted", true)
break
}
hc.logger.Warn("registration failed, retrying",
"node", hc.nodeName, "err", err, "backoff", backoff)
select {
case <-hc.stop:
return
case <-time.After(backoff):
}
backoff *= 2
if backoff > 60*time.Second {
backoff = 60 * time.Second
}
}
// Heartbeat loop.
ticker := time.NewTicker(hc.interval)
defer ticker.Stop()
for {
select {
case <-hc.stop:
return
case <-ticker.C:
hc.sendHeartbeat()
}
}
}()
}
func (hc *HeartbeatClient) sendHeartbeat() {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
_, err := hc.client.Heartbeat(ctx, &mcpv1.HeartbeatRequest{
Name: hc.nodeName,
Containers: 0, // TODO: count from runtime
})
if err != nil {
hc.logger.Warn("heartbeat failed", "node", hc.nodeName, "err", err)
}
}
// Stop stops the heartbeat loop and closes the master connection.
func (hc *HeartbeatClient) Stop() {
if hc == nil {
return
}
close(hc.stop)
hc.wg.Wait()
_ = hc.conn.Close()
}

View File

@@ -124,6 +124,10 @@ func Run(cfg *config.MasterConfig, version string) error {
"nodes", len(cfg.Nodes),
)
// Start heartbeat monitor.
hbMonitor := NewHeartbeatMonitor(m)
hbMonitor.Start()
// Signal handling.
ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
defer stop()
@@ -136,10 +140,12 @@ func Run(cfg *config.MasterConfig, version string) error {
select {
case <-ctx.Done():
logger.Info("shutting down")
hbMonitor.Stop()
server.GracefulStop()
pool.Close()
return nil
case err := <-errCh:
hbMonitor.Stop()
pool.Close()
return fmt.Errorf("serve: %w", err)
}

View File

@@ -0,0 +1,235 @@
package master
import (
"context"
"strings"
"sync"
"time"
mcpv1 "git.wntrmute.dev/mc/mcp/gen/mcp/v1"
"git.wntrmute.dev/mc/mcp/internal/auth"
"git.wntrmute.dev/mc/mcp/internal/masterdb"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
)
// Register handles agent self-registration. Identity-bound: the agent's
// MCIAS service name must match the claimed node name (agent-rift → rift).
func (m *Master) Register(ctx context.Context, req *mcpv1.RegisterRequest) (*mcpv1.RegisterResponse, error) {
// Extract caller identity from the auth context.
tokenInfo := auth.TokenInfoFromContext(ctx)
if tokenInfo == nil {
return nil, status.Error(codes.Unauthenticated, "no auth context")
}
// Identity binding: agent-rift can only register name="rift".
expectedName := strings.TrimPrefix(tokenInfo.Username, "agent-")
if expectedName == tokenInfo.Username {
// Not an agent-* account — also allow mcp-agent (legacy).
expectedName = req.GetName()
}
if req.GetName() != expectedName {
m.Logger.Warn("registration rejected: name mismatch",
"claimed", req.GetName(), "identity", tokenInfo.Username)
return nil, status.Errorf(codes.PermissionDenied,
"identity %q cannot register as %q", tokenInfo.Username, req.GetName())
}
// Check allowlist.
if len(m.Config.Registration.AllowedAgents) > 0 {
allowed := false
for _, a := range m.Config.Registration.AllowedAgents {
if a == tokenInfo.Username {
allowed = true
break
}
}
if !allowed {
m.Logger.Warn("registration rejected: not in allowlist",
"identity", tokenInfo.Username)
return nil, status.Errorf(codes.PermissionDenied,
"identity %q not in registration allowlist", tokenInfo.Username)
}
}
// Check max nodes.
nodes, err := masterdb.ListNodes(m.DB)
if err == nil && len(nodes) >= m.Config.Registration.MaxNodes {
// Check if this is a re-registration (existing node).
found := false
for _, n := range nodes {
if n.Name == req.GetName() {
found = true
break
}
}
if !found {
return nil, status.Error(codes.ResourceExhausted, "max nodes reached")
}
}
// Upsert node in registry.
role := req.GetRole()
if role == "" {
role = "worker"
}
arch := req.GetArch()
if arch == "" {
arch = "amd64"
}
if err := masterdb.UpsertNode(m.DB, req.GetName(), req.GetAddress(), role, arch); err != nil {
m.Logger.Error("registration upsert failed", "node", req.GetName(), "err", err)
return nil, status.Error(codes.Internal, "registration failed")
}
if err := masterdb.UpdateNodeStatus(m.DB, req.GetName(), "healthy"); err != nil {
m.Logger.Warn("update node status", "node", req.GetName(), "err", err)
}
// Update the agent pool connection.
if addErr := m.Pool.AddNode(req.GetName(), req.GetAddress()); addErr != nil {
m.Logger.Warn("pool update failed", "node", req.GetName(), "err", addErr)
}
m.Logger.Info("agent registered",
"node", req.GetName(), "address", req.GetAddress(),
"role", role, "arch", arch, "identity", tokenInfo.Username)
return &mcpv1.RegisterResponse{Accepted: true}, nil
}
// Heartbeat handles agent heartbeats. Updates the node's resource data
// and last-heartbeat timestamp. Derives the node name from the MCIAS
// identity, not the request (security: don't trust self-reported name).
func (m *Master) Heartbeat(ctx context.Context, req *mcpv1.HeartbeatRequest) (*mcpv1.HeartbeatResponse, error) {
// Derive node name from identity.
tokenInfo := auth.TokenInfoFromContext(ctx)
if tokenInfo == nil {
return nil, status.Error(codes.Unauthenticated, "no auth context")
}
nodeName := strings.TrimPrefix(tokenInfo.Username, "agent-")
if nodeName == tokenInfo.Username {
// Legacy mcp-agent account — use the request name.
nodeName = req.GetName()
}
// Verify the node is registered.
node, err := masterdb.GetNode(m.DB, nodeName)
if err != nil || node == nil {
return nil, status.Errorf(codes.NotFound, "node %q not registered", nodeName)
}
// Update heartbeat data.
now := time.Now().UTC().Format(time.RFC3339)
_, err = m.DB.Exec(`
UPDATE nodes SET
containers = ?,
status = 'healthy',
last_heartbeat = ?,
updated_at = datetime('now')
WHERE name = ?
`, req.GetContainers(), now, nodeName)
if err != nil {
m.Logger.Warn("heartbeat update failed", "node", nodeName, "err", err)
}
return &mcpv1.HeartbeatResponse{Acknowledged: true}, nil
}
// HeartbeatMonitor runs in the background, checking for agents that have
// missed heartbeats and probing them via HealthCheck.
type HeartbeatMonitor struct {
master *Master
interval time.Duration // heartbeat check interval (default: 30s)
timeout time.Duration // missed heartbeat threshold (default: 90s)
stop chan struct{}
wg sync.WaitGroup
}
// NewHeartbeatMonitor creates a heartbeat monitor.
func NewHeartbeatMonitor(m *Master) *HeartbeatMonitor {
return &HeartbeatMonitor{
master: m,
interval: 30 * time.Second,
timeout: 90 * time.Second,
stop: make(chan struct{}),
}
}
// Start begins the heartbeat monitoring loop.
func (hm *HeartbeatMonitor) Start() {
hm.wg.Add(1)
go func() {
defer hm.wg.Done()
// Initial warm-up: don't alert for the first cycle.
time.Sleep(hm.timeout)
ticker := time.NewTicker(hm.interval)
defer ticker.Stop()
for {
select {
case <-hm.stop:
return
case <-ticker.C:
hm.check()
}
}
}()
}
// Stop stops the heartbeat monitor.
func (hm *HeartbeatMonitor) Stop() {
close(hm.stop)
hm.wg.Wait()
}
func (hm *HeartbeatMonitor) check() {
nodes, err := masterdb.ListNodes(hm.master.DB)
if err != nil {
hm.master.Logger.Warn("heartbeat check: list nodes", "err", err)
return
}
now := time.Now()
for _, node := range nodes {
if node.Status == "unhealthy" {
continue // already marked, don't spam probes
}
if node.LastHeartbeat == nil {
continue // never sent a heartbeat, skip
}
if now.Sub(*node.LastHeartbeat) > hm.timeout {
hm.master.Logger.Warn("missed heartbeats, probing",
"node", node.Name,
"last_heartbeat", node.LastHeartbeat.Format(time.RFC3339))
// Probe the agent.
client, err := hm.master.Pool.Get(node.Name)
if err != nil {
hm.master.Logger.Warn("probe failed: no connection",
"node", node.Name, "err", err)
_ = masterdb.UpdateNodeStatus(hm.master.DB, node.Name, "unhealthy")
continue
}
ctx, cancel := context.WithTimeout(context.Background(),
hm.master.Config.Timeouts.HealthCheck.Duration)
_, probeErr := client.HealthCheck(ctx, &mcpv1.HealthCheckRequest{})
cancel()
if probeErr != nil {
hm.master.Logger.Warn("probe failed",
"node", node.Name, "err", probeErr)
_ = masterdb.UpdateNodeStatus(hm.master.DB, node.Name, "unhealthy")
} else {
// Probe succeeded — node is alive, just not sending heartbeats.
hm.master.Logger.Info("probe succeeded (heartbeats stale)",
"node", node.Name)
}
}
}
}