Add agent registration, heartbeats, and monitoring (Phase 4)
Master side: - Register RPC: identity-bound (agent-rift → rift), allowlist check, max nodes limit, upserts node in registry, updates agent pool - Heartbeat RPC: derives node name from MCIAS identity (not request), updates container count and last-heartbeat timestamp - HeartbeatMonitor: background goroutine checks for missed heartbeats (90s threshold), probes agents via HealthCheck, marks unhealthy Agent side: - HeartbeatClient: connects to master via env vars (MCP_MASTER_ADDRESS, MCP_MASTER_CA_CERT, MCP_MASTER_TOKEN_PATH), registers on startup with exponential backoff, sends heartbeats every 30s Proto: added Register and Heartbeat RPCs + messages to master.proto. Architecture v2 Phase 4. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -124,6 +124,10 @@ func Run(cfg *config.MasterConfig, version string) error {
|
||||
"nodes", len(cfg.Nodes),
|
||||
)
|
||||
|
||||
// Start heartbeat monitor.
|
||||
hbMonitor := NewHeartbeatMonitor(m)
|
||||
hbMonitor.Start()
|
||||
|
||||
// Signal handling.
|
||||
ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
|
||||
defer stop()
|
||||
@@ -136,10 +140,12 @@ func Run(cfg *config.MasterConfig, version string) error {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
logger.Info("shutting down")
|
||||
hbMonitor.Stop()
|
||||
server.GracefulStop()
|
||||
pool.Close()
|
||||
return nil
|
||||
case err := <-errCh:
|
||||
hbMonitor.Stop()
|
||||
pool.Close()
|
||||
return fmt.Errorf("serve: %w", err)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user