Add agent registration, heartbeats, and monitoring (Phase 4)

Master side:
- Register RPC: identity-bound (agent-rift → rift), allowlist check,
  max nodes limit, upserts node in registry, updates agent pool
- Heartbeat RPC: derives node name from MCIAS identity (not request),
  updates container count and last-heartbeat timestamp
- HeartbeatMonitor: background goroutine checks for missed heartbeats
  (90s threshold), probes agents via HealthCheck, marks unhealthy

Agent side:
- HeartbeatClient: connects to master via env vars (MCP_MASTER_ADDRESS,
  MCP_MASTER_CA_CERT, MCP_MASTER_TOKEN_PATH), registers on startup
  with exponential backoff, sends heartbeats every 30s

Proto: added Register and Heartbeat RPCs + messages to master.proto.

Architecture v2 Phase 4.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-04 12:00:31 -07:00
parent fa4d022bc1
commit 6351b68ef6
6 changed files with 805 additions and 12 deletions

View File

@@ -712,6 +712,238 @@ func (x *NodeInfo) GetServices() int32 {
return 0
}
type RegisterRequest struct {
state protoimpl.MessageState `protogen:"open.v1"`
Name string `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"`
Role string `protobuf:"bytes,2,opt,name=role,proto3" json:"role,omitempty"` // "worker", "edge", or "master"
Address string `protobuf:"bytes,3,opt,name=address,proto3" json:"address,omitempty"` // agent gRPC address
Arch string `protobuf:"bytes,4,opt,name=arch,proto3" json:"arch,omitempty"` // "amd64" or "arm64"
unknownFields protoimpl.UnknownFields
sizeCache protoimpl.SizeCache
}
func (x *RegisterRequest) Reset() {
*x = RegisterRequest{}
mi := &file_proto_mcp_v1_master_proto_msgTypes[12]
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
ms.StoreMessageInfo(mi)
}
func (x *RegisterRequest) String() string {
return protoimpl.X.MessageStringOf(x)
}
func (*RegisterRequest) ProtoMessage() {}
func (x *RegisterRequest) ProtoReflect() protoreflect.Message {
mi := &file_proto_mcp_v1_master_proto_msgTypes[12]
if x != nil {
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
if ms.LoadMessageInfo() == nil {
ms.StoreMessageInfo(mi)
}
return ms
}
return mi.MessageOf(x)
}
// Deprecated: Use RegisterRequest.ProtoReflect.Descriptor instead.
func (*RegisterRequest) Descriptor() ([]byte, []int) {
return file_proto_mcp_v1_master_proto_rawDescGZIP(), []int{12}
}
func (x *RegisterRequest) GetName() string {
if x != nil {
return x.Name
}
return ""
}
func (x *RegisterRequest) GetRole() string {
if x != nil {
return x.Role
}
return ""
}
func (x *RegisterRequest) GetAddress() string {
if x != nil {
return x.Address
}
return ""
}
func (x *RegisterRequest) GetArch() string {
if x != nil {
return x.Arch
}
return ""
}
type RegisterResponse struct {
state protoimpl.MessageState `protogen:"open.v1"`
Accepted bool `protobuf:"varint,1,opt,name=accepted,proto3" json:"accepted,omitempty"`
unknownFields protoimpl.UnknownFields
sizeCache protoimpl.SizeCache
}
func (x *RegisterResponse) Reset() {
*x = RegisterResponse{}
mi := &file_proto_mcp_v1_master_proto_msgTypes[13]
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
ms.StoreMessageInfo(mi)
}
func (x *RegisterResponse) String() string {
return protoimpl.X.MessageStringOf(x)
}
func (*RegisterResponse) ProtoMessage() {}
func (x *RegisterResponse) ProtoReflect() protoreflect.Message {
mi := &file_proto_mcp_v1_master_proto_msgTypes[13]
if x != nil {
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
if ms.LoadMessageInfo() == nil {
ms.StoreMessageInfo(mi)
}
return ms
}
return mi.MessageOf(x)
}
// Deprecated: Use RegisterResponse.ProtoReflect.Descriptor instead.
func (*RegisterResponse) Descriptor() ([]byte, []int) {
return file_proto_mcp_v1_master_proto_rawDescGZIP(), []int{13}
}
func (x *RegisterResponse) GetAccepted() bool {
if x != nil {
return x.Accepted
}
return false
}
type HeartbeatRequest struct {
state protoimpl.MessageState `protogen:"open.v1"`
Name string `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"`
CpuMillicores int64 `protobuf:"varint,2,opt,name=cpu_millicores,json=cpuMillicores,proto3" json:"cpu_millicores,omitempty"`
MemoryBytes int64 `protobuf:"varint,3,opt,name=memory_bytes,json=memoryBytes,proto3" json:"memory_bytes,omitempty"`
DiskBytes int64 `protobuf:"varint,4,opt,name=disk_bytes,json=diskBytes,proto3" json:"disk_bytes,omitempty"`
Containers int32 `protobuf:"varint,5,opt,name=containers,proto3" json:"containers,omitempty"`
unknownFields protoimpl.UnknownFields
sizeCache protoimpl.SizeCache
}
func (x *HeartbeatRequest) Reset() {
*x = HeartbeatRequest{}
mi := &file_proto_mcp_v1_master_proto_msgTypes[14]
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
ms.StoreMessageInfo(mi)
}
func (x *HeartbeatRequest) String() string {
return protoimpl.X.MessageStringOf(x)
}
func (*HeartbeatRequest) ProtoMessage() {}
func (x *HeartbeatRequest) ProtoReflect() protoreflect.Message {
mi := &file_proto_mcp_v1_master_proto_msgTypes[14]
if x != nil {
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
if ms.LoadMessageInfo() == nil {
ms.StoreMessageInfo(mi)
}
return ms
}
return mi.MessageOf(x)
}
// Deprecated: Use HeartbeatRequest.ProtoReflect.Descriptor instead.
func (*HeartbeatRequest) Descriptor() ([]byte, []int) {
return file_proto_mcp_v1_master_proto_rawDescGZIP(), []int{14}
}
func (x *HeartbeatRequest) GetName() string {
if x != nil {
return x.Name
}
return ""
}
func (x *HeartbeatRequest) GetCpuMillicores() int64 {
if x != nil {
return x.CpuMillicores
}
return 0
}
func (x *HeartbeatRequest) GetMemoryBytes() int64 {
if x != nil {
return x.MemoryBytes
}
return 0
}
func (x *HeartbeatRequest) GetDiskBytes() int64 {
if x != nil {
return x.DiskBytes
}
return 0
}
func (x *HeartbeatRequest) GetContainers() int32 {
if x != nil {
return x.Containers
}
return 0
}
type HeartbeatResponse struct {
state protoimpl.MessageState `protogen:"open.v1"`
Acknowledged bool `protobuf:"varint,1,opt,name=acknowledged,proto3" json:"acknowledged,omitempty"`
unknownFields protoimpl.UnknownFields
sizeCache protoimpl.SizeCache
}
func (x *HeartbeatResponse) Reset() {
*x = HeartbeatResponse{}
mi := &file_proto_mcp_v1_master_proto_msgTypes[15]
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
ms.StoreMessageInfo(mi)
}
func (x *HeartbeatResponse) String() string {
return protoimpl.X.MessageStringOf(x)
}
func (*HeartbeatResponse) ProtoMessage() {}
func (x *HeartbeatResponse) ProtoReflect() protoreflect.Message {
mi := &file_proto_mcp_v1_master_proto_msgTypes[15]
if x != nil {
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
if ms.LoadMessageInfo() == nil {
ms.StoreMessageInfo(mi)
}
return ms
}
return mi.MessageOf(x)
}
// Deprecated: Use HeartbeatResponse.ProtoReflect.Descriptor instead.
func (*HeartbeatResponse) Descriptor() ([]byte, []int) {
return file_proto_mcp_v1_master_proto_rawDescGZIP(), []int{15}
}
func (x *HeartbeatResponse) GetAcknowledged() bool {
if x != nil {
return x.Acknowledged
}
return false
}
var File_proto_mcp_v1_master_proto protoreflect.FileDescriptor
const file_proto_mcp_v1_master_proto_rawDesc = "" +
@@ -765,12 +997,32 @@ const file_proto_mcp_v1_master_proto_rawDesc = "" +
"containers\x18\x06 \x01(\x05R\n" +
"containers\x12%\n" +
"\x0elast_heartbeat\x18\a \x01(\tR\rlastHeartbeat\x12\x1a\n" +
"\bservices\x18\b \x01(\x05R\bservices2\xa9\x02\n" +
"\bservices\x18\b \x01(\x05R\bservices\"g\n" +
"\x0fRegisterRequest\x12\x12\n" +
"\x04name\x18\x01 \x01(\tR\x04name\x12\x12\n" +
"\x04role\x18\x02 \x01(\tR\x04role\x12\x18\n" +
"\aaddress\x18\x03 \x01(\tR\aaddress\x12\x12\n" +
"\x04arch\x18\x04 \x01(\tR\x04arch\".\n" +
"\x10RegisterResponse\x12\x1a\n" +
"\baccepted\x18\x01 \x01(\bR\baccepted\"\xaf\x01\n" +
"\x10HeartbeatRequest\x12\x12\n" +
"\x04name\x18\x01 \x01(\tR\x04name\x12%\n" +
"\x0ecpu_millicores\x18\x02 \x01(\x03R\rcpuMillicores\x12!\n" +
"\fmemory_bytes\x18\x03 \x01(\x03R\vmemoryBytes\x12\x1d\n" +
"\n" +
"disk_bytes\x18\x04 \x01(\x03R\tdiskBytes\x12\x1e\n" +
"\n" +
"containers\x18\x05 \x01(\x05R\n" +
"containers\"7\n" +
"\x11HeartbeatResponse\x12\"\n" +
"\facknowledged\x18\x01 \x01(\bR\facknowledged2\xaa\x03\n" +
"\x10McpMasterService\x12C\n" +
"\x06Deploy\x12\x1b.mcp.v1.MasterDeployRequest\x1a\x1c.mcp.v1.MasterDeployResponse\x12I\n" +
"\bUndeploy\x12\x1d.mcp.v1.MasterUndeployRequest\x1a\x1e.mcp.v1.MasterUndeployResponse\x12C\n" +
"\x06Status\x12\x1b.mcp.v1.MasterStatusRequest\x1a\x1c.mcp.v1.MasterStatusResponse\x12@\n" +
"\tListNodes\x12\x18.mcp.v1.ListNodesRequest\x1a\x19.mcp.v1.ListNodesResponseB*Z(git.wntrmute.dev/mc/mcp/gen/mcp/v1;mcpv1b\x06proto3"
"\tListNodes\x12\x18.mcp.v1.ListNodesRequest\x1a\x19.mcp.v1.ListNodesResponse\x12=\n" +
"\bRegister\x12\x17.mcp.v1.RegisterRequest\x1a\x18.mcp.v1.RegisterResponse\x12@\n" +
"\tHeartbeat\x12\x18.mcp.v1.HeartbeatRequest\x1a\x19.mcp.v1.HeartbeatResponseB*Z(git.wntrmute.dev/mc/mcp/gen/mcp/v1;mcpv1b\x06proto3"
var (
file_proto_mcp_v1_master_proto_rawDescOnce sync.Once
@@ -784,7 +1036,7 @@ func file_proto_mcp_v1_master_proto_rawDescGZIP() []byte {
return file_proto_mcp_v1_master_proto_rawDescData
}
var file_proto_mcp_v1_master_proto_msgTypes = make([]protoimpl.MessageInfo, 12)
var file_proto_mcp_v1_master_proto_msgTypes = make([]protoimpl.MessageInfo, 16)
var file_proto_mcp_v1_master_proto_goTypes = []any{
(*MasterDeployRequest)(nil), // 0: mcp.v1.MasterDeployRequest
(*MasterDeployResponse)(nil), // 1: mcp.v1.MasterDeployResponse
@@ -798,10 +1050,14 @@ var file_proto_mcp_v1_master_proto_goTypes = []any{
(*ListNodesRequest)(nil), // 9: mcp.v1.ListNodesRequest
(*ListNodesResponse)(nil), // 10: mcp.v1.ListNodesResponse
(*NodeInfo)(nil), // 11: mcp.v1.NodeInfo
(*ServiceSpec)(nil), // 12: mcp.v1.ServiceSpec
(*RegisterRequest)(nil), // 12: mcp.v1.RegisterRequest
(*RegisterResponse)(nil), // 13: mcp.v1.RegisterResponse
(*HeartbeatRequest)(nil), // 14: mcp.v1.HeartbeatRequest
(*HeartbeatResponse)(nil), // 15: mcp.v1.HeartbeatResponse
(*ServiceSpec)(nil), // 16: mcp.v1.ServiceSpec
}
var file_proto_mcp_v1_master_proto_depIdxs = []int32{
12, // 0: mcp.v1.MasterDeployRequest.service:type_name -> mcp.v1.ServiceSpec
16, // 0: mcp.v1.MasterDeployRequest.service:type_name -> mcp.v1.ServiceSpec
2, // 1: mcp.v1.MasterDeployResponse.deploy_result:type_name -> mcp.v1.StepResult
2, // 2: mcp.v1.MasterDeployResponse.edge_route_result:type_name -> mcp.v1.StepResult
2, // 3: mcp.v1.MasterDeployResponse.dns_result:type_name -> mcp.v1.StepResult
@@ -812,12 +1068,16 @@ var file_proto_mcp_v1_master_proto_depIdxs = []int32{
3, // 8: mcp.v1.McpMasterService.Undeploy:input_type -> mcp.v1.MasterUndeployRequest
5, // 9: mcp.v1.McpMasterService.Status:input_type -> mcp.v1.MasterStatusRequest
9, // 10: mcp.v1.McpMasterService.ListNodes:input_type -> mcp.v1.ListNodesRequest
1, // 11: mcp.v1.McpMasterService.Deploy:output_type -> mcp.v1.MasterDeployResponse
4, // 12: mcp.v1.McpMasterService.Undeploy:output_type -> mcp.v1.MasterUndeployResponse
6, // 13: mcp.v1.McpMasterService.Status:output_type -> mcp.v1.MasterStatusResponse
10, // 14: mcp.v1.McpMasterService.ListNodes:output_type -> mcp.v1.ListNodesResponse
11, // [11:15] is the sub-list for method output_type
7, // [7:11] is the sub-list for method input_type
12, // 11: mcp.v1.McpMasterService.Register:input_type -> mcp.v1.RegisterRequest
14, // 12: mcp.v1.McpMasterService.Heartbeat:input_type -> mcp.v1.HeartbeatRequest
1, // 13: mcp.v1.McpMasterService.Deploy:output_type -> mcp.v1.MasterDeployResponse
4, // 14: mcp.v1.McpMasterService.Undeploy:output_type -> mcp.v1.MasterUndeployResponse
6, // 15: mcp.v1.McpMasterService.Status:output_type -> mcp.v1.MasterStatusResponse
10, // 16: mcp.v1.McpMasterService.ListNodes:output_type -> mcp.v1.ListNodesResponse
13, // 17: mcp.v1.McpMasterService.Register:output_type -> mcp.v1.RegisterResponse
15, // 18: mcp.v1.McpMasterService.Heartbeat:output_type -> mcp.v1.HeartbeatResponse
13, // [13:19] is the sub-list for method output_type
7, // [7:13] is the sub-list for method input_type
7, // [7:7] is the sub-list for extension type_name
7, // [7:7] is the sub-list for extension extendee
0, // [0:7] is the sub-list for field type_name
@@ -835,7 +1095,7 @@ func file_proto_mcp_v1_master_proto_init() {
GoPackagePath: reflect.TypeOf(x{}).PkgPath(),
RawDescriptor: unsafe.Slice(unsafe.StringData(file_proto_mcp_v1_master_proto_rawDesc), len(file_proto_mcp_v1_master_proto_rawDesc)),
NumEnums: 0,
NumMessages: 12,
NumMessages: 16,
NumExtensions: 0,
NumServices: 1,
},

View File

@@ -25,6 +25,8 @@ const (
McpMasterService_Undeploy_FullMethodName = "/mcp.v1.McpMasterService/Undeploy"
McpMasterService_Status_FullMethodName = "/mcp.v1.McpMasterService/Status"
McpMasterService_ListNodes_FullMethodName = "/mcp.v1.McpMasterService/ListNodes"
McpMasterService_Register_FullMethodName = "/mcp.v1.McpMasterService/Register"
McpMasterService_Heartbeat_FullMethodName = "/mcp.v1.McpMasterService/Heartbeat"
)
// McpMasterServiceClient is the client API for McpMasterService service.
@@ -40,6 +42,9 @@ type McpMasterServiceClient interface {
Undeploy(ctx context.Context, in *MasterUndeployRequest, opts ...grpc.CallOption) (*MasterUndeployResponse, error)
Status(ctx context.Context, in *MasterStatusRequest, opts ...grpc.CallOption) (*MasterStatusResponse, error)
ListNodes(ctx context.Context, in *ListNodesRequest, opts ...grpc.CallOption) (*ListNodesResponse, error)
// Agent registration and health (called by agents).
Register(ctx context.Context, in *RegisterRequest, opts ...grpc.CallOption) (*RegisterResponse, error)
Heartbeat(ctx context.Context, in *HeartbeatRequest, opts ...grpc.CallOption) (*HeartbeatResponse, error)
}
type mcpMasterServiceClient struct {
@@ -90,6 +95,26 @@ func (c *mcpMasterServiceClient) ListNodes(ctx context.Context, in *ListNodesReq
return out, nil
}
func (c *mcpMasterServiceClient) Register(ctx context.Context, in *RegisterRequest, opts ...grpc.CallOption) (*RegisterResponse, error) {
cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...)
out := new(RegisterResponse)
err := c.cc.Invoke(ctx, McpMasterService_Register_FullMethodName, in, out, cOpts...)
if err != nil {
return nil, err
}
return out, nil
}
func (c *mcpMasterServiceClient) Heartbeat(ctx context.Context, in *HeartbeatRequest, opts ...grpc.CallOption) (*HeartbeatResponse, error) {
cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...)
out := new(HeartbeatResponse)
err := c.cc.Invoke(ctx, McpMasterService_Heartbeat_FullMethodName, in, out, cOpts...)
if err != nil {
return nil, err
}
return out, nil
}
// McpMasterServiceServer is the server API for McpMasterService service.
// All implementations must embed UnimplementedMcpMasterServiceServer
// for forward compatibility.
@@ -103,6 +128,9 @@ type McpMasterServiceServer interface {
Undeploy(context.Context, *MasterUndeployRequest) (*MasterUndeployResponse, error)
Status(context.Context, *MasterStatusRequest) (*MasterStatusResponse, error)
ListNodes(context.Context, *ListNodesRequest) (*ListNodesResponse, error)
// Agent registration and health (called by agents).
Register(context.Context, *RegisterRequest) (*RegisterResponse, error)
Heartbeat(context.Context, *HeartbeatRequest) (*HeartbeatResponse, error)
mustEmbedUnimplementedMcpMasterServiceServer()
}
@@ -125,6 +153,12 @@ func (UnimplementedMcpMasterServiceServer) Status(context.Context, *MasterStatus
func (UnimplementedMcpMasterServiceServer) ListNodes(context.Context, *ListNodesRequest) (*ListNodesResponse, error) {
return nil, status.Error(codes.Unimplemented, "method ListNodes not implemented")
}
func (UnimplementedMcpMasterServiceServer) Register(context.Context, *RegisterRequest) (*RegisterResponse, error) {
return nil, status.Error(codes.Unimplemented, "method Register not implemented")
}
func (UnimplementedMcpMasterServiceServer) Heartbeat(context.Context, *HeartbeatRequest) (*HeartbeatResponse, error) {
return nil, status.Error(codes.Unimplemented, "method Heartbeat not implemented")
}
func (UnimplementedMcpMasterServiceServer) mustEmbedUnimplementedMcpMasterServiceServer() {}
func (UnimplementedMcpMasterServiceServer) testEmbeddedByValue() {}
@@ -218,6 +252,42 @@ func _McpMasterService_ListNodes_Handler(srv interface{}, ctx context.Context, d
return interceptor(ctx, in, info, handler)
}
func _McpMasterService_Register_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
in := new(RegisterRequest)
if err := dec(in); err != nil {
return nil, err
}
if interceptor == nil {
return srv.(McpMasterServiceServer).Register(ctx, in)
}
info := &grpc.UnaryServerInfo{
Server: srv,
FullMethod: McpMasterService_Register_FullMethodName,
}
handler := func(ctx context.Context, req interface{}) (interface{}, error) {
return srv.(McpMasterServiceServer).Register(ctx, req.(*RegisterRequest))
}
return interceptor(ctx, in, info, handler)
}
func _McpMasterService_Heartbeat_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
in := new(HeartbeatRequest)
if err := dec(in); err != nil {
return nil, err
}
if interceptor == nil {
return srv.(McpMasterServiceServer).Heartbeat(ctx, in)
}
info := &grpc.UnaryServerInfo{
Server: srv,
FullMethod: McpMasterService_Heartbeat_FullMethodName,
}
handler := func(ctx context.Context, req interface{}) (interface{}, error) {
return srv.(McpMasterServiceServer).Heartbeat(ctx, req.(*HeartbeatRequest))
}
return interceptor(ctx, in, info, handler)
}
// McpMasterService_ServiceDesc is the grpc.ServiceDesc for McpMasterService service.
// It's only intended for direct use with grpc.RegisterService,
// and not to be introspected or modified (even as a copy)
@@ -241,6 +311,14 @@ var McpMasterService_ServiceDesc = grpc.ServiceDesc{
MethodName: "ListNodes",
Handler: _McpMasterService_ListNodes_Handler,
},
{
MethodName: "Register",
Handler: _McpMasterService_Register_Handler,
},
{
MethodName: "Heartbeat",
Handler: _McpMasterService_Heartbeat_Handler,
},
},
Streams: []grpc.StreamDesc{},
Metadata: "proto/mcp/v1/master.proto",