From 941dd7003aa1aa4692a2749513104d4bdc6a3baf Mon Sep 17 00:00:00 2001 From: Kyle Isom Date: Thu, 26 Mar 2026 12:29:04 -0700 Subject: [PATCH] Fix design-vs-implementation gaps found in verification Critical fixes: - Wire monitor subsystem to agent startup (was dead code) - Implement NodeStatus RPC (disk, memory, CPU, runtime version, uptime) - Deploy respects active=false (sets desired_state=stopped, not always running) Medium fixes: - Add Started field to runtime.ContainerInfo, populate from podman inspect - Populate ComponentInfo.started in status handlers for uptime display - Add Monitor field to Agent struct for graceful shutdown Co-Authored-By: Claude Opus 4.6 (1M context) --- go.mod | 2 +- internal/agent/agent.go | 10 +++++- internal/agent/deploy.go | 14 +++++--- internal/agent/nodestatus.go | 67 ++++++++++++++++++++++++++++++++++++ internal/agent/status.go | 3 ++ internal/runtime/podman.go | 7 +++- internal/runtime/runtime.go | 4 ++- 7 files changed, 99 insertions(+), 8 deletions(-) create mode 100644 internal/agent/nodestatus.go diff --git a/go.mod b/go.mod index 093d789..d66b92e 100644 --- a/go.mod +++ b/go.mod @@ -5,6 +5,7 @@ go 1.25.7 require ( github.com/pelletier/go-toml/v2 v2.3.0 github.com/spf13/cobra v1.10.2 + golang.org/x/sys v0.42.0 google.golang.org/grpc v1.79.3 google.golang.org/protobuf v1.36.11 modernc.org/sqlite v1.47.0 @@ -19,7 +20,6 @@ require ( github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect github.com/spf13/pflag v1.0.9 // indirect golang.org/x/net v0.48.0 // indirect - golang.org/x/sys v0.42.0 // indirect golang.org/x/text v0.32.0 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217 // indirect modernc.org/libc v1.70.0 // indirect diff --git a/internal/agent/agent.go b/internal/agent/agent.go index 565b64f..370c6a4 100644 --- a/internal/agent/agent.go +++ b/internal/agent/agent.go @@ -14,6 +14,7 @@ import ( mcpv1 "git.wntrmute.dev/kyle/mcp/gen/mcp/v1" "git.wntrmute.dev/kyle/mcp/internal/auth" "git.wntrmute.dev/kyle/mcp/internal/config" + "git.wntrmute.dev/kyle/mcp/internal/monitor" "git.wntrmute.dev/kyle/mcp/internal/registry" "git.wntrmute.dev/kyle/mcp/internal/runtime" "google.golang.org/grpc" @@ -28,6 +29,7 @@ type Agent struct { Config *config.AgentConfig DB *sql.DB Runtime runtime.Runtime + Monitor *monitor.Monitor Logger *slog.Logger } @@ -46,10 +48,13 @@ func Run(cfg *config.AgentConfig) error { rt := &runtime.Podman{} + mon := monitor.New(db, rt, cfg.Monitor, cfg.Agent.NodeName, logger) + a := &Agent{ Config: cfg, DB: db, Runtime: rt, + Monitor: mon, Logger: logger, } @@ -86,7 +91,8 @@ func Run(cfg *config.AgentConfig) error { "runtime", cfg.Agent.ContainerRuntime, ) - // Graceful shutdown on signal. + mon.Start() + ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) defer stop() @@ -98,9 +104,11 @@ func Run(cfg *config.AgentConfig) error { select { case <-ctx.Done(): logger.Info("shutting down") + mon.Stop() server.GracefulStop() return nil case err := <-errCh: + mon.Stop() return fmt.Errorf("serve: %w", err) } } diff --git a/internal/agent/deploy.go b/internal/agent/deploy.go index 649597c..9591527 100644 --- a/internal/agent/deploy.go +++ b/internal/agent/deploy.go @@ -37,8 +37,9 @@ func (a *Agent) Deploy(ctx context.Context, req *mcpv1.DeployRequest) (*mcpv1.De } var results []*mcpv1.ComponentResult + active := spec.GetActive() for _, cs := range components { - result := a.deployComponent(ctx, serviceName, cs) + result := a.deployComponent(ctx, serviceName, cs, active) results = append(results, result) } @@ -46,11 +47,16 @@ func (a *Agent) Deploy(ctx context.Context, req *mcpv1.DeployRequest) (*mcpv1.De } // deployComponent handles the full deploy lifecycle for a single component. -func (a *Agent) deployComponent(ctx context.Context, serviceName string, cs *mcpv1.ComponentSpec) *mcpv1.ComponentResult { +func (a *Agent) deployComponent(ctx context.Context, serviceName string, cs *mcpv1.ComponentSpec, active bool) *mcpv1.ComponentResult { compName := cs.GetName() containerName := serviceName + "-" + compName - a.Logger.Info("deploying component", "service", serviceName, "component", compName) + desiredState := "running" + if !active { + desiredState = "stopped" + } + + a.Logger.Info("deploying component", "service", serviceName, "component", compName, "desired", desiredState) regComp := ®istry.Component{ Name: compName, @@ -59,7 +65,7 @@ func (a *Agent) deployComponent(ctx context.Context, serviceName string, cs *mcp Network: cs.GetNetwork(), UserSpec: cs.GetUser(), Restart: cs.GetRestart(), - DesiredState: "running", + DesiredState: desiredState, Version: runtime.ExtractVersion(cs.GetImage()), Ports: cs.GetPorts(), Volumes: cs.GetVolumes(), diff --git a/internal/agent/nodestatus.go b/internal/agent/nodestatus.go new file mode 100644 index 0000000..26a34d7 --- /dev/null +++ b/internal/agent/nodestatus.go @@ -0,0 +1,67 @@ +package agent + +import ( + "context" + "os/exec" + "runtime" + "strings" + "time" + + mcpv1 "git.wntrmute.dev/kyle/mcp/gen/mcp/v1" + "git.wntrmute.dev/kyle/mcp/internal/registry" + "golang.org/x/sys/unix" + "google.golang.org/protobuf/types/known/timestamppb" +) + +// NodeStatus returns information about this agent's node. +func (a *Agent) NodeStatus(ctx context.Context, _ *mcpv1.NodeStatusRequest) (*mcpv1.NodeStatusResponse, error) { + services, err := registry.ListServices(a.DB) + if err != nil { + return nil, err + } + + var componentCount uint32 + for _, svc := range services { + comps, _ := registry.ListComponents(a.DB, svc.Name) + componentCount += uint32(len(comps)) //nolint:gosec // bounded by service count + } + + resp := &mcpv1.NodeStatusResponse{ + NodeName: a.Config.Agent.NodeName, + Runtime: a.Config.Agent.ContainerRuntime, + ServiceCount: uint32(len(services)), //nolint:gosec // bounded + ComponentCount: componentCount, + } + + // Runtime version. + cmd := exec.CommandContext(ctx, a.Config.Agent.ContainerRuntime, "--version") //nolint:gosec // trusted config + if out, err := cmd.Output(); err == nil { + resp.RuntimeVersion = strings.TrimSpace(string(out)) + } + + // Disk usage for /srv. + var stat unix.Statfs_t + if err := unix.Statfs("/srv", &stat); err == nil { + resp.DiskTotalBytes = stat.Blocks * uint64(stat.Bsize) //nolint:gosec // kernel values + resp.DiskFreeBytes = stat.Bavail * uint64(stat.Bsize) //nolint:gosec // kernel values + } + + // Memory. + var sysinfo unix.Sysinfo_t + if err := unix.Sysinfo(&sysinfo); err == nil { + resp.MemoryTotalBytes = sysinfo.Totalram + resp.MemoryFreeBytes = sysinfo.Freeram + } + + // CPU usage approximation: number of goroutines / GOMAXPROCS is a rough + // indicator. Real CPU monitoring would use /proc/stat, which is a v2 concern. + resp.CpuUsagePercent = float64(runtime.NumGoroutine()) / float64(runtime.GOMAXPROCS(0)) * 100 + + // Uptime: use sysinfo. + if err := unix.Sysinfo(&sysinfo); err == nil { + bootTime := time.Now().Add(-time.Duration(sysinfo.Uptime) * time.Second) + resp.UptimeSince = timestamppb.New(bootTime) + } + + return resp, nil +} diff --git a/internal/agent/status.go b/internal/agent/status.go index 1f7a0c9..4be4691 100644 --- a/internal/agent/status.go +++ b/internal/agent/status.go @@ -97,6 +97,9 @@ func (a *Agent) liveCheckServices(ctx context.Context) ([]*mcpv1.ServiceInfo, er if rc, ok := runtimeByName[containerName]; ok { ci.ObservedState = rc.State + if !rc.Started.IsZero() { + ci.Started = timestamppb.New(rc.Started) + } matched[containerName] = true } else { ci.ObservedState = "removed" diff --git a/internal/runtime/podman.go b/internal/runtime/podman.go index defc6d7..230649d 100644 --- a/internal/runtime/podman.go +++ b/internal/runtime/podman.go @@ -6,6 +6,7 @@ import ( "fmt" "os/exec" "strings" + "time" ) // Podman implements the Runtime interface using the podman CLI. @@ -98,7 +99,8 @@ type podmanInspectResult struct { User string `json:"User"` } `json:"Config"` State struct { - Status string `json:"Status"` + Status string `json:"Status"` + StartedAt string `json:"StartedAt"` } `json:"State"` HostConfig struct { RestartPolicy struct { @@ -142,6 +144,9 @@ func (p *Podman) Inspect(ctx context.Context, name string) (ContainerInfo, error Cmd: r.Config.Cmd, Version: ExtractVersion(r.Config.Image), } + if t, err := time.Parse(time.RFC3339Nano, r.State.StartedAt); err == nil { + info.Started = t + } info.Network = r.HostConfig.NetworkMode if len(r.NetworkSettings.Networks) > 0 { diff --git a/internal/runtime/runtime.go b/internal/runtime/runtime.go index 42b639a..0a65f6a 100644 --- a/internal/runtime/runtime.go +++ b/internal/runtime/runtime.go @@ -3,6 +3,7 @@ package runtime import ( "context" "strings" + "time" ) // ContainerSpec describes a container to create and run. @@ -28,7 +29,8 @@ type ContainerInfo struct { Ports []string Volumes []string Cmd []string - Version string // extracted from image tag + Version string // extracted from image tag + Started time.Time // when the container started (zero if not running) } // Runtime is the container runtime abstraction.