P2.2-P2.9, P3.2-P3.10, P4.1-P4.3: Complete Phases 2, 3, and 4
11 work units built in parallel and merged: Agent handlers (Phase 2): - P2.2 Deploy: pull images, stop/remove/run containers, update registry - P2.3 Lifecycle: stop/start/restart with desired_state tracking - P2.4 Status: list (registry), live check (runtime), get status (drift+events) - P2.5 Sync: receive desired state, reconcile unmanaged containers - P2.6 File transfer: push/pull scoped to /srv/<service>/, path validation - P2.7 Adopt: match <service>-* containers, derive component names - P2.8 Monitor: continuous watch loop, drift/flap alerting, event pruning - P2.9 Snapshot: VACUUM INTO database backup command CLI commands (Phase 3): - P3.2 Login, P3.3 Deploy, P3.4 Stop/Start/Restart - P3.5 List/Ps/Status, P3.6 Sync, P3.7 Adopt - P3.8 Service show/edit/export, P3.9 Push/Pull, P3.10 Node list/add/remove Deployment artifacts (Phase 4): - Systemd units (agent service + backup timer) - Example configs (CLI + agent) - Install script (idempotent) All packages: build, vet, lint (0 issues), test (all pass). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
113
internal/monitor/alerting.go
Normal file
113
internal/monitor/alerting.go
Normal file
@@ -0,0 +1,113 @@
|
||||
package monitor
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"os/exec"
|
||||
"time"
|
||||
|
||||
"git.wntrmute.dev/kyle/mcp/internal/config"
|
||||
"git.wntrmute.dev/kyle/mcp/internal/registry"
|
||||
)
|
||||
|
||||
// Alerter evaluates state transitions and fires alerts for drift or flapping.
|
||||
type Alerter struct {
|
||||
command []string
|
||||
cooldown time.Duration
|
||||
flapThreshold int
|
||||
flapWindow time.Duration
|
||||
nodeName string
|
||||
db *sql.DB
|
||||
logger *slog.Logger
|
||||
lastAlert map[string]time.Time // key: "service/component"
|
||||
}
|
||||
|
||||
// NewAlerter creates an Alerter from monitoring configuration.
|
||||
func NewAlerter(cfg config.MonitorConfig, nodeName string, db *sql.DB, logger *slog.Logger) *Alerter {
|
||||
return &Alerter{
|
||||
command: cfg.AlertCommand,
|
||||
cooldown: cfg.Cooldown.Duration,
|
||||
flapThreshold: cfg.FlapThreshold,
|
||||
flapWindow: cfg.FlapWindow.Duration,
|
||||
nodeName: nodeName,
|
||||
db: db,
|
||||
logger: logger,
|
||||
lastAlert: make(map[string]time.Time),
|
||||
}
|
||||
}
|
||||
|
||||
// Evaluate checks a component's state transition and fires alerts as needed.
|
||||
// It is called for every component on each monitor tick.
|
||||
func (al *Alerter) Evaluate(service, component, desiredState, observedState, prevState string) {
|
||||
if desiredState == "ignore" {
|
||||
return
|
||||
}
|
||||
|
||||
key := service + "/" + component
|
||||
|
||||
// Drift check: desired state does not match observed state.
|
||||
if desiredState != observedState {
|
||||
if al.cooledDown(key) {
|
||||
al.fire("drift", service, component, desiredState, observedState, prevState, 0)
|
||||
}
|
||||
}
|
||||
|
||||
// Flap check: too many transitions in the flap window.
|
||||
if observedState != prevState {
|
||||
count, err := registry.CountEvents(al.db, service, component, time.Now().Add(-al.flapWindow))
|
||||
if err != nil {
|
||||
al.logger.Error("alerter: count events", "error", err, "key", key)
|
||||
return
|
||||
}
|
||||
|
||||
if count >= al.flapThreshold {
|
||||
if al.cooledDown(key) {
|
||||
al.fire("flapping", service, component, desiredState, observedState, prevState, count)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// cooledDown returns true and records the alert time if enough time has
|
||||
// elapsed since the last alert for this key. Returns false if suppressed.
|
||||
func (al *Alerter) cooledDown(key string) bool {
|
||||
if last, ok := al.lastAlert[key]; ok {
|
||||
if time.Since(last) < al.cooldown {
|
||||
return false
|
||||
}
|
||||
}
|
||||
al.lastAlert[key] = time.Now()
|
||||
return true
|
||||
}
|
||||
|
||||
func (al *Alerter) fire(alertType, service, component, desired, observed, prev string, transitions int) {
|
||||
al.logger.Warn("alert",
|
||||
"type", alertType,
|
||||
"service", service,
|
||||
"component", component,
|
||||
"desired", desired,
|
||||
"observed", observed,
|
||||
)
|
||||
|
||||
if len(al.command) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
cmd := exec.Command(al.command[0], al.command[1:]...) //nolint:gosec // alert command from trusted config
|
||||
cmd.Env = append(os.Environ(),
|
||||
"MCP_COMPONENT="+component,
|
||||
"MCP_SERVICE="+service,
|
||||
"MCP_NODE="+al.nodeName,
|
||||
"MCP_DESIRED="+desired,
|
||||
"MCP_OBSERVED="+observed,
|
||||
"MCP_PREV_STATE="+prev,
|
||||
"MCP_ALERT_TYPE="+alertType,
|
||||
"MCP_TRANSITIONS="+fmt.Sprintf("%d", transitions),
|
||||
)
|
||||
|
||||
if out, err := cmd.CombinedOutput(); err != nil {
|
||||
al.logger.Error("alert command failed", "error", err, "output", string(out))
|
||||
}
|
||||
}
|
||||
157
internal/monitor/monitor.go
Normal file
157
internal/monitor/monitor.go
Normal file
@@ -0,0 +1,157 @@
|
||||
package monitor
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"time"
|
||||
|
||||
"git.wntrmute.dev/kyle/mcp/internal/config"
|
||||
"git.wntrmute.dev/kyle/mcp/internal/registry"
|
||||
"git.wntrmute.dev/kyle/mcp/internal/runtime"
|
||||
)
|
||||
|
||||
// Monitor watches container states and compares them to the registry,
|
||||
// recording events and firing alerts on drift or flapping.
|
||||
type Monitor struct {
|
||||
db *sql.DB
|
||||
runtime runtime.Runtime
|
||||
cfg config.MonitorConfig
|
||||
logger *slog.Logger
|
||||
alerter *Alerter
|
||||
stopCh chan struct{}
|
||||
done chan struct{}
|
||||
|
||||
prevState map[string]string // key: "service/component", value: observed state
|
||||
}
|
||||
|
||||
// New creates a Monitor with the given dependencies.
|
||||
func New(db *sql.DB, rt runtime.Runtime, cfg config.MonitorConfig, nodeName string, logger *slog.Logger) *Monitor {
|
||||
return &Monitor{
|
||||
db: db,
|
||||
runtime: rt,
|
||||
cfg: cfg,
|
||||
logger: logger,
|
||||
alerter: NewAlerter(cfg, nodeName, db, logger),
|
||||
stopCh: make(chan struct{}),
|
||||
done: make(chan struct{}),
|
||||
prevState: make(map[string]string),
|
||||
}
|
||||
}
|
||||
|
||||
// Start launches the monitoring goroutine.
|
||||
func (m *Monitor) Start() {
|
||||
go m.run()
|
||||
}
|
||||
|
||||
// Stop signals the monitoring goroutine to stop and waits for it to exit.
|
||||
func (m *Monitor) Stop() {
|
||||
close(m.stopCh)
|
||||
<-m.done
|
||||
}
|
||||
|
||||
func (m *Monitor) run() {
|
||||
defer close(m.done)
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
m.logger.Error("monitor panic recovered", "panic", fmt.Sprintf("%v", r))
|
||||
}
|
||||
}()
|
||||
|
||||
ticker := time.NewTicker(m.cfg.Interval.Duration)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-m.stopCh:
|
||||
return
|
||||
case <-ticker.C:
|
||||
m.tick()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (m *Monitor) tick() {
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
m.logger.Error("monitor tick panic recovered", "panic", fmt.Sprintf("%v", r))
|
||||
}
|
||||
}()
|
||||
|
||||
ctx := context.Background()
|
||||
|
||||
// Get the current runtime state of all containers.
|
||||
containers, err := m.runtime.List(ctx)
|
||||
if err != nil {
|
||||
m.logger.Error("monitor: list containers", "error", err)
|
||||
return
|
||||
}
|
||||
|
||||
// Index runtime containers by name for fast lookup.
|
||||
runtimeState := make(map[string]string, len(containers))
|
||||
for _, c := range containers {
|
||||
runtimeState[c.Name] = c.State
|
||||
}
|
||||
|
||||
// Walk all registered services and their components.
|
||||
services, err := registry.ListServices(m.db)
|
||||
if err != nil {
|
||||
m.logger.Error("monitor: list services", "error", err)
|
||||
return
|
||||
}
|
||||
|
||||
seen := make(map[string]struct{})
|
||||
|
||||
for _, svc := range services {
|
||||
components, err := registry.ListComponents(m.db, svc.Name)
|
||||
if err != nil {
|
||||
m.logger.Error("monitor: list components", "error", err, "service", svc.Name)
|
||||
continue
|
||||
}
|
||||
|
||||
for _, comp := range components {
|
||||
key := comp.Service + "/" + comp.Name
|
||||
seen[key] = struct{}{}
|
||||
containerName := comp.Service + "-" + comp.Name
|
||||
|
||||
observed := "unknown"
|
||||
if state, ok := runtimeState[containerName]; ok {
|
||||
observed = state
|
||||
}
|
||||
|
||||
prev, hasPrev := m.prevState[key]
|
||||
if !hasPrev {
|
||||
prev = comp.ObservedState
|
||||
}
|
||||
|
||||
if observed != prev {
|
||||
if err := registry.InsertEvent(m.db, comp.Service, comp.Name, prev, observed); err != nil {
|
||||
m.logger.Error("monitor: insert event", "error", err, "key", key)
|
||||
}
|
||||
|
||||
if err := registry.UpdateComponentState(m.db, comp.Service, comp.Name, "", observed); err != nil {
|
||||
m.logger.Error("monitor: update observed state", "error", err, "key", key)
|
||||
}
|
||||
|
||||
m.logger.Info("state change", "service", comp.Service, "component", comp.Name, "prev", prev, "observed", observed)
|
||||
}
|
||||
|
||||
m.alerter.Evaluate(comp.Service, comp.Name, comp.DesiredState, observed, prev)
|
||||
|
||||
m.prevState[key] = observed
|
||||
}
|
||||
}
|
||||
|
||||
// Evict entries for components that no longer exist in the registry.
|
||||
for key := range m.prevState {
|
||||
if _, ok := seen[key]; !ok {
|
||||
delete(m.prevState, key)
|
||||
}
|
||||
}
|
||||
|
||||
// Prune old events.
|
||||
if _, err := registry.PruneEvents(m.db, time.Now().Add(-m.cfg.Retention.Duration)); err != nil {
|
||||
m.logger.Error("monitor: prune events", "error", err)
|
||||
}
|
||||
}
|
||||
280
internal/monitor/monitor_test.go
Normal file
280
internal/monitor/monitor_test.go
Normal file
@@ -0,0 +1,280 @@
|
||||
package monitor
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"log/slog"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"git.wntrmute.dev/kyle/mcp/internal/config"
|
||||
"git.wntrmute.dev/kyle/mcp/internal/registry"
|
||||
"git.wntrmute.dev/kyle/mcp/internal/runtime"
|
||||
)
|
||||
|
||||
func openTestDB(t *testing.T) *sql.DB {
|
||||
t.Helper()
|
||||
db, err := registry.Open(filepath.Join(t.TempDir(), "test.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("open db: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { _ = db.Close() })
|
||||
return db
|
||||
}
|
||||
|
||||
func testLogger() *slog.Logger {
|
||||
return slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelError}))
|
||||
}
|
||||
|
||||
func testMonitorConfig() config.MonitorConfig {
|
||||
return config.MonitorConfig{
|
||||
Interval: config.Duration{Duration: 1 * time.Second},
|
||||
Cooldown: config.Duration{Duration: 1 * time.Minute},
|
||||
FlapThreshold: 3,
|
||||
FlapWindow: config.Duration{Duration: 10 * time.Minute},
|
||||
Retention: config.Duration{Duration: 24 * time.Hour},
|
||||
}
|
||||
}
|
||||
|
||||
// fakeRuntime implements runtime.Runtime for testing.
|
||||
type fakeRuntime struct {
|
||||
containers []runtime.ContainerInfo
|
||||
}
|
||||
|
||||
func (f *fakeRuntime) Pull(_ context.Context, _ string) error { return nil }
|
||||
func (f *fakeRuntime) Run(_ context.Context, _ runtime.ContainerSpec) error { return nil }
|
||||
func (f *fakeRuntime) Stop(_ context.Context, _ string) error { return nil }
|
||||
func (f *fakeRuntime) Remove(_ context.Context, _ string) error { return nil }
|
||||
|
||||
func (f *fakeRuntime) Inspect(_ context.Context, _ string) (runtime.ContainerInfo, error) {
|
||||
return runtime.ContainerInfo{}, nil
|
||||
}
|
||||
|
||||
func (f *fakeRuntime) List(_ context.Context) ([]runtime.ContainerInfo, error) {
|
||||
return f.containers, nil
|
||||
}
|
||||
|
||||
func TestAlerterDriftDetection(t *testing.T) {
|
||||
db := openTestDB(t)
|
||||
logger := testLogger()
|
||||
cfg := testMonitorConfig()
|
||||
|
||||
al := NewAlerter(cfg, "test-node", db, logger)
|
||||
|
||||
// Set up a service and component so CountEvents works.
|
||||
if err := registry.CreateService(db, "metacrypt", true); err != nil {
|
||||
t.Fatalf("create service: %v", err)
|
||||
}
|
||||
if err := registry.CreateComponent(db, ®istry.Component{
|
||||
Name: "api", Service: "metacrypt", Image: "img:v1",
|
||||
Restart: "unless-stopped", DesiredState: "running", ObservedState: "running",
|
||||
}); err != nil {
|
||||
t.Fatalf("create component: %v", err)
|
||||
}
|
||||
|
||||
// Desired is "running" but observed is "exited" -- drift should fire.
|
||||
al.Evaluate("metacrypt", "api", "running", "exited", "running")
|
||||
|
||||
// Verify alert was recorded (lastAlert should be set).
|
||||
key := "metacrypt/api"
|
||||
if _, ok := al.lastAlert[key]; !ok {
|
||||
t.Fatal("expected drift alert to be recorded in lastAlert")
|
||||
}
|
||||
}
|
||||
|
||||
func TestAlerterIgnoreState(t *testing.T) {
|
||||
db := openTestDB(t)
|
||||
logger := testLogger()
|
||||
cfg := testMonitorConfig()
|
||||
|
||||
al := NewAlerter(cfg, "test-node", db, logger)
|
||||
|
||||
// Components with desired_state "ignore" should not trigger alerts.
|
||||
al.Evaluate("metacrypt", "api", "ignore", "exited", "running")
|
||||
|
||||
key := "metacrypt/api"
|
||||
if _, ok := al.lastAlert[key]; ok {
|
||||
t.Fatal("expected no alert for ignored component")
|
||||
}
|
||||
}
|
||||
|
||||
func TestAlerterCooldownSuppression(t *testing.T) {
|
||||
db := openTestDB(t)
|
||||
logger := testLogger()
|
||||
cfg := testMonitorConfig()
|
||||
cfg.Cooldown.Duration = 1 * time.Hour // long cooldown
|
||||
|
||||
al := NewAlerter(cfg, "test-node", db, logger)
|
||||
|
||||
if err := registry.CreateService(db, "metacrypt", true); err != nil {
|
||||
t.Fatalf("create service: %v", err)
|
||||
}
|
||||
if err := registry.CreateComponent(db, ®istry.Component{
|
||||
Name: "api", Service: "metacrypt", Image: "img:v1",
|
||||
Restart: "unless-stopped", DesiredState: "running", ObservedState: "running",
|
||||
}); err != nil {
|
||||
t.Fatalf("create component: %v", err)
|
||||
}
|
||||
|
||||
// First call should fire.
|
||||
al.Evaluate("metacrypt", "api", "running", "exited", "running")
|
||||
key := "metacrypt/api"
|
||||
first, ok := al.lastAlert[key]
|
||||
if !ok {
|
||||
t.Fatal("expected first alert to fire")
|
||||
}
|
||||
|
||||
// Second call should be suppressed (within cooldown).
|
||||
al.Evaluate("metacrypt", "api", "running", "exited", "exited")
|
||||
second := al.lastAlert[key]
|
||||
if !second.Equal(first) {
|
||||
t.Fatal("expected second alert to be suppressed by cooldown")
|
||||
}
|
||||
}
|
||||
|
||||
func TestAlerterFlapDetection(t *testing.T) {
|
||||
db := openTestDB(t)
|
||||
logger := testLogger()
|
||||
cfg := testMonitorConfig()
|
||||
cfg.FlapThreshold = 2
|
||||
cfg.FlapWindow.Duration = 10 * time.Minute
|
||||
cfg.Cooldown.Duration = 0 // disable cooldown for this test
|
||||
|
||||
al := NewAlerter(cfg, "test-node", db, logger)
|
||||
|
||||
if err := registry.CreateService(db, "metacrypt", true); err != nil {
|
||||
t.Fatalf("create service: %v", err)
|
||||
}
|
||||
if err := registry.CreateComponent(db, ®istry.Component{
|
||||
Name: "api", Service: "metacrypt", Image: "img:v1",
|
||||
Restart: "unless-stopped", DesiredState: "running", ObservedState: "unknown",
|
||||
}); err != nil {
|
||||
t.Fatalf("create component: %v", err)
|
||||
}
|
||||
|
||||
// Insert enough events to exceed the flap threshold.
|
||||
for i := 0; i < 3; i++ {
|
||||
if err := registry.InsertEvent(db, "metacrypt", "api", "running", "exited"); err != nil {
|
||||
t.Fatalf("insert event %d: %v", i, err)
|
||||
}
|
||||
}
|
||||
|
||||
// Evaluate with a state transition -- should detect flapping.
|
||||
al.Evaluate("metacrypt", "api", "running", "exited", "running")
|
||||
|
||||
key := "metacrypt/api"
|
||||
if _, ok := al.lastAlert[key]; !ok {
|
||||
t.Fatal("expected flap alert to fire")
|
||||
}
|
||||
}
|
||||
|
||||
func TestMonitorTickStateChange(t *testing.T) {
|
||||
db := openTestDB(t)
|
||||
logger := testLogger()
|
||||
cfg := testMonitorConfig()
|
||||
|
||||
if err := registry.CreateService(db, "metacrypt", true); err != nil {
|
||||
t.Fatalf("create service: %v", err)
|
||||
}
|
||||
if err := registry.CreateComponent(db, ®istry.Component{
|
||||
Name: "api", Service: "metacrypt", Image: "img:v1",
|
||||
Restart: "unless-stopped", DesiredState: "running", ObservedState: "unknown",
|
||||
}); err != nil {
|
||||
t.Fatalf("create component: %v", err)
|
||||
}
|
||||
|
||||
rt := &fakeRuntime{
|
||||
containers: []runtime.ContainerInfo{
|
||||
{Name: "metacrypt-api", State: "running"},
|
||||
},
|
||||
}
|
||||
|
||||
m := New(db, rt, cfg, "test-node", logger)
|
||||
|
||||
// Run a single tick.
|
||||
m.tick()
|
||||
|
||||
// Verify observed state was updated in the registry.
|
||||
comp, err := registry.GetComponent(db, "metacrypt", "api")
|
||||
if err != nil {
|
||||
t.Fatalf("get component: %v", err)
|
||||
}
|
||||
if comp.ObservedState != "running" {
|
||||
t.Fatalf("observed state: got %q, want %q", comp.ObservedState, "running")
|
||||
}
|
||||
|
||||
// Verify an event was recorded (unknown -> running).
|
||||
events, err := registry.QueryEvents(db, "metacrypt", "api", time.Now().Add(-1*time.Hour), 0)
|
||||
if err != nil {
|
||||
t.Fatalf("query events: %v", err)
|
||||
}
|
||||
if len(events) != 1 {
|
||||
t.Fatalf("events: got %d, want 1", len(events))
|
||||
}
|
||||
if events[0].PrevState != "unknown" || events[0].NewState != "running" {
|
||||
t.Fatalf("event: got %q->%q, want unknown->running", events[0].PrevState, events[0].NewState)
|
||||
}
|
||||
|
||||
// Verify prevState map was updated.
|
||||
if m.prevState["metacrypt/api"] != "running" {
|
||||
t.Fatalf("prevState: got %q, want %q", m.prevState["metacrypt/api"], "running")
|
||||
}
|
||||
}
|
||||
|
||||
func TestMonitorStartStop(t *testing.T) {
|
||||
db := openTestDB(t)
|
||||
logger := testLogger()
|
||||
cfg := testMonitorConfig()
|
||||
cfg.Interval.Duration = 50 * time.Millisecond
|
||||
|
||||
rt := &fakeRuntime{}
|
||||
m := New(db, rt, cfg, "test-node", logger)
|
||||
|
||||
m.Start()
|
||||
|
||||
// Give it a moment to tick at least once.
|
||||
time.Sleep(150 * time.Millisecond)
|
||||
|
||||
m.Stop()
|
||||
|
||||
// If Stop returns, the goroutine exited cleanly.
|
||||
}
|
||||
|
||||
func TestMonitorNoChangeNoEvent(t *testing.T) {
|
||||
db := openTestDB(t)
|
||||
logger := testLogger()
|
||||
cfg := testMonitorConfig()
|
||||
|
||||
if err := registry.CreateService(db, "metacrypt", true); err != nil {
|
||||
t.Fatalf("create service: %v", err)
|
||||
}
|
||||
if err := registry.CreateComponent(db, ®istry.Component{
|
||||
Name: "api", Service: "metacrypt", Image: "img:v1",
|
||||
Restart: "unless-stopped", DesiredState: "running", ObservedState: "running",
|
||||
}); err != nil {
|
||||
t.Fatalf("create component: %v", err)
|
||||
}
|
||||
|
||||
rt := &fakeRuntime{
|
||||
containers: []runtime.ContainerInfo{
|
||||
{Name: "metacrypt-api", State: "running"},
|
||||
},
|
||||
}
|
||||
|
||||
m := New(db, rt, cfg, "test-node", logger)
|
||||
// Seed prevState so that observed == prev (no change).
|
||||
m.prevState["metacrypt/api"] = "running"
|
||||
|
||||
m.tick()
|
||||
|
||||
// No events should be recorded when state is unchanged.
|
||||
events, err := registry.QueryEvents(db, "metacrypt", "api", time.Now().Add(-1*time.Hour), 0)
|
||||
if err != nil {
|
||||
t.Fatalf("query events: %v", err)
|
||||
}
|
||||
if len(events) != 0 {
|
||||
t.Fatalf("events: got %d, want 0 (no state change)", len(events))
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user