11 work units built in parallel and merged: Agent handlers (Phase 2): - P2.2 Deploy: pull images, stop/remove/run containers, update registry - P2.3 Lifecycle: stop/start/restart with desired_state tracking - P2.4 Status: list (registry), live check (runtime), get status (drift+events) - P2.5 Sync: receive desired state, reconcile unmanaged containers - P2.6 File transfer: push/pull scoped to /srv/<service>/, path validation - P2.7 Adopt: match <service>-* containers, derive component names - P2.8 Monitor: continuous watch loop, drift/flap alerting, event pruning - P2.9 Snapshot: VACUUM INTO database backup command CLI commands (Phase 3): - P3.2 Login, P3.3 Deploy, P3.4 Stop/Start/Restart - P3.5 List/Ps/Status, P3.6 Sync, P3.7 Adopt - P3.8 Service show/edit/export, P3.9 Push/Pull, P3.10 Node list/add/remove Deployment artifacts (Phase 4): - Systemd units (agent service + backup timer) - Example configs (CLI + agent) - Install script (idempotent) All packages: build, vet, lint (0 issues), test (all pass). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
114 lines
3.1 KiB
Go
114 lines
3.1 KiB
Go
package monitor
|
|
|
|
import (
|
|
"database/sql"
|
|
"fmt"
|
|
"log/slog"
|
|
"os"
|
|
"os/exec"
|
|
"time"
|
|
|
|
"git.wntrmute.dev/kyle/mcp/internal/config"
|
|
"git.wntrmute.dev/kyle/mcp/internal/registry"
|
|
)
|
|
|
|
// Alerter evaluates state transitions and fires alerts for drift or flapping.
|
|
type Alerter struct {
|
|
command []string
|
|
cooldown time.Duration
|
|
flapThreshold int
|
|
flapWindow time.Duration
|
|
nodeName string
|
|
db *sql.DB
|
|
logger *slog.Logger
|
|
lastAlert map[string]time.Time // key: "service/component"
|
|
}
|
|
|
|
// NewAlerter creates an Alerter from monitoring configuration.
|
|
func NewAlerter(cfg config.MonitorConfig, nodeName string, db *sql.DB, logger *slog.Logger) *Alerter {
|
|
return &Alerter{
|
|
command: cfg.AlertCommand,
|
|
cooldown: cfg.Cooldown.Duration,
|
|
flapThreshold: cfg.FlapThreshold,
|
|
flapWindow: cfg.FlapWindow.Duration,
|
|
nodeName: nodeName,
|
|
db: db,
|
|
logger: logger,
|
|
lastAlert: make(map[string]time.Time),
|
|
}
|
|
}
|
|
|
|
// Evaluate checks a component's state transition and fires alerts as needed.
|
|
// It is called for every component on each monitor tick.
|
|
func (al *Alerter) Evaluate(service, component, desiredState, observedState, prevState string) {
|
|
if desiredState == "ignore" {
|
|
return
|
|
}
|
|
|
|
key := service + "/" + component
|
|
|
|
// Drift check: desired state does not match observed state.
|
|
if desiredState != observedState {
|
|
if al.cooledDown(key) {
|
|
al.fire("drift", service, component, desiredState, observedState, prevState, 0)
|
|
}
|
|
}
|
|
|
|
// Flap check: too many transitions in the flap window.
|
|
if observedState != prevState {
|
|
count, err := registry.CountEvents(al.db, service, component, time.Now().Add(-al.flapWindow))
|
|
if err != nil {
|
|
al.logger.Error("alerter: count events", "error", err, "key", key)
|
|
return
|
|
}
|
|
|
|
if count >= al.flapThreshold {
|
|
if al.cooledDown(key) {
|
|
al.fire("flapping", service, component, desiredState, observedState, prevState, count)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// cooledDown returns true and records the alert time if enough time has
|
|
// elapsed since the last alert for this key. Returns false if suppressed.
|
|
func (al *Alerter) cooledDown(key string) bool {
|
|
if last, ok := al.lastAlert[key]; ok {
|
|
if time.Since(last) < al.cooldown {
|
|
return false
|
|
}
|
|
}
|
|
al.lastAlert[key] = time.Now()
|
|
return true
|
|
}
|
|
|
|
func (al *Alerter) fire(alertType, service, component, desired, observed, prev string, transitions int) {
|
|
al.logger.Warn("alert",
|
|
"type", alertType,
|
|
"service", service,
|
|
"component", component,
|
|
"desired", desired,
|
|
"observed", observed,
|
|
)
|
|
|
|
if len(al.command) == 0 {
|
|
return
|
|
}
|
|
|
|
cmd := exec.Command(al.command[0], al.command[1:]...) //nolint:gosec // alert command from trusted config
|
|
cmd.Env = append(os.Environ(),
|
|
"MCP_COMPONENT="+component,
|
|
"MCP_SERVICE="+service,
|
|
"MCP_NODE="+al.nodeName,
|
|
"MCP_DESIRED="+desired,
|
|
"MCP_OBSERVED="+observed,
|
|
"MCP_PREV_STATE="+prev,
|
|
"MCP_ALERT_TYPE="+alertType,
|
|
"MCP_TRANSITIONS="+fmt.Sprintf("%d", transitions),
|
|
)
|
|
|
|
if out, err := cmd.CombinedOutput(); err != nil {
|
|
al.logger.Error("alert command failed", "error", err, "output", string(out))
|
|
}
|
|
}
|