P2.2-P2.9, P3.2-P3.10, P4.1-P4.3: Complete Phases 2, 3, and 4
11 work units built in parallel and merged: Agent handlers (Phase 2): - P2.2 Deploy: pull images, stop/remove/run containers, update registry - P2.3 Lifecycle: stop/start/restart with desired_state tracking - P2.4 Status: list (registry), live check (runtime), get status (drift+events) - P2.5 Sync: receive desired state, reconcile unmanaged containers - P2.6 File transfer: push/pull scoped to /srv/<service>/, path validation - P2.7 Adopt: match <service>-* containers, derive component names - P2.8 Monitor: continuous watch loop, drift/flap alerting, event pruning - P2.9 Snapshot: VACUUM INTO database backup command CLI commands (Phase 3): - P3.2 Login, P3.3 Deploy, P3.4 Stop/Start/Restart - P3.5 List/Ps/Status, P3.6 Sync, P3.7 Adopt - P3.8 Service show/edit/export, P3.9 Push/Pull, P3.10 Node list/add/remove Deployment artifacts (Phase 4): - Systemd units (agent service + backup timer) - Example configs (CLI + agent) - Install script (idempotent) All packages: build, vet, lint (0 issues), test (all pass). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
113
internal/monitor/alerting.go
Normal file
113
internal/monitor/alerting.go
Normal file
@@ -0,0 +1,113 @@
|
||||
package monitor
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"os/exec"
|
||||
"time"
|
||||
|
||||
"git.wntrmute.dev/kyle/mcp/internal/config"
|
||||
"git.wntrmute.dev/kyle/mcp/internal/registry"
|
||||
)
|
||||
|
||||
// Alerter evaluates state transitions and fires alerts for drift or flapping.
|
||||
type Alerter struct {
|
||||
command []string
|
||||
cooldown time.Duration
|
||||
flapThreshold int
|
||||
flapWindow time.Duration
|
||||
nodeName string
|
||||
db *sql.DB
|
||||
logger *slog.Logger
|
||||
lastAlert map[string]time.Time // key: "service/component"
|
||||
}
|
||||
|
||||
// NewAlerter creates an Alerter from monitoring configuration.
|
||||
func NewAlerter(cfg config.MonitorConfig, nodeName string, db *sql.DB, logger *slog.Logger) *Alerter {
|
||||
return &Alerter{
|
||||
command: cfg.AlertCommand,
|
||||
cooldown: cfg.Cooldown.Duration,
|
||||
flapThreshold: cfg.FlapThreshold,
|
||||
flapWindow: cfg.FlapWindow.Duration,
|
||||
nodeName: nodeName,
|
||||
db: db,
|
||||
logger: logger,
|
||||
lastAlert: make(map[string]time.Time),
|
||||
}
|
||||
}
|
||||
|
||||
// Evaluate checks a component's state transition and fires alerts as needed.
|
||||
// It is called for every component on each monitor tick.
|
||||
func (al *Alerter) Evaluate(service, component, desiredState, observedState, prevState string) {
|
||||
if desiredState == "ignore" {
|
||||
return
|
||||
}
|
||||
|
||||
key := service + "/" + component
|
||||
|
||||
// Drift check: desired state does not match observed state.
|
||||
if desiredState != observedState {
|
||||
if al.cooledDown(key) {
|
||||
al.fire("drift", service, component, desiredState, observedState, prevState, 0)
|
||||
}
|
||||
}
|
||||
|
||||
// Flap check: too many transitions in the flap window.
|
||||
if observedState != prevState {
|
||||
count, err := registry.CountEvents(al.db, service, component, time.Now().Add(-al.flapWindow))
|
||||
if err != nil {
|
||||
al.logger.Error("alerter: count events", "error", err, "key", key)
|
||||
return
|
||||
}
|
||||
|
||||
if count >= al.flapThreshold {
|
||||
if al.cooledDown(key) {
|
||||
al.fire("flapping", service, component, desiredState, observedState, prevState, count)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// cooledDown returns true and records the alert time if enough time has
|
||||
// elapsed since the last alert for this key. Returns false if suppressed.
|
||||
func (al *Alerter) cooledDown(key string) bool {
|
||||
if last, ok := al.lastAlert[key]; ok {
|
||||
if time.Since(last) < al.cooldown {
|
||||
return false
|
||||
}
|
||||
}
|
||||
al.lastAlert[key] = time.Now()
|
||||
return true
|
||||
}
|
||||
|
||||
func (al *Alerter) fire(alertType, service, component, desired, observed, prev string, transitions int) {
|
||||
al.logger.Warn("alert",
|
||||
"type", alertType,
|
||||
"service", service,
|
||||
"component", component,
|
||||
"desired", desired,
|
||||
"observed", observed,
|
||||
)
|
||||
|
||||
if len(al.command) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
cmd := exec.Command(al.command[0], al.command[1:]...) //nolint:gosec // alert command from trusted config
|
||||
cmd.Env = append(os.Environ(),
|
||||
"MCP_COMPONENT="+component,
|
||||
"MCP_SERVICE="+service,
|
||||
"MCP_NODE="+al.nodeName,
|
||||
"MCP_DESIRED="+desired,
|
||||
"MCP_OBSERVED="+observed,
|
||||
"MCP_PREV_STATE="+prev,
|
||||
"MCP_ALERT_TYPE="+alertType,
|
||||
"MCP_TRANSITIONS="+fmt.Sprintf("%d", transitions),
|
||||
)
|
||||
|
||||
if out, err := cmd.CombinedOutput(); err != nil {
|
||||
al.logger.Error("alert command failed", "error", err, "output", string(out))
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user