Files
mcp/internal/monitor/alerting.go
Kyle Isom 08b3e2a472 Migrate module path from kyle/ to mc/ org
All import paths updated to git.wntrmute.dev/mc/. Bumps mcdsl to v1.2.0,
mc-proxy to v1.1.0.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-27 02:07:42 -07:00

114 lines
3.1 KiB
Go

package monitor
import (
"database/sql"
"fmt"
"log/slog"
"os"
"os/exec"
"time"
"git.wntrmute.dev/mc/mcp/internal/config"
"git.wntrmute.dev/mc/mcp/internal/registry"
)
// Alerter evaluates state transitions and fires alerts for drift or flapping.
type Alerter struct {
command []string
cooldown time.Duration
flapThreshold int
flapWindow time.Duration
nodeName string
db *sql.DB
logger *slog.Logger
lastAlert map[string]time.Time // key: "service/component"
}
// NewAlerter creates an Alerter from monitoring configuration.
func NewAlerter(cfg config.MonitorConfig, nodeName string, db *sql.DB, logger *slog.Logger) *Alerter {
return &Alerter{
command: cfg.AlertCommand,
cooldown: cfg.Cooldown.Duration,
flapThreshold: cfg.FlapThreshold,
flapWindow: cfg.FlapWindow.Duration,
nodeName: nodeName,
db: db,
logger: logger,
lastAlert: make(map[string]time.Time),
}
}
// Evaluate checks a component's state transition and fires alerts as needed.
// It is called for every component on each monitor tick.
func (al *Alerter) Evaluate(service, component, desiredState, observedState, prevState string) {
if desiredState == "ignore" {
return
}
key := service + "/" + component
// Drift check: desired state does not match observed state.
if desiredState != observedState {
if al.cooledDown(key) {
al.fire("drift", service, component, desiredState, observedState, prevState, 0)
}
}
// Flap check: too many transitions in the flap window.
if observedState != prevState {
count, err := registry.CountEvents(al.db, service, component, time.Now().Add(-al.flapWindow))
if err != nil {
al.logger.Error("alerter: count events", "error", err, "key", key)
return
}
if count >= al.flapThreshold {
if al.cooledDown(key) {
al.fire("flapping", service, component, desiredState, observedState, prevState, count)
}
}
}
}
// cooledDown returns true and records the alert time if enough time has
// elapsed since the last alert for this key. Returns false if suppressed.
func (al *Alerter) cooledDown(key string) bool {
if last, ok := al.lastAlert[key]; ok {
if time.Since(last) < al.cooldown {
return false
}
}
al.lastAlert[key] = time.Now()
return true
}
func (al *Alerter) fire(alertType, service, component, desired, observed, prev string, transitions int) {
al.logger.Warn("alert",
"type", alertType,
"service", service,
"component", component,
"desired", desired,
"observed", observed,
)
if len(al.command) == 0 {
return
}
cmd := exec.Command(al.command[0], al.command[1:]...) //nolint:gosec // alert command from trusted config
cmd.Env = append(os.Environ(),
"MCP_COMPONENT="+component,
"MCP_SERVICE="+service,
"MCP_NODE="+al.nodeName,
"MCP_DESIRED="+desired,
"MCP_OBSERVED="+observed,
"MCP_PREV_STATE="+prev,
"MCP_ALERT_TYPE="+alertType,
"MCP_TRANSITIONS="+fmt.Sprintf("%d", transitions),
)
if out, err := cmd.CombinedOutput(); err != nil {
al.logger.Error("alert command failed", "error", err, "output", string(out))
}
}