package monitor import ( "database/sql" "fmt" "log/slog" "os" "os/exec" "time" "git.wntrmute.dev/mc/mcp/internal/config" "git.wntrmute.dev/mc/mcp/internal/registry" ) // Alerter evaluates state transitions and fires alerts for drift or flapping. type Alerter struct { command []string cooldown time.Duration flapThreshold int flapWindow time.Duration nodeName string db *sql.DB logger *slog.Logger lastAlert map[string]time.Time // key: "service/component" } // NewAlerter creates an Alerter from monitoring configuration. func NewAlerter(cfg config.MonitorConfig, nodeName string, db *sql.DB, logger *slog.Logger) *Alerter { return &Alerter{ command: cfg.AlertCommand, cooldown: cfg.Cooldown.Duration, flapThreshold: cfg.FlapThreshold, flapWindow: cfg.FlapWindow.Duration, nodeName: nodeName, db: db, logger: logger, lastAlert: make(map[string]time.Time), } } // Evaluate checks a component's state transition and fires alerts as needed. // It is called for every component on each monitor tick. func (al *Alerter) Evaluate(service, component, desiredState, observedState, prevState string) { if desiredState == "ignore" { return } key := service + "/" + component // Drift check: desired state does not match observed state. if desiredState != observedState { if al.cooledDown(key) { al.fire("drift", service, component, desiredState, observedState, prevState, 0) } } // Flap check: too many transitions in the flap window. if observedState != prevState { count, err := registry.CountEvents(al.db, service, component, time.Now().Add(-al.flapWindow)) if err != nil { al.logger.Error("alerter: count events", "error", err, "key", key) return } if count >= al.flapThreshold { if al.cooledDown(key) { al.fire("flapping", service, component, desiredState, observedState, prevState, count) } } } } // cooledDown returns true and records the alert time if enough time has // elapsed since the last alert for this key. Returns false if suppressed. func (al *Alerter) cooledDown(key string) bool { if last, ok := al.lastAlert[key]; ok { if time.Since(last) < al.cooldown { return false } } al.lastAlert[key] = time.Now() return true } func (al *Alerter) fire(alertType, service, component, desired, observed, prev string, transitions int) { al.logger.Warn("alert", "type", alertType, "service", service, "component", component, "desired", desired, "observed", observed, ) if len(al.command) == 0 { return } cmd := exec.Command(al.command[0], al.command[1:]...) //nolint:gosec // alert command from trusted config cmd.Env = append(os.Environ(), "MCP_COMPONENT="+component, "MCP_SERVICE="+service, "MCP_NODE="+al.nodeName, "MCP_DESIRED="+desired, "MCP_OBSERVED="+observed, "MCP_PREV_STATE="+prev, "MCP_ALERT_TYPE="+alertType, "MCP_TRANSITIONS="+fmt.Sprintf("%d", transitions), ) if out, err := cmd.CombinedOutput(); err != nil { al.logger.Error("alert command failed", "error", err, "output", string(out)) } }