All import paths updated to git.wntrmute.dev/mc/. Bumps mcdsl to v1.2.0, mc-proxy to v1.1.0. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
114 lines
3.1 KiB
Go
114 lines
3.1 KiB
Go
package monitor
|
|
|
|
import (
|
|
"database/sql"
|
|
"fmt"
|
|
"log/slog"
|
|
"os"
|
|
"os/exec"
|
|
"time"
|
|
|
|
"git.wntrmute.dev/mc/mcp/internal/config"
|
|
"git.wntrmute.dev/mc/mcp/internal/registry"
|
|
)
|
|
|
|
// Alerter evaluates state transitions and fires alerts for drift or flapping.
|
|
type Alerter struct {
|
|
command []string
|
|
cooldown time.Duration
|
|
flapThreshold int
|
|
flapWindow time.Duration
|
|
nodeName string
|
|
db *sql.DB
|
|
logger *slog.Logger
|
|
lastAlert map[string]time.Time // key: "service/component"
|
|
}
|
|
|
|
// NewAlerter creates an Alerter from monitoring configuration.
|
|
func NewAlerter(cfg config.MonitorConfig, nodeName string, db *sql.DB, logger *slog.Logger) *Alerter {
|
|
return &Alerter{
|
|
command: cfg.AlertCommand,
|
|
cooldown: cfg.Cooldown.Duration,
|
|
flapThreshold: cfg.FlapThreshold,
|
|
flapWindow: cfg.FlapWindow.Duration,
|
|
nodeName: nodeName,
|
|
db: db,
|
|
logger: logger,
|
|
lastAlert: make(map[string]time.Time),
|
|
}
|
|
}
|
|
|
|
// Evaluate checks a component's state transition and fires alerts as needed.
|
|
// It is called for every component on each monitor tick.
|
|
func (al *Alerter) Evaluate(service, component, desiredState, observedState, prevState string) {
|
|
if desiredState == "ignore" {
|
|
return
|
|
}
|
|
|
|
key := service + "/" + component
|
|
|
|
// Drift check: desired state does not match observed state.
|
|
if desiredState != observedState {
|
|
if al.cooledDown(key) {
|
|
al.fire("drift", service, component, desiredState, observedState, prevState, 0)
|
|
}
|
|
}
|
|
|
|
// Flap check: too many transitions in the flap window.
|
|
if observedState != prevState {
|
|
count, err := registry.CountEvents(al.db, service, component, time.Now().Add(-al.flapWindow))
|
|
if err != nil {
|
|
al.logger.Error("alerter: count events", "error", err, "key", key)
|
|
return
|
|
}
|
|
|
|
if count >= al.flapThreshold {
|
|
if al.cooledDown(key) {
|
|
al.fire("flapping", service, component, desiredState, observedState, prevState, count)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// cooledDown returns true and records the alert time if enough time has
|
|
// elapsed since the last alert for this key. Returns false if suppressed.
|
|
func (al *Alerter) cooledDown(key string) bool {
|
|
if last, ok := al.lastAlert[key]; ok {
|
|
if time.Since(last) < al.cooldown {
|
|
return false
|
|
}
|
|
}
|
|
al.lastAlert[key] = time.Now()
|
|
return true
|
|
}
|
|
|
|
func (al *Alerter) fire(alertType, service, component, desired, observed, prev string, transitions int) {
|
|
al.logger.Warn("alert",
|
|
"type", alertType,
|
|
"service", service,
|
|
"component", component,
|
|
"desired", desired,
|
|
"observed", observed,
|
|
)
|
|
|
|
if len(al.command) == 0 {
|
|
return
|
|
}
|
|
|
|
cmd := exec.Command(al.command[0], al.command[1:]...) //nolint:gosec // alert command from trusted config
|
|
cmd.Env = append(os.Environ(),
|
|
"MCP_COMPONENT="+component,
|
|
"MCP_SERVICE="+service,
|
|
"MCP_NODE="+al.nodeName,
|
|
"MCP_DESIRED="+desired,
|
|
"MCP_OBSERVED="+observed,
|
|
"MCP_PREV_STATE="+prev,
|
|
"MCP_ALERT_TYPE="+alertType,
|
|
"MCP_TRANSITIONS="+fmt.Sprintf("%d", transitions),
|
|
)
|
|
|
|
if out, err := cmd.CombinedOutput(); err != nil {
|
|
al.logger.Error("alert command failed", "error", err, "output", string(out))
|
|
}
|
|
}
|