package monitor import ( "context" "database/sql" "log/slog" "os" "path/filepath" "testing" "time" "git.wntrmute.dev/mc/mcp/internal/config" "git.wntrmute.dev/mc/mcp/internal/registry" "git.wntrmute.dev/mc/mcp/internal/runtime" ) func openTestDB(t *testing.T) *sql.DB { t.Helper() db, err := registry.Open(filepath.Join(t.TempDir(), "test.db")) if err != nil { t.Fatalf("open db: %v", err) } t.Cleanup(func() { _ = db.Close() }) return db } func testLogger() *slog.Logger { return slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelError})) } func testMonitorConfig() config.MonitorConfig { return config.MonitorConfig{ Interval: config.Duration{Duration: 1 * time.Second}, Cooldown: config.Duration{Duration: 1 * time.Minute}, FlapThreshold: 3, FlapWindow: config.Duration{Duration: 10 * time.Minute}, Retention: config.Duration{Duration: 24 * time.Hour}, } } // fakeRuntime implements runtime.Runtime for testing. type fakeRuntime struct { containers []runtime.ContainerInfo } func (f *fakeRuntime) Pull(_ context.Context, _ string) error { return nil } func (f *fakeRuntime) Run(_ context.Context, _ runtime.ContainerSpec) error { return nil } func (f *fakeRuntime) Stop(_ context.Context, _ string) error { return nil } func (f *fakeRuntime) Remove(_ context.Context, _ string) error { return nil } func (f *fakeRuntime) Build(_ context.Context, _, _, _ string) error { return nil } func (f *fakeRuntime) Push(_ context.Context, _ string) error { return nil } func (f *fakeRuntime) ImageExists(_ context.Context, _ string) (bool, error) { return true, nil } func (f *fakeRuntime) Inspect(_ context.Context, _ string) (runtime.ContainerInfo, error) { return runtime.ContainerInfo{}, nil } func (f *fakeRuntime) List(_ context.Context) ([]runtime.ContainerInfo, error) { return f.containers, nil } func TestAlerterDriftDetection(t *testing.T) { db := openTestDB(t) logger := testLogger() cfg := testMonitorConfig() al := NewAlerter(cfg, "test-node", db, logger) // Set up a service and component so CountEvents works. if err := registry.CreateService(db, "metacrypt", true); err != nil { t.Fatalf("create service: %v", err) } if err := registry.CreateComponent(db, ®istry.Component{ Name: "api", Service: "metacrypt", Image: "img:v1", Restart: "unless-stopped", DesiredState: "running", ObservedState: "running", }); err != nil { t.Fatalf("create component: %v", err) } // Desired is "running" but observed is "exited" -- drift should fire. al.Evaluate("metacrypt", "api", "running", "exited", "running") // Verify alert was recorded (lastAlert should be set). key := "metacrypt/api" if _, ok := al.lastAlert[key]; !ok { t.Fatal("expected drift alert to be recorded in lastAlert") } } func TestAlerterIgnoreState(t *testing.T) { db := openTestDB(t) logger := testLogger() cfg := testMonitorConfig() al := NewAlerter(cfg, "test-node", db, logger) // Components with desired_state "ignore" should not trigger alerts. al.Evaluate("metacrypt", "api", "ignore", "exited", "running") key := "metacrypt/api" if _, ok := al.lastAlert[key]; ok { t.Fatal("expected no alert for ignored component") } } func TestAlerterCooldownSuppression(t *testing.T) { db := openTestDB(t) logger := testLogger() cfg := testMonitorConfig() cfg.Cooldown.Duration = 1 * time.Hour // long cooldown al := NewAlerter(cfg, "test-node", db, logger) if err := registry.CreateService(db, "metacrypt", true); err != nil { t.Fatalf("create service: %v", err) } if err := registry.CreateComponent(db, ®istry.Component{ Name: "api", Service: "metacrypt", Image: "img:v1", Restart: "unless-stopped", DesiredState: "running", ObservedState: "running", }); err != nil { t.Fatalf("create component: %v", err) } // First call should fire. al.Evaluate("metacrypt", "api", "running", "exited", "running") key := "metacrypt/api" first, ok := al.lastAlert[key] if !ok { t.Fatal("expected first alert to fire") } // Second call should be suppressed (within cooldown). al.Evaluate("metacrypt", "api", "running", "exited", "exited") second := al.lastAlert[key] if !second.Equal(first) { t.Fatal("expected second alert to be suppressed by cooldown") } } func TestAlerterFlapDetection(t *testing.T) { db := openTestDB(t) logger := testLogger() cfg := testMonitorConfig() cfg.FlapThreshold = 2 cfg.FlapWindow.Duration = 10 * time.Minute cfg.Cooldown.Duration = 0 // disable cooldown for this test al := NewAlerter(cfg, "test-node", db, logger) if err := registry.CreateService(db, "metacrypt", true); err != nil { t.Fatalf("create service: %v", err) } if err := registry.CreateComponent(db, ®istry.Component{ Name: "api", Service: "metacrypt", Image: "img:v1", Restart: "unless-stopped", DesiredState: "running", ObservedState: "unknown", }); err != nil { t.Fatalf("create component: %v", err) } // Insert enough events to exceed the flap threshold. for i := 0; i < 3; i++ { if err := registry.InsertEvent(db, "metacrypt", "api", "running", "exited"); err != nil { t.Fatalf("insert event %d: %v", i, err) } } // Evaluate with a state transition -- should detect flapping. al.Evaluate("metacrypt", "api", "running", "exited", "running") key := "metacrypt/api" if _, ok := al.lastAlert[key]; !ok { t.Fatal("expected flap alert to fire") } } func TestMonitorTickStateChange(t *testing.T) { db := openTestDB(t) logger := testLogger() cfg := testMonitorConfig() if err := registry.CreateService(db, "metacrypt", true); err != nil { t.Fatalf("create service: %v", err) } if err := registry.CreateComponent(db, ®istry.Component{ Name: "api", Service: "metacrypt", Image: "img:v1", Restart: "unless-stopped", DesiredState: "running", ObservedState: "unknown", }); err != nil { t.Fatalf("create component: %v", err) } rt := &fakeRuntime{ containers: []runtime.ContainerInfo{ {Name: "metacrypt-api", State: "running"}, }, } m := New(db, rt, cfg, "test-node", logger) // Run a single tick. m.tick() // Verify observed state was updated in the registry. comp, err := registry.GetComponent(db, "metacrypt", "api") if err != nil { t.Fatalf("get component: %v", err) } if comp.ObservedState != "running" { t.Fatalf("observed state: got %q, want %q", comp.ObservedState, "running") } // Verify an event was recorded (unknown -> running). events, err := registry.QueryEvents(db, "metacrypt", "api", time.Now().Add(-1*time.Hour), 0) if err != nil { t.Fatalf("query events: %v", err) } if len(events) != 1 { t.Fatalf("events: got %d, want 1", len(events)) } if events[0].PrevState != "unknown" || events[0].NewState != "running" { t.Fatalf("event: got %q->%q, want unknown->running", events[0].PrevState, events[0].NewState) } // Verify prevState map was updated. if m.prevState["metacrypt/api"] != "running" { t.Fatalf("prevState: got %q, want %q", m.prevState["metacrypt/api"], "running") } } func TestMonitorStartStop(t *testing.T) { db := openTestDB(t) logger := testLogger() cfg := testMonitorConfig() cfg.Interval.Duration = 50 * time.Millisecond rt := &fakeRuntime{} m := New(db, rt, cfg, "test-node", logger) m.Start() // Give it a moment to tick at least once. time.Sleep(150 * time.Millisecond) m.Stop() // If Stop returns, the goroutine exited cleanly. } func TestMonitorNoChangeNoEvent(t *testing.T) { db := openTestDB(t) logger := testLogger() cfg := testMonitorConfig() if err := registry.CreateService(db, "metacrypt", true); err != nil { t.Fatalf("create service: %v", err) } if err := registry.CreateComponent(db, ®istry.Component{ Name: "api", Service: "metacrypt", Image: "img:v1", Restart: "unless-stopped", DesiredState: "running", ObservedState: "running", }); err != nil { t.Fatalf("create component: %v", err) } rt := &fakeRuntime{ containers: []runtime.ContainerInfo{ {Name: "metacrypt-api", State: "running"}, }, } m := New(db, rt, cfg, "test-node", logger) // Seed prevState so that observed == prev (no change). m.prevState["metacrypt/api"] = "running" m.tick() // No events should be recorded when state is unchanged. events, err := registry.QueryEvents(db, "metacrypt", "api", time.Now().Add(-1*time.Hour), 0) if err != nil { t.Fatalf("query events: %v", err) } if len(events) != 0 { t.Fatalf("events: got %d, want 0 (no state change)", len(events)) } }