P2.2-P2.9, P3.2-P3.10, P4.1-P4.3: Complete Phases 2, 3, and 4

11 work units built in parallel and merged:

Agent handlers (Phase 2):
- P2.2 Deploy: pull images, stop/remove/run containers, update registry
- P2.3 Lifecycle: stop/start/restart with desired_state tracking
- P2.4 Status: list (registry), live check (runtime), get status (drift+events)
- P2.5 Sync: receive desired state, reconcile unmanaged containers
- P2.6 File transfer: push/pull scoped to /srv/<service>/, path validation
- P2.7 Adopt: match <service>-* containers, derive component names
- P2.8 Monitor: continuous watch loop, drift/flap alerting, event pruning
- P2.9 Snapshot: VACUUM INTO database backup command

CLI commands (Phase 3):
- P3.2 Login, P3.3 Deploy, P3.4 Stop/Start/Restart
- P3.5 List/Ps/Status, P3.6 Sync, P3.7 Adopt
- P3.8 Service show/edit/export, P3.9 Push/Pull, P3.10 Node list/add/remove

Deployment artifacts (Phase 4):
- Systemd units (agent service + backup timer)
- Example configs (CLI + agent)
- Install script (idempotent)

All packages: build, vet, lint (0 issues), test (all pass).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-03-26 12:21:18 -07:00
parent d7cc970133
commit 8f913ddf9b
33 changed files with 3593 additions and 62 deletions

View File

@@ -0,0 +1,280 @@
package monitor
import (
"context"
"database/sql"
"log/slog"
"os"
"path/filepath"
"testing"
"time"
"git.wntrmute.dev/kyle/mcp/internal/config"
"git.wntrmute.dev/kyle/mcp/internal/registry"
"git.wntrmute.dev/kyle/mcp/internal/runtime"
)
func openTestDB(t *testing.T) *sql.DB {
t.Helper()
db, err := registry.Open(filepath.Join(t.TempDir(), "test.db"))
if err != nil {
t.Fatalf("open db: %v", err)
}
t.Cleanup(func() { _ = db.Close() })
return db
}
func testLogger() *slog.Logger {
return slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelError}))
}
func testMonitorConfig() config.MonitorConfig {
return config.MonitorConfig{
Interval: config.Duration{Duration: 1 * time.Second},
Cooldown: config.Duration{Duration: 1 * time.Minute},
FlapThreshold: 3,
FlapWindow: config.Duration{Duration: 10 * time.Minute},
Retention: config.Duration{Duration: 24 * time.Hour},
}
}
// fakeRuntime implements runtime.Runtime for testing.
type fakeRuntime struct {
containers []runtime.ContainerInfo
}
func (f *fakeRuntime) Pull(_ context.Context, _ string) error { return nil }
func (f *fakeRuntime) Run(_ context.Context, _ runtime.ContainerSpec) error { return nil }
func (f *fakeRuntime) Stop(_ context.Context, _ string) error { return nil }
func (f *fakeRuntime) Remove(_ context.Context, _ string) error { return nil }
func (f *fakeRuntime) Inspect(_ context.Context, _ string) (runtime.ContainerInfo, error) {
return runtime.ContainerInfo{}, nil
}
func (f *fakeRuntime) List(_ context.Context) ([]runtime.ContainerInfo, error) {
return f.containers, nil
}
func TestAlerterDriftDetection(t *testing.T) {
db := openTestDB(t)
logger := testLogger()
cfg := testMonitorConfig()
al := NewAlerter(cfg, "test-node", db, logger)
// Set up a service and component so CountEvents works.
if err := registry.CreateService(db, "metacrypt", true); err != nil {
t.Fatalf("create service: %v", err)
}
if err := registry.CreateComponent(db, &registry.Component{
Name: "api", Service: "metacrypt", Image: "img:v1",
Restart: "unless-stopped", DesiredState: "running", ObservedState: "running",
}); err != nil {
t.Fatalf("create component: %v", err)
}
// Desired is "running" but observed is "exited" -- drift should fire.
al.Evaluate("metacrypt", "api", "running", "exited", "running")
// Verify alert was recorded (lastAlert should be set).
key := "metacrypt/api"
if _, ok := al.lastAlert[key]; !ok {
t.Fatal("expected drift alert to be recorded in lastAlert")
}
}
func TestAlerterIgnoreState(t *testing.T) {
db := openTestDB(t)
logger := testLogger()
cfg := testMonitorConfig()
al := NewAlerter(cfg, "test-node", db, logger)
// Components with desired_state "ignore" should not trigger alerts.
al.Evaluate("metacrypt", "api", "ignore", "exited", "running")
key := "metacrypt/api"
if _, ok := al.lastAlert[key]; ok {
t.Fatal("expected no alert for ignored component")
}
}
func TestAlerterCooldownSuppression(t *testing.T) {
db := openTestDB(t)
logger := testLogger()
cfg := testMonitorConfig()
cfg.Cooldown.Duration = 1 * time.Hour // long cooldown
al := NewAlerter(cfg, "test-node", db, logger)
if err := registry.CreateService(db, "metacrypt", true); err != nil {
t.Fatalf("create service: %v", err)
}
if err := registry.CreateComponent(db, &registry.Component{
Name: "api", Service: "metacrypt", Image: "img:v1",
Restart: "unless-stopped", DesiredState: "running", ObservedState: "running",
}); err != nil {
t.Fatalf("create component: %v", err)
}
// First call should fire.
al.Evaluate("metacrypt", "api", "running", "exited", "running")
key := "metacrypt/api"
first, ok := al.lastAlert[key]
if !ok {
t.Fatal("expected first alert to fire")
}
// Second call should be suppressed (within cooldown).
al.Evaluate("metacrypt", "api", "running", "exited", "exited")
second := al.lastAlert[key]
if !second.Equal(first) {
t.Fatal("expected second alert to be suppressed by cooldown")
}
}
func TestAlerterFlapDetection(t *testing.T) {
db := openTestDB(t)
logger := testLogger()
cfg := testMonitorConfig()
cfg.FlapThreshold = 2
cfg.FlapWindow.Duration = 10 * time.Minute
cfg.Cooldown.Duration = 0 // disable cooldown for this test
al := NewAlerter(cfg, "test-node", db, logger)
if err := registry.CreateService(db, "metacrypt", true); err != nil {
t.Fatalf("create service: %v", err)
}
if err := registry.CreateComponent(db, &registry.Component{
Name: "api", Service: "metacrypt", Image: "img:v1",
Restart: "unless-stopped", DesiredState: "running", ObservedState: "unknown",
}); err != nil {
t.Fatalf("create component: %v", err)
}
// Insert enough events to exceed the flap threshold.
for i := 0; i < 3; i++ {
if err := registry.InsertEvent(db, "metacrypt", "api", "running", "exited"); err != nil {
t.Fatalf("insert event %d: %v", i, err)
}
}
// Evaluate with a state transition -- should detect flapping.
al.Evaluate("metacrypt", "api", "running", "exited", "running")
key := "metacrypt/api"
if _, ok := al.lastAlert[key]; !ok {
t.Fatal("expected flap alert to fire")
}
}
func TestMonitorTickStateChange(t *testing.T) {
db := openTestDB(t)
logger := testLogger()
cfg := testMonitorConfig()
if err := registry.CreateService(db, "metacrypt", true); err != nil {
t.Fatalf("create service: %v", err)
}
if err := registry.CreateComponent(db, &registry.Component{
Name: "api", Service: "metacrypt", Image: "img:v1",
Restart: "unless-stopped", DesiredState: "running", ObservedState: "unknown",
}); err != nil {
t.Fatalf("create component: %v", err)
}
rt := &fakeRuntime{
containers: []runtime.ContainerInfo{
{Name: "metacrypt-api", State: "running"},
},
}
m := New(db, rt, cfg, "test-node", logger)
// Run a single tick.
m.tick()
// Verify observed state was updated in the registry.
comp, err := registry.GetComponent(db, "metacrypt", "api")
if err != nil {
t.Fatalf("get component: %v", err)
}
if comp.ObservedState != "running" {
t.Fatalf("observed state: got %q, want %q", comp.ObservedState, "running")
}
// Verify an event was recorded (unknown -> running).
events, err := registry.QueryEvents(db, "metacrypt", "api", time.Now().Add(-1*time.Hour), 0)
if err != nil {
t.Fatalf("query events: %v", err)
}
if len(events) != 1 {
t.Fatalf("events: got %d, want 1", len(events))
}
if events[0].PrevState != "unknown" || events[0].NewState != "running" {
t.Fatalf("event: got %q->%q, want unknown->running", events[0].PrevState, events[0].NewState)
}
// Verify prevState map was updated.
if m.prevState["metacrypt/api"] != "running" {
t.Fatalf("prevState: got %q, want %q", m.prevState["metacrypt/api"], "running")
}
}
func TestMonitorStartStop(t *testing.T) {
db := openTestDB(t)
logger := testLogger()
cfg := testMonitorConfig()
cfg.Interval.Duration = 50 * time.Millisecond
rt := &fakeRuntime{}
m := New(db, rt, cfg, "test-node", logger)
m.Start()
// Give it a moment to tick at least once.
time.Sleep(150 * time.Millisecond)
m.Stop()
// If Stop returns, the goroutine exited cleanly.
}
func TestMonitorNoChangeNoEvent(t *testing.T) {
db := openTestDB(t)
logger := testLogger()
cfg := testMonitorConfig()
if err := registry.CreateService(db, "metacrypt", true); err != nil {
t.Fatalf("create service: %v", err)
}
if err := registry.CreateComponent(db, &registry.Component{
Name: "api", Service: "metacrypt", Image: "img:v1",
Restart: "unless-stopped", DesiredState: "running", ObservedState: "running",
}); err != nil {
t.Fatalf("create component: %v", err)
}
rt := &fakeRuntime{
containers: []runtime.ContainerInfo{
{Name: "metacrypt-api", State: "running"},
},
}
m := New(db, rt, cfg, "test-node", logger)
// Seed prevState so that observed == prev (no change).
m.prevState["metacrypt/api"] = "running"
m.tick()
// No events should be recorded when state is unchanged.
events, err := registry.QueryEvents(db, "metacrypt", "api", time.Now().Add(-1*time.Hour), 0)
if err != nil {
t.Fatalf("query events: %v", err)
}
if len(events) != 0 {
t.Fatalf("events: got %d, want 0 (no state change)", len(events))
}
}