monitor: see unikernel VMs + use canonical container naming
Two drift-reporting bugs: 1. The monitor listed only the podman runtime, so unikernel VMs always showed observed=unknown (false drift). It now takes a ContainerLister and the agent passes a merged lister (containers + VMs), mirroring listAllContainers. 2. The monitor computed the lookup name as service+"-"+component, which is wrong when component==service (the name collapses to just the service, e.g. "uktest"/"mc-proxy"). It now uses the canonical naming.ContainerNameFor — extracted to a shared package so the agent and monitor can't disagree. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -78,7 +78,7 @@ func Run(cfg *config.AgentConfig, version string) error {
|
||||
uk = qemu
|
||||
}
|
||||
|
||||
mon := monitor.New(db, rt, cfg.Monitor, cfg.Agent.NodeName, logger)
|
||||
mon := monitor.New(db, mergedLister{primary: rt, extra: uk, logger: logger}, cfg.Monitor, cfg.Agent.NodeName, logger)
|
||||
|
||||
proxy, err := NewProxyRouter(cfg.MCProxy.Socket, cfg.MCProxy.CertDir, logger)
|
||||
if err != nil {
|
||||
|
||||
@@ -1,34 +1,14 @@
|
||||
package agent
|
||||
|
||||
import "strings"
|
||||
import "git.wntrmute.dev/mc/mcp/internal/naming"
|
||||
|
||||
// ContainerNameFor returns the expected container name for a service and
|
||||
// component. For single-component services where the component name equals
|
||||
// the service name, the container name is just the service name (e.g.,
|
||||
// "mc-proxy" not "mc-proxy-mc-proxy").
|
||||
// ContainerNameFor is a convenience alias for naming.ContainerNameFor (the
|
||||
// canonical convention, shared with the monitor).
|
||||
func ContainerNameFor(service, component string) string {
|
||||
if service == component {
|
||||
return service
|
||||
}
|
||||
return service + "-" + component
|
||||
return naming.ContainerNameFor(service, component)
|
||||
}
|
||||
|
||||
// SplitContainerName splits a container name into service and component parts.
|
||||
// It checks known service names first to handle names like "mc-proxy" where a
|
||||
// naive split on "-" would produce the wrong result. If no known service
|
||||
// matches, it falls back to splitting on the first "-".
|
||||
// SplitContainerName is a convenience alias for naming.SplitContainerName.
|
||||
func SplitContainerName(name string, knownServices map[string]bool) (service, component string) {
|
||||
if knownServices[name] {
|
||||
return name, name
|
||||
}
|
||||
for svc := range knownServices {
|
||||
prefix := svc + "-"
|
||||
if strings.HasPrefix(name, prefix) && len(name) > len(prefix) {
|
||||
return svc, name[len(prefix):]
|
||||
}
|
||||
}
|
||||
if i := strings.Index(name, "-"); i >= 0 {
|
||||
return name[:i], name[i+1:]
|
||||
}
|
||||
return name, name
|
||||
return naming.SplitContainerName(name, knownServices)
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@ package agent
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log/slog"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
@@ -54,21 +55,38 @@ func (a *Agent) runtimeFor(rt string) runtime.Runtime {
|
||||
return a.Runtime
|
||||
}
|
||||
|
||||
// listAllContainers returns the observed state across every configured
|
||||
// runtime (containers + unikernel VMs) so reconciliation, status, and drift
|
||||
// detection see the whole picture.
|
||||
func (a *Agent) listAllContainers(ctx context.Context) ([]runtime.ContainerInfo, error) {
|
||||
infos, err := a.Runtime.List(ctx)
|
||||
// mergedLister lists observed containers across the container runtime and
|
||||
// (optionally) the unikernel runtime, so reconciliation, status, drift
|
||||
// detection, and the monitor see VMs and containers uniformly. It satisfies
|
||||
// the small lister interface the monitor consumes.
|
||||
type mergedLister struct {
|
||||
primary runtime.Runtime
|
||||
extra runtime.Runtime // unikernel runtime; may be nil
|
||||
logger *slog.Logger
|
||||
}
|
||||
|
||||
// List returns containers from the primary runtime plus, when configured,
|
||||
// unikernel VMs. A failure to list VMs is logged but not fatal — the
|
||||
// container view still reconciles.
|
||||
func (m mergedLister) List(ctx context.Context) ([]runtime.ContainerInfo, error) {
|
||||
infos, err := m.primary.List(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if a.Unikernel != nil {
|
||||
vms, vmErr := a.Unikernel.List(ctx)
|
||||
if m.extra != nil {
|
||||
vms, vmErr := m.extra.List(ctx)
|
||||
if vmErr == nil {
|
||||
infos = append(infos, vms...)
|
||||
} else if a.Logger != nil {
|
||||
a.Logger.Warn("list unikernel VMs failed", "err", vmErr)
|
||||
} else if m.logger != nil {
|
||||
m.logger.Warn("list unikernel VMs failed", "err", vmErr)
|
||||
}
|
||||
}
|
||||
return infos, nil
|
||||
}
|
||||
|
||||
// listAllContainers returns the observed state across every configured
|
||||
// runtime (containers + unikernel VMs) so reconciliation, status, and drift
|
||||
// detection see the whole picture.
|
||||
func (a *Agent) listAllContainers(ctx context.Context) ([]runtime.ContainerInfo, error) {
|
||||
return mergedLister{primary: a.Runtime, extra: a.Unikernel, logger: a.Logger}.List(ctx)
|
||||
}
|
||||
|
||||
@@ -8,15 +8,23 @@ import (
|
||||
"time"
|
||||
|
||||
"git.wntrmute.dev/mc/mcp/internal/config"
|
||||
"git.wntrmute.dev/mc/mcp/internal/naming"
|
||||
"git.wntrmute.dev/mc/mcp/internal/registry"
|
||||
"git.wntrmute.dev/mc/mcp/internal/runtime"
|
||||
)
|
||||
|
||||
// ContainerLister reports the observed containers on the node. A full
|
||||
// runtime.Runtime satisfies it; the agent passes a lister that merges the
|
||||
// container and unikernel runtimes so the monitor sees VMs too.
|
||||
type ContainerLister interface {
|
||||
List(ctx context.Context) ([]runtime.ContainerInfo, error)
|
||||
}
|
||||
|
||||
// Monitor watches container states and compares them to the registry,
|
||||
// recording events and firing alerts on drift or flapping.
|
||||
type Monitor struct {
|
||||
db *sql.DB
|
||||
runtime runtime.Runtime
|
||||
lister ContainerLister
|
||||
cfg config.MonitorConfig
|
||||
logger *slog.Logger
|
||||
alerter *Alerter
|
||||
@@ -26,11 +34,12 @@ type Monitor struct {
|
||||
prevState map[string]string // key: "service/component", value: observed state
|
||||
}
|
||||
|
||||
// New creates a Monitor with the given dependencies.
|
||||
func New(db *sql.DB, rt runtime.Runtime, cfg config.MonitorConfig, nodeName string, logger *slog.Logger) *Monitor {
|
||||
// New creates a Monitor with the given dependencies. lister reports observed
|
||||
// containers (and unikernel VMs, via a merged lister).
|
||||
func New(db *sql.DB, lister ContainerLister, cfg config.MonitorConfig, nodeName string, logger *slog.Logger) *Monitor {
|
||||
return &Monitor{
|
||||
db: db,
|
||||
runtime: rt,
|
||||
lister: lister,
|
||||
cfg: cfg,
|
||||
logger: logger,
|
||||
alerter: NewAlerter(cfg, nodeName, db, logger),
|
||||
@@ -82,7 +91,7 @@ func (m *Monitor) tick() {
|
||||
ctx := context.Background()
|
||||
|
||||
// Get the current runtime state of all containers.
|
||||
containers, err := m.runtime.List(ctx)
|
||||
containers, err := m.lister.List(ctx)
|
||||
if err != nil {
|
||||
m.logger.Error("monitor: list containers", "error", err)
|
||||
return
|
||||
@@ -113,7 +122,7 @@ func (m *Monitor) tick() {
|
||||
for _, comp := range components {
|
||||
key := comp.Service + "/" + comp.Name
|
||||
seen[key] = struct{}{}
|
||||
containerName := comp.Service + "-" + comp.Name
|
||||
containerName := naming.ContainerNameFor(comp.Service, comp.Name)
|
||||
|
||||
observed := "unknown"
|
||||
if state, ok := runtimeState[containerName]; ok {
|
||||
|
||||
@@ -227,6 +227,45 @@ func TestMonitorTickStateChange(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestMonitorTickCollapsedName guards the naming convention: when a
|
||||
// component's name equals its service, the container/VM name collapses to just
|
||||
// the service (e.g. "uktest", not "uktest-uktest"). The monitor must use
|
||||
// naming.ContainerNameFor to look it up, or it would report a running
|
||||
// component as "unknown" and raise false drift.
|
||||
func TestMonitorTickCollapsedName(t *testing.T) {
|
||||
db := openTestDB(t)
|
||||
logger := testLogger()
|
||||
cfg := testMonitorConfig()
|
||||
|
||||
if err := registry.CreateService(db, "uktest", true, ""); err != nil {
|
||||
t.Fatalf("create service: %v", err)
|
||||
}
|
||||
if err := registry.CreateComponent(db, ®istry.Component{
|
||||
Name: "uktest", Service: "uktest", Image: "img:v1",
|
||||
Restart: "unless-stopped", DesiredState: "running", ObservedState: "unknown",
|
||||
}); err != nil {
|
||||
t.Fatalf("create component: %v", err)
|
||||
}
|
||||
|
||||
// The runtime reports the collapsed name, as podman/QEMU actually do.
|
||||
rt := &fakeRuntime{
|
||||
containers: []runtime.ContainerInfo{
|
||||
{Name: "uktest", State: "running"},
|
||||
},
|
||||
}
|
||||
|
||||
m := New(db, rt, cfg, "test-node", logger)
|
||||
m.tick()
|
||||
|
||||
comp, err := registry.GetComponent(db, "uktest", "uktest")
|
||||
if err != nil {
|
||||
t.Fatalf("get component: %v", err)
|
||||
}
|
||||
if comp.ObservedState != "running" {
|
||||
t.Fatalf("observed state: got %q, want %q (collapsed name not resolved)", comp.ObservedState, "running")
|
||||
}
|
||||
}
|
||||
|
||||
func TestMonitorStartStop(t *testing.T) {
|
||||
db := openTestDB(t)
|
||||
logger := testLogger()
|
||||
|
||||
37
internal/naming/naming.go
Normal file
37
internal/naming/naming.go
Normal file
@@ -0,0 +1,37 @@
|
||||
// Package naming centralizes the container/VM naming convention shared by the
|
||||
// agent (which creates and reconciles containers) and the monitor (which looks
|
||||
// them up by name). Keeping it in one place prevents the two from disagreeing.
|
||||
package naming
|
||||
|
||||
import "strings"
|
||||
|
||||
// ContainerNameFor returns the expected container (or unikernel VM) name for a
|
||||
// service and component. For single-component services where the component
|
||||
// name equals the service name, the name is just the service name (e.g.,
|
||||
// "mc-proxy" not "mc-proxy-mc-proxy").
|
||||
func ContainerNameFor(service, component string) string {
|
||||
if service == component {
|
||||
return service
|
||||
}
|
||||
return service + "-" + component
|
||||
}
|
||||
|
||||
// SplitContainerName splits a container name into service and component parts.
|
||||
// It checks known service names first to handle names like "mc-proxy" where a
|
||||
// naive split on "-" would produce the wrong result. If no known service
|
||||
// matches, it falls back to splitting on the first "-".
|
||||
func SplitContainerName(name string, knownServices map[string]bool) (service, component string) {
|
||||
if knownServices[name] {
|
||||
return name, name
|
||||
}
|
||||
for svc := range knownServices {
|
||||
prefix := svc + "-"
|
||||
if strings.HasPrefix(name, prefix) && len(name) > len(prefix) {
|
||||
return svc, name[len(prefix):]
|
||||
}
|
||||
}
|
||||
if i := strings.Index(name, "-"); i >= 0 {
|
||||
return name[:i], name[i+1:]
|
||||
}
|
||||
return name, name
|
||||
}
|
||||
Reference in New Issue
Block a user