From da148a577da822cd0f3fe09815d9370ed25d2ca1 Mon Sep 17 00:00:00 2001 From: Kyle Isom Date: Tue, 24 Mar 2026 21:36:27 -0700 Subject: [PATCH] Add docker-compose, RUNBOOK.md, and docker Makefile target - docker-compose.yml: single service with data volume, ports 8443/9443/8080 - RUNBOOK.md: health checks, common operations (start/stop/backup/init), FIDO2 key registration, incident procedures (won't start, DB corruption, cert expiry, disk full, sync failures), escalation path - Makefile: added docker target Co-Authored-By: Claude Opus 4.6 (1M context) --- Makefile | 5 +- RUNBOOK.md | 198 +++++++++++++++++++++++++++++++ deploy/docker/docker-compose.yml | 18 +++ 3 files changed, 220 insertions(+), 1 deletion(-) create mode 100644 RUNBOOK.md create mode 100644 deploy/docker/docker-compose.yml diff --git a/Makefile b/Makefile index a013b95..9ca63bb 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: build test test-race vet lint proto proto-lint clean all +.PHONY: build test test-race vet lint proto proto-lint clean docker all LDFLAGS := -trimpath -ldflags="-s -w -X main.version=$(shell git describe --tags --always --dirty 2>/dev/null || echo dev)" @@ -32,4 +32,7 @@ proto-lint: clean: rm -f eng-pad-server +docker: + docker build -t eng-pad-server -f Dockerfile . + all: vet lint test eng-pad-server diff --git a/RUNBOOK.md b/RUNBOOK.md new file mode 100644 index 0000000..12a6f43 --- /dev/null +++ b/RUNBOOK.md @@ -0,0 +1,198 @@ +# RUNBOOK.md — eng-pad-server + +## 1. Service Overview + +eng-pad-server receives engineering notebook data from the Engineering +Pad Android app via gRPC, stores it in SQLite, and serves read-only +views through a web UI. Single authenticated user. + +**Ports**: 8443 (REST/HTTPS), 9443 (gRPC/TLS), 8080 (Web UI) +**Data**: `/srv/eng-pad-server/` +**Config**: `/srv/eng-pad-server/eng-pad-server.toml` +**Binary**: `/usr/local/bin/eng-pad-server` + +## 2. Health Checks + +1. Check service is running: + ``` + systemctl status eng-pad-server + ``` + +2. Check database health: + ``` + eng-pad-server status -c /srv/eng-pad-server/eng-pad-server.toml + ``` + +3. Check web UI responds: + ``` + curl -k https://localhost:8443/login + ``` + +4. Check gRPC responds: + ``` + grpcurl -insecure localhost:9443 list + ``` + +## 3. Common Operations + +### Start / Stop / Restart + +``` +systemctl start eng-pad-server +systemctl stop eng-pad-server +systemctl restart eng-pad-server +``` + +### View Logs + +``` +journalctl -u eng-pad-server -f +``` + +### Manual Backup + +``` +eng-pad-server snapshot -c /srv/eng-pad-server/eng-pad-server.toml +``` + +Backup saved to `/srv/eng-pad-server/backups/`. + +### Check Backup Timer + +``` +systemctl list-timers eng-pad-server-backup.timer +``` + +### Initialize (First Time) + +1. Install the binary and config: + ``` + sudo deploy/scripts/install.sh + ``` + +2. Edit the config file: + ``` + sudo -u engpad vi /srv/eng-pad-server/eng-pad-server.toml + ``` + +3. Generate TLS certificates (or copy existing ones): + ``` + # Self-signed for development: + openssl req -x509 -newkey ec -pkeyopt ec_paramgen_curve:prime256v1 \ + -keyout /srv/eng-pad-server/certs/key.pem \ + -out /srv/eng-pad-server/certs/cert.pem \ + -days 3650 -nodes -subj '/CN=pad.metacircular.net' + chown engpad:engpad /srv/eng-pad-server/certs/*.pem + chmod 600 /srv/eng-pad-server/certs/key.pem + ``` + +4. Create the admin user: + ``` + eng-pad-server init -c /srv/eng-pad-server/eng-pad-server.toml + ``` + +5. Start the service: + ``` + systemctl enable --now eng-pad-server + systemctl enable --now eng-pad-server-backup.timer + ``` + +### Register a FIDO2/U2F Security Key + +1. Log in to the web UI with password. +2. Navigate to `/keys`. +3. Enter a name for the key (e.g., "YubiKey 5"). +4. Click "Register" and touch the key when prompted. + +### Docker Deployment + +``` +cd deploy/docker +docker compose up -d +``` + +First-time setup inside the container: +``` +docker compose exec eng-pad-server eng-pad-server init -c /srv/eng-pad-server/eng-pad-server.toml +``` + +## 4. Alerting + +No automated alerting is configured. Monitor via: +- `systemctl status eng-pad-server` — process health +- `journalctl -u eng-pad-server --since "1 hour ago" | grep ERROR` — errors +- Backup age: `ls -lt /srv/eng-pad-server/backups/ | head` + +## 5. Incident Procedures + +### Service Won't Start + +1. Check logs: + ``` + journalctl -u eng-pad-server -n 50 --no-pager + ``` +2. Common causes: + - Config file missing or invalid → fix config + - TLS cert/key missing → regenerate or copy + - Port already in use → `ss -tlnp | grep 8443` + - Database locked → check for zombie processes: `fuser /srv/eng-pad-server/eng-pad-server.db` + +### Database Corruption + +1. Stop the service: + ``` + systemctl stop eng-pad-server + ``` +2. Check integrity: + ``` + sqlite3 /srv/eng-pad-server/eng-pad-server.db "PRAGMA integrity_check" + ``` +3. If corrupted, restore from backup: + ``` + cp /srv/eng-pad-server/backups/eng-pad-server-LATEST.db /srv/eng-pad-server/eng-pad-server.db + chown engpad:engpad /srv/eng-pad-server/eng-pad-server.db + ``` +4. Restart: + ``` + systemctl start eng-pad-server + ``` + +### Certificate Expiry + +1. Check expiry: + ``` + openssl x509 -in /srv/eng-pad-server/certs/cert.pem -noout -dates + ``` +2. Regenerate or renew the certificate. +3. Restart the service (picks up new certs on start). + +### Disk Full + +1. Check disk usage: + ``` + df -h /srv/eng-pad-server/ + du -sh /srv/eng-pad-server/* + ``` +2. Prune old backups: + ``` + ls -t /srv/eng-pad-server/backups/ | tail -n +8 | xargs -I{} rm /srv/eng-pad-server/backups/{} + ``` +3. Compact the database: + ``` + sqlite3 /srv/eng-pad-server/eng-pad-server.db "VACUUM" + ``` + +### Sync Fails from Android App + +1. Verify server is reachable from the device's network. +2. Check gRPC port is open: `ss -tlnp | grep 9443` +3. Check TLS cert is valid and trusted by the device. +4. Check credentials: verify the user exists via `eng-pad-server status`. +5. Check server logs for auth failures: `journalctl -u eng-pad-server | grep UNAUTHENTICATED` + +## 6. Escalation + +If the runbook doesn't resolve the issue: +1. Check ARCHITECTURE.md for system design context. +2. Check AUDIT.md for known security considerations. +3. Review recent commits for changes that may have introduced the issue. diff --git a/deploy/docker/docker-compose.yml b/deploy/docker/docker-compose.yml new file mode 100644 index 0000000..0f8aeb2 --- /dev/null +++ b/deploy/docker/docker-compose.yml @@ -0,0 +1,18 @@ +version: "3.8" + +services: + eng-pad-server: + build: + context: ../.. + dockerfile: Dockerfile + ports: + - "8443:8443" # REST API (HTTPS) + - "9443:9443" # gRPC (TLS) + - "8080:8080" # Web UI + volumes: + - data:/srv/eng-pad-server + restart: unless-stopped + +volumes: + data: + driver: local