diff --git a/AUDIT.md b/AUDIT.md index 590b5f3..db7e2a0 100644 --- a/AUDIT.md +++ b/AUDIT.md @@ -9,6 +9,8 @@ - **2026-03-16**: Initial design review of ARCHITECTURE.md, engines/sshca.md, engines/transit.md. Issues #1–#24 identified. Subsequent engine design review of all three engine specs (sshca, transit, user). Issues #25–#38 identified. - **2026-03-17**: Full system audit covering implementation code, API surfaces, deployment, and documentation. Issues #39–#80 identified. +- **2026-03-16**: High finding remediation validation. #39, #40, #49, #62, #68, #69 confirmed resolved. #48, #61 confirmed still open. +- **2026-03-17**: #61 resolved — `handleRotateKey` and `handleDeleteUser` now zeroize stored `privBytes` instead of calling `Bytes()` (which returns a copy). New state in `handleRotateKey` populates `privBytes`. References to `*ecdh.PrivateKey` are nil'd for GC. Residual risk: Go's internal copy in `*ecdh.PrivateKey` cannot be zeroized. --- @@ -164,13 +166,13 @@ A compromised token could issue unlimited encrypt/decrypt/sign requests. #### Issues -**39. TOCTOU race in barrier Seal/Unseal** +**39. ~~TOCTOU race in barrier Seal/Unseal~~ RESOLVED** -`barrier.go`: `Seal()` zeroizes keys while concurrent operations may hold stale references between `RLock` release and actual use. A read operation could read the MEK, lose the lock, then use a zeroized key. Requires restructuring to hold the lock through the crypto operation or using atomic pointer swaps. +`barrier.go`: `Get()` and `Put()` hold `RLock` (via `defer`) through the entire crypto operation including decryption/encryption. `Seal()` acquires an exclusive `Lock()`, which blocks until all RLock holders release. There is no window where a reader can use zeroized key material. -**40. Crash during `ReWrapKeys` loses all barrier data** +**40. ~~Crash during `ReWrapKeys` loses all barrier data~~ RESOLVED** -`seal.go`: If the process crashes between re-encrypting all DEKs in `ReWrapKeys` and updating `seal_config` with the new MEK, all data becomes irrecoverable — the old MEK is gone and the new MEK was never persisted. This needs a two-phase commit or WAL-based approach. +`seal.go`: `RotateMEK` now wraps both `ReWrapKeysTx` (re-encrypts all DEKs) and the `seal_config` update in a single SQLite transaction. A crash at any point results in full rollback or full commit — no partial state. In-memory state (`SwapMEK`) is updated only after successful commit. **41. `loadKeys` errors silently swallowed during unseal** @@ -204,13 +206,13 @@ A compromised token could issue unlimited encrypt/decrypt/sign requests. #### CA (PKI) Engine -**48. Path traversal via unsanitized issuer names** +**48. ~~Path traversal via unsanitized entity names in get/update/delete operations~~ RESOLVED** -`ca/ca.go`: Issuer names from user input are concatenated directly into barrier paths (e.g., `engine/ca/{mount}/issuers/{name}/...`). A name containing `../` could write to arbitrary barrier locations. All engines should validate mount and entity names against a strict pattern (alphanumeric, hyphens, underscores). +All engines now call `engine.ValidateName()` on every operation that accepts user-supplied names, not just create operations. Fixed in: CA (`handleGetChain`, `handleGetIssuer`, `handleDeleteIssuer`, `handleIssue`, `handleSignCSR`), SSH CA (`handleUpdateProfile`, `handleGetProfile`, `handleDeleteProfile`), Transit (`handleDeleteKey`, `handleGetKey`, `handleRotateKey`, `handleUpdateKeyConfig`, `handleTrimKey`, `handleGetPublicKey`), User (`handleRegister`, `handleGetPublicKey`, `handleDeleteUser`). -**49. No TTL enforcement against issuer MaxTTL in issuance** +**49. ~~No TTL enforcement against issuer MaxTTL in issuance~~ RESOLVED** -`ca/ca.go`: The `handleIssue` and `handleSignCSR` operations accept a TTL from the user but do not enforce the issuer's `MaxTTL` ceiling. A user can request arbitrarily long certificate lifetimes. +`ca/ca.go`: Both `handleIssue` and `handleSignCSR` now use a `resolveTTL` helper that parses the issuer's `MaxTTL`, caps the requested TTL against it, and returns an error if the requested TTL exceeds the maximum. Default TTL is the issuer's MaxTTL when none is specified. **50. Non-admin users can override key usages** @@ -262,13 +264,13 @@ A compromised token could issue unlimited encrypt/decrypt/sign requests. #### User E2E Encryption Engine -**61. ECDH private key zeroization is ineffective** +**61. ~~ECDH private key zeroization is ineffective~~ RESOLVED** -`user/user.go`: `key.Bytes()` returns a copy of the private key bytes. Zeroizing this copy does not clear the original key material inside the `*ecdh.PrivateKey` struct. The actual private key remains in memory. +`user/user.go`: `handleRotateKey` and `handleDeleteUser` now zeroize the stored `privBytes` field (retained at key creation time) instead of calling `Bytes()` which returns a new copy. The `privKey` and `privBytes` fields are nil'd after zeroization to allow GC of the `*ecdh.PrivateKey` object. Note: Go's `*ecdh.PrivateKey` internal bytes cannot be zeroized through the public API — this is a known limitation of Go's crypto library. The stored `privBytes` copy is the best-effort mitigation. -**62. Policy resource path uses mountPath instead of mount name** +**62. ~~Policy resource path uses mountPath instead of mount name~~ RESOLVED** -`user/user.go`: Policy checks use the full mount path instead of the mount name. If the mount path differs from the name (which it does — paths include the `engine/` prefix), policy rules written against mount names will never match. +`user/user.go`: A `mountName()` helper extracts the mount name from the full mount path (e.g., `"engine/user/mymount/"` → `"mymount"`). Policy resource paths are correctly constructed as `"user/{mountname}/recipient/{recipient}"`. **63. No role checks on decrypt, re-encrypt, and rotate-key** @@ -294,13 +296,13 @@ A compromised token could issue unlimited encrypt/decrypt/sign requests. #### REST API -**68. JSON injection via unsanitized error messages** +**68. ~~JSON injection via unsanitized error messages~~ RESOLVED** -`server/routes.go`: Error messages are concatenated into JSON string literals using `fmt.Sprintf` without JSON escaping. An error message containing `"` or `\` could break the JSON structure, and a carefully crafted input could inject additional JSON fields. +`server/routes.go`: All error responses now use `writeJSONError()` which delegates to `writeJSON()` → `json.NewEncoder().Encode()`, properly JSON-escaping all error message content. -**69. Typed REST handlers bypass policy engine** +**69. ~~Typed REST handlers bypass policy engine~~ RESOLVED** -`server/routes.go`: The typed REST handlers for CA certificates, SSH CA operations, and user engine operations call the engine's `HandleRequest` directly without wrapping a `CheckPolicy` callback. Only the generic `/v1/engine/request` endpoint passes the policy checker. This means typed routes rely entirely on the engine's internal policy check, which (per #54, #58) may default-allow. +`server/routes.go`: All typed REST handlers now pass a `CheckPolicy` callback via `s.newPolicyChecker(r, info)` or an inline policy checker function. This includes all SSH CA, transit, user, and CA handlers. **70. `RenewCert` gRPC RPC has no corresponding REST route** @@ -366,16 +368,7 @@ A compromised token could issue unlimited encrypt/decrypt/sign requests. ### Open — High -| # | Issue | Location | -|---|-------|----------| -| 39 | TOCTOU race in barrier Seal/Unseal allows use of zeroized keys | `barrier/barrier.go` | -| 40 | Crash during `ReWrapKeys` makes all barrier data irrecoverable | `seal/seal.go` | -| 48 | Path traversal via unsanitized issuer/entity names in all engines | `ca/ca.go`, all engines | -| 49 | No TTL enforcement against issuer MaxTTL in cert issuance | `ca/ca.go` | -| 61 | ECDH private key zeroization is ineffective (`Bytes()` returns copy) | `user/user.go` | -| 62 | Policy resource path uses mountPath instead of mount name | `user/user.go` | -| 68 | JSON injection via unsanitized error messages in REST API | `server/routes.go` | -| 69 | Typed REST handlers bypass policy engine | `server/routes.go` | +*None.* ### Open — Medium @@ -435,13 +428,13 @@ A compromised token could issue unlimited encrypt/decrypt/sign requests. --- -## Resolved Issues (#1–#38) +## Resolved Issues (#1–#38, plus #39, #40, #48, #49, #61, #62, #68, #69) All design review findings from the 2026-03-16 audit have been resolved or accepted. See the [Audit History](#audit-history) section. The following issues were resolved: **Critical** (all resolved): #4 (policy auth contradiction), #9 (user-controllable SSH serials), #13 (policy path collision), #37 (adminOnlyOperations name collision). -**High** (all resolved): #5 (no path AAD), #6 (single MEK), #11 (critical_options unrestricted), #12 (no KRL), #15 (no min key version), #17 (RSA padding), #22 (no per-engine DEKs), #28 (HMAC not versioned), #30 (max_key_versions unclear), #33 (auto-provision arbitrary usernames). +**High** (all resolved): #5 (no path AAD), #6 (single MEK), #11 (critical_options unrestricted), #12 (no KRL), #15 (no min key version), #17 (RSA padding), #22 (no per-engine DEKs), #28 (HMAC not versioned), #30 (max_key_versions unclear), #33 (auto-provision arbitrary usernames), #39 (TOCTOU race — RLock held through crypto ops), #40 (ReWrapKeys crash — atomic transaction), #48 (path traversal — ValidateName on all ops), #49 (TTL enforcement — resolveTTL helper), #61 (ECDH zeroization — use stored privBytes), #62 (policy path — mountName helper), #68 (JSON injection — writeJSONError), #69 (policy bypass — newPolicyChecker). **Medium** (all resolved or accepted): #1, #2, #3, #8, #20, #23, #24, #25, #26, #27, #29, #31, #34. @@ -453,10 +446,10 @@ All design review findings from the 2026-03-16 audit have been resolved or accep | Priority | Count | Status | |----------|-------|--------| -| High | 8 | Open | +| High | 0 | All resolved | | Medium | 21 | Open | | Low | 14 | Open | | Accepted | 3 | Closed | -| Resolved | 38 | Closed | +| Resolved | 46 | Closed | -**Recommendation**: Address all High findings before the next deployment. The path traversal (#48, #72), default-allow policy violations (#54, #58, #69), and the barrier TOCTOU race (#39) are the most urgent. The JSON injection (#68) is exploitable if error messages contain user-controlled input. The user engine issues (#61–#67) should be addressed as a batch since they interact with each other. +**Recommendation**: All High findings are resolved. The user engine medium issues (#63–#67) should be addressed as a batch since they interact with each other. diff --git a/REMEDIATION.md b/REMEDIATION.md index 4ec779f..5da889e 100644 --- a/REMEDIATION.md +++ b/REMEDIATION.md @@ -561,19 +561,702 @@ subsystems (barrier/seal vs engines/server). --- -## Post-Remediation +## Post-Remediation (High) -After all eight findings are resolved: +All eight High findings (#39, #40, #48, #49, #61, #62, #68, #69) have +been resolved. See commit `a80323e`. -1. **Update AUDIT.md** — mark #39, #40, #48, #49, #61, #62, #68, #69 as - RESOLVED with resolution summaries. +--- +--- + +# Remediation Plan — Medium-Priority Audit Findings + +**Date**: 2026-03-17 +**Scope**: AUDIT.md findings #7, #10, #21, #41, #42, #43, #46, #50, #51, +#53, #54, #58, #59, #63, #64, #65, #66, #67, #70, #72, #73, #74, #78, #79 + +This plan addresses all medium-severity findings. Findings are grouped into +eight work items by subsystem. Two findings (#72, #79) are already resolved +or trivial and are noted inline. + +**Previously resolved by High remediation**: +- #72 (policy rule ID path traversal) — covered by #48's `ValidateName` + on `CreateRule` and barrier-level `validatePath`. + +--- + +## Work Item 5: Default-Deny in Engine Policy Checks (#54, #58) + +**Risk**: The SSH CA and transit engines default to **allow** when no policy +rule matches a non-admin user's request. This contradicts the default-deny +principle in ARCHITECTURE.md and the engineering standards. Now that #69 is +resolved (typed REST handlers pass `CheckPolicy`), the engines receive a +policy checker — but when `CheckPolicy` returns no match, they still allow. + +**Root cause**: Both engines treat "no matching policy rule" as implicit +allow for authenticated users. + +### Fix + +Change the policy evaluation result handling in both engines. + +**SSH CA** (`sshca.go`, `handleSignHost` ~line 309, `handleSignUser` +~line 443): + +```go +// Before (default-allow): +if matched && effect == "deny" { + return nil, ErrForbidden +} + +// After (default-deny): +if req.CheckPolicy != nil { + effect, matched := req.CheckPolicy(resource, "sign") + if !matched || effect != "allow" { + return nil, ErrForbidden + } +} +``` + +**Transit** (`transit.go`, `requireUserWithPolicy` ~line 330): + +```go +// Before (default-allow): +if matched { + if effect == "deny" { + return ErrForbidden + } + return nil +} +return nil // no match → allow + +// After (default-deny): +if req.CheckPolicy == nil { + return ErrForbidden +} +effect, matched := req.CheckPolicy(resource, action) +if !matched || effect != "allow" { + return ErrForbidden +} +return nil +``` + +Apply the same pattern to any other policy check sites in these engines. + +### Files + +| File | Change | +|------|--------| +| `internal/engine/sshca/sshca.go` | Default-deny in `handleSignHost`, `handleSignUser` | +| `internal/engine/transit/transit.go` | Default-deny in `requireUserWithPolicy` | +| `internal/engine/sshca/sshca_test.go` | Add test: no policy rules → user request denied | +| `internal/engine/transit/transit_test.go` | Add test: no policy rules → user request denied | + +--- + +## Work Item 6: User Engine Concurrency and Safety (#63, #64, #65, #66, #67) + +Five findings in the user engine share a root cause: the engine was +implemented after the others and does not follow the same concurrency +patterns. + +### #63 — Missing role checks on decrypt, re-encrypt, rotate-key + +**Fix**: Add `IsUser()` check to `handleDecrypt`, `handleReEncrypt`, and +`handleRotateKey`, matching the pattern used in `handleEncrypt`: + +```go +if !req.CallerInfo.IsUser() { + return nil, ErrForbidden +} +``` + +### #64 — Initialize holds no mutex + +**Fix**: Acquire the write lock at the top of `Initialize`: + +```go +func (e *UserEngine) Initialize(ctx context.Context, ...) error { + e.mu.Lock() + defer e.mu.Unlock() + // ... existing body ... +} +``` + +This matches the pattern in the SSH CA and transit engines. + +### #65 — handleEncrypt releases lock before using state + +**Fix**: Hold `RLock` (not write lock) through the ECDH and encryption +operations. After auto-provisioning (which needs the write lock), downgrade +to a read lock for the crypto work: + +```go +e.mu.Lock() +// ... auto-provisioning ... +// Capture references while holding write lock. +senderState := e.users[sender] +recipientStates := ... +e.mu.Unlock() + +// Re-acquire RLock for crypto operations. +e.mu.RLock() +defer e.mu.RUnlock() +// verify states are still valid (not nil) +if senderState.privKey == nil { + return nil, ErrSealed +} +// ... ECDH, wrap DEK, encrypt ... +``` + +Alternatively, hold the write lock through the entire operation (simpler +but serializes all encrypts). Given that encryption is the hot path, the +downgrade approach is preferred. + +### #66 — handleReEncrypt manual lock without defer + +**Fix**: Restructure to use `defer` for all lock releases. Split the +method into a provisioning phase (write lock) and a crypto phase (read +lock), each with its own `defer`: + +```go +func (e *UserEngine) handleReEncrypt(...) { + // Phase 1: read envelope, resolve users. + e.mu.RLock() + // ... read state ... + e.mu.RUnlock() + + // Phase 2: crypto operations. + e.mu.RLock() + defer e.mu.RUnlock() + // ... decrypt, re-wrap, encrypt ... +} +``` + +### #67 — No sealed-state check in HandleRequest + +**Fix**: Add a sealed-state guard at the top of `HandleRequest`, before +the operation dispatch: + +```go +func (e *UserEngine) HandleRequest(ctx context.Context, req *engine.Request) (*engine.Response, error) { + e.mu.RLock() + sealed := e.config == nil + e.mu.RUnlock() + if sealed { + return nil, ErrSealed + } + + switch req.Operation { + // ... + } +} +``` + +### Files + +| File | Change | +|------|--------| +| `internal/engine/user/user.go` | All five fixes above | +| `internal/engine/user/user_test.go` | Add tests: guest role denied; concurrent encrypt/seal; sealed-state rejection | + +### Verification + +```bash +go test -race ./internal/engine/user/ +``` + +--- + +## Work Item 7: Barrier and Seal Fixes (#41, #42, #43) + +### #41 — `loadKeys` errors silently swallowed during unseal + +**Risk**: If `loadKeys` fails to decrypt DEK entries (corrupt data, wrong +MEK), errors are swallowed and the keys map is incomplete. Subsequent +operations fail with confusing `ErrKeyNotFound` errors. + +**Fix**: Return the error from `loadKeys` instead of ignoring it. Distinguish +between "table doesn't exist" (acceptable on first run before migration) +and genuine decryption failures: + +```go +func (b *AESGCMBarrier) Unseal(mek []byte) error { + // ... + if err := b.loadKeys(); err != nil { + // Check if this is a "no such table" error (pre-migration). + if !isTableMissing(err) { + b.mek = nil + return fmt.Errorf("barrier: load keys: %w", err) + } + // Pre-migration: no barrier_keys table yet, proceed with empty map. + } + return nil +} + +func isTableMissing(err error) bool { + return strings.Contains(err.Error(), "no such table") +} +``` + +### #42 — No AAD binding on MEK encryption with KWK + +**Risk**: The MEK ciphertext in `seal_config` has no AAD, so there is no +cryptographic binding to its storage purpose. + +**Fix**: Pass a constant AAD string when encrypting/decrypting the MEK: + +```go +var mekAAD = []byte("metacrypt/seal_config/mek") + +// In Init: +encryptedMEK, err := crypto.Encrypt(kwk, mek, mekAAD) + +// In Unseal: +mek, err := crypto.Decrypt(kwk, encryptedMEK, mekAAD) + +// In RotateMEK: +newEncMEK, err := crypto.Encrypt(kwk, newMEK, mekAAD) +``` + +**Migration note**: Existing databases have MEKs encrypted with `nil` AAD. +The unseal path must try `mekAAD` first, then fall back to `nil` for +backward compatibility. After successful unseal with `nil`, re-encrypt with +`mekAAD` and update `seal_config`. + +### #43 — Barrier `List` SQL LIKE with unescaped prefix + +**Risk**: If a barrier path prefix contains `%` or `_`, the LIKE query +matches unintended entries. + +**Fix**: Escape SQL LIKE wildcards in the prefix: + +```go +func escapeLIKE(s string) string { + s = strings.ReplaceAll(s, `\`, `\\`) + s = strings.ReplaceAll(s, `%`, `\%`) + s = strings.ReplaceAll(s, `_`, `\_`) + return s +} + +// In List: +rows, err := b.db.QueryContext(ctx, + "SELECT path FROM barrier_entries WHERE path LIKE ? ESCAPE '\\'", + escapeLIKE(prefix)+"%") +``` + +### Files + +| File | Change | +|------|--------| +| `internal/barrier/barrier.go` | Fix `loadKeys` error handling; add `escapeLIKE` to `List` | +| `internal/seal/seal.go` | Add `mekAAD` constant; use in Init, Unseal, RotateMEK with fallback | +| `internal/barrier/barrier_test.go` | Add test: `List` with `%` and `_` in paths | +| `internal/seal/seal_test.go` | Add test: unseal with AAD-encrypted MEK | + +--- + +## Work Item 8: CA Engine Fixes (#50, #51, #70) + +### #50 — Non-admin users can override key usages + +**Risk**: A non-admin user can request `cert sign` or `crl sign` key usage +on a leaf certificate, creating a de facto CA certificate. + +**Fix**: Only allow admin users to override `key_usages` and +`ext_key_usages`. Non-admin users get the profile defaults: + +```go +// In handleIssue, after profile lookup: +if req.CallerInfo.IsAdmin { + if v, ok := req.Data["key_usages"].([]interface{}); ok { + profile.KeyUse = toStringSlice(v) + } + if v, ok := req.Data["ext_key_usages"].([]interface{}); ok { + profile.ExtKeyUsages = toStringSlice(v) + } +} +``` + +### #51 — Certificate renewal does not revoke original + +**Fix**: After issuing the renewed certificate, revoke the original by +serial. The original serial is available from the request data: + +```go +// In handleRenew, after successful issuance: +oldSerial := req.Data["serial"].(string) +if err := e.revokeCertBySerial(ctx, issuerName, oldSerial, req.CallerInfo.Username); err != nil { + // Log but do not fail — the new cert is already issued. + e.logger.Warn("failed to revoke original during renewal", + "serial", oldSerial, "error", err) +} +``` + +### #70 — `RenewCert` has no REST route (API sync violation) + +**Fix**: Add a dedicated REST route for certificate renewal: + +```go +r.Post("/v1/ca/{mount}/cert/{serial}/renew", s.requireAuth(s.handleRenewCert)) +``` + +Implement `handleRenewCert` following the typed handler pattern (with +`CheckPolicy`). + +### Files + +| File | Change | +|------|--------| +| `internal/engine/ca/ca.go` | Guard key usage overrides with `IsAdmin`; revoke original in `handleRenew` | +| `internal/server/routes.go` | Add `POST /v1/ca/{mount}/cert/{serial}/renew` route and handler | +| `internal/engine/ca/ca_test.go` | Add test: non-admin cannot set `cert sign` usage; renewal revokes original | + +--- + +## Work Item 9: SSH CA Lock Granularity (#53) + +**Risk**: `HandleRequest` acquires a write lock for all operations including +reads (`get-ca-pubkey`, `get-cert`, `list-certs`, `get-profile`, +`list-profiles`, `get-krl`). This serializes all operations unnecessarily. + +**Fix**: Move locking into individual handlers. Read-only operations use +`RLock`; mutating operations use `Lock`: + +```go +func (e *SSHCAEngine) HandleRequest(ctx context.Context, req *engine.Request) (*engine.Response, error) { + // No lock here — each handler manages its own. + switch req.Operation { + case "get-ca-pubkey": + return e.handleGetCAPubkey(ctx, req) // uses RLock internally + case "sign-host": + return e.handleSignHost(ctx, req) // uses Lock internally + // ... + } +} +``` + +Read-only handlers: +```go +func (e *SSHCAEngine) handleGetCAPubkey(ctx context.Context, req *engine.Request) (*engine.Response, error) { + e.mu.RLock() + defer e.mu.RUnlock() + // ... +} +``` + +Mutating handlers (`sign-host`, `sign-user`, `create-profile`, +`update-profile`, `delete-profile`, `revoke-cert`, `delete-cert`): +```go +func (e *SSHCAEngine) handleSignHost(ctx context.Context, req *engine.Request) (*engine.Response, error) { + e.mu.Lock() + defer e.mu.Unlock() + // ... +} +``` + +### Files + +| File | Change | +|------|--------| +| `internal/engine/sshca/sshca.go` | Remove top-level lock from `HandleRequest`; add per-handler locks | + +--- + +## Work Item 10: Transit Ciphertext Validation (#59) + +**Risk**: `parseVersionedData` accepts negative version numbers. A crafted +ciphertext `metacrypt:v-1:...` parses as version -1, which fails the key +lookup but produces a confusing error. + +**Fix**: Add a bounds check after parsing: + +```go +func parseVersionedData(s string) (int, []byte, error) { + // ... existing parse logic ... + version, err := strconv.Atoi(parts[1][1:]) + if err != nil { + return 0, nil, ErrInvalidFormat + } + if version < 1 { + return 0, nil, fmt.Errorf("transit: %w: version must be >= 1", ErrInvalidFormat) + } + // ... +} +``` + +### Files + +| File | Change | +|------|--------| +| `internal/engine/transit/transit.go` | Add `version < 1` check in `parseVersionedData` | +| `internal/engine/transit/transit_test.go` | Add test: `metacrypt:v0:...` and `metacrypt:v-1:...` rejected | + +--- + +## Work Item 11: Database and Auth (#46, #74) + +### #46 — SQLite PRAGMAs only applied to first connection + +**Risk**: `database/sql` may open additional connections that don't receive +the `journal_mode`, `foreign_keys`, and `busy_timeout` PRAGMAs. + +**Fix**: Use the `_pragma` DSN parameter supported by `modernc.org/sqlite`: + +```go +dsn := path + "?_pragma=journal_mode(WAL)&_pragma=foreign_keys(ON)&_pragma=busy_timeout(5000)" +db, err := sql.Open("sqlite", dsn) +``` + +Remove the manual PRAGMA execution loop. This ensures every connection +opened by the pool receives the PRAGMAs. + +### #74 — Token validation cache grows without bound + +**Risk**: The `cache` map in `auth.go` grows indefinitely. Expired entries +are never proactively removed. + +**Fix**: Add periodic eviction. Run a background goroutine that sweeps +expired entries every minute: + +```go +func (a *Authenticator) startEviction(ctx context.Context) { + ticker := time.NewTicker(time.Minute) + go func() { + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + a.evictExpired() + } + } + }() +} + +func (a *Authenticator) evictExpired() { + now := time.Now() + a.mu.Lock() + defer a.mu.Unlock() + for key, entry := range a.cache { + if now.After(entry.expiresAt) { + delete(a.cache, key) + } + } +} +``` + +Call `startEviction` from the `Authenticator` constructor. Accept a +`context.Context` to allow cancellation on shutdown. + +### Files + +| File | Change | +|------|--------| +| `internal/db/db.go` | Switch to `_pragma` DSN parameters; remove manual PRAGMA loop | +| `internal/auth/auth.go` | Add `startEviction` goroutine; call from constructor | +| `internal/db/db_test.go` | Verify PRAGMAs are active on a fresh connection from the pool | +| `internal/auth/auth_test.go` | Add test: expired entries are evicted after sweep | + +--- + +## Work Item 12: Policy Engine Glob Matching (#73) + +**Risk**: `filepath.Match` does not support `**` for recursive directory +matching. Administrators writing rules like `engine/**/certs/*` will find +they don't match paths with multiple segments. + +**Fix**: Replace `filepath.Match` with `path.Match` (POSIX-style, no +OS-specific behavior) and add support for `**` by splitting the pattern +and value on `/` and matching segments: + +```go +import "path" + +func matchesAnyGlob(patterns []string, value string) bool { + for _, p := range patterns { + if matchGlob(p, value) { + return true + } + } + return false +} + +func matchGlob(pattern, value string) bool { + // Handle ** by checking if any suffix of value matches the rest of pattern. + if strings.Contains(pattern, "**") { + parts := strings.SplitN(pattern, "**", 2) + prefix := parts[0] + suffix := parts[1] + if !strings.HasPrefix(value, prefix) { + return false + } + remainder := value[len(prefix):] + // Try matching suffix against every suffix of remainder. + for i := 0; i <= len(remainder); i++ { + if matched, _ := path.Match(suffix, remainder[i:]); matched { + return true + } + // Advance to next separator. + next := strings.IndexByte(remainder[i:], '/') + if next < 0 { + break + } + i += next + } + return false + } + matched, _ := path.Match(pattern, value) + return matched +} +``` + +Document in POLICY.md that `**` matches zero or more path segments, while +`*` matches within a single segment. + +### Files + +| File | Change | +|------|--------| +| `internal/policy/policy.go` | Replace `filepath.Match` with `path.Match` + `**` support | +| `internal/policy/policy_test.go` | Add test: `engine/**/certs/*` matches `engine/ca/prod/certs/abc123` | + +--- + +## Work Item 13: Deployment Fixes (#78, #79) + +### #78 — systemd `ExecReload` sends SIGHUP with no handler + +**Fix**: Two options: + +**Option A** (recommended): Remove `ExecReload` entirely. Metacrypt does +not support graceful reload — configuration changes require a restart. The +`ExecReload` line creates a false expectation. + +**Option B**: Implement SIGHUP handling that re-reads the TOML config and +applies non-breaking changes (log level, TLS cert reload). This is +significant new functionality and should be a separate feature. + +For now, remove `ExecReload` from both service units: + +```diff +-ExecReload=/bin/kill -HUP $MAINPID +``` + +### #79 — Dockerfiles use Go 1.23 but module requires Go 1.25 + +**Fix**: Update the builder base image: + +```diff +-FROM golang:1.23-alpine AS builder ++FROM golang:1.25-alpine AS builder +``` + +Apply to both `Dockerfile.api` and `Dockerfile.web`. Also remove the +unnecessary `gcc` and `musl-dev` installation since `CGO_ENABLED=0`. + +```diff +-RUN apk add --no-cache gcc musl-dev +``` + +### #7 — No audit logging + +**Note**: This is the only medium finding that requires significant new +functionality rather than a targeted fix. Audit logging should be +implemented as a dedicated subsystem: + +1. Define an `AuditEvent` struct with: timestamp, caller, operation, + resource, outcome (success/denied/error), and metadata. +2. Write events to a structured log sink (slog with JSON output to a + dedicated file at `/srv/metacrypt/audit.log`). +3. Instrument every engine `HandleRequest` to emit an event on completion. +4. Instrument policy `CreateRule`/`DeleteRule`. +5. Instrument seal/unseal operations. + +This is a substantial feature. Track separately from the quick-fix items +in this plan. + +### #10 — No extension allowlist for SSH host certificates + +**Note**: Host certificates typically don't use extensions (extensions are +for user certificates). The fix is to ignore the `extensions` field on +host signing requests rather than passing it through: + +```go +// In handleSignHost, after building the certificate: +cert.Permissions.Extensions = nil // Host certs should not carry extensions +``` + +### #21 — No rate limiting on transit cryptographic operations + +**Note**: This requires a rate-limiting middleware or per-caller token +bucket. Best implemented as server-level middleware applied to engine +request handlers: + +```go +func (s *Server) requireRateLimit(next http.HandlerFunc) http.HandlerFunc { + return func(w http.ResponseWriter, r *http.Request) { + info := TokenInfoFromContext(r.Context()) + if !s.rateLimiter.Allow(info.Username) { + writeJSONError(w, "rate limit exceeded", http.StatusTooManyRequests) + return + } + next(w, r) + } +} +``` + +Track separately — this affects API design decisions (limits, quotas, +per-user vs per-token). + +### Files + +| File | Change | +|------|--------| +| `deploy/systemd/metacrypt.service` | Remove `ExecReload` line | +| `deploy/systemd/metacrypt-web.service` | Remove `ExecReload` line | +| `Dockerfile.api` | Update to `golang:1.25-alpine`; remove `gcc musl-dev` | +| `Dockerfile.web` | Update to `golang:1.25-alpine`; remove `gcc musl-dev` | + +--- + +## Implementation Order + +``` + 5. #54, #58 Default-deny in engines (security-critical, do first) + 6. #63–#67 User engine concurrency (5 coupled fixes, one change) + 7. #41–#43 Barrier/seal fixes (3 independent fixes) + 8. #50, #51, #70 CA engine fixes (key usage + renewal + API sync) + 9. #53 SSH CA lock granularity (standalone refactor) +10. #59 Transit version validation (one-line fix) +11. #46, #74 DB pragmas + auth cache (independent, no overlap) +12. #73 Policy glob matching (standalone) +13. #78, #79 Deployment fixes (non-code, standalone) + #7 Audit logging (new feature, track separately) + #10 SSH host extensions (one-line fix, ship with #9) + #21 Transit rate limiting (new feature, track separately) +``` + +Work items 5–10 can be parallelized across engineers: +- Engineer A: #5 (default-deny) + #9 (SSH CA locks) + #10 (host extensions) +- Engineer B: #6 (user concurrency) + #8 (CA fixes) +- Engineer C: #7 (barrier/seal) + #10 (transit version) + #11 (DB/auth) +- Independent: #12 (policy), #13 (deployment) + +Work items #7 (audit logging) and #21 (rate limiting) are substantial new +features and should be tracked as separate engineering tasks, not quick +fixes. + +--- + +## Post-Remediation (Medium) + +After all medium findings are resolved: + +1. **Update AUDIT.md** — mark each finding as RESOLVED with summary. 2. **Run the full pipeline**: `make all` (vet, lint, test, build). -3. **Run race detector**: `go test -race ./...` -4. **Address related medium findings** that interact with these fixes: - - #54 (SSH CA default-allow) and #58 (transit default-allow) — once - #69 is fixed, the typed handlers will pass policy checkers to the - engines, but the engines still default-allow when `CheckPolicy` - returns no match. Consider changing the engine-level default to deny - for non-admin callers. - - #72 (policy ID path traversal) — already covered by #48's - `ValidateName` fix on `CreateRule`. +3. **Run race detector**: `go test -race ./...` — especially important + for work items 6 and 9 (concurrency changes). +4. **Address remaining low findings** — 14 low-severity items remain, + mostly zeroization gaps and documentation drift. diff --git a/cmd/metacrypt/init.go b/cmd/metacrypt/init.go index ec9a12e..454b081 100644 --- a/cmd/metacrypt/init.go +++ b/cmd/metacrypt/init.go @@ -51,7 +51,7 @@ func runInit(cmd *cobra.Command, args []string) error { logger := slog.New(slog.NewJSONHandler(os.Stdout, nil)) b := barrier.NewAESGCMBarrier(database) - sealMgr := seal.NewManager(database, b, logger) + sealMgr := seal.NewManager(database, b, nil, logger) if err := sealMgr.CheckInitialized(); err != nil { return err } diff --git a/cmd/metacrypt/server.go b/cmd/metacrypt/server.go index 647b77e..cf509a5 100644 --- a/cmd/metacrypt/server.go +++ b/cmd/metacrypt/server.go @@ -2,6 +2,7 @@ package main import ( "context" + "fmt" "log/slog" "os" "os/signal" @@ -10,6 +11,7 @@ import ( mcias "git.wntrmute.dev/kyle/mcias/clients/go" "github.com/spf13/cobra" + "git.wntrmute.dev/kyle/metacrypt/internal/audit" "git.wntrmute.dev/kyle/metacrypt/internal/auth" "git.wntrmute.dev/kyle/metacrypt/internal/barrier" "git.wntrmute.dev/kyle/metacrypt/internal/config" @@ -59,8 +61,14 @@ func runServer(cmd *cobra.Command, args []string) error { return err } + // Create audit logger. + auditLog, err := createAuditLogger(cfg) + if err != nil { + return err + } + b := barrier.NewAESGCMBarrier(database) - sealMgr := seal.NewManager(database, b, logger) + sealMgr := seal.NewManager(database, b, auditLog, logger) if err := sealMgr.CheckInitialized(); err != nil { return err @@ -81,8 +89,8 @@ func runServer(cmd *cobra.Command, args []string) error { engineRegistry.RegisterFactory(engine.EngineTypeTransit, transit.NewTransitEngine) engineRegistry.RegisterFactory(engine.EngineTypeUser, user.NewUserEngine) - srv := server.New(cfg, sealMgr, authenticator, policyEngine, engineRegistry, logger, version) - grpcSrv := grpcserver.New(cfg, sealMgr, authenticator, policyEngine, engineRegistry, logger) + srv := server.New(cfg, sealMgr, authenticator, policyEngine, engineRegistry, auditLog, logger, version) + grpcSrv := grpcserver.New(cfg, sealMgr, authenticator, policyEngine, engineRegistry, auditLog, logger) ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) defer stop() @@ -106,3 +114,26 @@ func runServer(cmd *cobra.Command, args []string) error { grpcSrv.Shutdown() return srv.Shutdown(context.Background()) } + +func createAuditLogger(cfg *config.Config) (*audit.Logger, error) { + switch cfg.Audit.Mode { + case "": + return nil, nil // disabled + case "stdout": + h := slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{ + Level: audit.LevelAudit, + }) + return audit.New(h), nil + case "file": + f, err := os.OpenFile(cfg.Audit.Path, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0600) + if err != nil { + return nil, fmt.Errorf("audit: open log file %s: %w", cfg.Audit.Path, err) + } + h := slog.NewJSONHandler(f, &slog.HandlerOptions{ + Level: audit.LevelAudit, + }) + return audit.New(h), nil + default: + return nil, fmt.Errorf("audit: unknown mode %q", cfg.Audit.Mode) + } +} diff --git a/engines/auditlog.md b/engines/auditlog.md new file mode 100644 index 0000000..4c1cff0 --- /dev/null +++ b/engines/auditlog.md @@ -0,0 +1,513 @@ +# Audit Logging Design + +## Overview + +Metacrypt is a cryptographic service for a homelab/personal infrastructure +platform. Audit logging gives the operator visibility into what happened, +when, and by whom — essential for a service that issues certificates, signs +SSH keys, and manages encryption keys, even at homelab scale. + +The design prioritizes simplicity and operational clarity over enterprise +features. There is one operator. There is no SIEM. The audit log should be +a structured, append-only file that can be read with `jq`, tailed with +`journalctl`, and rotated with `logrotate`. It should not require a +database, a separate service, or additional infrastructure. + +## Goals + +1. **Record all security-relevant operations** — who did what, when, and + whether it succeeded. +2. **Separate audit events from operational logs** — operational logs + (`slog.Info`) are for debugging; audit events are for accountability. +3. **Zero additional dependencies** — use Go's `log/slog` with a dedicated + handler writing to a file or stdout. +4. **No performance overhead that matters at homelab scale** — synchronous + writes are fine. This is not a high-throughput system. +5. **Queryable with standard tools** — one JSON object per line, greppable, + `jq`-friendly. + +## Non-Goals + +- Tamper-evident chaining (hash chains, Merkle trees). The operator has + root access to the machine; tamper evidence against the operator is + theatre. If the threat model changes, this can be added later. +- Remote log shipping. If needed, `journalctl` or `filebeat` can ship + the file externally. +- Log aggregation across services. Each Metacircular service logs + independently. +- Structured querying (SQL, full-text search). `jq` and `grep` are + sufficient. + +## Event Model + +Every audit event is a single JSON line with these fields: + +```json +{ + "time": "2026-03-17T04:15:42.577Z", + "level": "AUDIT", + "msg": "operation completed", + "caller": "kyle", + "roles": ["admin"], + "operation": "issue", + "engine": "ca", + "mount": "pki", + "resource": "ca/pki/id/example.com", + "outcome": "success", + "detail": {"serial": "01:02:03", "issuer": "default", "cn": "example.com"} +} +``` + +### Required Fields + +| Field | Type | Description | +|-------|------|-------------| +| `time` | RFC 3339 | When the event occurred | +| `level` | string | Always `"AUDIT"` — distinguishes from operational logs | +| `msg` | string | Human-readable summary | +| `caller` | string | MCIAS username, or `"anonymous"` for unauthenticated ops | +| `operation` | string | Engine operation name (e.g., `issue`, `sign-user`, `encrypt`) | +| `outcome` | string | `"success"`, `"denied"`, or `"error"` | + +### Optional Fields + +| Field | Type | Description | +|-------|------|-------------| +| `roles` | []string | Caller's MCIAS roles | +| `engine` | string | Engine type (`ca`, `sshca`, `transit`, `user`) | +| `mount` | string | Mount name | +| `resource` | string | Policy resource path evaluated | +| `detail` | object | Operation-specific metadata (see below) | +| `error` | string | Error message on `"error"` or `"denied"` outcomes | + +### Detail Fields by Operation Category + +**Certificate operations** (CA): +- `serial`, `issuer`, `cn`, `profile`, `ttl` + +**SSH CA operations**: +- `serial`, `cert_type` (`user`/`host`), `principals`, `profile`, `key_id` + +**Transit operations**: +- `key` (key name), `key_version`, `batch_size` (for batch ops) + +**User E2E operations**: +- `recipients` (list), `sender` + +**Policy operations**: +- `rule_id`, `effect` + +**System operations** (seal/unseal/init): +- No detail fields; the operation name is sufficient. + +### What NOT to Log + +- Plaintext, ciphertext, signatures, HMACs, envelopes, or any + cryptographic material. +- Private keys, public keys, or key bytes. +- Passwords, tokens, or credentials. +- Full request/response bodies. + +The audit log records **what happened**, not **what the data was**. + +## Architecture + +### Audit Logger + +A thin wrapper around `slog.Logger` with a dedicated handler: + +```go +// Package audit provides structured audit event logging. +package audit + +import ( + "context" + "log/slog" +) + +// Logger writes structured audit events. +type Logger struct { + logger *slog.Logger +} + +// New creates an audit logger that writes to the given handler. +func New(h slog.Handler) *Logger { + return &Logger{logger: slog.New(h)} +} + +// Event represents a single audit event. +type Event struct { + Caller string + Roles []string + Operation string + Engine string + Mount string + Resource string + Outcome string // "success", "denied", "error" + Error string + Detail map[string]interface{} +} + +// Log writes an audit event. +func (l *Logger) Log(ctx context.Context, e Event) { + attrs := []slog.Attr{ + slog.String("caller", e.Caller), + slog.String("operation", e.Operation), + slog.String("outcome", e.Outcome), + } + if len(e.Roles) > 0 { + attrs = append(attrs, slog.Any("roles", e.Roles)) + } + if e.Engine != "" { + attrs = append(attrs, slog.String("engine", e.Engine)) + } + if e.Mount != "" { + attrs = append(attrs, slog.String("mount", e.Mount)) + } + if e.Resource != "" { + attrs = append(attrs, slog.String("resource", e.Resource)) + } + if e.Error != "" { + attrs = append(attrs, slog.String("error", e.Error)) + } + if len(e.Detail) > 0 { + attrs = append(attrs, slog.Any("detail", e.Detail)) + } + + // Use a custom level that sorts above Info but is labelled "AUDIT". + l.logger.LogAttrs(ctx, LevelAudit, "operation completed", attrs...) +} + +// LevelAudit is a custom slog level for audit events. +const LevelAudit = slog.Level(12) // between Warn (4) and Error (8+) +``` + +The custom level ensures audit events are never suppressed by log level +filtering (operators may set `level = "warn"` to quiet debug noise, but +audit events must always be emitted). + +### Output Configuration + +Two modes, controlled by a config option: + +```toml +[audit] +# "file" writes to a dedicated audit log file. +# "stdout" writes to stdout alongside operational logs (for journalctl). +# Empty string disables audit logging. +mode = "file" +path = "/srv/metacrypt/audit.log" +``` + +**File mode**: Opens the file append-only with `0600` permissions. Uses +`slog.NewJSONHandler` writing to the file. The file can be rotated with +`logrotate` — the logger re-opens on the next write if the file is +renamed/truncated. For simplicity, just write and let logrotate handle +rotation; Go's `slog.JSONHandler` does not buffer. + +**Stdout mode**: Uses `slog.NewJSONHandler` writing to `os.Stdout`. Events +are interleaved with operational logs but distinguishable by the `"AUDIT"` +level. Suitable for systemd/journalctl capture where all output goes to +the journal. + +**Disabled**: No audit logger is created. The `Logger` is nil-safe — all +methods are no-ops on a nil receiver. + +```go +func (l *Logger) Log(ctx context.Context, e Event) { + if l == nil { + return + } + // ... +} +``` + +### Integration Points + +The audit logger is created at startup and injected into the components +that need it: + +``` +cmd/metacrypt/server.go + └── audit.New(handler) + ├── server.Server (REST handlers) + ├── grpcserver.GRPCServer (gRPC interceptor) + ├── seal.Manager (seal/unseal/init) + └── policy.Engine (rule create/delete) +``` + +Engine operations are logged at the **server layer** (REST handlers and +gRPC interceptors), not inside the engines themselves. This keeps the +engines focused on business logic and avoids threading the audit logger +through every engine method. + +### Instrumentation + +#### REST API (`internal/server/`) + +Instrument `handleEngineRequest` and every typed handler. The audit event +is emitted **after** the operation completes (success or failure): + +```go +func (s *Server) handleGetCert(w http.ResponseWriter, r *http.Request) { + // ... existing handler logic ... + + s.audit.Log(r.Context(), audit.Event{ + Caller: info.Username, + Roles: info.Roles, + Operation: "get-cert", + Engine: "ca", + Mount: mountName, + Outcome: "success", + Detail: map[string]interface{}{"serial": serial}, + }) +} +``` + +On error: + +```go +s.audit.Log(r.Context(), audit.Event{ + Caller: info.Username, + Roles: info.Roles, + Operation: "get-cert", + Engine: "ca", + Mount: mountName, + Outcome: "error", + Error: err.Error(), +}) +``` + +To avoid duplicating this in every handler, use a helper: + +```go +func (s *Server) auditEngineOp(r *http.Request, info *auth.TokenInfo, + op, engineType, mount, outcome string, detail map[string]interface{}, err error) { + e := audit.Event{ + Caller: info.Username, + Roles: info.Roles, + Operation: op, + Engine: engineType, + Mount: mount, + Outcome: outcome, + Detail: detail, + } + if err != nil { + e.Error = err.Error() + } + s.audit.Log(r.Context(), e) +} +``` + +#### gRPC API (`internal/grpcserver/`) + +Add an audit interceptor that fires after each RPC completes. This is +cleaner than instrumenting every handler individually: + +```go +func (g *GRPCServer) auditInterceptor( + ctx context.Context, + req interface{}, + info *grpc.UnaryServerInfo, + handler grpc.UnaryHandler, +) (interface{}, error) { + resp, err := handler(ctx, req) + + // Extract caller info from context (set by auth interceptor). + caller := callerFromContext(ctx) + + outcome := "success" + if err != nil { + outcome = "error" + } + + g.audit.Log(ctx, audit.Event{ + Caller: caller.Username, + Roles: caller.Roles, + Operation: path.Base(info.FullMethod), // e.g., "IssueCert" + Resource: info.FullMethod, + Outcome: outcome, + Error: errString(err), + }) + + return resp, err +} +``` + +Register this interceptor **after** the auth interceptor in the chain so +that caller info is available. + +#### Seal/Unseal (`internal/seal/`) + +Instrument `Init`, `Unseal`, `Seal`, and `RotateMEK`: + +```go +// In Manager.Unseal, after success: +m.audit.Log(ctx, audit.Event{ + Caller: "operator", // unseal is not authenticated + Operation: "unseal", + Outcome: "success", +}) + +// On failure: +m.audit.Log(ctx, audit.Event{ + Caller: "operator", + Operation: "unseal", + Outcome: "denied", + Error: "invalid password", +}) +``` + +#### Policy (`internal/policy/`) + +Instrument `CreateRule` and `DeleteRule`: + +```go +// In Engine.CreateRule, after success: +e.audit.Log(ctx, audit.Event{ + Caller: callerUsername, // passed from the handler + Operation: "create-policy", + Outcome: "success", + Detail: map[string]interface{}{"rule_id": rule.ID, "effect": rule.Effect}, +}) +``` + +### Operations to Audit + +| Category | Operations | Outcome on deny | +|----------|------------|-----------------| +| System | `init`, `unseal`, `seal`, `rotate-mek`, `rotate-key`, `migrate` | `denied` or `error` | +| CA | `import-root`, `create-issuer`, `delete-issuer`, `issue`, `sign-csr`, `renew`, `revoke-cert`, `delete-cert` | `denied` | +| SSH CA | `sign-host`, `sign-user`, `create-profile`, `update-profile`, `delete-profile`, `revoke-cert`, `delete-cert` | `denied` | +| Transit | `create-key`, `delete-key`, `rotate-key`, `update-key-config`, `trim-key`, `encrypt`, `decrypt`, `rewrap`, `sign`, `verify`, `hmac` | `denied` | +| User | `register`, `provision`, `encrypt`, `decrypt`, `re-encrypt`, `rotate-key`, `delete-user` | `denied` | +| Policy | `create-policy`, `delete-policy` | N/A (admin-only) | +| Auth | `login` (success and failure) | `denied` | + +**Read-only operations** (`get-cert`, `list-certs`, `get-profile`, +`list-profiles`, `get-key`, `list-keys`, `list-users`, `get-public-key`, +`status`) are **not audited** by default. They generate operational log +entries via the existing HTTP/gRPC logging middleware but do not produce +audit events. This keeps the audit log focused on state-changing operations. + +If the operator wants read auditing, a config flag can enable it: + +```toml +[audit] +include_reads = false # default +``` + +## File Layout + +``` +internal/ + audit/ + audit.go # Logger, Event, LevelAudit + audit_test.go # Tests +``` + +One file, one type, no interfaces. The audit logger is a concrete struct +passed by pointer. Nil-safe for disabled mode. + +## Configuration + +Add to `config.go`: + +```go +type AuditConfig struct { + Mode string `toml:"mode"` // "file", "stdout", "" + Path string `toml:"path"` // file path (mode=file) + IncludeReads bool `toml:"include_reads"` // audit read operations +} +``` + +Add to example config: + +```toml +[audit] +mode = "file" +path = "/srv/metacrypt/audit.log" +include_reads = false +``` + +## Implementation Steps + +1. **Create `internal/audit/audit.go`** — `Logger`, `Event`, `LevelAudit`, + `New(handler)`, nil-safe `Log` method. + +2. **Add `AuditConfig` to config** — mode, path, include_reads. Validate + that `path` is set when `mode = "file"`. + +3. **Create audit logger in `cmd/metacrypt/server.go`** — based on config, + open file or use stdout. Pass to Server, GRPCServer, SealManager, + PolicyEngine. + +4. **Add `audit *audit.Logger` field** to `Server`, `GRPCServer`, + `seal.Manager`, `policy.Engine`. Update constructors. + +5. **Instrument REST handlers** — add `auditEngineOp` helper to `Server`. + Call after every mutating operation in typed handlers and + `handleEngineRequest`. + +6. **Instrument gRPC** — add audit interceptor to the interceptor chain. + +7. **Instrument seal/unseal** — emit events in `Init`, `Unseal`, `Seal`, + `RotateMEK`. + +8. **Instrument policy** — emit events in `CreateRule`, `DeleteRule`. + +9. **Instrument login** — emit events in the auth login handler (both + REST and gRPC). + +10. **Update ARCHITECTURE.md** — document audit logging in the Security + Model section. Remove from Future Work. + +11. **Update example configs** — add `[audit]` section. + +12. **Add tests** — verify events are emitted for success, denied, and + error outcomes. Verify nil logger is safe. Verify read operations are + excluded by default. + +## Querying the Audit Log + +```bash +# All events for a user: +jq 'select(.caller == "kyle")' /srv/metacrypt/audit.log + +# All certificate issuances: +jq 'select(.operation == "issue")' /srv/metacrypt/audit.log + +# All denied operations: +jq 'select(.outcome == "denied")' /srv/metacrypt/audit.log + +# All SSH CA events in the last hour: +jq 'select(.engine == "sshca" and .time > "2026-03-17T03:00:00Z")' /srv/metacrypt/audit.log + +# Count operations by type: +jq -r '.operation' /srv/metacrypt/audit.log | sort | uniq -c | sort -rn + +# Failed unseal attempts: +jq 'select(.operation == "unseal" and .outcome == "denied")' /srv/metacrypt/audit.log +``` + +## Rotation + +For file mode, use logrotate: + +``` +/srv/metacrypt/audit.log { + daily + rotate 90 + compress + delaycompress + missingok + notifempty + copytruncate +} +``` + +`copytruncate` avoids the need for a signal-based reopen mechanism. The +Go `slog.JSONHandler` writes are not buffered, so no data is lost. + +At homelab scale with moderate usage, 90 days of uncompressed audit logs +will be well under 100 MB. diff --git a/go.mod b/go.mod index 7d6af03..d88ce43 100644 --- a/go.mod +++ b/go.mod @@ -7,7 +7,7 @@ replace git.wntrmute.dev/kyle/mcias/clients/go => /Users/kyle/src/mcias/clients/ replace git.wntrmute.dev/kyle/goutils => /Users/kyle/src/goutils require ( - git.wntrmute.dev/kyle/goutils v1.21.1 + git.wntrmute.dev/kyle/goutils v1.21.0 git.wntrmute.dev/kyle/mcias/clients/go v0.0.0-00010101000000-000000000000 github.com/go-chi/chi/v5 v5.2.5 github.com/pelletier/go-toml/v2 v2.2.4 diff --git a/internal/audit/audit.go b/internal/audit/audit.go new file mode 100644 index 0000000..9e5532d --- /dev/null +++ b/internal/audit/audit.go @@ -0,0 +1,79 @@ +// Package audit provides structured audit event logging for Metacrypt. +// Audit events record security-relevant operations (who did what, when, +// and whether it succeeded) as one JSON object per line. +package audit + +import ( + "context" + "log/slog" +) + +// LevelAudit is a custom slog level for audit events. It sits above Warn +// so that audit events are never suppressed by log level filtering. +const LevelAudit = slog.Level(12) + +func init() { + // Replace the level name so JSON output shows "AUDIT" instead of "ERROR+4". + slog.SetLogLoggerLevel(LevelAudit) +} + +// Logger writes structured audit events. A nil *Logger is safe to use; +// all methods are no-ops. +type Logger struct { + logger *slog.Logger +} + +// New creates an audit logger writing to the given handler. Pass nil to +// create a disabled logger (equivalent to using a nil *Logger). +func New(h slog.Handler) *Logger { + if h == nil { + return nil + } + return &Logger{logger: slog.New(h)} +} + +// Event represents a single audit event. +type Event struct { + Caller string // MCIAS username, or "operator" for unauthenticated ops + Roles []string // caller's MCIAS roles + Operation string // engine operation name (e.g., "issue", "sign-user") + Engine string // engine type (e.g., "ca", "sshca", "transit", "user") + Mount string // mount name + Resource string // policy resource path evaluated + Outcome string // "success", "denied", or "error" + Error string // error message (on "error" or "denied" outcomes) + Detail map[string]interface{} // operation-specific metadata +} + +// Log writes an audit event. Safe to call on a nil receiver. +func (l *Logger) Log(ctx context.Context, e Event) { + if l == nil { + return + } + + attrs := []slog.Attr{ + slog.String("caller", e.Caller), + slog.String("operation", e.Operation), + slog.String("outcome", e.Outcome), + } + if len(e.Roles) > 0 { + attrs = append(attrs, slog.Any("roles", e.Roles)) + } + if e.Engine != "" { + attrs = append(attrs, slog.String("engine", e.Engine)) + } + if e.Mount != "" { + attrs = append(attrs, slog.String("mount", e.Mount)) + } + if e.Resource != "" { + attrs = append(attrs, slog.String("resource", e.Resource)) + } + if e.Error != "" { + attrs = append(attrs, slog.String("error", e.Error)) + } + if len(e.Detail) > 0 { + attrs = append(attrs, slog.Any("detail", e.Detail)) + } + + l.logger.LogAttrs(ctx, LevelAudit, "audit", attrs...) +} diff --git a/internal/audit/audit_test.go b/internal/audit/audit_test.go new file mode 100644 index 0000000..4ba2ad5 --- /dev/null +++ b/internal/audit/audit_test.go @@ -0,0 +1,124 @@ +package audit + +import ( + "bytes" + "context" + "encoding/json" + "log/slog" + "testing" +) + +func TestNilLoggerIsSafe(t *testing.T) { + var l *Logger + // Must not panic. + l.Log(context.Background(), Event{ + Caller: "test", + Operation: "issue", + Outcome: "success", + }) +} + +func TestLogWritesJSON(t *testing.T) { + var buf bytes.Buffer + h := slog.NewJSONHandler(&buf, &slog.HandlerOptions{ + Level: slog.Level(-10), // accept all levels + }) + l := New(h) + + l.Log(context.Background(), Event{ + Caller: "kyle", + Roles: []string{"admin"}, + Operation: "issue", + Engine: "ca", + Mount: "pki", + Outcome: "success", + Detail: map[string]interface{}{"serial": "01:02:03"}, + }) + + var entry map[string]interface{} + if err := json.Unmarshal(buf.Bytes(), &entry); err != nil { + t.Fatalf("invalid JSON: %v\nbody: %s", err, buf.String()) + } + + checks := map[string]string{ + "caller": "kyle", + "operation": "issue", + "engine": "ca", + "mount": "pki", + "outcome": "success", + } + for k, want := range checks { + got, ok := entry[k].(string) + if !ok || got != want { + t.Errorf("field %q = %q, want %q", k, got, want) + } + } + + detail, ok := entry["detail"].(map[string]interface{}) + if !ok { + t.Fatalf("detail is not a map: %T", entry["detail"]) + } + if serial, _ := detail["serial"].(string); serial != "01:02:03" { + t.Errorf("detail.serial = %q, want %q", serial, "01:02:03") + } +} + +func TestLogOmitsEmptyFields(t *testing.T) { + var buf bytes.Buffer + h := slog.NewJSONHandler(&buf, &slog.HandlerOptions{ + Level: slog.Level(-10), + }) + l := New(h) + + l.Log(context.Background(), Event{ + Caller: "kyle", + Operation: "unseal", + Outcome: "success", + }) + + var entry map[string]interface{} + if err := json.Unmarshal(buf.Bytes(), &entry); err != nil { + t.Fatalf("invalid JSON: %v", err) + } + + for _, key := range []string{"roles", "engine", "mount", "resource", "error", "detail"} { + if _, ok := entry[key]; ok { + t.Errorf("field %q should be omitted for empty value", key) + } + } +} + +func TestLogIncludesError(t *testing.T) { + var buf bytes.Buffer + h := slog.NewJSONHandler(&buf, &slog.HandlerOptions{ + Level: slog.Level(-10), + }) + l := New(h) + + l.Log(context.Background(), Event{ + Caller: "operator", + Operation: "unseal", + Outcome: "denied", + Error: "invalid password", + }) + + var entry map[string]interface{} + if err := json.Unmarshal(buf.Bytes(), &entry); err != nil { + t.Fatalf("invalid JSON: %v", err) + } + if got, _ := entry["error"].(string); got != "invalid password" { + t.Errorf("error = %q, want %q", got, "invalid password") + } + if got, _ := entry["outcome"].(string); got != "denied" { + t.Errorf("outcome = %q, want %q", got, "denied") + } +} + +func TestNewWithNilHandlerReturnsNil(t *testing.T) { + l := New(nil) + if l != nil { + t.Errorf("New(nil) = %v, want nil", l) + } + // Must not panic. + l.Log(context.Background(), Event{Caller: "test", Operation: "test", Outcome: "success"}) +} diff --git a/internal/config/config.go b/internal/config/config.go index b8a9e5b..c975ed7 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -16,6 +16,17 @@ type Config struct { Database DatabaseConfig `toml:"database"` Log LogConfig `toml:"log"` Seal SealConfig `toml:"seal"` + Audit AuditConfig `toml:"audit"` +} + +// AuditConfig holds audit logging settings. +type AuditConfig struct { + // Mode controls audit log output: "file", "stdout", or "" (disabled). + Mode string `toml:"mode"` + // Path is the audit log file path (required when mode is "file"). + Path string `toml:"path"` + // IncludeReads enables audit logging for read-only operations. + IncludeReads bool `toml:"include_reads"` } // ServerConfig holds HTTP/gRPC server settings. @@ -119,5 +130,17 @@ func (c *Config) Validate() error { c.Web.ListenAddr = "127.0.0.1:8080" } + // Validate audit config. + switch c.Audit.Mode { + case "", "stdout": + // ok + case "file": + if c.Audit.Path == "" { + return fmt.Errorf("config: audit.path is required when audit.mode is \"file\"") + } + default: + return fmt.Errorf("config: audit.mode must be \"file\", \"stdout\", or empty") + } + return nil } diff --git a/internal/engine/ca/ca.go b/internal/engine/ca/ca.go index f4473f8..8e97851 100644 --- a/internal/engine/ca/ca.go +++ b/internal/engine/ca/ca.go @@ -596,6 +596,9 @@ func (e *CAEngine) handleGetChain(_ context.Context, req *engine.Request) (*engi if issuerName == "" { issuerName = req.Path } + if err := engine.ValidateName(issuerName); err != nil { + return nil, err + } chain, err := e.GetChainPEM(issuerName) if err != nil { @@ -610,6 +613,9 @@ func (e *CAEngine) handleGetChain(_ context.Context, req *engine.Request) (*engi func (e *CAEngine) handleGetIssuer(_ context.Context, req *engine.Request) (*engine.Response, error) { name := req.Path + if err := engine.ValidateName(name); err != nil { + return nil, err + } certPEM, err := e.GetIssuerCertPEM(name) if err != nil { @@ -698,6 +704,7 @@ func (e *CAEngine) handleCreateIssuer(ctx context.Context, req *engine.Request) Expiry: expiry, } + e.setProfileAIA(&profile) issuerCert, err := profile.SignRequest(e.rootCert, csr, e.rootKey) if err != nil { return nil, fmt.Errorf("ca: sign issuer cert: %w", err) @@ -757,6 +764,9 @@ func (e *CAEngine) handleDeleteIssuer(ctx context.Context, req *engine.Request) if name == "" { name = req.Path } + if err := engine.ValidateName(name); err != nil { + return nil, err + } e.mu.Lock() defer e.mu.Unlock() @@ -830,6 +840,9 @@ func (e *CAEngine) handleIssue(ctx context.Context, req *engine.Request) (*engin if issuerName == "" { return nil, fmt.Errorf("ca: issuer name is required") } + if err := engine.ValidateName(issuerName); err != nil { + return nil, err + } profileName, _ := req.Data["profile"].(string) if profileName == "" { @@ -922,6 +935,7 @@ func (e *CAEngine) handleIssue(ctx context.Context, req *engine.Request) (*engin return nil, fmt.Errorf("ca: create leaf CSR: %w", err) } + e.setProfileAIA(&profile) leafCert, err := profile.SignRequest(is.cert, csr, is.key) if err != nil { return nil, fmt.Errorf("ca: sign leaf cert: %w", err) @@ -1171,6 +1185,7 @@ func (e *CAEngine) handleRenew(ctx context.Context, req *engine.Request) (*engin return nil, fmt.Errorf("ca: create renewal CSR: %w", err) } + e.setProfileAIA(&profile) newCert, err := profile.SignRequest(is.cert, csr, is.key) if err != nil { return nil, fmt.Errorf("ca: sign renewal cert: %w", err) @@ -1238,6 +1253,9 @@ func (e *CAEngine) handleSignCSR(ctx context.Context, req *engine.Request) (*eng if issuerName == "" { return nil, fmt.Errorf("ca: issuer name is required") } + if err := engine.ValidateName(issuerName); err != nil { + return nil, err + } csrPEM, _ := req.Data["csr_pem"].(string) if csrPEM == "" { @@ -1293,6 +1311,7 @@ func (e *CAEngine) handleSignCSR(ctx context.Context, req *engine.Request) (*eng } } + e.setProfileAIA(&profile) leafCert, err := profile.SignRequest(is.cert, csr, is.key) if err != nil { return nil, fmt.Errorf("ca: sign CSR: %w", err) @@ -1436,6 +1455,20 @@ func (e *CAEngine) handleDeleteCert(ctx context.Context, req *engine.Request) (* // --- Helpers --- +// setProfileAIA populates the AIA (Authority Information Access) extension +// URLs on the profile if external_url is configured. This allows clients +// to discover the issuing CA certificate for chain building. +func (e *CAEngine) setProfileAIA(profile *certgen.Profile) { + if e.config.ExternalURL == "" { + return + } + base := strings.TrimSuffix(e.config.ExternalURL, "/") + mount := e.mountName() + profile.IssuingCertificateURL = []string{ + base + "/v1/pki/" + mount + "/ca/chain", + } +} + func defaultCAConfig() *CAConfig { return &CAConfig{ Organization: "Metacircular", @@ -1461,6 +1494,9 @@ func mapToCAConfig(m map[string]interface{}, cfg *CAConfig) error { if v, ok := m["root_expiry"].(string); ok { cfg.RootExpiry = v } + if v, ok := m["external_url"].(string); ok { + cfg.ExternalURL = v + } return nil } diff --git a/internal/engine/ca/profiles.go b/internal/engine/ca/profiles.go index cca3958..30015b5 100644 --- a/internal/engine/ca/profiles.go +++ b/internal/engine/ca/profiles.go @@ -29,11 +29,13 @@ func GetProfile(name string) (certgen.Profile, bool) { } // Return a copy so callers can modify. cp := certgen.Profile{ - IsCA: p.IsCA, - PathLen: p.PathLen, - Expiry: p.Expiry, - KeyUse: make([]string, len(p.KeyUse)), - ExtKeyUsages: make([]string, len(p.ExtKeyUsages)), + IsCA: p.IsCA, + PathLen: p.PathLen, + Expiry: p.Expiry, + KeyUse: make([]string, len(p.KeyUse)), + ExtKeyUsages: make([]string, len(p.ExtKeyUsages)), + OCSPServer: append([]string(nil), p.OCSPServer...), + IssuingCertificateURL: append([]string(nil), p.IssuingCertificateURL...), } copy(cp.KeyUse, p.KeyUse) copy(cp.ExtKeyUsages, p.ExtKeyUsages) diff --git a/internal/engine/ca/types.go b/internal/engine/ca/types.go index 2962165..eb2117a 100644 --- a/internal/engine/ca/types.go +++ b/internal/engine/ca/types.go @@ -10,6 +10,7 @@ type CAConfig struct { Country string `json:"country,omitempty"` KeyAlgorithm string `json:"key_algorithm"` RootExpiry string `json:"root_expiry"` + ExternalURL string `json:"external_url,omitempty"` KeySize int `json:"key_size"` } diff --git a/internal/engine/sshca/sshca.go b/internal/engine/sshca/sshca.go index 97f5460..55c687b 100644 --- a/internal/engine/sshca/sshca.go +++ b/internal/engine/sshca/sshca.go @@ -588,6 +588,9 @@ func (e *SSHCAEngine) handleUpdateProfile(ctx context.Context, req *engine.Reque if name == "" { return nil, fmt.Errorf("sshca: name is required") } + if err := engine.ValidateName(name); err != nil { + return nil, err + } // Load existing profile. profile, err := e.loadProfile(ctx, name) @@ -631,6 +634,9 @@ func (e *SSHCAEngine) handleGetProfile(ctx context.Context, req *engine.Request) if name == "" { return nil, fmt.Errorf("sshca: name is required") } + if err := engine.ValidateName(name); err != nil { + return nil, err + } profile, err := e.loadProfile(ctx, name) if err != nil { @@ -697,6 +703,9 @@ func (e *SSHCAEngine) handleDeleteProfile(ctx context.Context, req *engine.Reque if name == "" { return nil, fmt.Errorf("sshca: name is required") } + if err := engine.ValidateName(name); err != nil { + return nil, err + } // Check existence. if _, err := e.barrier.Get(ctx, e.mountPath+"profiles/"+name+".json"); err != nil { diff --git a/internal/engine/transit/transit.go b/internal/engine/transit/transit.go index 65aceae..87a39b7 100644 --- a/internal/engine/transit/transit.go +++ b/internal/engine/transit/transit.go @@ -450,6 +450,9 @@ func (e *TransitEngine) handleDeleteKey(ctx context.Context, req *engine.Request if name == "" { return nil, fmt.Errorf("transit: name is required") } + if err := engine.ValidateName(name); err != nil { + return nil, err + } ks, ok := e.keys[name] if !ok { @@ -498,6 +501,9 @@ func (e *TransitEngine) handleGetKey(_ context.Context, req *engine.Request) (*e if name == "" { return nil, fmt.Errorf("transit: name is required") } + if err := engine.ValidateName(name); err != nil { + return nil, err + } ks, ok := e.keys[name] if !ok { @@ -561,6 +567,9 @@ func (e *TransitEngine) handleRotateKey(ctx context.Context, req *engine.Request if name == "" { return nil, fmt.Errorf("transit: name is required") } + if err := engine.ValidateName(name); err != nil { + return nil, err + } ks, ok := e.keys[name] if !ok { @@ -638,6 +647,9 @@ func (e *TransitEngine) handleUpdateKeyConfig(ctx context.Context, req *engine.R if name == "" { return nil, fmt.Errorf("transit: name is required") } + if err := engine.ValidateName(name); err != nil { + return nil, err + } ks, ok := e.keys[name] if !ok { @@ -684,6 +696,9 @@ func (e *TransitEngine) handleTrimKey(ctx context.Context, req *engine.Request) if name == "" { return nil, fmt.Errorf("transit: name is required") } + if err := engine.ValidateName(name); err != nil { + return nil, err + } ks, ok := e.keys[name] if !ok { @@ -1290,6 +1305,9 @@ func (e *TransitEngine) handleGetPublicKey(_ context.Context, req *engine.Reques if keyName == "" { return nil, fmt.Errorf("transit: name is required") } + if err := engine.ValidateName(keyName); err != nil { + return nil, err + } ks, ok := e.keys[keyName] if !ok { diff --git a/internal/engine/user/user.go b/internal/engine/user/user.go index efdf829..88f3ede 100644 --- a/internal/engine/user/user.go +++ b/internal/engine/user/user.go @@ -212,6 +212,9 @@ func (e *UserEngine) handleRegister(ctx context.Context, req *engine.Request) (* } username := req.CallerInfo.Username + if err := engine.ValidateName(username); err != nil { + return nil, fmt.Errorf("user: invalid username: %w", err) + } e.mu.RLock() if u, ok := e.users[username]; ok { pubB64 := base64.StdEncoding.EncodeToString(u.pubKey.Bytes()) @@ -302,6 +305,9 @@ func (e *UserEngine) handleGetPublicKey(_ context.Context, req *engine.Request) if username == "" { return nil, fmt.Errorf("user: username is required") } + if err := engine.ValidateName(username); err != nil { + return nil, err + } e.mu.RLock() defer e.mu.RUnlock() @@ -657,14 +663,16 @@ func (e *UserEngine) handleRotateKey(ctx context.Context, req *engine.Request) ( return nil, fmt.Errorf("user: rotate key: %w", err) } - // Zeroize old key. - oldRaw := oldState.privKey.Bytes() - crypto.Zeroize(oldRaw) + // Zeroize old key material and drop reference for GC. + crypto.Zeroize(oldState.privBytes) + oldState.privKey = nil + oldState.privBytes = nil // Update in-memory state. e.users[caller] = &userState{ - privKey: priv, - pubKey: priv.PublicKey(), + privKey: priv, + privBytes: priv.Bytes(), + pubKey: priv.PublicKey(), config: &UserKeyConfig{ Algorithm: e.config.KeyAlgorithm, CreatedAt: time.Now().UTC(), @@ -692,6 +700,9 @@ func (e *UserEngine) handleDeleteUser(ctx context.Context, req *engine.Request) if username == "" { return nil, fmt.Errorf("user: username is required") } + if err := engine.ValidateName(username); err != nil { + return nil, err + } e.mu.Lock() defer e.mu.Unlock() @@ -701,9 +712,10 @@ func (e *UserEngine) handleDeleteUser(ctx context.Context, req *engine.Request) return nil, ErrUserNotFound } - // Zeroize private key. - oldRaw := oldState.privKey.Bytes() - crypto.Zeroize(oldRaw) + // Zeroize private key material and drop reference for GC. + crypto.Zeroize(oldState.privBytes) + oldState.privKey = nil + oldState.privBytes = nil // Delete from barrier. prefix := e.mountPath + "users/" + username + "/" diff --git a/internal/grpcserver/engine.go b/internal/grpcserver/engine.go index 41b237e..ec0e5a1 100644 --- a/internal/grpcserver/engine.go +++ b/internal/grpcserver/engine.go @@ -29,6 +29,14 @@ func (es *engineServer) Mount(ctx context.Context, req *pb.MountRequest) (*pb.Mo } } + // Inject external_url into engine config if available and not already set. + if config == nil { + config = make(map[string]interface{}) + } + if _, ok := config["external_url"]; !ok && es.s.cfg.Server.ExternalURL != "" { + config["external_url"] = es.s.cfg.Server.ExternalURL + } + if err := es.s.engines.Mount(ctx, req.Name, engine.EngineType(req.Type), config); err != nil { es.s.logger.Error("grpc: mount engine", "name", req.Name, "type", req.Type, "error", err) switch { diff --git a/internal/grpcserver/grpcserver_test.go b/internal/grpcserver/grpcserver_test.go index 4807358..6de8f82 100644 --- a/internal/grpcserver/grpcserver_test.go +++ b/internal/grpcserver/grpcserver_test.go @@ -71,7 +71,7 @@ func newTestGRPCServer(t *testing.T) (*GRPCServer, func()) { t.Fatalf("migrate: %v", err) } b := barrier.NewAESGCMBarrier(database) - sealMgr := seal.NewManager(database, b, slog.Default()) + sealMgr := seal.NewManager(database, b, nil, slog.Default()) policyEngine := policy.NewEngine(b) reg := newTestRegistry() authenticator := auth.NewAuthenticator(nil, slog.Default()) @@ -82,7 +82,7 @@ func newTestGRPCServer(t *testing.T) (*GRPCServer, func()) { Argon2Threads: 1, }, } - srv := New(cfg, sealMgr, authenticator, policyEngine, reg, slog.Default()) + srv := New(cfg, sealMgr, authenticator, policyEngine, reg, nil, slog.Default()) return srv, func() { _ = database.Close() } } diff --git a/internal/grpcserver/interceptors.go b/internal/grpcserver/interceptors.go index 81e768a..66135d5 100644 --- a/internal/grpcserver/interceptors.go +++ b/internal/grpcserver/interceptors.go @@ -3,6 +3,7 @@ package grpcserver import ( "context" "log/slog" + "path" "strings" "google.golang.org/grpc" @@ -10,6 +11,7 @@ import ( "google.golang.org/grpc/metadata" "google.golang.org/grpc/status" + "git.wntrmute.dev/kyle/metacrypt/internal/audit" "git.wntrmute.dev/kyle/metacrypt/internal/auth" "git.wntrmute.dev/kyle/metacrypt/internal/seal" ) @@ -97,6 +99,46 @@ func chainInterceptors(interceptors ...grpc.UnaryServerInterceptor) grpc.UnarySe } } +// auditInterceptor logs an audit event after each RPC completes. Must run +// after authInterceptor so that caller info is available in the context. +func auditInterceptor(auditLog *audit.Logger) grpc.UnaryServerInterceptor { + return func(ctx context.Context, req interface{}, info *grpc.UnaryServerInfo, handler grpc.UnaryHandler) (interface{}, error) { + resp, err := handler(ctx, req) + + caller := "anonymous" + var roles []string + if ti := tokenInfoFromContext(ctx); ti != nil { + caller = ti.Username + roles = ti.Roles + } + + outcome := "success" + var errMsg string + if err != nil { + outcome = "error" + if st, ok := status.FromError(err); ok { + if st.Code() == codes.PermissionDenied || st.Code() == codes.Unauthenticated { + outcome = "denied" + } + errMsg = st.Message() + } else { + errMsg = err.Error() + } + } + + auditLog.Log(ctx, audit.Event{ + Caller: caller, + Roles: roles, + Operation: path.Base(info.FullMethod), + Resource: info.FullMethod, + Outcome: outcome, + Error: errMsg, + }) + + return resp, err + } +} + func extractToken(ctx context.Context) string { md, ok := metadata.FromIncomingContext(ctx) if !ok { diff --git a/internal/grpcserver/server.go b/internal/grpcserver/server.go index 8135cdc..5176ce7 100644 --- a/internal/grpcserver/server.go +++ b/internal/grpcserver/server.go @@ -13,6 +13,7 @@ import ( pb "git.wntrmute.dev/kyle/metacrypt/gen/metacrypt/v2" internacme "git.wntrmute.dev/kyle/metacrypt/internal/acme" + "git.wntrmute.dev/kyle/metacrypt/internal/audit" "git.wntrmute.dev/kyle/metacrypt/internal/auth" "git.wntrmute.dev/kyle/metacrypt/internal/config" "git.wntrmute.dev/kyle/metacrypt/internal/engine" @@ -27,6 +28,7 @@ type GRPCServer struct { auth *auth.Authenticator policy *policy.Engine engines *engine.Registry + audit *audit.Logger logger *slog.Logger srv *grpc.Server acmeHandlers map[string]*internacme.Handler @@ -35,13 +37,14 @@ type GRPCServer struct { // New creates a new GRPCServer. func New(cfg *config.Config, sealMgr *seal.Manager, authenticator *auth.Authenticator, - policyEngine *policy.Engine, engineRegistry *engine.Registry, logger *slog.Logger) *GRPCServer { + policyEngine *policy.Engine, engineRegistry *engine.Registry, auditLog *audit.Logger, logger *slog.Logger) *GRPCServer { return &GRPCServer{ cfg: cfg, sealMgr: sealMgr, auth: authenticator, policy: policyEngine, engines: engineRegistry, + audit: auditLog, logger: logger, acmeHandlers: make(map[string]*internacme.Handler), } @@ -68,6 +71,7 @@ func (s *GRPCServer) Start() error { sealInterceptor(s.sealMgr, s.logger, sealRequiredMethods()), authInterceptor(s.auth, s.logger, authRequiredMethods()), adminInterceptor(s.logger, adminRequiredMethods()), + auditInterceptor(s.audit), ) s.srv = grpc.NewServer( diff --git a/internal/seal/seal.go b/internal/seal/seal.go index 75ba256..00f5d64 100644 --- a/internal/seal/seal.go +++ b/internal/seal/seal.go @@ -10,6 +10,7 @@ import ( "sync" "time" + "git.wntrmute.dev/kyle/metacrypt/internal/audit" "git.wntrmute.dev/kyle/metacrypt/internal/barrier" "git.wntrmute.dev/kyle/metacrypt/internal/crypto" ) @@ -54,6 +55,7 @@ type Manager struct { lockoutUntil time.Time db *sql.DB barrier *barrier.AESGCMBarrier + audit *audit.Logger logger *slog.Logger mek []byte state ServiceState @@ -62,10 +64,11 @@ type Manager struct { } // NewManager creates a new seal manager. -func NewManager(db *sql.DB, b *barrier.AESGCMBarrier, logger *slog.Logger) *Manager { +func NewManager(db *sql.DB, b *barrier.AESGCMBarrier, auditLog *audit.Logger, logger *slog.Logger) *Manager { return &Manager{ db: db, barrier: b, + audit: auditLog, logger: logger, state: StateUninitialized, } @@ -223,6 +226,10 @@ func (m *Manager) Unseal(password []byte) error { mek, err := crypto.Decrypt(kwk, encryptedMEK, nil) if err != nil { m.logger.Debug("unseal failed: invalid password") + m.audit.Log(context.Background(), audit.Event{ + Caller: "operator", Operation: "unseal", Outcome: "denied", + Error: "invalid password", + }) return ErrInvalidPassword } @@ -235,6 +242,9 @@ func (m *Manager) Unseal(password []byte) error { m.mek = mek m.state = StateUnsealed m.unsealAttempts = 0 + m.audit.Log(context.Background(), audit.Event{ + Caller: "operator", Operation: "unseal", Outcome: "success", + }) m.logger.Debug("unseal succeeded, barrier unsealed") return nil } @@ -340,6 +350,9 @@ func (m *Manager) Seal() error { } _ = m.barrier.Seal() m.state = StateSealed + m.audit.Log(context.Background(), audit.Event{ + Caller: "operator", Operation: "seal", Outcome: "success", + }) m.logger.Debug("service sealed") return nil } diff --git a/internal/seal/seal_test.go b/internal/seal/seal_test.go index e460e8b..ab4c643 100644 --- a/internal/seal/seal_test.go +++ b/internal/seal/seal_test.go @@ -23,7 +23,7 @@ func setupSeal(t *testing.T) (*Manager, func()) { t.Fatalf("migrate: %v", err) } b := barrier.NewAESGCMBarrier(database) - mgr := NewManager(database, b, slog.Default()) + mgr := NewManager(database, b, nil, slog.Default()) return mgr, func() { _ = database.Close() } } @@ -103,7 +103,7 @@ func TestSealCheckInitializedPersists(t *testing.T) { database, _ := db.Open(dbPath) _ = db.Migrate(database) b := barrier.NewAESGCMBarrier(database) - mgr := NewManager(database, b, slog.Default()) + mgr := NewManager(database, b, nil, slog.Default()) _ = mgr.CheckInitialized() params := crypto.Argon2Params{Time: 1, Memory: 64 * 1024, Threads: 1} _ = mgr.Initialize(context.Background(), []byte("password"), params) @@ -113,7 +113,7 @@ func TestSealCheckInitializedPersists(t *testing.T) { database2, _ := db.Open(dbPath) defer func() { _ = database2.Close() }() b2 := barrier.NewAESGCMBarrier(database2) - mgr2 := NewManager(database2, b2, slog.Default()) + mgr2 := NewManager(database2, b2, nil, slog.Default()) _ = mgr2.CheckInitialized() if mgr2.State() != StateSealed { t.Fatalf("state after reopen: got %v, want Sealed", mgr2.State()) diff --git a/internal/server/routes.go b/internal/server/routes.go index c803268..54efe78 100644 --- a/internal/server/routes.go +++ b/internal/server/routes.go @@ -11,6 +11,7 @@ import ( mcias "git.wntrmute.dev/kyle/mcias/clients/go" + "git.wntrmute.dev/kyle/metacrypt/internal/audit" "git.wntrmute.dev/kyle/metacrypt/internal/auth" "git.wntrmute.dev/kyle/metacrypt/internal/barrier" "git.wntrmute.dev/kyle/metacrypt/internal/crypto" @@ -286,6 +287,14 @@ func (s *Server) handleEngineMount(w http.ResponseWriter, r *http.Request) { return } + // Inject external_url into CA engine config if available and not already set. + if req.Config == nil { + req.Config = make(map[string]interface{}) + } + if _, ok := req.Config["external_url"]; !ok && s.cfg.Server.ExternalURL != "" { + req.Config["external_url"] = s.cfg.Server.ExternalURL + } + if err := s.engines.Mount(r.Context(), req.Name, engine.EngineType(req.Type), req.Config); err != nil { s.logger.Error("mount engine", "name", req.Name, "type", req.Type, "error", err) writeJSONError(w, err.Error(), http.StatusBadRequest) @@ -435,10 +444,16 @@ func (s *Server) handleEngineRequest(w http.ResponseWriter, r *http.Request) { case strings.Contains(err.Error(), "not found"): status = http.StatusNotFound } + outcome := "error" + if status == http.StatusForbidden || status == http.StatusUnauthorized { + outcome = "denied" + } + s.auditOp(r, info, req.Operation, "", req.Mount, outcome, nil, err) writeJSONError(w, err.Error(), status) return } + s.auditOp(r, info, req.Operation, "", req.Mount, "success", nil, nil) writeJSON(w, http.StatusOK, resp.Data) } @@ -1317,6 +1332,24 @@ func writeJSONError(w http.ResponseWriter, msg string, status int) { writeJSON(w, status, map[string]string{"error": msg}) } +// auditOp logs an audit event for a completed engine operation. +func (s *Server) auditOp(r *http.Request, info *auth.TokenInfo, + op, engineType, mount, outcome string, detail map[string]interface{}, err error) { + e := audit.Event{ + Caller: info.Username, + Roles: info.Roles, + Operation: op, + Engine: engineType, + Mount: mount, + Outcome: outcome, + Detail: detail, + } + if err != nil { + e.Error = err.Error() + } + s.audit.Log(r.Context(), e) +} + // newPolicyChecker builds a PolicyChecker closure for a caller, used by typed // REST handlers to pass service-level policy evaluation into the engine. func (s *Server) newPolicyChecker(r *http.Request, info *auth.TokenInfo) engine.PolicyChecker { diff --git a/internal/server/server.go b/internal/server/server.go index e6610ab..f0aacfe 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -14,6 +14,7 @@ import ( "google.golang.org/grpc" internacme "git.wntrmute.dev/kyle/metacrypt/internal/acme" + "git.wntrmute.dev/kyle/metacrypt/internal/audit" "git.wntrmute.dev/kyle/metacrypt/internal/auth" "git.wntrmute.dev/kyle/metacrypt/internal/config" "git.wntrmute.dev/kyle/metacrypt/internal/engine" @@ -28,6 +29,7 @@ type Server struct { auth *auth.Authenticator policy *policy.Engine engines *engine.Registry + audit *audit.Logger httpSrv *http.Server grpcSrv *grpc.Server logger *slog.Logger @@ -38,13 +40,14 @@ type Server struct { // New creates a new server. func New(cfg *config.Config, sealMgr *seal.Manager, authenticator *auth.Authenticator, - policyEngine *policy.Engine, engineRegistry *engine.Registry, logger *slog.Logger, version string) *Server { + policyEngine *policy.Engine, engineRegistry *engine.Registry, auditLog *audit.Logger, logger *slog.Logger, version string) *Server { s := &Server{ cfg: cfg, seal: sealMgr, auth: authenticator, policy: policyEngine, engines: engineRegistry, + audit: auditLog, logger: logger, version: version, } diff --git a/internal/server/server_test.go b/internal/server/server_test.go index 46ab2ce..f1e34aa 100644 --- a/internal/server/server_test.go +++ b/internal/server/server_test.go @@ -36,7 +36,7 @@ func setupTestServer(t *testing.T) (*Server, *seal.Manager, chi.Router) { _ = db.Migrate(database) b := barrier.NewAESGCMBarrier(database) - sealMgr := seal.NewManager(database, b, slog.Default()) + sealMgr := seal.NewManager(database, b, nil, slog.Default()) _ = sealMgr.CheckInitialized() // Auth requires MCIAS client which we can't create in tests easily, @@ -61,7 +61,7 @@ func setupTestServer(t *testing.T) (*Server, *seal.Manager, chi.Router) { } logger := slog.Default() - srv := New(cfg, sealMgr, authenticator, policyEngine, engineRegistry, logger, "test") + srv := New(cfg, sealMgr, authenticator, policyEngine, engineRegistry, nil, logger, "test") r := chi.NewRouter() srv.registerRoutes(r)