Phase 9: two-phase garbage collection engine

GC engine (internal/gc/): Collector.Run() implements the two-phase
algorithm — Phase 1 finds unreferenced blobs and deletes DB rows in
a single transaction, Phase 2 deletes blob files from storage.
Registry-wide mutex blocks concurrent GC runs. Collector.Reconcile()
scans filesystem for orphaned files with no DB row (crash recovery).

Wired into admin_gc.go: POST /v1/gc now launches the real collector
in a goroutine with gc_started/gc_completed audit events.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-03-19 20:27:17 -07:00
parent c01e7ffa30
commit 562b69e875
9 changed files with 727 additions and 9 deletions

6
internal/gc/errors.go Normal file
View File

@@ -0,0 +1,6 @@
package gc
import "errors"
// ErrGCRunning indicates that a GC run is already in progress.
var ErrGCRunning = errors.New("gc: already running")

160
internal/gc/gc.go Normal file
View File

@@ -0,0 +1,160 @@
package gc
import (
"context"
"fmt"
"sync"
"time"
)
// DB provides the database operations needed by GC.
type DB interface {
// FindUnreferencedBlobs returns digests and sizes of blobs with no
// manifest_blobs entries, deletes those blob rows in a transaction,
// and returns the results.
FindAndDeleteUnreferencedBlobs() ([]UnreferencedBlob, error)
// BlobExistsByDigest checks whether a blob row exists for the given digest.
BlobExistsByDigest(digest string) (bool, error)
}
// Storage provides filesystem operations for blob cleanup.
type Storage interface {
Delete(digest string) error
ListBlobDigests() ([]string, error)
}
// UnreferencedBlob is a blob that has no manifest references.
type UnreferencedBlob struct {
Digest string
Size int64
}
// Result records the outcome of a GC run.
type Result struct {
BlobsRemoved int
BytesFreed int64
Duration time.Duration
}
// Collector performs garbage collection of unreferenced blobs.
type Collector struct {
db DB
storage Storage
mu sync.Mutex // registry-wide GC lock
}
// New creates a new garbage collector.
func New(db DB, storage Storage) *Collector {
return &Collector{db: db, storage: storage}
}
// Run executes the two-phase GC algorithm per ARCHITECTURE.md §9.
// Phase 1 (DB): find unreferenced blobs, delete rows in a transaction.
// Phase 2 (filesystem): delete blob files, clean up empty dirs.
// Returns ErrGCRunning if another GC run is already in progress.
func (c *Collector) Run(ctx context.Context) (*Result, error) {
if !c.mu.TryLock() {
return nil, ErrGCRunning
}
defer c.mu.Unlock()
start := time.Now()
// Check for cancellation.
if err := ctx.Err(); err != nil {
return nil, fmt.Errorf("gc: %w", err)
}
// Phase 1: Mark and sweep in DB.
unreferenced, err := c.db.FindAndDeleteUnreferencedBlobs()
if err != nil {
return nil, fmt.Errorf("gc: phase 1: %w", err)
}
// Phase 2: Delete files from storage.
var bytesFreed int64
for _, blob := range unreferenced {
if err := ctx.Err(); err != nil {
// Return partial result on cancellation.
return &Result{
BlobsRemoved: len(unreferenced),
BytesFreed: bytesFreed,
Duration: time.Since(start),
}, fmt.Errorf("gc: phase 2 interrupted: %w", err)
}
// Best-effort file deletion. If the file is already gone (e.g.,
// crash recovery), that's fine.
if err := c.storage.Delete(blob.Digest); err != nil {
// Log but continue — orphaned files are harmless and will
// be caught by reconcile.
continue
}
bytesFreed += blob.Size
}
return &Result{
BlobsRemoved: len(unreferenced),
BytesFreed: bytesFreed,
Duration: time.Since(start),
}, nil
}
// Reconcile scans the filesystem for blob files with no matching DB row
// and deletes them. This handles crash recovery — files left behind when
// the process crashed after Phase 1 (DB cleanup) but before Phase 2
// (file cleanup) completed.
func (c *Collector) Reconcile(ctx context.Context) (*Result, error) {
if !c.mu.TryLock() {
return nil, ErrGCRunning
}
defer c.mu.Unlock()
start := time.Now()
digests, err := c.storage.ListBlobDigests()
if err != nil {
return nil, fmt.Errorf("gc: list blob files: %w", err)
}
var removed int
var bytesFreed int64
for _, digest := range digests {
if err := ctx.Err(); err != nil {
return &Result{
BlobsRemoved: removed,
BytesFreed: bytesFreed,
Duration: time.Since(start),
}, fmt.Errorf("gc: reconcile interrupted: %w", err)
}
exists, err := c.db.BlobExistsByDigest(digest)
if err != nil {
continue
}
if !exists {
if err := c.storage.Delete(digest); err != nil {
continue
}
removed++
}
}
return &Result{
BlobsRemoved: removed,
BytesFreed: bytesFreed,
Duration: time.Since(start),
}, nil
}
// Lock acquires the GC lock, blocking new blob uploads.
// Returns a function to release the lock.
func (c *Collector) Lock() func() {
c.mu.Lock()
return c.mu.Unlock
}
// TryLock attempts to acquire the GC lock without blocking.
// Returns true if the lock was acquired.
func (c *Collector) TryLock() bool {
return c.mu.TryLock()
}

230
internal/gc/gc_test.go Normal file
View File

@@ -0,0 +1,230 @@
package gc
import (
"context"
"errors"
"sync"
"testing"
)
// fakeDB implements gc.DB for tests.
type fakeDB struct {
mu sync.Mutex
unreferenced []UnreferencedBlob
blobsExist map[string]bool
}
func newFakeDB() *fakeDB {
return &fakeDB{
blobsExist: make(map[string]bool),
}
}
func (f *fakeDB) FindAndDeleteUnreferencedBlobs() ([]UnreferencedBlob, error) {
f.mu.Lock()
defer f.mu.Unlock()
result := make([]UnreferencedBlob, len(f.unreferenced))
copy(result, f.unreferenced)
// Simulate deletion by removing from blobsExist.
for _, b := range f.unreferenced {
delete(f.blobsExist, b.Digest)
}
f.unreferenced = nil
return result, nil
}
func (f *fakeDB) BlobExistsByDigest(digest string) (bool, error) {
f.mu.Lock()
defer f.mu.Unlock()
return f.blobsExist[digest], nil
}
// fakeStorage implements gc.Storage for tests.
type fakeStorage struct {
mu sync.Mutex
blobs map[string]int64 // digest -> size
deleted []string
}
func newFakeStorage() *fakeStorage {
return &fakeStorage{
blobs: make(map[string]int64),
}
}
func (f *fakeStorage) Delete(digest string) error {
f.mu.Lock()
defer f.mu.Unlock()
if _, ok := f.blobs[digest]; !ok {
return errors.New("not found")
}
delete(f.blobs, digest)
f.deleted = append(f.deleted, digest)
return nil
}
func (f *fakeStorage) ListBlobDigests() ([]string, error) {
f.mu.Lock()
defer f.mu.Unlock()
var digests []string
for d := range f.blobs {
digests = append(digests, d)
}
return digests, nil
}
func TestGCRemovesUnreferencedBlobs(t *testing.T) {
db := newFakeDB()
db.unreferenced = []UnreferencedBlob{
{Digest: "sha256:dead1", Size: 100},
{Digest: "sha256:dead2", Size: 200},
}
db.blobsExist["sha256:dead1"] = true
db.blobsExist["sha256:dead2"] = true
db.blobsExist["sha256:alive"] = true // referenced, not in unreferenced list
store := newFakeStorage()
store.blobs["sha256:dead1"] = 100
store.blobs["sha256:dead2"] = 200
store.blobs["sha256:alive"] = 300
c := New(db, store)
result, err := c.Run(context.Background())
if err != nil {
t.Fatalf("Run: %v", err)
}
if result.BlobsRemoved != 2 {
t.Fatalf("BlobsRemoved: got %d, want 2", result.BlobsRemoved)
}
if result.BytesFreed != 300 {
t.Fatalf("BytesFreed: got %d, want 300", result.BytesFreed)
}
// Dead blobs should be deleted from storage.
if _, ok := store.blobs["sha256:dead1"]; ok {
t.Fatal("sha256:dead1 should have been deleted from storage")
}
if _, ok := store.blobs["sha256:dead2"]; ok {
t.Fatal("sha256:dead2 should have been deleted from storage")
}
// Alive blob should still exist.
if _, ok := store.blobs["sha256:alive"]; !ok {
t.Fatal("sha256:alive should still exist in storage")
}
}
func TestGCDoesNotRemoveReferencedBlobs(t *testing.T) {
db := newFakeDB()
// No unreferenced blobs.
db.blobsExist["sha256:alive"] = true
store := newFakeStorage()
store.blobs["sha256:alive"] = 500
c := New(db, store)
result, err := c.Run(context.Background())
if err != nil {
t.Fatalf("Run: %v", err)
}
if result.BlobsRemoved != 0 {
t.Fatalf("BlobsRemoved: got %d, want 0", result.BlobsRemoved)
}
if _, ok := store.blobs["sha256:alive"]; !ok {
t.Fatal("referenced blob should not be deleted")
}
}
func TestGCConcurrentRejected(t *testing.T) {
db := newFakeDB()
store := newFakeStorage()
c := New(db, store)
// Acquire the lock manually.
c.mu.Lock()
// Try to run GC — should fail.
_, err := c.Run(context.Background())
if !errors.Is(err, ErrGCRunning) {
t.Fatalf("expected ErrGCRunning, got %v", err)
}
c.mu.Unlock()
// Now it should work.
result, err := c.Run(context.Background())
if err != nil {
t.Fatalf("Run after unlock: %v", err)
}
if result.BlobsRemoved != 0 {
t.Fatalf("BlobsRemoved: got %d, want 0", result.BlobsRemoved)
}
}
func TestGCEmptyRegistry(t *testing.T) {
db := newFakeDB()
store := newFakeStorage()
c := New(db, store)
result, err := c.Run(context.Background())
if err != nil {
t.Fatalf("Run: %v", err)
}
if result.BlobsRemoved != 0 {
t.Fatalf("BlobsRemoved: got %d, want 0", result.BlobsRemoved)
}
if result.Duration <= 0 {
t.Fatal("Duration should be positive")
}
}
func TestReconcileCleansOrphanedFiles(t *testing.T) {
db := newFakeDB()
// Only sha256:alive has a DB row.
db.blobsExist["sha256:alive"] = true
store := newFakeStorage()
store.blobs["sha256:alive"] = 100
store.blobs["sha256:orphan1"] = 200
store.blobs["sha256:orphan2"] = 300
c := New(db, store)
result, err := c.Reconcile(context.Background())
if err != nil {
t.Fatalf("Reconcile: %v", err)
}
if result.BlobsRemoved != 2 {
t.Fatalf("BlobsRemoved: got %d, want 2", result.BlobsRemoved)
}
// Alive blob should still exist.
if _, ok := store.blobs["sha256:alive"]; !ok {
t.Fatal("sha256:alive should still exist")
}
// Orphans should be gone.
if _, ok := store.blobs["sha256:orphan1"]; ok {
t.Fatal("sha256:orphan1 should have been deleted")
}
if _, ok := store.blobs["sha256:orphan2"]; ok {
t.Fatal("sha256:orphan2 should have been deleted")
}
}
func TestReconcileEmptyStorage(t *testing.T) {
db := newFakeDB()
store := newFakeStorage()
c := New(db, store)
result, err := c.Reconcile(context.Background())
if err != nil {
t.Fatalf("Reconcile: %v", err)
}
if result.BlobsRemoved != 0 {
t.Fatalf("BlobsRemoved: got %d, want 0", result.BlobsRemoved)
}
}