Introduce error recovery mechanisms with retry logic and circuit breaker integration.

- Added `ErrorRecovery.cc` and `ErrorRecovery.h` for retry and circuit breaker implementations. - Enhanced swap file handling with transient error retries and exponential backoff (e.g., ENOSPC, EDQUOT). - Integrated circuit breaker into SwapManager to gracefully handle repeated failures, prevent system overload, and enable automatic recovery. - Updated `DEVELOPER_GUIDE.md` with comprehensive documentation on error recovery patterns and graceful degradation strategies. - Refined fsync, temp file creation, and swap file logic with retry-on-failure mechanisms for improved resilience.
2026-02-17 21:38:40 -08:00
parent daeeecb342
commit 0d87bc0b25
7 changed files with 624 additions and 17 deletions
--- a/Buffer.cc
+++ b/Buffer.cc
@@ -20,6 +20,7 @@
 #include "UndoTree.h"
 #include "ErrorHandler.h"
 #include "SyscallWrappers.h"
+#include "ErrorRecovery.h"
 // For reconstructing highlighter state on copies
 #include "syntax/HighlighterRegistry.h"
 #include "syntax/NullHighlighter.h"
@@ -148,9 +149,21 @@ atomic_write_file(const std::string &path, const char *data, std::size_t len, st
 	// mkstemp requires a mutable buffer.
 	std::vector<char> buf(tmpl_s.begin(), tmpl_s.end());
 	buf.push_back('\0');
-	int fd = kte::syscall::Mkstemp(buf.data());
+
+	// Retry on transient errors for temp file creation
+	int fd          = -1;
+	auto mkstemp_fn = [&]() -> bool {
+		// Reset buffer for each retry attempt
+		buf.assign(tmpl_s.begin(), tmpl_s.end());
+		buf.push_back('\0');
+		fd = kte::syscall::Mkstemp(buf.data());
+		return fd >= 0;
+	};
+
+	if (!kte::RetryOnTransientError(mkstemp_fn, kte::RetryPolicy::Aggressive(), err)) {
 		if (fd < 0) {
-		err = std::string("Failed to create temp file for save: ") + std::strerror(errno);
+			err = std::string("Failed to create temp file for save: ") + std::strerror(errno) + err;
+		}
 		return false;
 	}
 	std::string tmp_path(buf.data());
@@ -163,8 +176,14 @@ atomic_write_file(const std::string &path, const char *data, std::size_t len, st

 	bool ok = write_all_fd(fd, data, len, err);
 	if (ok) {
-		if (kte::syscall::Fsync(fd) != 0) {
-			err = std::string("fsync failed: ") + std::strerror(errno);
+		// Retry fsync on transient errors
+		auto fsync_fn = [&]() -> bool {
+			return kte::syscall::Fsync(fd) == 0;
+		};
+
+		std::string fsync_err;
+		if (!kte::RetryOnTransientError(fsync_fn, kte::RetryPolicy::Aggressive(), fsync_err)) {
+			err = std::string("fsync failed: ") + std::strerror(errno) + fsync_err;
 			ok  = false;
 		}
 	}
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -144,6 +144,7 @@ set(COMMON_SOURCES
        Swap.cc
        ErrorHandler.cc
        SyscallWrappers.cc
+        ErrorRecovery.cc
        TerminalInputHandler.cc
        TerminalRenderer.cc
        TerminalFrontend.cc
@@ -341,6 +342,7 @@ if (BUILD_TESTS)
            Swap.cc
            ErrorHandler.cc
            SyscallWrappers.cc
+            ErrorRecovery.cc
            KKeymap.cc
            SwapRecorder.h
            OptimizedSearch.cc
--- a/ErrorRecovery.cc
+++ b/ErrorRecovery.cc
@@ -0,0 +1,157 @@
+// ErrorRecovery.cc - Error recovery mechanisms implementation
+#include "ErrorRecovery.h"
+#include <mutex>
+
+namespace kte {
+CircuitBreaker::CircuitBreaker(const Config &cfg)
+	: config_(cfg), state_(State::Closed), failure_count_(0), success_count_(0),
+	  last_failure_time_(std::chrono::steady_clock::time_point::min()),
+	  state_change_time_(std::chrono::steady_clock::now()) {}
+
+
+bool
+CircuitBreaker::AllowRequest()
+{
+	std::lock_guard<std::mutex> lg(mtx_);
+
+	const auto now = std::chrono::steady_clock::now();
+
+	switch (state_) {
+	case State::Closed:
+		// Normal operation, allow all requests
+		return true;
+
+	case State::Open: {
+		// Check if timeout has elapsed to transition to HalfOpen
+		const auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(
+			now - state_change_time_
+		);
+		if (elapsed >= config_.open_timeout) {
+			TransitionTo(State::HalfOpen);
+			return true; // Allow one request to test recovery
+		}
+		return false; // Circuit is open, reject request
+	}
+
+	case State::HalfOpen:
+		// Allow limited requests to test recovery
+		return true;
+	}
+
+	return false;
+}
+
+
+void
+CircuitBreaker::RecordSuccess()
+{
+	std::lock_guard<std::mutex> lg(mtx_);
+
+	switch (state_) {
+	case State::Closed:
+		// Reset failure count on success in normal operation
+		failure_count_ = 0;
+		break;
+
+	case State::HalfOpen:
+		++success_count_;
+		if (success_count_ >= config_.success_threshold) {
+			// Enough successes, close the circuit
+			TransitionTo(State::Closed);
+		}
+		break;
+
+	case State::Open:
+		// Shouldn't happen (requests rejected), but handle gracefully
+		break;
+	}
+}
+
+
+void
+CircuitBreaker::RecordFailure()
+{
+	std::lock_guard<std::mutex> lg(mtx_);
+
+	const auto now     = std::chrono::steady_clock::now();
+	last_failure_time_ = now;
+
+	switch (state_) {
+	case State::Closed:
+		// Check if we need to reset the failure count (window expired)
+		if (IsWindowExpired()) {
+			failure_count_ = 0;
+		}
+
+		++failure_count_;
+		if (failure_count_ >= config_.failure_threshold) {
+			// Too many failures, open the circuit
+			TransitionTo(State::Open);
+		}
+		break;
+
+	case State::HalfOpen:
+		// Failure during recovery test, reopen the circuit
+		TransitionTo(State::Open);
+		break;
+
+	case State::Open:
+		// Already open, just track the failure
+		++failure_count_;
+		break;
+	}
+}
+
+
+void
+CircuitBreaker::Reset()
+{
+	std::lock_guard<std::mutex> lg(mtx_);
+	TransitionTo(State::Closed);
+}
+
+
+void
+CircuitBreaker::TransitionTo(State new_state)
+{
+	if (state_ == new_state) {
+		return;
+	}
+
+	state_             = new_state;
+	state_change_time_ = std::chrono::steady_clock::now();
+
+	switch (new_state) {
+	case State::Closed:
+		failure_count_ = 0;
+		success_count_ = 0;
+		break;
+
+	case State::Open:
+		success_count_ = 0;
+		// Keep failure_count_ for diagnostics
+		break;
+
+	case State::HalfOpen:
+		success_count_ = 0;
+		// Keep failure_count_ for diagnostics
+		break;
+	}
+}
+
+
+bool
+CircuitBreaker::IsWindowExpired() const
+{
+	if (failure_count_ == 0) {
+		return false;
+	}
+
+	const auto now     = std::chrono::steady_clock::now();
+	const auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(
+		now - last_failure_time_
+	);
+
+	return elapsed >= config_.window;
+}
+} // namespace kte
--- a/ErrorRecovery.h
+++ b/ErrorRecovery.h
@@ -0,0 +1,170 @@
+// ErrorRecovery.h - Error recovery mechanisms for kte
+#pragma once
+
+#include <chrono>
+#include <cstddef>
+#include <functional>
+#include <string>
+#include <thread>
+#include <mutex>
+#include <cerrno>
+
+namespace kte {
+// Classify errno values as transient (retryable) or permanent
+inline bool
+IsTransientError(int err)
+{
+	switch (err) {
+	case EAGAIN:
+#if EAGAIN != EWOULDBLOCK
+	case EWOULDBLOCK: 
+#endif
+	case EBUSY:
+	case EIO: // I/O error (may be transient on network filesystems)
+	case ETIMEDOUT:
+	case ENOSPC: // Disk full (may become available)
+	case EDQUOT: // Quota exceeded (may become available)
+		return true;
+	default:
+		return false;
+	}
+}
+
+
+// RetryPolicy defines retry behavior for transient failures
+struct RetryPolicy {
+	std::size_t max_attempts{3}; // Maximum retry attempts
+	std::chrono::milliseconds initial_delay{100}; // Initial delay before first retry
+	double backoff_multiplier{2.0}; // Exponential backoff multiplier
+	std::chrono::milliseconds max_delay{5000}; // Maximum delay between retries
+
+	// Default policy: 3 attempts, 100ms initial, 2x backoff, 5s max
+	static RetryPolicy Default()
+	{
+		return RetryPolicy{};
+	}
+
+
+	// Aggressive policy for critical operations: more attempts, faster retries
+	static RetryPolicy Aggressive()
+	{
+		return RetryPolicy{5, std::chrono::milliseconds(50), 1.5, std::chrono::milliseconds(2000)};
+	}
+
+
+	// Conservative policy for non-critical operations: fewer attempts, slower retries
+	static RetryPolicy Conservative()
+	{
+		return RetryPolicy{2, std::chrono::milliseconds(200), 2.5, std::chrono::milliseconds(10000)};
+	}
+};
+
+// Retry a function with exponential backoff for transient errors
+// Returns true on success, false on permanent failure or exhausted retries
+// The function `fn` should return true on success, false on failure, and set errno on failure
+template<typename Func>
+bool
+RetryOnTransientError(Func fn, const RetryPolicy &policy, std::string &err)
+{
+	std::size_t attempt             = 0;
+	std::chrono::milliseconds delay = policy.initial_delay;
+
+	while (attempt < policy.max_attempts) {
+		++attempt;
+		errno = 0;
+		if (fn()) {
+			return true; // Success
+		}
+
+		int saved_errno = errno;
+		if (!IsTransientError(saved_errno)) {
+			// Permanent error, don't retry
+			return false;
+		}
+
+		if (attempt >= policy.max_attempts) {
+			// Exhausted retries
+			err += " (exhausted " + std::to_string(policy.max_attempts) + " retry attempts)";
+			return false;
+		}
+
+		// Sleep before retry
+		std::this_thread::sleep_for(delay);
+
+		// Exponential backoff
+		delay = std::chrono::milliseconds(
+			static_cast<long long>(delay.count() * policy.backoff_multiplier)
+		);
+		if (delay > policy.max_delay) {
+			delay = policy.max_delay;
+		}
+	}
+
+	return false;
+}
+
+
+// CircuitBreaker prevents repeated attempts to failing operations
+// States: Closed (normal), Open (failing, reject immediately), HalfOpen (testing recovery)
+class CircuitBreaker {
+public:
+	enum class State {
+		Closed, // Normal operation, allow all requests
+		Open, // Failing, reject requests immediately
+		HalfOpen // Testing recovery, allow limited requests
+	};
+
+	struct Config {
+		std::size_t failure_threshold; // Failures before opening circuit
+		std::chrono::seconds open_timeout; // Time before attempting recovery (Open → HalfOpen)
+		std::size_t success_threshold; // Successes in HalfOpen before closing
+		std::chrono::seconds window; // Time window for counting failures
+
+		Config()
+			: failure_threshold(5), open_timeout(30), success_threshold(2), window(60) {}
+	};
+
+
+	explicit CircuitBreaker(const Config &cfg = Config());
+
+
+	// Check if operation is allowed (returns false if circuit is Open)
+	bool AllowRequest();
+
+	// Record successful operation
+	void RecordSuccess();
+
+	// Record failed operation
+	void RecordFailure();
+
+	// Get current state
+	State GetState() const
+	{
+		return state_;
+	}
+
+
+	// Get failure count in current window
+	std::size_t GetFailureCount() const
+	{
+		return failure_count_;
+	}
+
+
+	// Reset circuit to Closed state (for testing or manual intervention)
+	void Reset();
+
+private:
+	void TransitionTo(State new_state);
+
+	bool IsWindowExpired() const;
+
+	Config config_;
+	State state_;
+	std::size_t failure_count_;
+	std::size_t success_count_;
+	std::chrono::steady_clock::time_point last_failure_time_;
+	std::chrono::steady_clock::time_point state_change_time_;
+	mutable std::mutex mtx_;
+};
+} // namespace kte
--- a/Swap.cc
+++ b/Swap.cc
@@ -2,6 +2,7 @@
 #include "Buffer.h"
 #include "ErrorHandler.h"
 #include "SyscallWrappers.h"
+#include "ErrorRecovery.h"

 #include <algorithm>
 #include <chrono>
@@ -613,10 +614,19 @@ SwapManager::open_ctx(JournalCtx &ctx, const std::string &path, std::string &err
 #ifdef O_CLOEXEC
 	flags |= O_CLOEXEC;
 #endif
-	int fd = kte::syscall::Open(path.c_str(), flags, 0600);
+
+	// Retry on transient errors (ENOSPC, EDQUOT, EBUSY, etc.)
+	int fd       = -1;
+	auto open_fn = [&]() -> bool {
+		fd = kte::syscall::Open(path.c_str(), flags, 0600);
+		return fd >= 0;
+	};
+
+	if (!RetryOnTransientError(open_fn, RetryPolicy::Aggressive(), err)) {
 		if (fd < 0) {
 			int saved_errno = errno;
-		err             = "Failed to open swap file '" + path + "': " + std::strerror(saved_errno);
+			err = "Failed to open swap file '" + path + "': " + std::strerror(saved_errno) + err;
+		}
 		return false;
 	}
 	// Ensure permissions even if file already existed.
@@ -636,10 +646,20 @@ SwapManager::open_ctx(JournalCtx &ctx, const std::string &path, std::string &err
 #ifdef O_CLOEXEC
 		tflags |= O_CLOEXEC;
 #endif
+
+		// Retry on transient errors for truncation open
+		fd             = -1;
+		auto reopen_fn = [&]() -> bool {
 			fd = kte::syscall::Open(path.c_str(), tflags, 0600);
+			return fd >= 0;
+		};
+
+		if (!RetryOnTransientError(reopen_fn, RetryPolicy::Aggressive(), err)) {
 			if (fd < 0) {
 				int saved_errno = errno;
-			err = "Failed to reopen swap file for truncation '" + path + "': " + std::strerror(saved_errno);
+				err = "Failed to reopen swap file for truncation '" + path + "': " + std::strerror(
+					      saved_errno) + err;
+			}
 			return false;
 		}
 		(void) kte::syscall::Fchmod(fd, 0600);
@@ -705,10 +725,19 @@ SwapManager::compact_to_checkpoint(JournalCtx &ctx, const std::vector<std::uint8
 #ifdef O_CLOEXEC
 	flags |= O_CLOEXEC;
 #endif
-	int tfd = kte::syscall::Open(tmp_path.c_str(), flags, 0600);
+
+	// Retry on transient errors for temp file creation
+	int tfd          = -1;
+	auto open_tmp_fn = [&]() -> bool {
+		tfd = kte::syscall::Open(tmp_path.c_str(), flags, 0600);
+		return tfd >= 0;
+	};
+
+	if (!RetryOnTransientError(open_tmp_fn, RetryPolicy::Aggressive(), err)) {
 		if (tfd < 0) {
 			int saved_errno = errno;
-		err             = "Failed to open temp swap file '" + tmp_path + "': " + std::strerror(saved_errno);
+			err = "Failed to open temp swap file '" + tmp_path + "': " + std::strerror(saved_errno) + err;
+		}
 		return false;
 	}
 	(void) kte::syscall::Fchmod(tfd, 0600);
@@ -1062,6 +1091,34 @@ SwapManager::process_one(const Pending &p)
 	if (!p.buf)
 		return;

+	// Check circuit breaker before processing
+	bool circuit_open = false;
+	{
+		std::lock_guard<std::mutex> lg(mtx_);
+		if (!circuit_breaker_.AllowRequest()) {
+			circuit_open = true;
+		}
+	}
+
+	if (circuit_open) {
+		// Circuit is open - graceful degradation: skip swap write
+		// This prevents repeated failures from overwhelming the system
+		// Swap recording will resume when circuit closes
+		static std::atomic<std::uint64_t> last_warning_ns{0};
+		const std::uint64_t now  = now_ns();
+		const std::uint64_t last = last_warning_ns.load();
+		// Log warning at most once per 60 seconds to avoid spam
+		if (now - last > 60000000000ULL) {
+			last_warning_ns.store(now);
+			ErrorHandler::Instance().Warning("SwapManager",
+			                                 "Swap operations temporarily disabled due to repeated failures (circuit breaker open)",
+			                                 p.buf && !p.buf->Filename().empty()
+				                                 ? p.buf->Filename()
+				                                 : "<unnamed>");
+		}
+		return;
+	}
+
 	try {
 		Buffer &buf = *p.buf;

@@ -1084,10 +1141,18 @@ SwapManager::process_one(const Pending &p)
 		std::string open_err;
 		if (!open_ctx(*ctxp, path, open_err)) {
 			report_error(open_err, p.buf);
+			{
+				std::lock_guard<std::mutex> lg(mtx_);
+				circuit_breaker_.RecordFailure();
+			}
 			return;
 		}
 		if (p.payload.size() > 0xFFFFFFu) {
 			report_error("Payload too large: " + std::to_string(p.payload.size()) + " bytes", p.buf);
+			{
+				std::lock_guard<std::mutex> lg(mtx_);
+				circuit_breaker_.RecordFailure();
+			}
 			return;
 		}

@@ -1123,6 +1188,10 @@ SwapManager::process_one(const Pending &p)
 		if (!ok) {
 			int err = errno;
 			report_error("Failed to write swap record to '" + path + "': " + std::strerror(err), p.buf);
+			{
+				std::lock_guard<std::mutex> lg(mtx_);
+				circuit_breaker_.RecordFailure();
+			}
 			return;
 		}
 		ctxp->approx_size_bytes += static_cast<std::uint64_t>(rec.size());
@@ -1138,12 +1207,27 @@ SwapManager::process_one(const Pending &p)
 			std::string compact_err;
 			if (!compact_to_checkpoint(*ctxp, rec, compact_err)) {
 				report_error(compact_err, p.buf);
+				// Note: compaction failure is not fatal, don't record circuit breaker failure
 			}
 		}
+
+		// Record success for circuit breaker
+		{
+			std::lock_guard<std::mutex> lg(mtx_);
+			circuit_breaker_.RecordSuccess();
+		}
 	} catch (const std::exception &e) {
 		report_error(std::string("Exception in process_one: ") + e.what(), p.buf);
+		{
+			std::lock_guard<std::mutex> lg(mtx_);
+			circuit_breaker_.RecordFailure();
+		}
 	} catch (...) {
 		report_error("Unknown exception in process_one", p.buf);
+		{
+			std::lock_guard<std::mutex> lg(mtx_);
+			circuit_breaker_.RecordFailure();
+		}
 	}
 }

--- a/Swap.h
+++ b/Swap.h
@@ -15,6 +15,7 @@
 #include <atomic>

 #include "SwapRecorder.h"
+#include "ErrorRecovery.h"

 class Buffer;

@@ -245,5 +246,8 @@ private:
 	// Error tracking (protected by mtx_)
 	std::deque<SwapError> errors_; // bounded to max 100 entries
 	std::size_t total_error_count_{0};
+
+	// Circuit breaker for swap operations (protected by mtx_)
+	CircuitBreaker circuit_breaker_;
 };
 } // namespace kte
--- a/docs/DEVELOPER_GUIDE.md
+++ b/docs/DEVELOPER_GUIDE.md
@@ -852,6 +852,177 @@ When updating existing code to follow these conventions:
 5. **Update callers** to handle the error parameter
 6. **Write tests** that verify error handling

+### Error Recovery Mechanisms
+
+kte implements automatic error recovery for transient failures using
+retry logic and circuit breaker patterns.
+
+#### Transient Error Classification
+
+Transient errors are temporary failures that may succeed on retry:
+
+```cpp
+#include "ErrorRecovery.h"
+
+bool IsTransientError(int err);  // Returns true for EAGAIN, EWOULDBLOCK, EBUSY, EIO, ETIMEDOUT, ENOSPC, EDQUOT
+```
+
+**Transient errors**:
+
+- `EAGAIN` / `EWOULDBLOCK` - Resource temporarily unavailable
+- `EBUSY` - Device or resource busy
+- `EIO` - I/O error (may be transient on network filesystems)
+- `ETIMEDOUT` - Operation timed out
+- `ENOSPC` - No space left on device (may become available)
+- `EDQUOT` - Disk quota exceeded (may become available)
+
+**Permanent errors** (don't retry):
+
+- `ENOENT` - File not found
+- `EACCES` - Permission denied
+- `EINVAL` - Invalid argument
+- `ENOTDIR` - Not a directory
+
+#### Retry Policies
+
+Three predefined retry policies are available:
+
+```cpp
+// Default: 3 attempts, 100ms initial delay, 2x backoff, 5s max delay
+RetryPolicy::Default()
+
+// Aggressive: 5 attempts, 50ms initial delay, 1.5x backoff, 2s max delay
+// Use for critical operations (swap files, file saves)
+RetryPolicy::Aggressive()
+
+// Conservative: 2 attempts, 200ms initial delay, 2.5x backoff, 10s max delay
+// Use for non-critical operations
+RetryPolicy::Conservative()
+```
+
+#### Using RetryOnTransientError
+
+Wrap syscalls with automatic retry on transient errors:
+
+```cpp
+#include "ErrorRecovery.h"
+#include "SyscallWrappers.h"
+
+bool save_file(const std::string &path, std::string &err) {
+    int fd = -1;
+    auto open_fn = [&]() -> bool {
+        fd = kte::syscall::Open(path.c_str(), O_CREAT | O_WRONLY, 0644);
+        return fd >= 0;
+    };
+    
+    if (!kte::RetryOnTransientError(open_fn, kte::RetryPolicy::Aggressive(), err)) {
+        if (fd < 0) {
+            int saved_errno = errno;
+            err = "Failed to open file '" + path + "': " + std::strerror(saved_errno) + err;
+        }
+        return false;
+    }
+    
+    // ... use fd
+    kte::syscall::Close(fd);
+    return true;
+}
+```
+
+**Key points**:
+
+- Lambda must return `bool` (true = success, false = failure)
+- Lambda must set `errno` on failure for transient error detection
+- Use EINTR-safe syscall wrappers (`kte::syscall::*`) inside lambdas
+- Capture errno immediately after failure
+- Append retry info to error message (automatically added by
+  RetryOnTransientError)
+
+#### Circuit Breaker Pattern
+
+The circuit breaker prevents repeated attempts to failing operations,
+enabling graceful degradation.
+
+**States**:
+
+- **Closed** (normal): All requests allowed
+- **Open** (failing): Requests rejected immediately, operation disabled
+- **HalfOpen** (testing): Limited requests allowed to test recovery
+
+**Configuration** (SwapManager example):
+
+```cpp
+CircuitBreaker::Config cfg;
+cfg.failure_threshold = 5;      // Open after 5 failures
+cfg.timeout = std::chrono::seconds(30);  // Try recovery after 30s
+cfg.success_threshold = 2;      // Close after 2 successes in HalfOpen
+cfg.window = std::chrono::seconds(60);   // Count failures in 60s window
+
+CircuitBreaker breaker(cfg);
+```
+
+**Usage**:
+
+```cpp
+// Check before operation
+if (!breaker.AllowRequest()) {
+    // Circuit is open - graceful degradation
+    log_warning("Operation disabled due to repeated failures");
+    return;  // Skip operation
+}
+
+// Perform operation
+if (operation_succeeds()) {
+    breaker.RecordSuccess();
+} else {
+    breaker.RecordFailure();
+}
+```
+
+**SwapManager Integration**:
+
+The SwapManager uses a circuit breaker to handle repeated swap file
+failures:
+
+1. After 5 swap write failures in 60 seconds, circuit opens
+2. Swap recording is disabled (graceful degradation)
+3. Warning logged once per 60 seconds to avoid spam
+4. After 30 seconds, circuit enters HalfOpen state
+5. If 2 consecutive operations succeed, circuit closes and swap
+   recording resumes
+
+This ensures the editor remains functional even when swap files are
+unavailable (disk full, quota exceeded, filesystem errors).
+
+#### Graceful Degradation Strategies
+
+When operations fail repeatedly:
+
+1. **Disable non-critical features** - Swap recording can be disabled
+   without affecting editing
+2. **Log warnings** - Inform user of degraded operation via ErrorHandler
+3. **Rate-limit warnings** - Avoid log spam (e.g., once per 60 seconds)
+4. **Automatic recovery** - Circuit breaker automatically tests recovery
+5. **Preserve core functionality** - Editor remains usable without swap
+   files
+
+**Example** (from SwapManager):
+
+```cpp
+if (circuit_open) {
+    // Graceful degradation: skip swap write
+    static std::atomic<std::uint64_t> last_warning_ns{0};
+    const std::uint64_t now = now_ns();
+    if (now - last_warning_ns.load() > 60000000000ULL) {
+        last_warning_ns.store(now);
+        ErrorHandler::Instance().Warning("SwapManager", 
+            "Swap operations temporarily disabled due to repeated failures",
+            buffer_name);
+    }
+    return;  // Skip operation, editor continues normally
+}
+```
+
 ## Common Tasks

 ### Adding a New Command