From 0d87bc0b25d0d818fddb924201a2ee55f058f549 Mon Sep 17 00:00:00 2001 From: Kyle Isom Date: Tue, 17 Feb 2026 21:38:40 -0800 Subject: [PATCH] Introduce error recovery mechanisms with retry logic and circuit breaker integration. - Added `ErrorRecovery.cc` and `ErrorRecovery.h` for retry and circuit breaker implementations. - Enhanced swap file handling with transient error retries and exponential backoff (e.g., ENOSPC, EDQUOT). - Integrated circuit breaker into SwapManager to gracefully handle repeated failures, prevent system overload, and enable automatic recovery. - Updated `DEVELOPER_GUIDE.md` with comprehensive documentation on error recovery patterns and graceful degradation strategies. - Refined fsync, temp file creation, and swap file logic with retry-on-failure mechanisms for improved resilience. --- Buffer.cc | 29 +++++-- CMakeLists.txt | 2 + ErrorRecovery.cc | 157 ++++++++++++++++++++++++++++++++++++ ErrorRecovery.h | 170 +++++++++++++++++++++++++++++++++++++++ Swap.cc | 108 ++++++++++++++++++++++--- Swap.h | 4 + docs/DEVELOPER_GUIDE.md | 171 ++++++++++++++++++++++++++++++++++++++++ 7 files changed, 624 insertions(+), 17 deletions(-) create mode 100644 ErrorRecovery.cc create mode 100644 ErrorRecovery.h diff --git a/Buffer.cc b/Buffer.cc index d1c3499..3f706e4 100644 --- a/Buffer.cc +++ b/Buffer.cc @@ -20,6 +20,7 @@ #include "UndoTree.h" #include "ErrorHandler.h" #include "SyscallWrappers.h" +#include "ErrorRecovery.h" // For reconstructing highlighter state on copies #include "syntax/HighlighterRegistry.h" #include "syntax/NullHighlighter.h" @@ -148,9 +149,21 @@ atomic_write_file(const std::string &path, const char *data, std::size_t len, st // mkstemp requires a mutable buffer. std::vector buf(tmpl_s.begin(), tmpl_s.end()); buf.push_back('\0'); - int fd = kte::syscall::Mkstemp(buf.data()); - if (fd < 0) { - err = std::string("Failed to create temp file for save: ") + std::strerror(errno); + + // Retry on transient errors for temp file creation + int fd = -1; + auto mkstemp_fn = [&]() -> bool { + // Reset buffer for each retry attempt + buf.assign(tmpl_s.begin(), tmpl_s.end()); + buf.push_back('\0'); + fd = kte::syscall::Mkstemp(buf.data()); + return fd >= 0; + }; + + if (!kte::RetryOnTransientError(mkstemp_fn, kte::RetryPolicy::Aggressive(), err)) { + if (fd < 0) { + err = std::string("Failed to create temp file for save: ") + std::strerror(errno) + err; + } return false; } std::string tmp_path(buf.data()); @@ -163,8 +176,14 @@ atomic_write_file(const std::string &path, const char *data, std::size_t len, st bool ok = write_all_fd(fd, data, len, err); if (ok) { - if (kte::syscall::Fsync(fd) != 0) { - err = std::string("fsync failed: ") + std::strerror(errno); + // Retry fsync on transient errors + auto fsync_fn = [&]() -> bool { + return kte::syscall::Fsync(fd) == 0; + }; + + std::string fsync_err; + if (!kte::RetryOnTransientError(fsync_fn, kte::RetryPolicy::Aggressive(), fsync_err)) { + err = std::string("fsync failed: ") + std::strerror(errno) + fsync_err; ok = false; } } diff --git a/CMakeLists.txt b/CMakeLists.txt index 2ce70e1..eb85db7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -144,6 +144,7 @@ set(COMMON_SOURCES Swap.cc ErrorHandler.cc SyscallWrappers.cc + ErrorRecovery.cc TerminalInputHandler.cc TerminalRenderer.cc TerminalFrontend.cc @@ -341,6 +342,7 @@ if (BUILD_TESTS) Swap.cc ErrorHandler.cc SyscallWrappers.cc + ErrorRecovery.cc KKeymap.cc SwapRecorder.h OptimizedSearch.cc diff --git a/ErrorRecovery.cc b/ErrorRecovery.cc new file mode 100644 index 0000000..bfa6985 --- /dev/null +++ b/ErrorRecovery.cc @@ -0,0 +1,157 @@ +// ErrorRecovery.cc - Error recovery mechanisms implementation +#include "ErrorRecovery.h" +#include + +namespace kte { +CircuitBreaker::CircuitBreaker(const Config &cfg) + : config_(cfg), state_(State::Closed), failure_count_(0), success_count_(0), + last_failure_time_(std::chrono::steady_clock::time_point::min()), + state_change_time_(std::chrono::steady_clock::now()) {} + + +bool +CircuitBreaker::AllowRequest() +{ + std::lock_guard lg(mtx_); + + const auto now = std::chrono::steady_clock::now(); + + switch (state_) { + case State::Closed: + // Normal operation, allow all requests + return true; + + case State::Open: { + // Check if timeout has elapsed to transition to HalfOpen + const auto elapsed = std::chrono::duration_cast( + now - state_change_time_ + ); + if (elapsed >= config_.open_timeout) { + TransitionTo(State::HalfOpen); + return true; // Allow one request to test recovery + } + return false; // Circuit is open, reject request + } + + case State::HalfOpen: + // Allow limited requests to test recovery + return true; + } + + return false; +} + + +void +CircuitBreaker::RecordSuccess() +{ + std::lock_guard lg(mtx_); + + switch (state_) { + case State::Closed: + // Reset failure count on success in normal operation + failure_count_ = 0; + break; + + case State::HalfOpen: + ++success_count_; + if (success_count_ >= config_.success_threshold) { + // Enough successes, close the circuit + TransitionTo(State::Closed); + } + break; + + case State::Open: + // Shouldn't happen (requests rejected), but handle gracefully + break; + } +} + + +void +CircuitBreaker::RecordFailure() +{ + std::lock_guard lg(mtx_); + + const auto now = std::chrono::steady_clock::now(); + last_failure_time_ = now; + + switch (state_) { + case State::Closed: + // Check if we need to reset the failure count (window expired) + if (IsWindowExpired()) { + failure_count_ = 0; + } + + ++failure_count_; + if (failure_count_ >= config_.failure_threshold) { + // Too many failures, open the circuit + TransitionTo(State::Open); + } + break; + + case State::HalfOpen: + // Failure during recovery test, reopen the circuit + TransitionTo(State::Open); + break; + + case State::Open: + // Already open, just track the failure + ++failure_count_; + break; + } +} + + +void +CircuitBreaker::Reset() +{ + std::lock_guard lg(mtx_); + TransitionTo(State::Closed); +} + + +void +CircuitBreaker::TransitionTo(State new_state) +{ + if (state_ == new_state) { + return; + } + + state_ = new_state; + state_change_time_ = std::chrono::steady_clock::now(); + + switch (new_state) { + case State::Closed: + failure_count_ = 0; + success_count_ = 0; + break; + + case State::Open: + success_count_ = 0; + // Keep failure_count_ for diagnostics + break; + + case State::HalfOpen: + success_count_ = 0; + // Keep failure_count_ for diagnostics + break; + } +} + + +bool +CircuitBreaker::IsWindowExpired() const +{ + if (failure_count_ == 0) { + return false; + } + + const auto now = std::chrono::steady_clock::now(); + const auto elapsed = std::chrono::duration_cast( + now - last_failure_time_ + ); + + return elapsed >= config_.window; +} +} // namespace kte \ No newline at end of file diff --git a/ErrorRecovery.h b/ErrorRecovery.h new file mode 100644 index 0000000..a98ce17 --- /dev/null +++ b/ErrorRecovery.h @@ -0,0 +1,170 @@ +// ErrorRecovery.h - Error recovery mechanisms for kte +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace kte { +// Classify errno values as transient (retryable) or permanent +inline bool +IsTransientError(int err) +{ + switch (err) { + case EAGAIN: +#if EAGAIN != EWOULDBLOCK + case EWOULDBLOCK: +#endif + case EBUSY: + case EIO: // I/O error (may be transient on network filesystems) + case ETIMEDOUT: + case ENOSPC: // Disk full (may become available) + case EDQUOT: // Quota exceeded (may become available) + return true; + default: + return false; + } +} + + +// RetryPolicy defines retry behavior for transient failures +struct RetryPolicy { + std::size_t max_attempts{3}; // Maximum retry attempts + std::chrono::milliseconds initial_delay{100}; // Initial delay before first retry + double backoff_multiplier{2.0}; // Exponential backoff multiplier + std::chrono::milliseconds max_delay{5000}; // Maximum delay between retries + + // Default policy: 3 attempts, 100ms initial, 2x backoff, 5s max + static RetryPolicy Default() + { + return RetryPolicy{}; + } + + + // Aggressive policy for critical operations: more attempts, faster retries + static RetryPolicy Aggressive() + { + return RetryPolicy{5, std::chrono::milliseconds(50), 1.5, std::chrono::milliseconds(2000)}; + } + + + // Conservative policy for non-critical operations: fewer attempts, slower retries + static RetryPolicy Conservative() + { + return RetryPolicy{2, std::chrono::milliseconds(200), 2.5, std::chrono::milliseconds(10000)}; + } +}; + +// Retry a function with exponential backoff for transient errors +// Returns true on success, false on permanent failure or exhausted retries +// The function `fn` should return true on success, false on failure, and set errno on failure +template +bool +RetryOnTransientError(Func fn, const RetryPolicy &policy, std::string &err) +{ + std::size_t attempt = 0; + std::chrono::milliseconds delay = policy.initial_delay; + + while (attempt < policy.max_attempts) { + ++attempt; + errno = 0; + if (fn()) { + return true; // Success + } + + int saved_errno = errno; + if (!IsTransientError(saved_errno)) { + // Permanent error, don't retry + return false; + } + + if (attempt >= policy.max_attempts) { + // Exhausted retries + err += " (exhausted " + std::to_string(policy.max_attempts) + " retry attempts)"; + return false; + } + + // Sleep before retry + std::this_thread::sleep_for(delay); + + // Exponential backoff + delay = std::chrono::milliseconds( + static_cast(delay.count() * policy.backoff_multiplier) + ); + if (delay > policy.max_delay) { + delay = policy.max_delay; + } + } + + return false; +} + + +// CircuitBreaker prevents repeated attempts to failing operations +// States: Closed (normal), Open (failing, reject immediately), HalfOpen (testing recovery) +class CircuitBreaker { +public: + enum class State { + Closed, // Normal operation, allow all requests + Open, // Failing, reject requests immediately + HalfOpen // Testing recovery, allow limited requests + }; + + struct Config { + std::size_t failure_threshold; // Failures before opening circuit + std::chrono::seconds open_timeout; // Time before attempting recovery (Open → HalfOpen) + std::size_t success_threshold; // Successes in HalfOpen before closing + std::chrono::seconds window; // Time window for counting failures + + Config() + : failure_threshold(5), open_timeout(30), success_threshold(2), window(60) {} + }; + + + explicit CircuitBreaker(const Config &cfg = Config()); + + + // Check if operation is allowed (returns false if circuit is Open) + bool AllowRequest(); + + // Record successful operation + void RecordSuccess(); + + // Record failed operation + void RecordFailure(); + + // Get current state + State GetState() const + { + return state_; + } + + + // Get failure count in current window + std::size_t GetFailureCount() const + { + return failure_count_; + } + + + // Reset circuit to Closed state (for testing or manual intervention) + void Reset(); + +private: + void TransitionTo(State new_state); + + bool IsWindowExpired() const; + + Config config_; + State state_; + std::size_t failure_count_; + std::size_t success_count_; + std::chrono::steady_clock::time_point last_failure_time_; + std::chrono::steady_clock::time_point state_change_time_; + mutable std::mutex mtx_; +}; +} // namespace kte \ No newline at end of file diff --git a/Swap.cc b/Swap.cc index b262943..e036d0e 100644 --- a/Swap.cc +++ b/Swap.cc @@ -2,6 +2,7 @@ #include "Buffer.h" #include "ErrorHandler.h" #include "SyscallWrappers.h" +#include "ErrorRecovery.h" #include #include @@ -613,10 +614,19 @@ SwapManager::open_ctx(JournalCtx &ctx, const std::string &path, std::string &err #ifdef O_CLOEXEC flags |= O_CLOEXEC; #endif - int fd = kte::syscall::Open(path.c_str(), flags, 0600); - if (fd < 0) { - int saved_errno = errno; - err = "Failed to open swap file '" + path + "': " + std::strerror(saved_errno); + + // Retry on transient errors (ENOSPC, EDQUOT, EBUSY, etc.) + int fd = -1; + auto open_fn = [&]() -> bool { + fd = kte::syscall::Open(path.c_str(), flags, 0600); + return fd >= 0; + }; + + if (!RetryOnTransientError(open_fn, RetryPolicy::Aggressive(), err)) { + if (fd < 0) { + int saved_errno = errno; + err = "Failed to open swap file '" + path + "': " + std::strerror(saved_errno) + err; + } return false; } // Ensure permissions even if file already existed. @@ -636,10 +646,20 @@ SwapManager::open_ctx(JournalCtx &ctx, const std::string &path, std::string &err #ifdef O_CLOEXEC tflags |= O_CLOEXEC; #endif - fd = kte::syscall::Open(path.c_str(), tflags, 0600); - if (fd < 0) { - int saved_errno = errno; - err = "Failed to reopen swap file for truncation '" + path + "': " + std::strerror(saved_errno); + + // Retry on transient errors for truncation open + fd = -1; + auto reopen_fn = [&]() -> bool { + fd = kte::syscall::Open(path.c_str(), tflags, 0600); + return fd >= 0; + }; + + if (!RetryOnTransientError(reopen_fn, RetryPolicy::Aggressive(), err)) { + if (fd < 0) { + int saved_errno = errno; + err = "Failed to reopen swap file for truncation '" + path + "': " + std::strerror( + saved_errno) + err; + } return false; } (void) kte::syscall::Fchmod(fd, 0600); @@ -705,10 +725,19 @@ SwapManager::compact_to_checkpoint(JournalCtx &ctx, const std::vector bool { + tfd = kte::syscall::Open(tmp_path.c_str(), flags, 0600); + return tfd >= 0; + }; + + if (!RetryOnTransientError(open_tmp_fn, RetryPolicy::Aggressive(), err)) { + if (tfd < 0) { + int saved_errno = errno; + err = "Failed to open temp swap file '" + tmp_path + "': " + std::strerror(saved_errno) + err; + } return false; } (void) kte::syscall::Fchmod(tfd, 0600); @@ -1062,6 +1091,34 @@ SwapManager::process_one(const Pending &p) if (!p.buf) return; + // Check circuit breaker before processing + bool circuit_open = false; + { + std::lock_guard lg(mtx_); + if (!circuit_breaker_.AllowRequest()) { + circuit_open = true; + } + } + + if (circuit_open) { + // Circuit is open - graceful degradation: skip swap write + // This prevents repeated failures from overwhelming the system + // Swap recording will resume when circuit closes + static std::atomic last_warning_ns{0}; + const std::uint64_t now = now_ns(); + const std::uint64_t last = last_warning_ns.load(); + // Log warning at most once per 60 seconds to avoid spam + if (now - last > 60000000000ULL) { + last_warning_ns.store(now); + ErrorHandler::Instance().Warning("SwapManager", + "Swap operations temporarily disabled due to repeated failures (circuit breaker open)", + p.buf && !p.buf->Filename().empty() + ? p.buf->Filename() + : ""); + } + return; + } + try { Buffer &buf = *p.buf; @@ -1084,10 +1141,18 @@ SwapManager::process_one(const Pending &p) std::string open_err; if (!open_ctx(*ctxp, path, open_err)) { report_error(open_err, p.buf); + { + std::lock_guard lg(mtx_); + circuit_breaker_.RecordFailure(); + } return; } if (p.payload.size() > 0xFFFFFFu) { report_error("Payload too large: " + std::to_string(p.payload.size()) + " bytes", p.buf); + { + std::lock_guard lg(mtx_); + circuit_breaker_.RecordFailure(); + } return; } @@ -1123,6 +1188,10 @@ SwapManager::process_one(const Pending &p) if (!ok) { int err = errno; report_error("Failed to write swap record to '" + path + "': " + std::strerror(err), p.buf); + { + std::lock_guard lg(mtx_); + circuit_breaker_.RecordFailure(); + } return; } ctxp->approx_size_bytes += static_cast(rec.size()); @@ -1138,12 +1207,27 @@ SwapManager::process_one(const Pending &p) std::string compact_err; if (!compact_to_checkpoint(*ctxp, rec, compact_err)) { report_error(compact_err, p.buf); + // Note: compaction failure is not fatal, don't record circuit breaker failure } } + + // Record success for circuit breaker + { + std::lock_guard lg(mtx_); + circuit_breaker_.RecordSuccess(); + } } catch (const std::exception &e) { report_error(std::string("Exception in process_one: ") + e.what(), p.buf); + { + std::lock_guard lg(mtx_); + circuit_breaker_.RecordFailure(); + } } catch (...) { report_error("Unknown exception in process_one", p.buf); + { + std::lock_guard lg(mtx_); + circuit_breaker_.RecordFailure(); + } } } diff --git a/Swap.h b/Swap.h index 8b991b3..5f4f872 100644 --- a/Swap.h +++ b/Swap.h @@ -15,6 +15,7 @@ #include #include "SwapRecorder.h" +#include "ErrorRecovery.h" class Buffer; @@ -245,5 +246,8 @@ private: // Error tracking (protected by mtx_) std::deque errors_; // bounded to max 100 entries std::size_t total_error_count_{0}; + + // Circuit breaker for swap operations (protected by mtx_) + CircuitBreaker circuit_breaker_; }; } // namespace kte \ No newline at end of file diff --git a/docs/DEVELOPER_GUIDE.md b/docs/DEVELOPER_GUIDE.md index a0daf37..6c2e6f0 100644 --- a/docs/DEVELOPER_GUIDE.md +++ b/docs/DEVELOPER_GUIDE.md @@ -852,6 +852,177 @@ When updating existing code to follow these conventions: 5. **Update callers** to handle the error parameter 6. **Write tests** that verify error handling +### Error Recovery Mechanisms + +kte implements automatic error recovery for transient failures using +retry logic and circuit breaker patterns. + +#### Transient Error Classification + +Transient errors are temporary failures that may succeed on retry: + +```cpp +#include "ErrorRecovery.h" + +bool IsTransientError(int err); // Returns true for EAGAIN, EWOULDBLOCK, EBUSY, EIO, ETIMEDOUT, ENOSPC, EDQUOT +``` + +**Transient errors**: + +- `EAGAIN` / `EWOULDBLOCK` - Resource temporarily unavailable +- `EBUSY` - Device or resource busy +- `EIO` - I/O error (may be transient on network filesystems) +- `ETIMEDOUT` - Operation timed out +- `ENOSPC` - No space left on device (may become available) +- `EDQUOT` - Disk quota exceeded (may become available) + +**Permanent errors** (don't retry): + +- `ENOENT` - File not found +- `EACCES` - Permission denied +- `EINVAL` - Invalid argument +- `ENOTDIR` - Not a directory + +#### Retry Policies + +Three predefined retry policies are available: + +```cpp +// Default: 3 attempts, 100ms initial delay, 2x backoff, 5s max delay +RetryPolicy::Default() + +// Aggressive: 5 attempts, 50ms initial delay, 1.5x backoff, 2s max delay +// Use for critical operations (swap files, file saves) +RetryPolicy::Aggressive() + +// Conservative: 2 attempts, 200ms initial delay, 2.5x backoff, 10s max delay +// Use for non-critical operations +RetryPolicy::Conservative() +``` + +#### Using RetryOnTransientError + +Wrap syscalls with automatic retry on transient errors: + +```cpp +#include "ErrorRecovery.h" +#include "SyscallWrappers.h" + +bool save_file(const std::string &path, std::string &err) { + int fd = -1; + auto open_fn = [&]() -> bool { + fd = kte::syscall::Open(path.c_str(), O_CREAT | O_WRONLY, 0644); + return fd >= 0; + }; + + if (!kte::RetryOnTransientError(open_fn, kte::RetryPolicy::Aggressive(), err)) { + if (fd < 0) { + int saved_errno = errno; + err = "Failed to open file '" + path + "': " + std::strerror(saved_errno) + err; + } + return false; + } + + // ... use fd + kte::syscall::Close(fd); + return true; +} +``` + +**Key points**: + +- Lambda must return `bool` (true = success, false = failure) +- Lambda must set `errno` on failure for transient error detection +- Use EINTR-safe syscall wrappers (`kte::syscall::*`) inside lambdas +- Capture errno immediately after failure +- Append retry info to error message (automatically added by + RetryOnTransientError) + +#### Circuit Breaker Pattern + +The circuit breaker prevents repeated attempts to failing operations, +enabling graceful degradation. + +**States**: + +- **Closed** (normal): All requests allowed +- **Open** (failing): Requests rejected immediately, operation disabled +- **HalfOpen** (testing): Limited requests allowed to test recovery + +**Configuration** (SwapManager example): + +```cpp +CircuitBreaker::Config cfg; +cfg.failure_threshold = 5; // Open after 5 failures +cfg.timeout = std::chrono::seconds(30); // Try recovery after 30s +cfg.success_threshold = 2; // Close after 2 successes in HalfOpen +cfg.window = std::chrono::seconds(60); // Count failures in 60s window + +CircuitBreaker breaker(cfg); +``` + +**Usage**: + +```cpp +// Check before operation +if (!breaker.AllowRequest()) { + // Circuit is open - graceful degradation + log_warning("Operation disabled due to repeated failures"); + return; // Skip operation +} + +// Perform operation +if (operation_succeeds()) { + breaker.RecordSuccess(); +} else { + breaker.RecordFailure(); +} +``` + +**SwapManager Integration**: + +The SwapManager uses a circuit breaker to handle repeated swap file +failures: + +1. After 5 swap write failures in 60 seconds, circuit opens +2. Swap recording is disabled (graceful degradation) +3. Warning logged once per 60 seconds to avoid spam +4. After 30 seconds, circuit enters HalfOpen state +5. If 2 consecutive operations succeed, circuit closes and swap + recording resumes + +This ensures the editor remains functional even when swap files are +unavailable (disk full, quota exceeded, filesystem errors). + +#### Graceful Degradation Strategies + +When operations fail repeatedly: + +1. **Disable non-critical features** - Swap recording can be disabled + without affecting editing +2. **Log warnings** - Inform user of degraded operation via ErrorHandler +3. **Rate-limit warnings** - Avoid log spam (e.g., once per 60 seconds) +4. **Automatic recovery** - Circuit breaker automatically tests recovery +5. **Preserve core functionality** - Editor remains usable without swap + files + +**Example** (from SwapManager): + +```cpp +if (circuit_open) { + // Graceful degradation: skip swap write + static std::atomic last_warning_ns{0}; + const std::uint64_t now = now_ns(); + if (now - last_warning_ns.load() > 60000000000ULL) { + last_warning_ns.store(now); + ErrorHandler::Instance().Warning("SwapManager", + "Swap operations temporarily disabled due to repeated failures", + buffer_name); + } + return; // Skip operation, editor continues normally +} +``` + ## Common Tasks ### Adding a New Command