Introduce error recovery mechanisms with retry logic and circuit breaker integration.
- Added `ErrorRecovery.cc` and `ErrorRecovery.h` for retry and circuit breaker implementations. - Enhanced swap file handling with transient error retries and exponential backoff (e.g., ENOSPC, EDQUOT). - Integrated circuit breaker into SwapManager to gracefully handle repeated failures, prevent system overload, and enable automatic recovery. - Updated `DEVELOPER_GUIDE.md` with comprehensive documentation on error recovery patterns and graceful degradation strategies. - Refined fsync, temp file creation, and swap file logic with retry-on-failure mechanisms for improved resilience.
This commit is contained in:
27
Buffer.cc
27
Buffer.cc
@@ -20,6 +20,7 @@
|
|||||||
#include "UndoTree.h"
|
#include "UndoTree.h"
|
||||||
#include "ErrorHandler.h"
|
#include "ErrorHandler.h"
|
||||||
#include "SyscallWrappers.h"
|
#include "SyscallWrappers.h"
|
||||||
|
#include "ErrorRecovery.h"
|
||||||
// For reconstructing highlighter state on copies
|
// For reconstructing highlighter state on copies
|
||||||
#include "syntax/HighlighterRegistry.h"
|
#include "syntax/HighlighterRegistry.h"
|
||||||
#include "syntax/NullHighlighter.h"
|
#include "syntax/NullHighlighter.h"
|
||||||
@@ -148,9 +149,21 @@ atomic_write_file(const std::string &path, const char *data, std::size_t len, st
|
|||||||
// mkstemp requires a mutable buffer.
|
// mkstemp requires a mutable buffer.
|
||||||
std::vector<char> buf(tmpl_s.begin(), tmpl_s.end());
|
std::vector<char> buf(tmpl_s.begin(), tmpl_s.end());
|
||||||
buf.push_back('\0');
|
buf.push_back('\0');
|
||||||
int fd = kte::syscall::Mkstemp(buf.data());
|
|
||||||
|
// Retry on transient errors for temp file creation
|
||||||
|
int fd = -1;
|
||||||
|
auto mkstemp_fn = [&]() -> bool {
|
||||||
|
// Reset buffer for each retry attempt
|
||||||
|
buf.assign(tmpl_s.begin(), tmpl_s.end());
|
||||||
|
buf.push_back('\0');
|
||||||
|
fd = kte::syscall::Mkstemp(buf.data());
|
||||||
|
return fd >= 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
if (!kte::RetryOnTransientError(mkstemp_fn, kte::RetryPolicy::Aggressive(), err)) {
|
||||||
if (fd < 0) {
|
if (fd < 0) {
|
||||||
err = std::string("Failed to create temp file for save: ") + std::strerror(errno);
|
err = std::string("Failed to create temp file for save: ") + std::strerror(errno) + err;
|
||||||
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
std::string tmp_path(buf.data());
|
std::string tmp_path(buf.data());
|
||||||
@@ -163,8 +176,14 @@ atomic_write_file(const std::string &path, const char *data, std::size_t len, st
|
|||||||
|
|
||||||
bool ok = write_all_fd(fd, data, len, err);
|
bool ok = write_all_fd(fd, data, len, err);
|
||||||
if (ok) {
|
if (ok) {
|
||||||
if (kte::syscall::Fsync(fd) != 0) {
|
// Retry fsync on transient errors
|
||||||
err = std::string("fsync failed: ") + std::strerror(errno);
|
auto fsync_fn = [&]() -> bool {
|
||||||
|
return kte::syscall::Fsync(fd) == 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
std::string fsync_err;
|
||||||
|
if (!kte::RetryOnTransientError(fsync_fn, kte::RetryPolicy::Aggressive(), fsync_err)) {
|
||||||
|
err = std::string("fsync failed: ") + std::strerror(errno) + fsync_err;
|
||||||
ok = false;
|
ok = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -144,6 +144,7 @@ set(COMMON_SOURCES
|
|||||||
Swap.cc
|
Swap.cc
|
||||||
ErrorHandler.cc
|
ErrorHandler.cc
|
||||||
SyscallWrappers.cc
|
SyscallWrappers.cc
|
||||||
|
ErrorRecovery.cc
|
||||||
TerminalInputHandler.cc
|
TerminalInputHandler.cc
|
||||||
TerminalRenderer.cc
|
TerminalRenderer.cc
|
||||||
TerminalFrontend.cc
|
TerminalFrontend.cc
|
||||||
@@ -341,6 +342,7 @@ if (BUILD_TESTS)
|
|||||||
Swap.cc
|
Swap.cc
|
||||||
ErrorHandler.cc
|
ErrorHandler.cc
|
||||||
SyscallWrappers.cc
|
SyscallWrappers.cc
|
||||||
|
ErrorRecovery.cc
|
||||||
KKeymap.cc
|
KKeymap.cc
|
||||||
SwapRecorder.h
|
SwapRecorder.h
|
||||||
OptimizedSearch.cc
|
OptimizedSearch.cc
|
||||||
|
|||||||
157
ErrorRecovery.cc
Normal file
157
ErrorRecovery.cc
Normal file
@@ -0,0 +1,157 @@
|
|||||||
|
// ErrorRecovery.cc - Error recovery mechanisms implementation
|
||||||
|
#include "ErrorRecovery.h"
|
||||||
|
#include <mutex>
|
||||||
|
|
||||||
|
namespace kte {
|
||||||
|
CircuitBreaker::CircuitBreaker(const Config &cfg)
|
||||||
|
: config_(cfg), state_(State::Closed), failure_count_(0), success_count_(0),
|
||||||
|
last_failure_time_(std::chrono::steady_clock::time_point::min()),
|
||||||
|
state_change_time_(std::chrono::steady_clock::now()) {}
|
||||||
|
|
||||||
|
|
||||||
|
bool
|
||||||
|
CircuitBreaker::AllowRequest()
|
||||||
|
{
|
||||||
|
std::lock_guard<std::mutex> lg(mtx_);
|
||||||
|
|
||||||
|
const auto now = std::chrono::steady_clock::now();
|
||||||
|
|
||||||
|
switch (state_) {
|
||||||
|
case State::Closed:
|
||||||
|
// Normal operation, allow all requests
|
||||||
|
return true;
|
||||||
|
|
||||||
|
case State::Open: {
|
||||||
|
// Check if timeout has elapsed to transition to HalfOpen
|
||||||
|
const auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(
|
||||||
|
now - state_change_time_
|
||||||
|
);
|
||||||
|
if (elapsed >= config_.open_timeout) {
|
||||||
|
TransitionTo(State::HalfOpen);
|
||||||
|
return true; // Allow one request to test recovery
|
||||||
|
}
|
||||||
|
return false; // Circuit is open, reject request
|
||||||
|
}
|
||||||
|
|
||||||
|
case State::HalfOpen:
|
||||||
|
// Allow limited requests to test recovery
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void
|
||||||
|
CircuitBreaker::RecordSuccess()
|
||||||
|
{
|
||||||
|
std::lock_guard<std::mutex> lg(mtx_);
|
||||||
|
|
||||||
|
switch (state_) {
|
||||||
|
case State::Closed:
|
||||||
|
// Reset failure count on success in normal operation
|
||||||
|
failure_count_ = 0;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case State::HalfOpen:
|
||||||
|
++success_count_;
|
||||||
|
if (success_count_ >= config_.success_threshold) {
|
||||||
|
// Enough successes, close the circuit
|
||||||
|
TransitionTo(State::Closed);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
case State::Open:
|
||||||
|
// Shouldn't happen (requests rejected), but handle gracefully
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void
|
||||||
|
CircuitBreaker::RecordFailure()
|
||||||
|
{
|
||||||
|
std::lock_guard<std::mutex> lg(mtx_);
|
||||||
|
|
||||||
|
const auto now = std::chrono::steady_clock::now();
|
||||||
|
last_failure_time_ = now;
|
||||||
|
|
||||||
|
switch (state_) {
|
||||||
|
case State::Closed:
|
||||||
|
// Check if we need to reset the failure count (window expired)
|
||||||
|
if (IsWindowExpired()) {
|
||||||
|
failure_count_ = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
++failure_count_;
|
||||||
|
if (failure_count_ >= config_.failure_threshold) {
|
||||||
|
// Too many failures, open the circuit
|
||||||
|
TransitionTo(State::Open);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
case State::HalfOpen:
|
||||||
|
// Failure during recovery test, reopen the circuit
|
||||||
|
TransitionTo(State::Open);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case State::Open:
|
||||||
|
// Already open, just track the failure
|
||||||
|
++failure_count_;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void
|
||||||
|
CircuitBreaker::Reset()
|
||||||
|
{
|
||||||
|
std::lock_guard<std::mutex> lg(mtx_);
|
||||||
|
TransitionTo(State::Closed);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void
|
||||||
|
CircuitBreaker::TransitionTo(State new_state)
|
||||||
|
{
|
||||||
|
if (state_ == new_state) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
state_ = new_state;
|
||||||
|
state_change_time_ = std::chrono::steady_clock::now();
|
||||||
|
|
||||||
|
switch (new_state) {
|
||||||
|
case State::Closed:
|
||||||
|
failure_count_ = 0;
|
||||||
|
success_count_ = 0;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case State::Open:
|
||||||
|
success_count_ = 0;
|
||||||
|
// Keep failure_count_ for diagnostics
|
||||||
|
break;
|
||||||
|
|
||||||
|
case State::HalfOpen:
|
||||||
|
success_count_ = 0;
|
||||||
|
// Keep failure_count_ for diagnostics
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
bool
|
||||||
|
CircuitBreaker::IsWindowExpired() const
|
||||||
|
{
|
||||||
|
if (failure_count_ == 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto now = std::chrono::steady_clock::now();
|
||||||
|
const auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(
|
||||||
|
now - last_failure_time_
|
||||||
|
);
|
||||||
|
|
||||||
|
return elapsed >= config_.window;
|
||||||
|
}
|
||||||
|
} // namespace kte
|
||||||
170
ErrorRecovery.h
Normal file
170
ErrorRecovery.h
Normal file
@@ -0,0 +1,170 @@
|
|||||||
|
// ErrorRecovery.h - Error recovery mechanisms for kte
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <chrono>
|
||||||
|
#include <cstddef>
|
||||||
|
#include <functional>
|
||||||
|
#include <string>
|
||||||
|
#include <thread>
|
||||||
|
#include <mutex>
|
||||||
|
#include <cerrno>
|
||||||
|
|
||||||
|
namespace kte {
|
||||||
|
// Classify errno values as transient (retryable) or permanent
|
||||||
|
inline bool
|
||||||
|
IsTransientError(int err)
|
||||||
|
{
|
||||||
|
switch (err) {
|
||||||
|
case EAGAIN:
|
||||||
|
#if EAGAIN != EWOULDBLOCK
|
||||||
|
case EWOULDBLOCK:
|
||||||
|
#endif
|
||||||
|
case EBUSY:
|
||||||
|
case EIO: // I/O error (may be transient on network filesystems)
|
||||||
|
case ETIMEDOUT:
|
||||||
|
case ENOSPC: // Disk full (may become available)
|
||||||
|
case EDQUOT: // Quota exceeded (may become available)
|
||||||
|
return true;
|
||||||
|
default:
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// RetryPolicy defines retry behavior for transient failures
|
||||||
|
struct RetryPolicy {
|
||||||
|
std::size_t max_attempts{3}; // Maximum retry attempts
|
||||||
|
std::chrono::milliseconds initial_delay{100}; // Initial delay before first retry
|
||||||
|
double backoff_multiplier{2.0}; // Exponential backoff multiplier
|
||||||
|
std::chrono::milliseconds max_delay{5000}; // Maximum delay between retries
|
||||||
|
|
||||||
|
// Default policy: 3 attempts, 100ms initial, 2x backoff, 5s max
|
||||||
|
static RetryPolicy Default()
|
||||||
|
{
|
||||||
|
return RetryPolicy{};
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Aggressive policy for critical operations: more attempts, faster retries
|
||||||
|
static RetryPolicy Aggressive()
|
||||||
|
{
|
||||||
|
return RetryPolicy{5, std::chrono::milliseconds(50), 1.5, std::chrono::milliseconds(2000)};
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Conservative policy for non-critical operations: fewer attempts, slower retries
|
||||||
|
static RetryPolicy Conservative()
|
||||||
|
{
|
||||||
|
return RetryPolicy{2, std::chrono::milliseconds(200), 2.5, std::chrono::milliseconds(10000)};
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Retry a function with exponential backoff for transient errors
|
||||||
|
// Returns true on success, false on permanent failure or exhausted retries
|
||||||
|
// The function `fn` should return true on success, false on failure, and set errno on failure
|
||||||
|
template<typename Func>
|
||||||
|
bool
|
||||||
|
RetryOnTransientError(Func fn, const RetryPolicy &policy, std::string &err)
|
||||||
|
{
|
||||||
|
std::size_t attempt = 0;
|
||||||
|
std::chrono::milliseconds delay = policy.initial_delay;
|
||||||
|
|
||||||
|
while (attempt < policy.max_attempts) {
|
||||||
|
++attempt;
|
||||||
|
errno = 0;
|
||||||
|
if (fn()) {
|
||||||
|
return true; // Success
|
||||||
|
}
|
||||||
|
|
||||||
|
int saved_errno = errno;
|
||||||
|
if (!IsTransientError(saved_errno)) {
|
||||||
|
// Permanent error, don't retry
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (attempt >= policy.max_attempts) {
|
||||||
|
// Exhausted retries
|
||||||
|
err += " (exhausted " + std::to_string(policy.max_attempts) + " retry attempts)";
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sleep before retry
|
||||||
|
std::this_thread::sleep_for(delay);
|
||||||
|
|
||||||
|
// Exponential backoff
|
||||||
|
delay = std::chrono::milliseconds(
|
||||||
|
static_cast<long long>(delay.count() * policy.backoff_multiplier)
|
||||||
|
);
|
||||||
|
if (delay > policy.max_delay) {
|
||||||
|
delay = policy.max_delay;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// CircuitBreaker prevents repeated attempts to failing operations
|
||||||
|
// States: Closed (normal), Open (failing, reject immediately), HalfOpen (testing recovery)
|
||||||
|
class CircuitBreaker {
|
||||||
|
public:
|
||||||
|
enum class State {
|
||||||
|
Closed, // Normal operation, allow all requests
|
||||||
|
Open, // Failing, reject requests immediately
|
||||||
|
HalfOpen // Testing recovery, allow limited requests
|
||||||
|
};
|
||||||
|
|
||||||
|
struct Config {
|
||||||
|
std::size_t failure_threshold; // Failures before opening circuit
|
||||||
|
std::chrono::seconds open_timeout; // Time before attempting recovery (Open → HalfOpen)
|
||||||
|
std::size_t success_threshold; // Successes in HalfOpen before closing
|
||||||
|
std::chrono::seconds window; // Time window for counting failures
|
||||||
|
|
||||||
|
Config()
|
||||||
|
: failure_threshold(5), open_timeout(30), success_threshold(2), window(60) {}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
explicit CircuitBreaker(const Config &cfg = Config());
|
||||||
|
|
||||||
|
|
||||||
|
// Check if operation is allowed (returns false if circuit is Open)
|
||||||
|
bool AllowRequest();
|
||||||
|
|
||||||
|
// Record successful operation
|
||||||
|
void RecordSuccess();
|
||||||
|
|
||||||
|
// Record failed operation
|
||||||
|
void RecordFailure();
|
||||||
|
|
||||||
|
// Get current state
|
||||||
|
State GetState() const
|
||||||
|
{
|
||||||
|
return state_;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Get failure count in current window
|
||||||
|
std::size_t GetFailureCount() const
|
||||||
|
{
|
||||||
|
return failure_count_;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Reset circuit to Closed state (for testing or manual intervention)
|
||||||
|
void Reset();
|
||||||
|
|
||||||
|
private:
|
||||||
|
void TransitionTo(State new_state);
|
||||||
|
|
||||||
|
bool IsWindowExpired() const;
|
||||||
|
|
||||||
|
Config config_;
|
||||||
|
State state_;
|
||||||
|
std::size_t failure_count_;
|
||||||
|
std::size_t success_count_;
|
||||||
|
std::chrono::steady_clock::time_point last_failure_time_;
|
||||||
|
std::chrono::steady_clock::time_point state_change_time_;
|
||||||
|
mutable std::mutex mtx_;
|
||||||
|
};
|
||||||
|
} // namespace kte
|
||||||
94
Swap.cc
94
Swap.cc
@@ -2,6 +2,7 @@
|
|||||||
#include "Buffer.h"
|
#include "Buffer.h"
|
||||||
#include "ErrorHandler.h"
|
#include "ErrorHandler.h"
|
||||||
#include "SyscallWrappers.h"
|
#include "SyscallWrappers.h"
|
||||||
|
#include "ErrorRecovery.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <chrono>
|
#include <chrono>
|
||||||
@@ -613,10 +614,19 @@ SwapManager::open_ctx(JournalCtx &ctx, const std::string &path, std::string &err
|
|||||||
#ifdef O_CLOEXEC
|
#ifdef O_CLOEXEC
|
||||||
flags |= O_CLOEXEC;
|
flags |= O_CLOEXEC;
|
||||||
#endif
|
#endif
|
||||||
int fd = kte::syscall::Open(path.c_str(), flags, 0600);
|
|
||||||
|
// Retry on transient errors (ENOSPC, EDQUOT, EBUSY, etc.)
|
||||||
|
int fd = -1;
|
||||||
|
auto open_fn = [&]() -> bool {
|
||||||
|
fd = kte::syscall::Open(path.c_str(), flags, 0600);
|
||||||
|
return fd >= 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
if (!RetryOnTransientError(open_fn, RetryPolicy::Aggressive(), err)) {
|
||||||
if (fd < 0) {
|
if (fd < 0) {
|
||||||
int saved_errno = errno;
|
int saved_errno = errno;
|
||||||
err = "Failed to open swap file '" + path + "': " + std::strerror(saved_errno);
|
err = "Failed to open swap file '" + path + "': " + std::strerror(saved_errno) + err;
|
||||||
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
// Ensure permissions even if file already existed.
|
// Ensure permissions even if file already existed.
|
||||||
@@ -636,10 +646,20 @@ SwapManager::open_ctx(JournalCtx &ctx, const std::string &path, std::string &err
|
|||||||
#ifdef O_CLOEXEC
|
#ifdef O_CLOEXEC
|
||||||
tflags |= O_CLOEXEC;
|
tflags |= O_CLOEXEC;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// Retry on transient errors for truncation open
|
||||||
|
fd = -1;
|
||||||
|
auto reopen_fn = [&]() -> bool {
|
||||||
fd = kte::syscall::Open(path.c_str(), tflags, 0600);
|
fd = kte::syscall::Open(path.c_str(), tflags, 0600);
|
||||||
|
return fd >= 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
if (!RetryOnTransientError(reopen_fn, RetryPolicy::Aggressive(), err)) {
|
||||||
if (fd < 0) {
|
if (fd < 0) {
|
||||||
int saved_errno = errno;
|
int saved_errno = errno;
|
||||||
err = "Failed to reopen swap file for truncation '" + path + "': " + std::strerror(saved_errno);
|
err = "Failed to reopen swap file for truncation '" + path + "': " + std::strerror(
|
||||||
|
saved_errno) + err;
|
||||||
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
(void) kte::syscall::Fchmod(fd, 0600);
|
(void) kte::syscall::Fchmod(fd, 0600);
|
||||||
@@ -705,10 +725,19 @@ SwapManager::compact_to_checkpoint(JournalCtx &ctx, const std::vector<std::uint8
|
|||||||
#ifdef O_CLOEXEC
|
#ifdef O_CLOEXEC
|
||||||
flags |= O_CLOEXEC;
|
flags |= O_CLOEXEC;
|
||||||
#endif
|
#endif
|
||||||
int tfd = kte::syscall::Open(tmp_path.c_str(), flags, 0600);
|
|
||||||
|
// Retry on transient errors for temp file creation
|
||||||
|
int tfd = -1;
|
||||||
|
auto open_tmp_fn = [&]() -> bool {
|
||||||
|
tfd = kte::syscall::Open(tmp_path.c_str(), flags, 0600);
|
||||||
|
return tfd >= 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
if (!RetryOnTransientError(open_tmp_fn, RetryPolicy::Aggressive(), err)) {
|
||||||
if (tfd < 0) {
|
if (tfd < 0) {
|
||||||
int saved_errno = errno;
|
int saved_errno = errno;
|
||||||
err = "Failed to open temp swap file '" + tmp_path + "': " + std::strerror(saved_errno);
|
err = "Failed to open temp swap file '" + tmp_path + "': " + std::strerror(saved_errno) + err;
|
||||||
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
(void) kte::syscall::Fchmod(tfd, 0600);
|
(void) kte::syscall::Fchmod(tfd, 0600);
|
||||||
@@ -1062,6 +1091,34 @@ SwapManager::process_one(const Pending &p)
|
|||||||
if (!p.buf)
|
if (!p.buf)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
// Check circuit breaker before processing
|
||||||
|
bool circuit_open = false;
|
||||||
|
{
|
||||||
|
std::lock_guard<std::mutex> lg(mtx_);
|
||||||
|
if (!circuit_breaker_.AllowRequest()) {
|
||||||
|
circuit_open = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (circuit_open) {
|
||||||
|
// Circuit is open - graceful degradation: skip swap write
|
||||||
|
// This prevents repeated failures from overwhelming the system
|
||||||
|
// Swap recording will resume when circuit closes
|
||||||
|
static std::atomic<std::uint64_t> last_warning_ns{0};
|
||||||
|
const std::uint64_t now = now_ns();
|
||||||
|
const std::uint64_t last = last_warning_ns.load();
|
||||||
|
// Log warning at most once per 60 seconds to avoid spam
|
||||||
|
if (now - last > 60000000000ULL) {
|
||||||
|
last_warning_ns.store(now);
|
||||||
|
ErrorHandler::Instance().Warning("SwapManager",
|
||||||
|
"Swap operations temporarily disabled due to repeated failures (circuit breaker open)",
|
||||||
|
p.buf && !p.buf->Filename().empty()
|
||||||
|
? p.buf->Filename()
|
||||||
|
: "<unnamed>");
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
Buffer &buf = *p.buf;
|
Buffer &buf = *p.buf;
|
||||||
|
|
||||||
@@ -1084,10 +1141,18 @@ SwapManager::process_one(const Pending &p)
|
|||||||
std::string open_err;
|
std::string open_err;
|
||||||
if (!open_ctx(*ctxp, path, open_err)) {
|
if (!open_ctx(*ctxp, path, open_err)) {
|
||||||
report_error(open_err, p.buf);
|
report_error(open_err, p.buf);
|
||||||
|
{
|
||||||
|
std::lock_guard<std::mutex> lg(mtx_);
|
||||||
|
circuit_breaker_.RecordFailure();
|
||||||
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (p.payload.size() > 0xFFFFFFu) {
|
if (p.payload.size() > 0xFFFFFFu) {
|
||||||
report_error("Payload too large: " + std::to_string(p.payload.size()) + " bytes", p.buf);
|
report_error("Payload too large: " + std::to_string(p.payload.size()) + " bytes", p.buf);
|
||||||
|
{
|
||||||
|
std::lock_guard<std::mutex> lg(mtx_);
|
||||||
|
circuit_breaker_.RecordFailure();
|
||||||
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1123,6 +1188,10 @@ SwapManager::process_one(const Pending &p)
|
|||||||
if (!ok) {
|
if (!ok) {
|
||||||
int err = errno;
|
int err = errno;
|
||||||
report_error("Failed to write swap record to '" + path + "': " + std::strerror(err), p.buf);
|
report_error("Failed to write swap record to '" + path + "': " + std::strerror(err), p.buf);
|
||||||
|
{
|
||||||
|
std::lock_guard<std::mutex> lg(mtx_);
|
||||||
|
circuit_breaker_.RecordFailure();
|
||||||
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
ctxp->approx_size_bytes += static_cast<std::uint64_t>(rec.size());
|
ctxp->approx_size_bytes += static_cast<std::uint64_t>(rec.size());
|
||||||
@@ -1138,12 +1207,27 @@ SwapManager::process_one(const Pending &p)
|
|||||||
std::string compact_err;
|
std::string compact_err;
|
||||||
if (!compact_to_checkpoint(*ctxp, rec, compact_err)) {
|
if (!compact_to_checkpoint(*ctxp, rec, compact_err)) {
|
||||||
report_error(compact_err, p.buf);
|
report_error(compact_err, p.buf);
|
||||||
|
// Note: compaction failure is not fatal, don't record circuit breaker failure
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Record success for circuit breaker
|
||||||
|
{
|
||||||
|
std::lock_guard<std::mutex> lg(mtx_);
|
||||||
|
circuit_breaker_.RecordSuccess();
|
||||||
|
}
|
||||||
} catch (const std::exception &e) {
|
} catch (const std::exception &e) {
|
||||||
report_error(std::string("Exception in process_one: ") + e.what(), p.buf);
|
report_error(std::string("Exception in process_one: ") + e.what(), p.buf);
|
||||||
|
{
|
||||||
|
std::lock_guard<std::mutex> lg(mtx_);
|
||||||
|
circuit_breaker_.RecordFailure();
|
||||||
|
}
|
||||||
} catch (...) {
|
} catch (...) {
|
||||||
report_error("Unknown exception in process_one", p.buf);
|
report_error("Unknown exception in process_one", p.buf);
|
||||||
|
{
|
||||||
|
std::lock_guard<std::mutex> lg(mtx_);
|
||||||
|
circuit_breaker_.RecordFailure();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
4
Swap.h
4
Swap.h
@@ -15,6 +15,7 @@
|
|||||||
#include <atomic>
|
#include <atomic>
|
||||||
|
|
||||||
#include "SwapRecorder.h"
|
#include "SwapRecorder.h"
|
||||||
|
#include "ErrorRecovery.h"
|
||||||
|
|
||||||
class Buffer;
|
class Buffer;
|
||||||
|
|
||||||
@@ -245,5 +246,8 @@ private:
|
|||||||
// Error tracking (protected by mtx_)
|
// Error tracking (protected by mtx_)
|
||||||
std::deque<SwapError> errors_; // bounded to max 100 entries
|
std::deque<SwapError> errors_; // bounded to max 100 entries
|
||||||
std::size_t total_error_count_{0};
|
std::size_t total_error_count_{0};
|
||||||
|
|
||||||
|
// Circuit breaker for swap operations (protected by mtx_)
|
||||||
|
CircuitBreaker circuit_breaker_;
|
||||||
};
|
};
|
||||||
} // namespace kte
|
} // namespace kte
|
||||||
@@ -852,6 +852,177 @@ When updating existing code to follow these conventions:
|
|||||||
5. **Update callers** to handle the error parameter
|
5. **Update callers** to handle the error parameter
|
||||||
6. **Write tests** that verify error handling
|
6. **Write tests** that verify error handling
|
||||||
|
|
||||||
|
### Error Recovery Mechanisms
|
||||||
|
|
||||||
|
kte implements automatic error recovery for transient failures using
|
||||||
|
retry logic and circuit breaker patterns.
|
||||||
|
|
||||||
|
#### Transient Error Classification
|
||||||
|
|
||||||
|
Transient errors are temporary failures that may succeed on retry:
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
#include "ErrorRecovery.h"
|
||||||
|
|
||||||
|
bool IsTransientError(int err); // Returns true for EAGAIN, EWOULDBLOCK, EBUSY, EIO, ETIMEDOUT, ENOSPC, EDQUOT
|
||||||
|
```
|
||||||
|
|
||||||
|
**Transient errors**:
|
||||||
|
|
||||||
|
- `EAGAIN` / `EWOULDBLOCK` - Resource temporarily unavailable
|
||||||
|
- `EBUSY` - Device or resource busy
|
||||||
|
- `EIO` - I/O error (may be transient on network filesystems)
|
||||||
|
- `ETIMEDOUT` - Operation timed out
|
||||||
|
- `ENOSPC` - No space left on device (may become available)
|
||||||
|
- `EDQUOT` - Disk quota exceeded (may become available)
|
||||||
|
|
||||||
|
**Permanent errors** (don't retry):
|
||||||
|
|
||||||
|
- `ENOENT` - File not found
|
||||||
|
- `EACCES` - Permission denied
|
||||||
|
- `EINVAL` - Invalid argument
|
||||||
|
- `ENOTDIR` - Not a directory
|
||||||
|
|
||||||
|
#### Retry Policies
|
||||||
|
|
||||||
|
Three predefined retry policies are available:
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
// Default: 3 attempts, 100ms initial delay, 2x backoff, 5s max delay
|
||||||
|
RetryPolicy::Default()
|
||||||
|
|
||||||
|
// Aggressive: 5 attempts, 50ms initial delay, 1.5x backoff, 2s max delay
|
||||||
|
// Use for critical operations (swap files, file saves)
|
||||||
|
RetryPolicy::Aggressive()
|
||||||
|
|
||||||
|
// Conservative: 2 attempts, 200ms initial delay, 2.5x backoff, 10s max delay
|
||||||
|
// Use for non-critical operations
|
||||||
|
RetryPolicy::Conservative()
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Using RetryOnTransientError
|
||||||
|
|
||||||
|
Wrap syscalls with automatic retry on transient errors:
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
#include "ErrorRecovery.h"
|
||||||
|
#include "SyscallWrappers.h"
|
||||||
|
|
||||||
|
bool save_file(const std::string &path, std::string &err) {
|
||||||
|
int fd = -1;
|
||||||
|
auto open_fn = [&]() -> bool {
|
||||||
|
fd = kte::syscall::Open(path.c_str(), O_CREAT | O_WRONLY, 0644);
|
||||||
|
return fd >= 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
if (!kte::RetryOnTransientError(open_fn, kte::RetryPolicy::Aggressive(), err)) {
|
||||||
|
if (fd < 0) {
|
||||||
|
int saved_errno = errno;
|
||||||
|
err = "Failed to open file '" + path + "': " + std::strerror(saved_errno) + err;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ... use fd
|
||||||
|
kte::syscall::Close(fd);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Key points**:
|
||||||
|
|
||||||
|
- Lambda must return `bool` (true = success, false = failure)
|
||||||
|
- Lambda must set `errno` on failure for transient error detection
|
||||||
|
- Use EINTR-safe syscall wrappers (`kte::syscall::*`) inside lambdas
|
||||||
|
- Capture errno immediately after failure
|
||||||
|
- Append retry info to error message (automatically added by
|
||||||
|
RetryOnTransientError)
|
||||||
|
|
||||||
|
#### Circuit Breaker Pattern
|
||||||
|
|
||||||
|
The circuit breaker prevents repeated attempts to failing operations,
|
||||||
|
enabling graceful degradation.
|
||||||
|
|
||||||
|
**States**:
|
||||||
|
|
||||||
|
- **Closed** (normal): All requests allowed
|
||||||
|
- **Open** (failing): Requests rejected immediately, operation disabled
|
||||||
|
- **HalfOpen** (testing): Limited requests allowed to test recovery
|
||||||
|
|
||||||
|
**Configuration** (SwapManager example):
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
CircuitBreaker::Config cfg;
|
||||||
|
cfg.failure_threshold = 5; // Open after 5 failures
|
||||||
|
cfg.timeout = std::chrono::seconds(30); // Try recovery after 30s
|
||||||
|
cfg.success_threshold = 2; // Close after 2 successes in HalfOpen
|
||||||
|
cfg.window = std::chrono::seconds(60); // Count failures in 60s window
|
||||||
|
|
||||||
|
CircuitBreaker breaker(cfg);
|
||||||
|
```
|
||||||
|
|
||||||
|
**Usage**:
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
// Check before operation
|
||||||
|
if (!breaker.AllowRequest()) {
|
||||||
|
// Circuit is open - graceful degradation
|
||||||
|
log_warning("Operation disabled due to repeated failures");
|
||||||
|
return; // Skip operation
|
||||||
|
}
|
||||||
|
|
||||||
|
// Perform operation
|
||||||
|
if (operation_succeeds()) {
|
||||||
|
breaker.RecordSuccess();
|
||||||
|
} else {
|
||||||
|
breaker.RecordFailure();
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**SwapManager Integration**:
|
||||||
|
|
||||||
|
The SwapManager uses a circuit breaker to handle repeated swap file
|
||||||
|
failures:
|
||||||
|
|
||||||
|
1. After 5 swap write failures in 60 seconds, circuit opens
|
||||||
|
2. Swap recording is disabled (graceful degradation)
|
||||||
|
3. Warning logged once per 60 seconds to avoid spam
|
||||||
|
4. After 30 seconds, circuit enters HalfOpen state
|
||||||
|
5. If 2 consecutive operations succeed, circuit closes and swap
|
||||||
|
recording resumes
|
||||||
|
|
||||||
|
This ensures the editor remains functional even when swap files are
|
||||||
|
unavailable (disk full, quota exceeded, filesystem errors).
|
||||||
|
|
||||||
|
#### Graceful Degradation Strategies
|
||||||
|
|
||||||
|
When operations fail repeatedly:
|
||||||
|
|
||||||
|
1. **Disable non-critical features** - Swap recording can be disabled
|
||||||
|
without affecting editing
|
||||||
|
2. **Log warnings** - Inform user of degraded operation via ErrorHandler
|
||||||
|
3. **Rate-limit warnings** - Avoid log spam (e.g., once per 60 seconds)
|
||||||
|
4. **Automatic recovery** - Circuit breaker automatically tests recovery
|
||||||
|
5. **Preserve core functionality** - Editor remains usable without swap
|
||||||
|
files
|
||||||
|
|
||||||
|
**Example** (from SwapManager):
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
if (circuit_open) {
|
||||||
|
// Graceful degradation: skip swap write
|
||||||
|
static std::atomic<std::uint64_t> last_warning_ns{0};
|
||||||
|
const std::uint64_t now = now_ns();
|
||||||
|
if (now - last_warning_ns.load() > 60000000000ULL) {
|
||||||
|
last_warning_ns.store(now);
|
||||||
|
ErrorHandler::Instance().Warning("SwapManager",
|
||||||
|
"Swap operations temporarily disabled due to repeated failures",
|
||||||
|
buffer_name);
|
||||||
|
}
|
||||||
|
return; // Skip operation, editor continues normally
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
## Common Tasks
|
## Common Tasks
|
||||||
|
|
||||||
### Adding a New Command
|
### Adding a New Command
|
||||||
|
|||||||
Reference in New Issue
Block a user