Introduce error recovery mechanisms with retry logic and circuit breaker integration.
- Added `ErrorRecovery.cc` and `ErrorRecovery.h` for retry and circuit breaker implementations. - Enhanced swap file handling with transient error retries and exponential backoff (e.g., ENOSPC, EDQUOT). - Integrated circuit breaker into SwapManager to gracefully handle repeated failures, prevent system overload, and enable automatic recovery. - Updated `DEVELOPER_GUIDE.md` with comprehensive documentation on error recovery patterns and graceful degradation strategies. - Refined fsync, temp file creation, and swap file logic with retry-on-failure mechanisms for improved resilience.
This commit is contained in:
27
Buffer.cc
27
Buffer.cc
@@ -20,6 +20,7 @@
|
||||
#include "UndoTree.h"
|
||||
#include "ErrorHandler.h"
|
||||
#include "SyscallWrappers.h"
|
||||
#include "ErrorRecovery.h"
|
||||
// For reconstructing highlighter state on copies
|
||||
#include "syntax/HighlighterRegistry.h"
|
||||
#include "syntax/NullHighlighter.h"
|
||||
@@ -148,9 +149,21 @@ atomic_write_file(const std::string &path, const char *data, std::size_t len, st
|
||||
// mkstemp requires a mutable buffer.
|
||||
std::vector<char> buf(tmpl_s.begin(), tmpl_s.end());
|
||||
buf.push_back('\0');
|
||||
int fd = kte::syscall::Mkstemp(buf.data());
|
||||
|
||||
// Retry on transient errors for temp file creation
|
||||
int fd = -1;
|
||||
auto mkstemp_fn = [&]() -> bool {
|
||||
// Reset buffer for each retry attempt
|
||||
buf.assign(tmpl_s.begin(), tmpl_s.end());
|
||||
buf.push_back('\0');
|
||||
fd = kte::syscall::Mkstemp(buf.data());
|
||||
return fd >= 0;
|
||||
};
|
||||
|
||||
if (!kte::RetryOnTransientError(mkstemp_fn, kte::RetryPolicy::Aggressive(), err)) {
|
||||
if (fd < 0) {
|
||||
err = std::string("Failed to create temp file for save: ") + std::strerror(errno);
|
||||
err = std::string("Failed to create temp file for save: ") + std::strerror(errno) + err;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
std::string tmp_path(buf.data());
|
||||
@@ -163,8 +176,14 @@ atomic_write_file(const std::string &path, const char *data, std::size_t len, st
|
||||
|
||||
bool ok = write_all_fd(fd, data, len, err);
|
||||
if (ok) {
|
||||
if (kte::syscall::Fsync(fd) != 0) {
|
||||
err = std::string("fsync failed: ") + std::strerror(errno);
|
||||
// Retry fsync on transient errors
|
||||
auto fsync_fn = [&]() -> bool {
|
||||
return kte::syscall::Fsync(fd) == 0;
|
||||
};
|
||||
|
||||
std::string fsync_err;
|
||||
if (!kte::RetryOnTransientError(fsync_fn, kte::RetryPolicy::Aggressive(), fsync_err)) {
|
||||
err = std::string("fsync failed: ") + std::strerror(errno) + fsync_err;
|
||||
ok = false;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -144,6 +144,7 @@ set(COMMON_SOURCES
|
||||
Swap.cc
|
||||
ErrorHandler.cc
|
||||
SyscallWrappers.cc
|
||||
ErrorRecovery.cc
|
||||
TerminalInputHandler.cc
|
||||
TerminalRenderer.cc
|
||||
TerminalFrontend.cc
|
||||
@@ -341,6 +342,7 @@ if (BUILD_TESTS)
|
||||
Swap.cc
|
||||
ErrorHandler.cc
|
||||
SyscallWrappers.cc
|
||||
ErrorRecovery.cc
|
||||
KKeymap.cc
|
||||
SwapRecorder.h
|
||||
OptimizedSearch.cc
|
||||
|
||||
157
ErrorRecovery.cc
Normal file
157
ErrorRecovery.cc
Normal file
@@ -0,0 +1,157 @@
|
||||
// ErrorRecovery.cc - Error recovery mechanisms implementation
|
||||
#include "ErrorRecovery.h"
|
||||
#include <mutex>
|
||||
|
||||
namespace kte {
|
||||
CircuitBreaker::CircuitBreaker(const Config &cfg)
|
||||
: config_(cfg), state_(State::Closed), failure_count_(0), success_count_(0),
|
||||
last_failure_time_(std::chrono::steady_clock::time_point::min()),
|
||||
state_change_time_(std::chrono::steady_clock::now()) {}
|
||||
|
||||
|
||||
bool
|
||||
CircuitBreaker::AllowRequest()
|
||||
{
|
||||
std::lock_guard<std::mutex> lg(mtx_);
|
||||
|
||||
const auto now = std::chrono::steady_clock::now();
|
||||
|
||||
switch (state_) {
|
||||
case State::Closed:
|
||||
// Normal operation, allow all requests
|
||||
return true;
|
||||
|
||||
case State::Open: {
|
||||
// Check if timeout has elapsed to transition to HalfOpen
|
||||
const auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(
|
||||
now - state_change_time_
|
||||
);
|
||||
if (elapsed >= config_.open_timeout) {
|
||||
TransitionTo(State::HalfOpen);
|
||||
return true; // Allow one request to test recovery
|
||||
}
|
||||
return false; // Circuit is open, reject request
|
||||
}
|
||||
|
||||
case State::HalfOpen:
|
||||
// Allow limited requests to test recovery
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
CircuitBreaker::RecordSuccess()
|
||||
{
|
||||
std::lock_guard<std::mutex> lg(mtx_);
|
||||
|
||||
switch (state_) {
|
||||
case State::Closed:
|
||||
// Reset failure count on success in normal operation
|
||||
failure_count_ = 0;
|
||||
break;
|
||||
|
||||
case State::HalfOpen:
|
||||
++success_count_;
|
||||
if (success_count_ >= config_.success_threshold) {
|
||||
// Enough successes, close the circuit
|
||||
TransitionTo(State::Closed);
|
||||
}
|
||||
break;
|
||||
|
||||
case State::Open:
|
||||
// Shouldn't happen (requests rejected), but handle gracefully
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
CircuitBreaker::RecordFailure()
|
||||
{
|
||||
std::lock_guard<std::mutex> lg(mtx_);
|
||||
|
||||
const auto now = std::chrono::steady_clock::now();
|
||||
last_failure_time_ = now;
|
||||
|
||||
switch (state_) {
|
||||
case State::Closed:
|
||||
// Check if we need to reset the failure count (window expired)
|
||||
if (IsWindowExpired()) {
|
||||
failure_count_ = 0;
|
||||
}
|
||||
|
||||
++failure_count_;
|
||||
if (failure_count_ >= config_.failure_threshold) {
|
||||
// Too many failures, open the circuit
|
||||
TransitionTo(State::Open);
|
||||
}
|
||||
break;
|
||||
|
||||
case State::HalfOpen:
|
||||
// Failure during recovery test, reopen the circuit
|
||||
TransitionTo(State::Open);
|
||||
break;
|
||||
|
||||
case State::Open:
|
||||
// Already open, just track the failure
|
||||
++failure_count_;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
CircuitBreaker::Reset()
|
||||
{
|
||||
std::lock_guard<std::mutex> lg(mtx_);
|
||||
TransitionTo(State::Closed);
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
CircuitBreaker::TransitionTo(State new_state)
|
||||
{
|
||||
if (state_ == new_state) {
|
||||
return;
|
||||
}
|
||||
|
||||
state_ = new_state;
|
||||
state_change_time_ = std::chrono::steady_clock::now();
|
||||
|
||||
switch (new_state) {
|
||||
case State::Closed:
|
||||
failure_count_ = 0;
|
||||
success_count_ = 0;
|
||||
break;
|
||||
|
||||
case State::Open:
|
||||
success_count_ = 0;
|
||||
// Keep failure_count_ for diagnostics
|
||||
break;
|
||||
|
||||
case State::HalfOpen:
|
||||
success_count_ = 0;
|
||||
// Keep failure_count_ for diagnostics
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
bool
|
||||
CircuitBreaker::IsWindowExpired() const
|
||||
{
|
||||
if (failure_count_ == 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto now = std::chrono::steady_clock::now();
|
||||
const auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(
|
||||
now - last_failure_time_
|
||||
);
|
||||
|
||||
return elapsed >= config_.window;
|
||||
}
|
||||
} // namespace kte
|
||||
170
ErrorRecovery.h
Normal file
170
ErrorRecovery.h
Normal file
@@ -0,0 +1,170 @@
|
||||
// ErrorRecovery.h - Error recovery mechanisms for kte
|
||||
#pragma once
|
||||
|
||||
#include <chrono>
|
||||
#include <cstddef>
|
||||
#include <functional>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <mutex>
|
||||
#include <cerrno>
|
||||
|
||||
namespace kte {
|
||||
// Classify errno values as transient (retryable) or permanent
|
||||
inline bool
|
||||
IsTransientError(int err)
|
||||
{
|
||||
switch (err) {
|
||||
case EAGAIN:
|
||||
#if EAGAIN != EWOULDBLOCK
|
||||
case EWOULDBLOCK:
|
||||
#endif
|
||||
case EBUSY:
|
||||
case EIO: // I/O error (may be transient on network filesystems)
|
||||
case ETIMEDOUT:
|
||||
case ENOSPC: // Disk full (may become available)
|
||||
case EDQUOT: // Quota exceeded (may become available)
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// RetryPolicy defines retry behavior for transient failures
|
||||
struct RetryPolicy {
|
||||
std::size_t max_attempts{3}; // Maximum retry attempts
|
||||
std::chrono::milliseconds initial_delay{100}; // Initial delay before first retry
|
||||
double backoff_multiplier{2.0}; // Exponential backoff multiplier
|
||||
std::chrono::milliseconds max_delay{5000}; // Maximum delay between retries
|
||||
|
||||
// Default policy: 3 attempts, 100ms initial, 2x backoff, 5s max
|
||||
static RetryPolicy Default()
|
||||
{
|
||||
return RetryPolicy{};
|
||||
}
|
||||
|
||||
|
||||
// Aggressive policy for critical operations: more attempts, faster retries
|
||||
static RetryPolicy Aggressive()
|
||||
{
|
||||
return RetryPolicy{5, std::chrono::milliseconds(50), 1.5, std::chrono::milliseconds(2000)};
|
||||
}
|
||||
|
||||
|
||||
// Conservative policy for non-critical operations: fewer attempts, slower retries
|
||||
static RetryPolicy Conservative()
|
||||
{
|
||||
return RetryPolicy{2, std::chrono::milliseconds(200), 2.5, std::chrono::milliseconds(10000)};
|
||||
}
|
||||
};
|
||||
|
||||
// Retry a function with exponential backoff for transient errors
|
||||
// Returns true on success, false on permanent failure or exhausted retries
|
||||
// The function `fn` should return true on success, false on failure, and set errno on failure
|
||||
template<typename Func>
|
||||
bool
|
||||
RetryOnTransientError(Func fn, const RetryPolicy &policy, std::string &err)
|
||||
{
|
||||
std::size_t attempt = 0;
|
||||
std::chrono::milliseconds delay = policy.initial_delay;
|
||||
|
||||
while (attempt < policy.max_attempts) {
|
||||
++attempt;
|
||||
errno = 0;
|
||||
if (fn()) {
|
||||
return true; // Success
|
||||
}
|
||||
|
||||
int saved_errno = errno;
|
||||
if (!IsTransientError(saved_errno)) {
|
||||
// Permanent error, don't retry
|
||||
return false;
|
||||
}
|
||||
|
||||
if (attempt >= policy.max_attempts) {
|
||||
// Exhausted retries
|
||||
err += " (exhausted " + std::to_string(policy.max_attempts) + " retry attempts)";
|
||||
return false;
|
||||
}
|
||||
|
||||
// Sleep before retry
|
||||
std::this_thread::sleep_for(delay);
|
||||
|
||||
// Exponential backoff
|
||||
delay = std::chrono::milliseconds(
|
||||
static_cast<long long>(delay.count() * policy.backoff_multiplier)
|
||||
);
|
||||
if (delay > policy.max_delay) {
|
||||
delay = policy.max_delay;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
// CircuitBreaker prevents repeated attempts to failing operations
|
||||
// States: Closed (normal), Open (failing, reject immediately), HalfOpen (testing recovery)
|
||||
class CircuitBreaker {
|
||||
public:
|
||||
enum class State {
|
||||
Closed, // Normal operation, allow all requests
|
||||
Open, // Failing, reject requests immediately
|
||||
HalfOpen // Testing recovery, allow limited requests
|
||||
};
|
||||
|
||||
struct Config {
|
||||
std::size_t failure_threshold; // Failures before opening circuit
|
||||
std::chrono::seconds open_timeout; // Time before attempting recovery (Open → HalfOpen)
|
||||
std::size_t success_threshold; // Successes in HalfOpen before closing
|
||||
std::chrono::seconds window; // Time window for counting failures
|
||||
|
||||
Config()
|
||||
: failure_threshold(5), open_timeout(30), success_threshold(2), window(60) {}
|
||||
};
|
||||
|
||||
|
||||
explicit CircuitBreaker(const Config &cfg = Config());
|
||||
|
||||
|
||||
// Check if operation is allowed (returns false if circuit is Open)
|
||||
bool AllowRequest();
|
||||
|
||||
// Record successful operation
|
||||
void RecordSuccess();
|
||||
|
||||
// Record failed operation
|
||||
void RecordFailure();
|
||||
|
||||
// Get current state
|
||||
State GetState() const
|
||||
{
|
||||
return state_;
|
||||
}
|
||||
|
||||
|
||||
// Get failure count in current window
|
||||
std::size_t GetFailureCount() const
|
||||
{
|
||||
return failure_count_;
|
||||
}
|
||||
|
||||
|
||||
// Reset circuit to Closed state (for testing or manual intervention)
|
||||
void Reset();
|
||||
|
||||
private:
|
||||
void TransitionTo(State new_state);
|
||||
|
||||
bool IsWindowExpired() const;
|
||||
|
||||
Config config_;
|
||||
State state_;
|
||||
std::size_t failure_count_;
|
||||
std::size_t success_count_;
|
||||
std::chrono::steady_clock::time_point last_failure_time_;
|
||||
std::chrono::steady_clock::time_point state_change_time_;
|
||||
mutable std::mutex mtx_;
|
||||
};
|
||||
} // namespace kte
|
||||
94
Swap.cc
94
Swap.cc
@@ -2,6 +2,7 @@
|
||||
#include "Buffer.h"
|
||||
#include "ErrorHandler.h"
|
||||
#include "SyscallWrappers.h"
|
||||
#include "ErrorRecovery.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <chrono>
|
||||
@@ -613,10 +614,19 @@ SwapManager::open_ctx(JournalCtx &ctx, const std::string &path, std::string &err
|
||||
#ifdef O_CLOEXEC
|
||||
flags |= O_CLOEXEC;
|
||||
#endif
|
||||
int fd = kte::syscall::Open(path.c_str(), flags, 0600);
|
||||
|
||||
// Retry on transient errors (ENOSPC, EDQUOT, EBUSY, etc.)
|
||||
int fd = -1;
|
||||
auto open_fn = [&]() -> bool {
|
||||
fd = kte::syscall::Open(path.c_str(), flags, 0600);
|
||||
return fd >= 0;
|
||||
};
|
||||
|
||||
if (!RetryOnTransientError(open_fn, RetryPolicy::Aggressive(), err)) {
|
||||
if (fd < 0) {
|
||||
int saved_errno = errno;
|
||||
err = "Failed to open swap file '" + path + "': " + std::strerror(saved_errno);
|
||||
err = "Failed to open swap file '" + path + "': " + std::strerror(saved_errno) + err;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
// Ensure permissions even if file already existed.
|
||||
@@ -636,10 +646,20 @@ SwapManager::open_ctx(JournalCtx &ctx, const std::string &path, std::string &err
|
||||
#ifdef O_CLOEXEC
|
||||
tflags |= O_CLOEXEC;
|
||||
#endif
|
||||
|
||||
// Retry on transient errors for truncation open
|
||||
fd = -1;
|
||||
auto reopen_fn = [&]() -> bool {
|
||||
fd = kte::syscall::Open(path.c_str(), tflags, 0600);
|
||||
return fd >= 0;
|
||||
};
|
||||
|
||||
if (!RetryOnTransientError(reopen_fn, RetryPolicy::Aggressive(), err)) {
|
||||
if (fd < 0) {
|
||||
int saved_errno = errno;
|
||||
err = "Failed to reopen swap file for truncation '" + path + "': " + std::strerror(saved_errno);
|
||||
err = "Failed to reopen swap file for truncation '" + path + "': " + std::strerror(
|
||||
saved_errno) + err;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
(void) kte::syscall::Fchmod(fd, 0600);
|
||||
@@ -705,10 +725,19 @@ SwapManager::compact_to_checkpoint(JournalCtx &ctx, const std::vector<std::uint8
|
||||
#ifdef O_CLOEXEC
|
||||
flags |= O_CLOEXEC;
|
||||
#endif
|
||||
int tfd = kte::syscall::Open(tmp_path.c_str(), flags, 0600);
|
||||
|
||||
// Retry on transient errors for temp file creation
|
||||
int tfd = -1;
|
||||
auto open_tmp_fn = [&]() -> bool {
|
||||
tfd = kte::syscall::Open(tmp_path.c_str(), flags, 0600);
|
||||
return tfd >= 0;
|
||||
};
|
||||
|
||||
if (!RetryOnTransientError(open_tmp_fn, RetryPolicy::Aggressive(), err)) {
|
||||
if (tfd < 0) {
|
||||
int saved_errno = errno;
|
||||
err = "Failed to open temp swap file '" + tmp_path + "': " + std::strerror(saved_errno);
|
||||
err = "Failed to open temp swap file '" + tmp_path + "': " + std::strerror(saved_errno) + err;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
(void) kte::syscall::Fchmod(tfd, 0600);
|
||||
@@ -1062,6 +1091,34 @@ SwapManager::process_one(const Pending &p)
|
||||
if (!p.buf)
|
||||
return;
|
||||
|
||||
// Check circuit breaker before processing
|
||||
bool circuit_open = false;
|
||||
{
|
||||
std::lock_guard<std::mutex> lg(mtx_);
|
||||
if (!circuit_breaker_.AllowRequest()) {
|
||||
circuit_open = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (circuit_open) {
|
||||
// Circuit is open - graceful degradation: skip swap write
|
||||
// This prevents repeated failures from overwhelming the system
|
||||
// Swap recording will resume when circuit closes
|
||||
static std::atomic<std::uint64_t> last_warning_ns{0};
|
||||
const std::uint64_t now = now_ns();
|
||||
const std::uint64_t last = last_warning_ns.load();
|
||||
// Log warning at most once per 60 seconds to avoid spam
|
||||
if (now - last > 60000000000ULL) {
|
||||
last_warning_ns.store(now);
|
||||
ErrorHandler::Instance().Warning("SwapManager",
|
||||
"Swap operations temporarily disabled due to repeated failures (circuit breaker open)",
|
||||
p.buf && !p.buf->Filename().empty()
|
||||
? p.buf->Filename()
|
||||
: "<unnamed>");
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
Buffer &buf = *p.buf;
|
||||
|
||||
@@ -1084,10 +1141,18 @@ SwapManager::process_one(const Pending &p)
|
||||
std::string open_err;
|
||||
if (!open_ctx(*ctxp, path, open_err)) {
|
||||
report_error(open_err, p.buf);
|
||||
{
|
||||
std::lock_guard<std::mutex> lg(mtx_);
|
||||
circuit_breaker_.RecordFailure();
|
||||
}
|
||||
return;
|
||||
}
|
||||
if (p.payload.size() > 0xFFFFFFu) {
|
||||
report_error("Payload too large: " + std::to_string(p.payload.size()) + " bytes", p.buf);
|
||||
{
|
||||
std::lock_guard<std::mutex> lg(mtx_);
|
||||
circuit_breaker_.RecordFailure();
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -1123,6 +1188,10 @@ SwapManager::process_one(const Pending &p)
|
||||
if (!ok) {
|
||||
int err = errno;
|
||||
report_error("Failed to write swap record to '" + path + "': " + std::strerror(err), p.buf);
|
||||
{
|
||||
std::lock_guard<std::mutex> lg(mtx_);
|
||||
circuit_breaker_.RecordFailure();
|
||||
}
|
||||
return;
|
||||
}
|
||||
ctxp->approx_size_bytes += static_cast<std::uint64_t>(rec.size());
|
||||
@@ -1138,12 +1207,27 @@ SwapManager::process_one(const Pending &p)
|
||||
std::string compact_err;
|
||||
if (!compact_to_checkpoint(*ctxp, rec, compact_err)) {
|
||||
report_error(compact_err, p.buf);
|
||||
// Note: compaction failure is not fatal, don't record circuit breaker failure
|
||||
}
|
||||
}
|
||||
|
||||
// Record success for circuit breaker
|
||||
{
|
||||
std::lock_guard<std::mutex> lg(mtx_);
|
||||
circuit_breaker_.RecordSuccess();
|
||||
}
|
||||
} catch (const std::exception &e) {
|
||||
report_error(std::string("Exception in process_one: ") + e.what(), p.buf);
|
||||
{
|
||||
std::lock_guard<std::mutex> lg(mtx_);
|
||||
circuit_breaker_.RecordFailure();
|
||||
}
|
||||
} catch (...) {
|
||||
report_error("Unknown exception in process_one", p.buf);
|
||||
{
|
||||
std::lock_guard<std::mutex> lg(mtx_);
|
||||
circuit_breaker_.RecordFailure();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
4
Swap.h
4
Swap.h
@@ -15,6 +15,7 @@
|
||||
#include <atomic>
|
||||
|
||||
#include "SwapRecorder.h"
|
||||
#include "ErrorRecovery.h"
|
||||
|
||||
class Buffer;
|
||||
|
||||
@@ -245,5 +246,8 @@ private:
|
||||
// Error tracking (protected by mtx_)
|
||||
std::deque<SwapError> errors_; // bounded to max 100 entries
|
||||
std::size_t total_error_count_{0};
|
||||
|
||||
// Circuit breaker for swap operations (protected by mtx_)
|
||||
CircuitBreaker circuit_breaker_;
|
||||
};
|
||||
} // namespace kte
|
||||
@@ -852,6 +852,177 @@ When updating existing code to follow these conventions:
|
||||
5. **Update callers** to handle the error parameter
|
||||
6. **Write tests** that verify error handling
|
||||
|
||||
### Error Recovery Mechanisms
|
||||
|
||||
kte implements automatic error recovery for transient failures using
|
||||
retry logic and circuit breaker patterns.
|
||||
|
||||
#### Transient Error Classification
|
||||
|
||||
Transient errors are temporary failures that may succeed on retry:
|
||||
|
||||
```cpp
|
||||
#include "ErrorRecovery.h"
|
||||
|
||||
bool IsTransientError(int err); // Returns true for EAGAIN, EWOULDBLOCK, EBUSY, EIO, ETIMEDOUT, ENOSPC, EDQUOT
|
||||
```
|
||||
|
||||
**Transient errors**:
|
||||
|
||||
- `EAGAIN` / `EWOULDBLOCK` - Resource temporarily unavailable
|
||||
- `EBUSY` - Device or resource busy
|
||||
- `EIO` - I/O error (may be transient on network filesystems)
|
||||
- `ETIMEDOUT` - Operation timed out
|
||||
- `ENOSPC` - No space left on device (may become available)
|
||||
- `EDQUOT` - Disk quota exceeded (may become available)
|
||||
|
||||
**Permanent errors** (don't retry):
|
||||
|
||||
- `ENOENT` - File not found
|
||||
- `EACCES` - Permission denied
|
||||
- `EINVAL` - Invalid argument
|
||||
- `ENOTDIR` - Not a directory
|
||||
|
||||
#### Retry Policies
|
||||
|
||||
Three predefined retry policies are available:
|
||||
|
||||
```cpp
|
||||
// Default: 3 attempts, 100ms initial delay, 2x backoff, 5s max delay
|
||||
RetryPolicy::Default()
|
||||
|
||||
// Aggressive: 5 attempts, 50ms initial delay, 1.5x backoff, 2s max delay
|
||||
// Use for critical operations (swap files, file saves)
|
||||
RetryPolicy::Aggressive()
|
||||
|
||||
// Conservative: 2 attempts, 200ms initial delay, 2.5x backoff, 10s max delay
|
||||
// Use for non-critical operations
|
||||
RetryPolicy::Conservative()
|
||||
```
|
||||
|
||||
#### Using RetryOnTransientError
|
||||
|
||||
Wrap syscalls with automatic retry on transient errors:
|
||||
|
||||
```cpp
|
||||
#include "ErrorRecovery.h"
|
||||
#include "SyscallWrappers.h"
|
||||
|
||||
bool save_file(const std::string &path, std::string &err) {
|
||||
int fd = -1;
|
||||
auto open_fn = [&]() -> bool {
|
||||
fd = kte::syscall::Open(path.c_str(), O_CREAT | O_WRONLY, 0644);
|
||||
return fd >= 0;
|
||||
};
|
||||
|
||||
if (!kte::RetryOnTransientError(open_fn, kte::RetryPolicy::Aggressive(), err)) {
|
||||
if (fd < 0) {
|
||||
int saved_errno = errno;
|
||||
err = "Failed to open file '" + path + "': " + std::strerror(saved_errno) + err;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// ... use fd
|
||||
kte::syscall::Close(fd);
|
||||
return true;
|
||||
}
|
||||
```
|
||||
|
||||
**Key points**:
|
||||
|
||||
- Lambda must return `bool` (true = success, false = failure)
|
||||
- Lambda must set `errno` on failure for transient error detection
|
||||
- Use EINTR-safe syscall wrappers (`kte::syscall::*`) inside lambdas
|
||||
- Capture errno immediately after failure
|
||||
- Append retry info to error message (automatically added by
|
||||
RetryOnTransientError)
|
||||
|
||||
#### Circuit Breaker Pattern
|
||||
|
||||
The circuit breaker prevents repeated attempts to failing operations,
|
||||
enabling graceful degradation.
|
||||
|
||||
**States**:
|
||||
|
||||
- **Closed** (normal): All requests allowed
|
||||
- **Open** (failing): Requests rejected immediately, operation disabled
|
||||
- **HalfOpen** (testing): Limited requests allowed to test recovery
|
||||
|
||||
**Configuration** (SwapManager example):
|
||||
|
||||
```cpp
|
||||
CircuitBreaker::Config cfg;
|
||||
cfg.failure_threshold = 5; // Open after 5 failures
|
||||
cfg.timeout = std::chrono::seconds(30); // Try recovery after 30s
|
||||
cfg.success_threshold = 2; // Close after 2 successes in HalfOpen
|
||||
cfg.window = std::chrono::seconds(60); // Count failures in 60s window
|
||||
|
||||
CircuitBreaker breaker(cfg);
|
||||
```
|
||||
|
||||
**Usage**:
|
||||
|
||||
```cpp
|
||||
// Check before operation
|
||||
if (!breaker.AllowRequest()) {
|
||||
// Circuit is open - graceful degradation
|
||||
log_warning("Operation disabled due to repeated failures");
|
||||
return; // Skip operation
|
||||
}
|
||||
|
||||
// Perform operation
|
||||
if (operation_succeeds()) {
|
||||
breaker.RecordSuccess();
|
||||
} else {
|
||||
breaker.RecordFailure();
|
||||
}
|
||||
```
|
||||
|
||||
**SwapManager Integration**:
|
||||
|
||||
The SwapManager uses a circuit breaker to handle repeated swap file
|
||||
failures:
|
||||
|
||||
1. After 5 swap write failures in 60 seconds, circuit opens
|
||||
2. Swap recording is disabled (graceful degradation)
|
||||
3. Warning logged once per 60 seconds to avoid spam
|
||||
4. After 30 seconds, circuit enters HalfOpen state
|
||||
5. If 2 consecutive operations succeed, circuit closes and swap
|
||||
recording resumes
|
||||
|
||||
This ensures the editor remains functional even when swap files are
|
||||
unavailable (disk full, quota exceeded, filesystem errors).
|
||||
|
||||
#### Graceful Degradation Strategies
|
||||
|
||||
When operations fail repeatedly:
|
||||
|
||||
1. **Disable non-critical features** - Swap recording can be disabled
|
||||
without affecting editing
|
||||
2. **Log warnings** - Inform user of degraded operation via ErrorHandler
|
||||
3. **Rate-limit warnings** - Avoid log spam (e.g., once per 60 seconds)
|
||||
4. **Automatic recovery** - Circuit breaker automatically tests recovery
|
||||
5. **Preserve core functionality** - Editor remains usable without swap
|
||||
files
|
||||
|
||||
**Example** (from SwapManager):
|
||||
|
||||
```cpp
|
||||
if (circuit_open) {
|
||||
// Graceful degradation: skip swap write
|
||||
static std::atomic<std::uint64_t> last_warning_ns{0};
|
||||
const std::uint64_t now = now_ns();
|
||||
if (now - last_warning_ns.load() > 60000000000ULL) {
|
||||
last_warning_ns.store(now);
|
||||
ErrorHandler::Instance().Warning("SwapManager",
|
||||
"Swap operations temporarily disabled due to repeated failures",
|
||||
buffer_name);
|
||||
}
|
||||
return; // Skip operation, editor continues normally
|
||||
}
|
||||
```
|
||||
|
||||
## Common Tasks
|
||||
|
||||
### Adding a New Command
|
||||
|
||||
Reference in New Issue
Block a user