Introduce error recovery mechanisms with retry logic and circuit breaker integration.
- Added `ErrorRecovery.cc` and `ErrorRecovery.h` for retry and circuit breaker implementations. - Enhanced swap file handling with transient error retries and exponential backoff (e.g., ENOSPC, EDQUOT). - Integrated circuit breaker into SwapManager to gracefully handle repeated failures, prevent system overload, and enable automatic recovery. - Updated `DEVELOPER_GUIDE.md` with comprehensive documentation on error recovery patterns and graceful degradation strategies. - Refined fsync, temp file creation, and swap file logic with retry-on-failure mechanisms for improved resilience.
This commit is contained in:
157
ErrorRecovery.cc
Normal file
157
ErrorRecovery.cc
Normal file
@@ -0,0 +1,157 @@
|
||||
// ErrorRecovery.cc - Error recovery mechanisms implementation
|
||||
#include "ErrorRecovery.h"
|
||||
#include <mutex>
|
||||
|
||||
namespace kte {
|
||||
CircuitBreaker::CircuitBreaker(const Config &cfg)
|
||||
: config_(cfg), state_(State::Closed), failure_count_(0), success_count_(0),
|
||||
last_failure_time_(std::chrono::steady_clock::time_point::min()),
|
||||
state_change_time_(std::chrono::steady_clock::now()) {}
|
||||
|
||||
|
||||
bool
|
||||
CircuitBreaker::AllowRequest()
|
||||
{
|
||||
std::lock_guard<std::mutex> lg(mtx_);
|
||||
|
||||
const auto now = std::chrono::steady_clock::now();
|
||||
|
||||
switch (state_) {
|
||||
case State::Closed:
|
||||
// Normal operation, allow all requests
|
||||
return true;
|
||||
|
||||
case State::Open: {
|
||||
// Check if timeout has elapsed to transition to HalfOpen
|
||||
const auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(
|
||||
now - state_change_time_
|
||||
);
|
||||
if (elapsed >= config_.open_timeout) {
|
||||
TransitionTo(State::HalfOpen);
|
||||
return true; // Allow one request to test recovery
|
||||
}
|
||||
return false; // Circuit is open, reject request
|
||||
}
|
||||
|
||||
case State::HalfOpen:
|
||||
// Allow limited requests to test recovery
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
CircuitBreaker::RecordSuccess()
|
||||
{
|
||||
std::lock_guard<std::mutex> lg(mtx_);
|
||||
|
||||
switch (state_) {
|
||||
case State::Closed:
|
||||
// Reset failure count on success in normal operation
|
||||
failure_count_ = 0;
|
||||
break;
|
||||
|
||||
case State::HalfOpen:
|
||||
++success_count_;
|
||||
if (success_count_ >= config_.success_threshold) {
|
||||
// Enough successes, close the circuit
|
||||
TransitionTo(State::Closed);
|
||||
}
|
||||
break;
|
||||
|
||||
case State::Open:
|
||||
// Shouldn't happen (requests rejected), but handle gracefully
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
CircuitBreaker::RecordFailure()
|
||||
{
|
||||
std::lock_guard<std::mutex> lg(mtx_);
|
||||
|
||||
const auto now = std::chrono::steady_clock::now();
|
||||
last_failure_time_ = now;
|
||||
|
||||
switch (state_) {
|
||||
case State::Closed:
|
||||
// Check if we need to reset the failure count (window expired)
|
||||
if (IsWindowExpired()) {
|
||||
failure_count_ = 0;
|
||||
}
|
||||
|
||||
++failure_count_;
|
||||
if (failure_count_ >= config_.failure_threshold) {
|
||||
// Too many failures, open the circuit
|
||||
TransitionTo(State::Open);
|
||||
}
|
||||
break;
|
||||
|
||||
case State::HalfOpen:
|
||||
// Failure during recovery test, reopen the circuit
|
||||
TransitionTo(State::Open);
|
||||
break;
|
||||
|
||||
case State::Open:
|
||||
// Already open, just track the failure
|
||||
++failure_count_;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
CircuitBreaker::Reset()
|
||||
{
|
||||
std::lock_guard<std::mutex> lg(mtx_);
|
||||
TransitionTo(State::Closed);
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
CircuitBreaker::TransitionTo(State new_state)
|
||||
{
|
||||
if (state_ == new_state) {
|
||||
return;
|
||||
}
|
||||
|
||||
state_ = new_state;
|
||||
state_change_time_ = std::chrono::steady_clock::now();
|
||||
|
||||
switch (new_state) {
|
||||
case State::Closed:
|
||||
failure_count_ = 0;
|
||||
success_count_ = 0;
|
||||
break;
|
||||
|
||||
case State::Open:
|
||||
success_count_ = 0;
|
||||
// Keep failure_count_ for diagnostics
|
||||
break;
|
||||
|
||||
case State::HalfOpen:
|
||||
success_count_ = 0;
|
||||
// Keep failure_count_ for diagnostics
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
bool
|
||||
CircuitBreaker::IsWindowExpired() const
|
||||
{
|
||||
if (failure_count_ == 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto now = std::chrono::steady_clock::now();
|
||||
const auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(
|
||||
now - last_failure_time_
|
||||
);
|
||||
|
||||
return elapsed >= config_.window;
|
||||
}
|
||||
} // namespace kte
|
||||
Reference in New Issue
Block a user