feat: create v2-prep branch with comprehensive planning
Restructured project for V2 refactor: **Structure Changes:** - Moved all V1 code to orig/ folder (preserved with git mv) - Created docs/planning/ directory - Added orig/README_V1.md explaining V1 preservation **Planning Documents:** - 00_V2_MASTER_PLAN.md: Complete architecture overview - Executive summary of critical V1 issues - High-level component architecture diagrams - 5-phase implementation roadmap - Success metrics and risk mitigation - 07_TASK_BREAKDOWN.md: Atomic task breakdown - 99+ hours of detailed tasks - Every task < 2 hours (atomic) - Clear dependencies and success criteria - Organized by implementation phase **V2 Key Improvements:** - Per-exchange parsers (factory pattern) - Multi-layer strict validation - Multi-index pool cache - Background validation pipeline - Comprehensive observability **Critical Issues Addressed:** - Zero address tokens (strict validation + cache enrichment) - Parsing accuracy (protocol-specific parsers) - No audit trail (background validation channel) - Inefficient lookups (multi-index cache) - Stats disconnection (event-driven metrics) Next Steps: 1. Review planning documents 2. Begin Phase 1: Foundation (P1-001 through P1-010) 3. Implement parsers in Phase 2 4. Build cache system in Phase 3 5. Add validation pipeline in Phase 4 6. Migrate and test in Phase 5 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
621
orig/internal/recovery/error_handler.go
Normal file
621
orig/internal/recovery/error_handler.go
Normal file
@@ -0,0 +1,621 @@
|
||||
package recovery
|
||||
|
||||
import (
|
||||
"context"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/ethereum/go-ethereum/common"
|
||||
|
||||
"github.com/fraktal/mev-beta/internal/logger"
|
||||
)
|
||||
|
||||
// ErrorSeverity represents the severity level of an error
|
||||
type ErrorSeverity int
|
||||
|
||||
const (
|
||||
SeverityLow ErrorSeverity = iota
|
||||
SeverityMedium
|
||||
SeverityHigh
|
||||
SeverityCritical
|
||||
)
|
||||
|
||||
func (s ErrorSeverity) String() string {
|
||||
switch s {
|
||||
case SeverityLow:
|
||||
return "LOW"
|
||||
case SeverityMedium:
|
||||
return "MEDIUM"
|
||||
case SeverityHigh:
|
||||
return "HIGH"
|
||||
case SeverityCritical:
|
||||
return "CRITICAL"
|
||||
default:
|
||||
return "UNKNOWN"
|
||||
}
|
||||
}
|
||||
|
||||
// ErrorType categorizes different types of errors
|
||||
type ErrorType int
|
||||
|
||||
const (
|
||||
ErrorTypeAddressCorruption ErrorType = iota
|
||||
ErrorTypeContractCallFailed
|
||||
ErrorTypeRPCConnectionFailed
|
||||
ErrorTypeDataParsingFailed
|
||||
ErrorTypeValidationFailed
|
||||
ErrorTypeTimeoutError
|
||||
)
|
||||
|
||||
func (e ErrorType) String() string {
|
||||
switch e {
|
||||
case ErrorTypeAddressCorruption:
|
||||
return "ADDRESS_CORRUPTION"
|
||||
case ErrorTypeContractCallFailed:
|
||||
return "CONTRACT_CALL_FAILED"
|
||||
case ErrorTypeRPCConnectionFailed:
|
||||
return "RPC_CONNECTION_FAILED"
|
||||
case ErrorTypeDataParsingFailed:
|
||||
return "DATA_PARSING_FAILED"
|
||||
case ErrorTypeValidationFailed:
|
||||
return "VALIDATION_FAILED"
|
||||
case ErrorTypeTimeoutError:
|
||||
return "TIMEOUT_ERROR"
|
||||
default:
|
||||
return "UNKNOWN_ERROR"
|
||||
}
|
||||
}
|
||||
|
||||
// RecoveryAction represents an action to take when an error occurs
|
||||
type RecoveryAction int
|
||||
|
||||
const (
|
||||
ActionSkipAndContinue RecoveryAction = iota
|
||||
ActionRetryWithBackoff
|
||||
ActionUseFallbackData
|
||||
ActionCircuitBreaker
|
||||
ActionEmergencyStop
|
||||
)
|
||||
|
||||
func (a RecoveryAction) String() string {
|
||||
switch a {
|
||||
case ActionSkipAndContinue:
|
||||
return "SKIP_AND_CONTINUE"
|
||||
case ActionRetryWithBackoff:
|
||||
return "RETRY_WITH_BACKOFF"
|
||||
case ActionUseFallbackData:
|
||||
return "USE_FALLBACK_DATA"
|
||||
case ActionCircuitBreaker:
|
||||
return "CIRCUIT_BREAKER"
|
||||
case ActionEmergencyStop:
|
||||
return "EMERGENCY_STOP"
|
||||
default:
|
||||
return "UNKNOWN_ACTION"
|
||||
}
|
||||
}
|
||||
|
||||
// ErrorEvent represents a specific error occurrence
|
||||
type ErrorEvent struct {
|
||||
Timestamp time.Time
|
||||
Type ErrorType
|
||||
Severity ErrorSeverity
|
||||
Component string
|
||||
Address common.Address
|
||||
Message string
|
||||
Context map[string]interface{}
|
||||
AttemptCount int
|
||||
LastAttempt time.Time
|
||||
Resolved bool
|
||||
ResolvedAt time.Time
|
||||
}
|
||||
|
||||
// RecoveryRule defines how to handle specific error patterns
|
||||
type RecoveryRule struct {
|
||||
ErrorType ErrorType
|
||||
MaxSeverity ErrorSeverity
|
||||
Action RecoveryAction
|
||||
MaxRetries int
|
||||
BackoffInterval time.Duration
|
||||
CircuitBreakerThreshold int
|
||||
ContextMatchers map[string]interface{}
|
||||
}
|
||||
|
||||
// ErrorHandler provides comprehensive error handling and recovery capabilities
|
||||
type ErrorHandler struct {
|
||||
mu sync.RWMutex
|
||||
logger *logger.Logger
|
||||
errorHistory []ErrorEvent
|
||||
componentStats map[string]*ComponentStats
|
||||
circuitBreakers map[string]*CircuitBreaker
|
||||
recoveryRules []RecoveryRule
|
||||
fallbackProvider FallbackDataProvider
|
||||
maxHistorySize int
|
||||
alertThresholds map[ErrorType]int
|
||||
enabled bool
|
||||
}
|
||||
|
||||
// ComponentStats tracks error statistics for components
|
||||
type ComponentStats struct {
|
||||
mu sync.RWMutex
|
||||
Component string
|
||||
TotalErrors int
|
||||
ErrorsByType map[ErrorType]int
|
||||
ErrorsBySeverity map[ErrorSeverity]int
|
||||
LastError time.Time
|
||||
ConsecutiveFailures int
|
||||
SuccessCount int
|
||||
IsHealthy bool
|
||||
LastHealthCheck time.Time
|
||||
}
|
||||
|
||||
// CircuitBreaker implements circuit breaker pattern for failing components
|
||||
type CircuitBreaker struct {
|
||||
mu sync.RWMutex
|
||||
Name string
|
||||
State CircuitState
|
||||
FailureCount int
|
||||
Threshold int
|
||||
Timeout time.Duration
|
||||
LastFailure time.Time
|
||||
LastSuccess time.Time
|
||||
HalfOpenAllowed bool
|
||||
}
|
||||
|
||||
type CircuitState int
|
||||
|
||||
const (
|
||||
CircuitClosed CircuitState = iota
|
||||
CircuitOpen
|
||||
CircuitHalfOpen
|
||||
)
|
||||
|
||||
func (s CircuitState) String() string {
|
||||
switch s {
|
||||
case CircuitClosed:
|
||||
return "CLOSED"
|
||||
case CircuitOpen:
|
||||
return "OPEN"
|
||||
case CircuitHalfOpen:
|
||||
return "HALF_OPEN"
|
||||
default:
|
||||
return "UNKNOWN"
|
||||
}
|
||||
}
|
||||
|
||||
// FallbackDataProvider interface for providing fallback data when primary sources fail
|
||||
type FallbackDataProvider interface {
|
||||
GetFallbackTokenInfo(ctx context.Context, address common.Address) (*FallbackTokenInfo, error)
|
||||
GetFallbackPoolInfo(ctx context.Context, address common.Address) (*FallbackPoolInfo, error)
|
||||
GetFallbackContractType(ctx context.Context, address common.Address) (string, error)
|
||||
}
|
||||
|
||||
type FallbackTokenInfo struct {
|
||||
Address common.Address
|
||||
Symbol string
|
||||
Name string
|
||||
Decimals uint8
|
||||
IsVerified bool
|
||||
Source string
|
||||
Confidence float64
|
||||
}
|
||||
|
||||
type FallbackPoolInfo struct {
|
||||
Address common.Address
|
||||
Token0 common.Address
|
||||
Token1 common.Address
|
||||
Protocol string
|
||||
Fee uint32
|
||||
IsVerified bool
|
||||
Source string
|
||||
Confidence float64
|
||||
}
|
||||
|
||||
// NewErrorHandler creates a new error handler with default configuration
|
||||
func NewErrorHandler(logger *logger.Logger) *ErrorHandler {
|
||||
handler := &ErrorHandler{
|
||||
logger: logger,
|
||||
errorHistory: make([]ErrorEvent, 0),
|
||||
componentStats: make(map[string]*ComponentStats),
|
||||
circuitBreakers: make(map[string]*CircuitBreaker),
|
||||
maxHistorySize: 1000,
|
||||
alertThresholds: make(map[ErrorType]int),
|
||||
enabled: true,
|
||||
}
|
||||
|
||||
// Initialize default recovery rules
|
||||
handler.initializeDefaultRules()
|
||||
|
||||
// Initialize default alert thresholds
|
||||
handler.initializeAlertThresholds()
|
||||
|
||||
return handler
|
||||
}
|
||||
|
||||
// initializeDefaultRules sets up default recovery rules for common error scenarios
|
||||
func (eh *ErrorHandler) initializeDefaultRules() {
|
||||
eh.recoveryRules = []RecoveryRule{
|
||||
{
|
||||
ErrorType: ErrorTypeAddressCorruption,
|
||||
MaxSeverity: SeverityMedium,
|
||||
Action: ActionRetryWithBackoff,
|
||||
MaxRetries: 2,
|
||||
BackoffInterval: 500 * time.Millisecond,
|
||||
},
|
||||
{
|
||||
ErrorType: ErrorTypeAddressCorruption,
|
||||
MaxSeverity: SeverityCritical,
|
||||
Action: ActionUseFallbackData,
|
||||
MaxRetries: 0,
|
||||
BackoffInterval: 0,
|
||||
},
|
||||
{
|
||||
ErrorType: ErrorTypeContractCallFailed,
|
||||
MaxSeverity: SeverityMedium,
|
||||
Action: ActionRetryWithBackoff,
|
||||
MaxRetries: 3,
|
||||
BackoffInterval: 2 * time.Second,
|
||||
},
|
||||
{
|
||||
ErrorType: ErrorTypeRPCConnectionFailed,
|
||||
MaxSeverity: SeverityHigh,
|
||||
Action: ActionCircuitBreaker,
|
||||
MaxRetries: 5,
|
||||
BackoffInterval: 5 * time.Second,
|
||||
CircuitBreakerThreshold: 10,
|
||||
},
|
||||
{
|
||||
ErrorType: ErrorTypeDataParsingFailed,
|
||||
MaxSeverity: SeverityMedium,
|
||||
Action: ActionUseFallbackData,
|
||||
MaxRetries: 2,
|
||||
BackoffInterval: 1 * time.Second,
|
||||
},
|
||||
{
|
||||
ErrorType: ErrorTypeValidationFailed,
|
||||
MaxSeverity: SeverityLow,
|
||||
Action: ActionSkipAndContinue,
|
||||
MaxRetries: 0,
|
||||
BackoffInterval: 0,
|
||||
},
|
||||
{
|
||||
ErrorType: ErrorTypeValidationFailed,
|
||||
MaxSeverity: SeverityHigh,
|
||||
Action: ActionRetryWithBackoff,
|
||||
MaxRetries: 1,
|
||||
BackoffInterval: 500 * time.Millisecond,
|
||||
},
|
||||
{
|
||||
ErrorType: ErrorTypeTimeoutError,
|
||||
MaxSeverity: SeverityMedium,
|
||||
Action: ActionRetryWithBackoff,
|
||||
MaxRetries: 3,
|
||||
BackoffInterval: 3 * time.Second,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// initializeAlertThresholds sets up alert thresholds for different error types
|
||||
func (eh *ErrorHandler) initializeAlertThresholds() {
|
||||
eh.alertThresholds[ErrorTypeAddressCorruption] = 5
|
||||
eh.alertThresholds[ErrorTypeContractCallFailed] = 20
|
||||
eh.alertThresholds[ErrorTypeRPCConnectionFailed] = 10
|
||||
eh.alertThresholds[ErrorTypeDataParsingFailed] = 15
|
||||
eh.alertThresholds[ErrorTypeValidationFailed] = 25
|
||||
eh.alertThresholds[ErrorTypeTimeoutError] = 30
|
||||
}
|
||||
|
||||
// HandleError processes an error and determines the appropriate recovery action
|
||||
func (eh *ErrorHandler) HandleError(ctx context.Context, errorType ErrorType, severity ErrorSeverity, component string, address common.Address, message string, context map[string]interface{}) RecoveryAction {
|
||||
if !eh.enabled {
|
||||
return ActionSkipAndContinue
|
||||
}
|
||||
|
||||
eh.mu.Lock()
|
||||
defer eh.mu.Unlock()
|
||||
|
||||
// Record the error event
|
||||
event := ErrorEvent{
|
||||
Timestamp: time.Now(),
|
||||
Type: errorType,
|
||||
Severity: severity,
|
||||
Component: component,
|
||||
Address: address,
|
||||
Message: message,
|
||||
Context: context,
|
||||
AttemptCount: 1,
|
||||
LastAttempt: time.Now(),
|
||||
}
|
||||
|
||||
// Update error history
|
||||
eh.addToHistory(event)
|
||||
|
||||
// Update component statistics
|
||||
eh.updateComponentStats(component, errorType, severity)
|
||||
|
||||
// Check circuit breakers
|
||||
if eh.shouldTriggerCircuitBreaker(component, errorType) {
|
||||
eh.triggerCircuitBreaker(component)
|
||||
return ActionCircuitBreaker
|
||||
}
|
||||
|
||||
// Find matching recovery rule
|
||||
rule := eh.findRecoveryRule(errorType, severity, context)
|
||||
if rule == nil {
|
||||
// Default action for unmatched errors
|
||||
return ActionSkipAndContinue
|
||||
}
|
||||
|
||||
// Log the error and recovery action
|
||||
eh.logger.Error("Error handled by recovery system",
|
||||
"type", errorType.String(),
|
||||
"severity", severity.String(),
|
||||
"component", component,
|
||||
"address", address.Hex(),
|
||||
"message", message,
|
||||
"action", rule.Action.String())
|
||||
|
||||
// Check if alert threshold is reached
|
||||
eh.checkAlertThresholds(errorType)
|
||||
|
||||
return rule.Action
|
||||
}
|
||||
|
||||
// addToHistory adds an error event to the history buffer
|
||||
func (eh *ErrorHandler) addToHistory(event ErrorEvent) {
|
||||
eh.errorHistory = append(eh.errorHistory, event)
|
||||
|
||||
// Trim history if it exceeds max size
|
||||
if len(eh.errorHistory) > eh.maxHistorySize {
|
||||
eh.errorHistory = eh.errorHistory[len(eh.errorHistory)-eh.maxHistorySize:]
|
||||
}
|
||||
}
|
||||
|
||||
// updateComponentStats updates statistics for a component
|
||||
func (eh *ErrorHandler) updateComponentStats(component string, errorType ErrorType, severity ErrorSeverity) {
|
||||
stats, exists := eh.componentStats[component]
|
||||
if !exists {
|
||||
stats = &ComponentStats{
|
||||
Component: component,
|
||||
ErrorsByType: make(map[ErrorType]int),
|
||||
ErrorsBySeverity: make(map[ErrorSeverity]int),
|
||||
IsHealthy: true,
|
||||
}
|
||||
eh.componentStats[component] = stats
|
||||
}
|
||||
|
||||
stats.mu.Lock()
|
||||
defer stats.mu.Unlock()
|
||||
|
||||
stats.TotalErrors++
|
||||
stats.ErrorsByType[errorType]++
|
||||
stats.ErrorsBySeverity[severity]++
|
||||
stats.LastError = time.Now()
|
||||
stats.ConsecutiveFailures++
|
||||
|
||||
// Mark as unhealthy if too many consecutive failures
|
||||
if stats.ConsecutiveFailures > 10 {
|
||||
stats.IsHealthy = false
|
||||
}
|
||||
}
|
||||
|
||||
// findRecoveryRule finds the best matching recovery rule for an error
|
||||
func (eh *ErrorHandler) findRecoveryRule(errorType ErrorType, severity ErrorSeverity, context map[string]interface{}) *RecoveryRule {
|
||||
for _, rule := range eh.recoveryRules {
|
||||
if rule.ErrorType == errorType && severity <= rule.MaxSeverity {
|
||||
// Check context matchers if present
|
||||
if len(rule.ContextMatchers) > 0 {
|
||||
if !eh.matchesContext(context, rule.ContextMatchers) {
|
||||
continue
|
||||
}
|
||||
}
|
||||
return &rule
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// matchesContext checks if the error context matches the rule's context matchers
|
||||
func (eh *ErrorHandler) matchesContext(errorContext, ruleMatchers map[string]interface{}) bool {
|
||||
for key, expectedValue := range ruleMatchers {
|
||||
if actualValue, exists := errorContext[key]; !exists || actualValue != expectedValue {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// shouldTriggerCircuitBreaker determines if a circuit breaker should be triggered
|
||||
func (eh *ErrorHandler) shouldTriggerCircuitBreaker(component string, errorType ErrorType) bool {
|
||||
stats, exists := eh.componentStats[component]
|
||||
if !exists {
|
||||
return false
|
||||
}
|
||||
|
||||
stats.mu.RLock()
|
||||
defer stats.mu.RUnlock()
|
||||
|
||||
// Trigger if consecutive failures exceed threshold for critical errors
|
||||
if errorType == ErrorTypeRPCConnectionFailed && stats.ConsecutiveFailures >= 5 {
|
||||
return true
|
||||
}
|
||||
|
||||
if errorType == ErrorTypeAddressCorruption && stats.ConsecutiveFailures >= 3 {
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
// triggerCircuitBreaker activates a circuit breaker for a component
|
||||
func (eh *ErrorHandler) triggerCircuitBreaker(component string) {
|
||||
breaker := &CircuitBreaker{
|
||||
Name: component,
|
||||
State: CircuitOpen,
|
||||
FailureCount: 0,
|
||||
Threshold: 5,
|
||||
Timeout: 30 * time.Second,
|
||||
LastFailure: time.Now(),
|
||||
}
|
||||
|
||||
eh.circuitBreakers[component] = breaker
|
||||
|
||||
eh.logger.Warn("Circuit breaker triggered",
|
||||
"component", component,
|
||||
"timeout", breaker.Timeout)
|
||||
}
|
||||
|
||||
// checkAlertThresholds checks if error counts have reached alert thresholds
|
||||
func (eh *ErrorHandler) checkAlertThresholds(errorType ErrorType) {
|
||||
threshold, exists := eh.alertThresholds[errorType]
|
||||
if !exists {
|
||||
return
|
||||
}
|
||||
|
||||
// Count recent errors of this type (last hour)
|
||||
recentCount := 0
|
||||
cutoff := time.Now().Add(-1 * time.Hour)
|
||||
|
||||
for _, event := range eh.errorHistory {
|
||||
if event.Type == errorType && event.Timestamp.After(cutoff) {
|
||||
recentCount++
|
||||
}
|
||||
}
|
||||
|
||||
if recentCount >= threshold {
|
||||
eh.logger.Warn("Error threshold reached - alert triggered",
|
||||
"error_type", errorType.String(),
|
||||
"count", recentCount,
|
||||
"threshold", threshold)
|
||||
// Here you would trigger your alerting system
|
||||
}
|
||||
}
|
||||
|
||||
// GetComponentHealth returns the health status of all components
|
||||
func (eh *ErrorHandler) GetComponentHealth() map[string]*ComponentStats {
|
||||
eh.mu.RLock()
|
||||
defer eh.mu.RUnlock()
|
||||
|
||||
// Return a copy to prevent external modification
|
||||
result := make(map[string]*ComponentStats)
|
||||
for name, stats := range eh.componentStats {
|
||||
result[name] = &ComponentStats{
|
||||
Component: stats.Component,
|
||||
TotalErrors: stats.TotalErrors,
|
||||
ErrorsByType: make(map[ErrorType]int),
|
||||
ErrorsBySeverity: make(map[ErrorSeverity]int),
|
||||
LastError: stats.LastError,
|
||||
ConsecutiveFailures: stats.ConsecutiveFailures,
|
||||
SuccessCount: stats.SuccessCount,
|
||||
IsHealthy: stats.IsHealthy,
|
||||
LastHealthCheck: stats.LastHealthCheck,
|
||||
}
|
||||
|
||||
// Copy maps
|
||||
for k, v := range stats.ErrorsByType {
|
||||
result[name].ErrorsByType[k] = v
|
||||
}
|
||||
for k, v := range stats.ErrorsBySeverity {
|
||||
result[name].ErrorsBySeverity[k] = v
|
||||
}
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// RecordSuccess records a successful operation for a component
|
||||
func (eh *ErrorHandler) RecordSuccess(component string) {
|
||||
eh.mu.Lock()
|
||||
defer eh.mu.Unlock()
|
||||
|
||||
stats, exists := eh.componentStats[component]
|
||||
if !exists {
|
||||
stats = &ComponentStats{
|
||||
Component: component,
|
||||
ErrorsByType: make(map[ErrorType]int),
|
||||
ErrorsBySeverity: make(map[ErrorSeverity]int),
|
||||
IsHealthy: true,
|
||||
}
|
||||
eh.componentStats[component] = stats
|
||||
}
|
||||
|
||||
stats.mu.Lock()
|
||||
defer stats.mu.Unlock()
|
||||
|
||||
stats.SuccessCount++
|
||||
stats.ConsecutiveFailures = 0
|
||||
stats.IsHealthy = true
|
||||
stats.LastHealthCheck = time.Now()
|
||||
|
||||
// Reset circuit breaker if it exists
|
||||
if breaker, exists := eh.circuitBreakers[component]; exists {
|
||||
breaker.mu.Lock()
|
||||
breaker.State = CircuitClosed
|
||||
breaker.FailureCount = 0
|
||||
breaker.LastSuccess = time.Now()
|
||||
breaker.mu.Unlock()
|
||||
}
|
||||
}
|
||||
|
||||
// IsCircuitOpen checks if a circuit breaker is open for a component
|
||||
func (eh *ErrorHandler) IsCircuitOpen(component string) bool {
|
||||
eh.mu.RLock()
|
||||
defer eh.mu.RUnlock()
|
||||
|
||||
breaker, exists := eh.circuitBreakers[component]
|
||||
if !exists {
|
||||
return false
|
||||
}
|
||||
|
||||
breaker.mu.RLock()
|
||||
defer breaker.mu.RUnlock()
|
||||
|
||||
if breaker.State == CircuitOpen {
|
||||
// Check if timeout has passed
|
||||
if time.Since(breaker.LastFailure) > breaker.Timeout {
|
||||
breaker.State = CircuitHalfOpen
|
||||
breaker.HalfOpenAllowed = true
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
// SetFallbackProvider sets the fallback data provider
|
||||
func (eh *ErrorHandler) SetFallbackProvider(provider FallbackDataProvider) {
|
||||
eh.mu.Lock()
|
||||
defer eh.mu.Unlock()
|
||||
eh.fallbackProvider = provider
|
||||
}
|
||||
|
||||
// GetErrorSummary returns a summary of recent errors
|
||||
func (eh *ErrorHandler) GetErrorSummary(duration time.Duration) map[string]interface{} {
|
||||
eh.mu.RLock()
|
||||
defer eh.mu.RUnlock()
|
||||
|
||||
cutoff := time.Now().Add(-duration)
|
||||
summary := map[string]interface{}{
|
||||
"total_errors": 0,
|
||||
"errors_by_type": make(map[string]int),
|
||||
"errors_by_severity": make(map[string]int),
|
||||
"errors_by_component": make(map[string]int),
|
||||
"time_range": duration.String(),
|
||||
}
|
||||
|
||||
for _, event := range eh.errorHistory {
|
||||
if event.Timestamp.After(cutoff) {
|
||||
summary["total_errors"] = summary["total_errors"].(int) + 1
|
||||
|
||||
typeKey := event.Type.String()
|
||||
summary["errors_by_type"].(map[string]int)[typeKey]++
|
||||
|
||||
severityKey := event.Severity.String()
|
||||
summary["errors_by_severity"].(map[string]int)[severityKey]++
|
||||
|
||||
summary["errors_by_component"].(map[string]int)[event.Component]++
|
||||
}
|
||||
}
|
||||
|
||||
return summary
|
||||
}
|
||||
384
orig/internal/recovery/fallback_provider.go
Normal file
384
orig/internal/recovery/fallback_provider.go
Normal file
@@ -0,0 +1,384 @@
|
||||
package recovery
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/ethereum/go-ethereum/common"
|
||||
|
||||
"github.com/fraktal/mev-beta/internal/logger"
|
||||
"github.com/fraktal/mev-beta/internal/registry"
|
||||
)
|
||||
|
||||
// DefaultFallbackProvider implements FallbackDataProvider with multiple data sources
|
||||
type DefaultFallbackProvider struct {
|
||||
mu sync.RWMutex
|
||||
logger *logger.Logger
|
||||
contractRegistry *registry.ContractRegistry
|
||||
staticTokenData map[common.Address]*FallbackTokenInfo
|
||||
staticPoolData map[common.Address]*FallbackPoolInfo
|
||||
cacheTimeout time.Duration
|
||||
enabled bool
|
||||
}
|
||||
|
||||
// NewDefaultFallbackProvider creates a new fallback data provider
|
||||
func NewDefaultFallbackProvider(logger *logger.Logger, contractRegistry *registry.ContractRegistry) *DefaultFallbackProvider {
|
||||
provider := &DefaultFallbackProvider{
|
||||
logger: logger,
|
||||
contractRegistry: contractRegistry,
|
||||
staticTokenData: make(map[common.Address]*FallbackTokenInfo),
|
||||
staticPoolData: make(map[common.Address]*FallbackPoolInfo),
|
||||
cacheTimeout: 5 * time.Minute,
|
||||
enabled: true,
|
||||
}
|
||||
|
||||
// Initialize with known safe data
|
||||
provider.initializeStaticData()
|
||||
|
||||
return provider
|
||||
}
|
||||
|
||||
// initializeStaticData populates the provider with known good data for critical Arbitrum contracts
|
||||
func (fp *DefaultFallbackProvider) initializeStaticData() {
|
||||
fp.mu.Lock()
|
||||
defer fp.mu.Unlock()
|
||||
|
||||
// Major Arbitrum tokens with verified addresses
|
||||
fp.staticTokenData[common.HexToAddress("0x82aF49447D8a07e3bd95BD0d56f35241523fBab1")] = &FallbackTokenInfo{
|
||||
Address: common.HexToAddress("0x82aF49447D8a07e3bd95BD0d56f35241523fBab1"),
|
||||
Symbol: "WETH",
|
||||
Name: "Wrapped Ether",
|
||||
Decimals: 18,
|
||||
IsVerified: true,
|
||||
Source: "static_fallback",
|
||||
Confidence: 1.0,
|
||||
}
|
||||
|
||||
fp.staticTokenData[common.HexToAddress("0xaf88d065e77c8cC2239327C5EDb3A432268e5831")] = &FallbackTokenInfo{
|
||||
Address: common.HexToAddress("0xaf88d065e77c8cC2239327C5EDb3A432268e5831"),
|
||||
Symbol: "USDC",
|
||||
Name: "USD Coin",
|
||||
Decimals: 6,
|
||||
IsVerified: true,
|
||||
Source: "static_fallback",
|
||||
Confidence: 1.0,
|
||||
}
|
||||
|
||||
fp.staticTokenData[common.HexToAddress("0xFd086bC7CD5C481DCC9C85ebE478A1C0b69FCbb9")] = &FallbackTokenInfo{
|
||||
Address: common.HexToAddress("0xFd086bC7CD5C481DCC9C85ebE478A1C0b69FCbb9"),
|
||||
Symbol: "USDT",
|
||||
Name: "Tether USD",
|
||||
Decimals: 6,
|
||||
IsVerified: true,
|
||||
Source: "static_fallback",
|
||||
Confidence: 1.0,
|
||||
}
|
||||
|
||||
fp.staticTokenData[common.HexToAddress("0x2f2a2543B76A4166549F7aaB2e75Bef0aefC5B0f")] = &FallbackTokenInfo{
|
||||
Address: common.HexToAddress("0x2f2a2543B76A4166549F7aaB2e75Bef0aefC5B0f"),
|
||||
Symbol: "WBTC",
|
||||
Name: "Wrapped BTC",
|
||||
Decimals: 8,
|
||||
IsVerified: true,
|
||||
Source: "static_fallback",
|
||||
Confidence: 1.0,
|
||||
}
|
||||
|
||||
fp.staticTokenData[common.HexToAddress("0x912CE59144191C1204E64559FE8253a0e49E6548")] = &FallbackTokenInfo{
|
||||
Address: common.HexToAddress("0x912CE59144191C1204E64559FE8253a0e49E6548"),
|
||||
Symbol: "ARB",
|
||||
Name: "Arbitrum",
|
||||
Decimals: 18,
|
||||
IsVerified: true,
|
||||
Source: "static_fallback",
|
||||
Confidence: 1.0,
|
||||
}
|
||||
|
||||
// High-volume Uniswap V3 pools with verified addresses and token pairs
|
||||
fp.staticPoolData[common.HexToAddress("0xC6962004f452bE9203591991D15f6b388e09E8D0")] = &FallbackPoolInfo{
|
||||
Address: common.HexToAddress("0xC6962004f452bE9203591991D15f6b388e09E8D0"),
|
||||
Token0: common.HexToAddress("0x82aF49447D8a07e3bd95BD0d56f35241523fBab1"), // WETH
|
||||
Token1: common.HexToAddress("0xaf88d065e77c8cC2239327C5EDb3A432268e5831"), // USDC
|
||||
Protocol: "UniswapV3",
|
||||
Fee: 500, // 0.05%
|
||||
IsVerified: true,
|
||||
Source: "static_fallback",
|
||||
Confidence: 1.0,
|
||||
}
|
||||
|
||||
fp.staticPoolData[common.HexToAddress("0x641C00A822e8b671738d32a431a4Fb6074E5c79d")] = &FallbackPoolInfo{
|
||||
Address: common.HexToAddress("0x641C00A822e8b671738d32a431a4Fb6074E5c79d"),
|
||||
Token0: common.HexToAddress("0x82aF49447D8a07e3bd95BD0d56f35241523fBab1"), // WETH
|
||||
Token1: common.HexToAddress("0xaf88d065e77c8cC2239327C5EDb3A432268e5831"), // USDC
|
||||
Protocol: "UniswapV3",
|
||||
Fee: 3000, // 0.3%
|
||||
IsVerified: true,
|
||||
Source: "static_fallback",
|
||||
Confidence: 1.0,
|
||||
}
|
||||
|
||||
fp.staticPoolData[common.HexToAddress("0x17c14D2c404D167802b16C450d3c99F88F2c4F4d")] = &FallbackPoolInfo{
|
||||
Address: common.HexToAddress("0x17c14D2c404D167802b16C450d3c99F88F2c4F4d"),
|
||||
Token0: common.HexToAddress("0xaf88d065e77c8cC2239327C5EDb3A432268e5831"), // USDC
|
||||
Token1: common.HexToAddress("0xFd086bC7CD5C481DCC9C85ebE478A1C0b69FCbb9"), // USDT
|
||||
Protocol: "UniswapV3",
|
||||
Fee: 100, // 0.01%
|
||||
IsVerified: true,
|
||||
Source: "static_fallback",
|
||||
Confidence: 1.0,
|
||||
}
|
||||
|
||||
fp.staticPoolData[common.HexToAddress("0x2f5e87C032bc4F8526F320c012A4e678F1fa6cAB")] = &FallbackPoolInfo{
|
||||
Address: common.HexToAddress("0x2f5e87C032bc4F8526F320c012A4e678F1fa6cAB"),
|
||||
Token0: common.HexToAddress("0x2f2a2543B76A4166549F7aaB2e75Bef0aefC5B0f"), // WBTC
|
||||
Token1: common.HexToAddress("0x82aF49447D8a07e3bd95BD0d56f35241523fBab1"), // WETH
|
||||
Protocol: "UniswapV3",
|
||||
Fee: 500, // 0.05%
|
||||
IsVerified: true,
|
||||
Source: "static_fallback",
|
||||
Confidence: 1.0,
|
||||
}
|
||||
|
||||
fp.logger.Info("Initialized fallback provider with static data",
|
||||
"tokens", len(fp.staticTokenData),
|
||||
"pools", len(fp.staticPoolData))
|
||||
}
|
||||
|
||||
// GetFallbackTokenInfo provides fallback token information
|
||||
func (fp *DefaultFallbackProvider) GetFallbackTokenInfo(ctx context.Context, address common.Address) (*FallbackTokenInfo, error) {
|
||||
if !fp.enabled {
|
||||
return nil, fmt.Errorf("fallback provider disabled")
|
||||
}
|
||||
|
||||
fp.mu.RLock()
|
||||
defer fp.mu.RUnlock()
|
||||
|
||||
// First, try static data
|
||||
if tokenInfo, exists := fp.staticTokenData[address]; exists {
|
||||
fp.logger.Debug("Fallback token info from static data",
|
||||
"address", address.Hex(),
|
||||
"symbol", tokenInfo.Symbol,
|
||||
"source", tokenInfo.Source)
|
||||
return tokenInfo, nil
|
||||
}
|
||||
|
||||
// Second, try contract registry if available
|
||||
if fp.contractRegistry != nil {
|
||||
if contractInfo, err := fp.contractRegistry.GetContractInfo(ctx, address); err == nil && contractInfo != nil {
|
||||
tokenInfo := &FallbackTokenInfo{
|
||||
Address: address,
|
||||
Symbol: contractInfo.Symbol,
|
||||
Name: contractInfo.Name,
|
||||
Decimals: contractInfo.Decimals,
|
||||
IsVerified: contractInfo.IsVerified,
|
||||
Source: "contract_registry",
|
||||
Confidence: contractInfo.Confidence,
|
||||
}
|
||||
|
||||
fp.logger.Debug("Fallback token info from registry",
|
||||
"address", address.Hex(),
|
||||
"symbol", tokenInfo.Symbol,
|
||||
"confidence", tokenInfo.Confidence)
|
||||
|
||||
return tokenInfo, nil
|
||||
}
|
||||
}
|
||||
|
||||
// Third, provide minimal safe fallback for unknown tokens
|
||||
tokenInfo := &FallbackTokenInfo{
|
||||
Address: address,
|
||||
Symbol: fmt.Sprintf("UNK_%s", address.Hex()[:8]),
|
||||
Name: "Unknown Token",
|
||||
Decimals: 18, // Safe default
|
||||
IsVerified: false,
|
||||
Source: "generated_fallback",
|
||||
Confidence: 0.1,
|
||||
}
|
||||
|
||||
fp.logger.Warn("Using generated fallback token info",
|
||||
"address", address.Hex(),
|
||||
"symbol", tokenInfo.Symbol)
|
||||
|
||||
return tokenInfo, nil
|
||||
}
|
||||
|
||||
// GetFallbackPoolInfo provides fallback pool information
|
||||
func (fp *DefaultFallbackProvider) GetFallbackPoolInfo(ctx context.Context, address common.Address) (*FallbackPoolInfo, error) {
|
||||
if !fp.enabled {
|
||||
return nil, fmt.Errorf("fallback provider disabled")
|
||||
}
|
||||
|
||||
fp.mu.RLock()
|
||||
defer fp.mu.RUnlock()
|
||||
|
||||
// First, try static data
|
||||
if poolInfo, exists := fp.staticPoolData[address]; exists {
|
||||
fp.logger.Debug("Fallback pool info from static data",
|
||||
"address", address.Hex(),
|
||||
"protocol", poolInfo.Protocol,
|
||||
"token0", poolInfo.Token0.Hex(),
|
||||
"token1", poolInfo.Token1.Hex())
|
||||
return poolInfo, nil
|
||||
}
|
||||
|
||||
// Second, try contract registry if available
|
||||
if fp.contractRegistry != nil {
|
||||
if poolInfo := fp.contractRegistry.GetPoolInfo(address); poolInfo != nil {
|
||||
fallbackInfo := &FallbackPoolInfo{
|
||||
Address: address,
|
||||
Token0: poolInfo.Token0,
|
||||
Token1: poolInfo.Token1,
|
||||
Protocol: poolInfo.Protocol,
|
||||
Fee: poolInfo.Fee,
|
||||
IsVerified: poolInfo.IsVerified,
|
||||
Source: "contract_registry",
|
||||
Confidence: poolInfo.Confidence,
|
||||
}
|
||||
|
||||
fp.logger.Debug("Fallback pool info from registry",
|
||||
"address", address.Hex(),
|
||||
"protocol", fallbackInfo.Protocol,
|
||||
"confidence", fallbackInfo.Confidence)
|
||||
|
||||
return fallbackInfo, nil
|
||||
}
|
||||
}
|
||||
|
||||
// No fallback available for unknown pools - return error
|
||||
return nil, fmt.Errorf("no fallback data available for pool %s", address.Hex())
|
||||
}
|
||||
|
||||
// GetFallbackContractType provides fallback contract type information
|
||||
func (fp *DefaultFallbackProvider) GetFallbackContractType(ctx context.Context, address common.Address) (string, error) {
|
||||
if !fp.enabled {
|
||||
return "", fmt.Errorf("fallback provider disabled")
|
||||
}
|
||||
|
||||
fp.mu.RLock()
|
||||
defer fp.mu.RUnlock()
|
||||
|
||||
// Check if it's a known token
|
||||
if _, exists := fp.staticTokenData[address]; exists {
|
||||
return "ERC20", nil
|
||||
}
|
||||
|
||||
// Check if it's a known pool
|
||||
if _, exists := fp.staticPoolData[address]; exists {
|
||||
return "Pool", nil
|
||||
}
|
||||
|
||||
// Try contract registry
|
||||
if fp.contractRegistry != nil {
|
||||
if contractInfo, err := fp.contractRegistry.GetContractInfo(ctx, address); err == nil && contractInfo != nil {
|
||||
return contractInfo.Type.String(), nil
|
||||
}
|
||||
}
|
||||
|
||||
// Default to unknown
|
||||
return "Unknown", nil
|
||||
}
|
||||
|
||||
// AddStaticTokenData adds static token data for fallback use
|
||||
func (fp *DefaultFallbackProvider) AddStaticTokenData(address common.Address, info *FallbackTokenInfo) {
|
||||
fp.mu.Lock()
|
||||
defer fp.mu.Unlock()
|
||||
|
||||
fp.staticTokenData[address] = info
|
||||
fp.logger.Debug("Added static token data",
|
||||
"address", address.Hex(),
|
||||
"symbol", info.Symbol)
|
||||
}
|
||||
|
||||
// AddStaticPoolData adds static pool data for fallback use
|
||||
func (fp *DefaultFallbackProvider) AddStaticPoolData(address common.Address, info *FallbackPoolInfo) {
|
||||
fp.mu.Lock()
|
||||
defer fp.mu.Unlock()
|
||||
|
||||
fp.staticPoolData[address] = info
|
||||
fp.logger.Debug("Added static pool data",
|
||||
"address", address.Hex(),
|
||||
"protocol", info.Protocol)
|
||||
}
|
||||
|
||||
// IsAddressKnown checks if an address is in the static fallback data
|
||||
func (fp *DefaultFallbackProvider) IsAddressKnown(address common.Address) bool {
|
||||
fp.mu.RLock()
|
||||
defer fp.mu.RUnlock()
|
||||
|
||||
_, isToken := fp.staticTokenData[address]
|
||||
_, isPool := fp.staticPoolData[address]
|
||||
|
||||
return isToken || isPool
|
||||
}
|
||||
|
||||
// GetKnownAddresses returns all known addresses in the fallback provider
|
||||
func (fp *DefaultFallbackProvider) GetKnownAddresses() (tokens []common.Address, pools []common.Address) {
|
||||
fp.mu.RLock()
|
||||
defer fp.mu.RUnlock()
|
||||
|
||||
for addr := range fp.staticTokenData {
|
||||
tokens = append(tokens, addr)
|
||||
}
|
||||
|
||||
for addr := range fp.staticPoolData {
|
||||
pools = append(pools, addr)
|
||||
}
|
||||
|
||||
return tokens, pools
|
||||
}
|
||||
|
||||
// ValidateAddressWithFallback performs validation using fallback data
|
||||
func (fp *DefaultFallbackProvider) ValidateAddressWithFallback(ctx context.Context, address common.Address, expectedType string) (bool, float64, error) {
|
||||
if !fp.enabled {
|
||||
return false, 0.0, fmt.Errorf("fallback provider disabled")
|
||||
}
|
||||
|
||||
// Check if address is known in our static data
|
||||
if fp.IsAddressKnown(address) {
|
||||
actualType, err := fp.GetFallbackContractType(ctx, address)
|
||||
if err != nil {
|
||||
return false, 0.0, err
|
||||
}
|
||||
|
||||
if actualType == expectedType {
|
||||
return true, 1.0, nil // High confidence for known addresses
|
||||
}
|
||||
|
||||
return false, 0.0, fmt.Errorf("type mismatch: expected %s, got %s", expectedType, actualType)
|
||||
}
|
||||
|
||||
// For unknown addresses, provide low confidence validation
|
||||
return true, 0.3, nil // Allow with low confidence
|
||||
}
|
||||
|
||||
// GetStats returns statistics about the fallback provider
|
||||
func (fp *DefaultFallbackProvider) GetStats() map[string]interface{} {
|
||||
fp.mu.RLock()
|
||||
defer fp.mu.RUnlock()
|
||||
|
||||
return map[string]interface{}{
|
||||
"enabled": fp.enabled,
|
||||
"static_tokens_count": len(fp.staticTokenData),
|
||||
"static_pools_count": len(fp.staticPoolData),
|
||||
"cache_timeout": fp.cacheTimeout.String(),
|
||||
"has_registry": fp.contractRegistry != nil,
|
||||
}
|
||||
}
|
||||
|
||||
// Enable enables the fallback provider
|
||||
func (fp *DefaultFallbackProvider) Enable() {
|
||||
fp.mu.Lock()
|
||||
defer fp.mu.Unlock()
|
||||
fp.enabled = true
|
||||
fp.logger.Info("Fallback provider enabled")
|
||||
}
|
||||
|
||||
// Disable disables the fallback provider
|
||||
func (fp *DefaultFallbackProvider) Disable() {
|
||||
fp.mu.Lock()
|
||||
defer fp.mu.Unlock()
|
||||
fp.enabled = false
|
||||
fp.logger.Info("Fallback provider disabled")
|
||||
}
|
||||
446
orig/internal/recovery/retry_handler.go
Normal file
446
orig/internal/recovery/retry_handler.go
Normal file
@@ -0,0 +1,446 @@
|
||||
package recovery
|
||||
|
||||
import (
|
||||
"context"
|
||||
"math"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/fraktal/mev-beta/internal/logger"
|
||||
)
|
||||
|
||||
// RetryConfig defines retry behavior configuration
|
||||
type RetryConfig struct {
|
||||
MaxAttempts int
|
||||
InitialDelay time.Duration
|
||||
MaxDelay time.Duration
|
||||
BackoffFactor float64
|
||||
JitterEnabled bool
|
||||
TimeoutPerAttempt time.Duration
|
||||
}
|
||||
|
||||
// DefaultRetryConfig returns a sensible default retry configuration
|
||||
func DefaultRetryConfig() RetryConfig {
|
||||
return RetryConfig{
|
||||
MaxAttempts: 3,
|
||||
InitialDelay: 1 * time.Second,
|
||||
MaxDelay: 30 * time.Second,
|
||||
BackoffFactor: 2.0,
|
||||
JitterEnabled: true,
|
||||
TimeoutPerAttempt: 10 * time.Second,
|
||||
}
|
||||
}
|
||||
|
||||
// RetryableOperation represents an operation that can be retried
|
||||
type RetryableOperation func(ctx context.Context, attempt int) error
|
||||
|
||||
// RetryHandler provides exponential backoff retry capabilities
|
||||
type RetryHandler struct {
|
||||
mu sync.RWMutex
|
||||
logger *logger.Logger
|
||||
configs map[string]RetryConfig
|
||||
stats map[string]*RetryStats
|
||||
enabled bool
|
||||
}
|
||||
|
||||
// RetryStats tracks retry statistics for operations
|
||||
type RetryStats struct {
|
||||
mu sync.RWMutex
|
||||
OperationType string
|
||||
TotalAttempts int
|
||||
SuccessfulRetries int
|
||||
FailedRetries int
|
||||
AverageAttempts float64
|
||||
LastAttempt time.Time
|
||||
LastSuccess time.Time
|
||||
LastFailure time.Time
|
||||
}
|
||||
|
||||
// RetryResult contains the result of a retry operation
|
||||
type RetryResult struct {
|
||||
Success bool
|
||||
Attempts int
|
||||
TotalDuration time.Duration
|
||||
LastError error
|
||||
LastAttemptAt time.Time
|
||||
}
|
||||
|
||||
// NewRetryHandler creates a new retry handler
|
||||
func NewRetryHandler(logger *logger.Logger) *RetryHandler {
|
||||
handler := &RetryHandler{
|
||||
logger: logger,
|
||||
configs: make(map[string]RetryConfig),
|
||||
stats: make(map[string]*RetryStats),
|
||||
enabled: true,
|
||||
}
|
||||
|
||||
// Initialize default configurations for common operations
|
||||
handler.initializeDefaultConfigs()
|
||||
|
||||
return handler
|
||||
}
|
||||
|
||||
// initializeDefaultConfigs sets up default retry configurations
|
||||
func (rh *RetryHandler) initializeDefaultConfigs() {
|
||||
// Contract call retries - moderate backoff
|
||||
rh.configs["contract_call"] = RetryConfig{
|
||||
MaxAttempts: 3,
|
||||
InitialDelay: 500 * time.Millisecond,
|
||||
MaxDelay: 5 * time.Second,
|
||||
BackoffFactor: 2.0,
|
||||
JitterEnabled: true,
|
||||
TimeoutPerAttempt: 10 * time.Second,
|
||||
}
|
||||
|
||||
// RPC connection retries - aggressive backoff
|
||||
rh.configs["rpc_connection"] = RetryConfig{
|
||||
MaxAttempts: 5,
|
||||
InitialDelay: 1 * time.Second,
|
||||
MaxDelay: 30 * time.Second,
|
||||
BackoffFactor: 2.5,
|
||||
JitterEnabled: true,
|
||||
TimeoutPerAttempt: 15 * time.Second,
|
||||
}
|
||||
|
||||
// Data parsing retries - quick retries
|
||||
rh.configs["data_parsing"] = RetryConfig{
|
||||
MaxAttempts: 2,
|
||||
InitialDelay: 100 * time.Millisecond,
|
||||
MaxDelay: 1 * time.Second,
|
||||
BackoffFactor: 2.0,
|
||||
JitterEnabled: false,
|
||||
TimeoutPerAttempt: 5 * time.Second,
|
||||
}
|
||||
|
||||
// Block processing retries - conservative
|
||||
rh.configs["block_processing"] = RetryConfig{
|
||||
MaxAttempts: 3,
|
||||
InitialDelay: 2 * time.Second,
|
||||
MaxDelay: 10 * time.Second,
|
||||
BackoffFactor: 2.0,
|
||||
JitterEnabled: true,
|
||||
TimeoutPerAttempt: 30 * time.Second,
|
||||
}
|
||||
|
||||
// Token metadata retries - patient backoff
|
||||
rh.configs["token_metadata"] = RetryConfig{
|
||||
MaxAttempts: 4,
|
||||
InitialDelay: 1 * time.Second,
|
||||
MaxDelay: 20 * time.Second,
|
||||
BackoffFactor: 2.0,
|
||||
JitterEnabled: true,
|
||||
TimeoutPerAttempt: 15 * time.Second,
|
||||
}
|
||||
}
|
||||
|
||||
// ExecuteWithRetry executes an operation with retry logic
|
||||
func (rh *RetryHandler) ExecuteWithRetry(ctx context.Context, operationType string, operation RetryableOperation) *RetryResult {
|
||||
if !rh.enabled {
|
||||
// If retries are disabled, try once
|
||||
err := operation(ctx, 1)
|
||||
return &RetryResult{
|
||||
Success: err == nil,
|
||||
Attempts: 1,
|
||||
TotalDuration: 0,
|
||||
LastError: err,
|
||||
LastAttemptAt: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
config := rh.getConfig(operationType)
|
||||
start := time.Now()
|
||||
var lastError error
|
||||
|
||||
rh.mu.Lock()
|
||||
stats, exists := rh.stats[operationType]
|
||||
if !exists {
|
||||
stats = &RetryStats{
|
||||
OperationType: operationType,
|
||||
}
|
||||
rh.stats[operationType] = stats
|
||||
}
|
||||
rh.mu.Unlock()
|
||||
|
||||
for attempt := 1; attempt <= config.MaxAttempts; attempt++ {
|
||||
// Create context with timeout for this attempt
|
||||
attemptCtx, cancel := context.WithTimeout(ctx, config.TimeoutPerAttempt)
|
||||
|
||||
rh.logger.Debug("Attempting operation with retry",
|
||||
"operation", operationType,
|
||||
"attempt", attempt,
|
||||
"max_attempts", config.MaxAttempts)
|
||||
|
||||
// Execute the operation
|
||||
err := operation(attemptCtx, attempt)
|
||||
cancel()
|
||||
|
||||
// Update statistics
|
||||
stats.mu.Lock()
|
||||
stats.TotalAttempts++
|
||||
stats.LastAttempt = time.Now()
|
||||
stats.mu.Unlock()
|
||||
|
||||
if err == nil {
|
||||
// Success!
|
||||
duration := time.Since(start)
|
||||
|
||||
stats.mu.Lock()
|
||||
stats.SuccessfulRetries++
|
||||
stats.LastSuccess = time.Now()
|
||||
denominator := stats.SuccessfulRetries + stats.FailedRetries
|
||||
if denominator > 0 {
|
||||
stats.AverageAttempts = float64(stats.TotalAttempts) / float64(denominator)
|
||||
}
|
||||
stats.mu.Unlock()
|
||||
|
||||
rh.logger.Debug("Operation succeeded",
|
||||
"operation", operationType,
|
||||
"attempt", attempt,
|
||||
"duration", duration)
|
||||
|
||||
return &RetryResult{
|
||||
Success: true,
|
||||
Attempts: attempt,
|
||||
TotalDuration: duration,
|
||||
LastError: nil,
|
||||
LastAttemptAt: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
lastError = err
|
||||
|
||||
// Check if context was cancelled
|
||||
if ctx.Err() != nil {
|
||||
rh.logger.Debug("Operation cancelled by context",
|
||||
"operation", operationType,
|
||||
"attempt", attempt,
|
||||
"error", ctx.Err())
|
||||
break
|
||||
}
|
||||
|
||||
// Don't wait after the last attempt
|
||||
if attempt < config.MaxAttempts {
|
||||
delay := rh.calculateDelay(config, attempt)
|
||||
|
||||
rh.logger.Debug("Operation failed, retrying",
|
||||
"operation", operationType,
|
||||
"attempt", attempt,
|
||||
"error", err,
|
||||
"delay", delay)
|
||||
|
||||
// Wait before next attempt
|
||||
select {
|
||||
case <-time.After(delay):
|
||||
// Continue to next attempt
|
||||
case <-ctx.Done():
|
||||
// Context cancelled during wait
|
||||
break
|
||||
}
|
||||
} else {
|
||||
rh.logger.Warn("Operation failed after all retries",
|
||||
"operation", operationType,
|
||||
"attempts", attempt,
|
||||
"error", err)
|
||||
}
|
||||
}
|
||||
|
||||
// All attempts failed
|
||||
duration := time.Since(start)
|
||||
|
||||
stats.mu.Lock()
|
||||
stats.FailedRetries++
|
||||
stats.LastFailure = time.Now()
|
||||
denominator := stats.SuccessfulRetries + stats.FailedRetries
|
||||
if denominator > 0 {
|
||||
stats.AverageAttempts = float64(stats.TotalAttempts) / float64(denominator)
|
||||
}
|
||||
stats.mu.Unlock()
|
||||
|
||||
return &RetryResult{
|
||||
Success: false,
|
||||
Attempts: config.MaxAttempts,
|
||||
TotalDuration: duration,
|
||||
LastError: lastError,
|
||||
LastAttemptAt: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
// calculateDelay calculates the delay before the next retry attempt
|
||||
func (rh *RetryHandler) calculateDelay(config RetryConfig, attempt int) time.Duration {
|
||||
// Calculate exponential backoff
|
||||
delay := float64(config.InitialDelay) * math.Pow(config.BackoffFactor, float64(attempt-1))
|
||||
|
||||
// Apply maximum delay cap
|
||||
if delay > float64(config.MaxDelay) {
|
||||
delay = float64(config.MaxDelay)
|
||||
}
|
||||
|
||||
duration := time.Duration(delay)
|
||||
|
||||
// Add jitter if enabled
|
||||
if config.JitterEnabled {
|
||||
jitter := time.Duration(float64(duration) * 0.1 * (2*rh.randomFloat() - 1))
|
||||
duration += jitter
|
||||
}
|
||||
|
||||
// Ensure minimum delay
|
||||
if duration < 0 {
|
||||
duration = config.InitialDelay
|
||||
}
|
||||
|
||||
return duration
|
||||
}
|
||||
|
||||
// randomFloat returns a pseudo-random float between 0 and 1
|
||||
func (rh *RetryHandler) randomFloat() float64 {
|
||||
// Simple pseudo-random number based on current time
|
||||
return float64(time.Now().UnixNano()%1000) / 1000.0
|
||||
}
|
||||
|
||||
// getConfig returns the retry configuration for an operation type
|
||||
func (rh *RetryHandler) getConfig(operationType string) RetryConfig {
|
||||
rh.mu.RLock()
|
||||
defer rh.mu.RUnlock()
|
||||
|
||||
if config, exists := rh.configs[operationType]; exists {
|
||||
return config
|
||||
}
|
||||
|
||||
// Return default config if no specific config found
|
||||
return DefaultRetryConfig()
|
||||
}
|
||||
|
||||
// SetConfig sets a custom retry configuration for an operation type
|
||||
func (rh *RetryHandler) SetConfig(operationType string, config RetryConfig) {
|
||||
rh.mu.Lock()
|
||||
defer rh.mu.Unlock()
|
||||
|
||||
rh.configs[operationType] = config
|
||||
rh.logger.Debug("Set retry config",
|
||||
"operation", operationType,
|
||||
"max_attempts", config.MaxAttempts,
|
||||
"initial_delay", config.InitialDelay,
|
||||
"max_delay", config.MaxDelay)
|
||||
}
|
||||
|
||||
// GetStats returns retry statistics for all operation types
|
||||
func (rh *RetryHandler) GetStats() map[string]*RetryStats {
|
||||
rh.mu.RLock()
|
||||
defer rh.mu.RUnlock()
|
||||
|
||||
// Return a copy to prevent external modification
|
||||
result := make(map[string]*RetryStats)
|
||||
for opType, stats := range rh.stats {
|
||||
stats.mu.RLock()
|
||||
result[opType] = &RetryStats{
|
||||
OperationType: stats.OperationType,
|
||||
TotalAttempts: stats.TotalAttempts,
|
||||
SuccessfulRetries: stats.SuccessfulRetries,
|
||||
FailedRetries: stats.FailedRetries,
|
||||
AverageAttempts: stats.AverageAttempts,
|
||||
LastAttempt: stats.LastAttempt,
|
||||
LastSuccess: stats.LastSuccess,
|
||||
LastFailure: stats.LastFailure,
|
||||
}
|
||||
stats.mu.RUnlock()
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// GetOperationStats returns statistics for a specific operation type
|
||||
func (rh *RetryHandler) GetOperationStats(operationType string) *RetryStats {
|
||||
rh.mu.RLock()
|
||||
defer rh.mu.RUnlock()
|
||||
|
||||
stats, exists := rh.stats[operationType]
|
||||
if !exists {
|
||||
return nil
|
||||
}
|
||||
|
||||
stats.mu.RLock()
|
||||
defer stats.mu.RUnlock()
|
||||
|
||||
return &RetryStats{
|
||||
OperationType: stats.OperationType,
|
||||
TotalAttempts: stats.TotalAttempts,
|
||||
SuccessfulRetries: stats.SuccessfulRetries,
|
||||
FailedRetries: stats.FailedRetries,
|
||||
AverageAttempts: stats.AverageAttempts,
|
||||
LastAttempt: stats.LastAttempt,
|
||||
LastSuccess: stats.LastSuccess,
|
||||
LastFailure: stats.LastFailure,
|
||||
}
|
||||
}
|
||||
|
||||
// ResetStats resets statistics for all operation types
|
||||
func (rh *RetryHandler) ResetStats() {
|
||||
rh.mu.Lock()
|
||||
defer rh.mu.Unlock()
|
||||
|
||||
rh.stats = make(map[string]*RetryStats)
|
||||
rh.logger.Info("Reset retry statistics")
|
||||
}
|
||||
|
||||
// Enable enables the retry handler
|
||||
func (rh *RetryHandler) Enable() {
|
||||
rh.mu.Lock()
|
||||
defer rh.mu.Unlock()
|
||||
rh.enabled = true
|
||||
rh.logger.Info("Retry handler enabled")
|
||||
}
|
||||
|
||||
// Disable disables the retry handler
|
||||
func (rh *RetryHandler) Disable() {
|
||||
rh.mu.Lock()
|
||||
defer rh.mu.Unlock()
|
||||
rh.enabled = false
|
||||
rh.logger.Info("Retry handler disabled")
|
||||
}
|
||||
|
||||
// IsEnabled returns whether the retry handler is enabled
|
||||
func (rh *RetryHandler) IsEnabled() bool {
|
||||
rh.mu.RLock()
|
||||
defer rh.mu.RUnlock()
|
||||
return rh.enabled
|
||||
}
|
||||
|
||||
// GetHealthSummary returns a health summary based on retry statistics
|
||||
func (rh *RetryHandler) GetHealthSummary() map[string]interface{} {
|
||||
stats := rh.GetStats()
|
||||
|
||||
summary := map[string]interface{}{
|
||||
"enabled": rh.enabled,
|
||||
"total_operations": len(stats),
|
||||
"healthy_operations": 0,
|
||||
"unhealthy_operations": 0,
|
||||
"operation_details": make(map[string]interface{}),
|
||||
}
|
||||
|
||||
for opType, opStats := range stats {
|
||||
total := opStats.SuccessfulRetries + opStats.FailedRetries
|
||||
successRate := 0.0
|
||||
if total > 0 {
|
||||
successRate = float64(opStats.SuccessfulRetries) / float64(total)
|
||||
}
|
||||
|
||||
isHealthy := successRate >= 0.9 && opStats.AverageAttempts <= 2.0
|
||||
|
||||
if isHealthy {
|
||||
summary["healthy_operations"] = summary["healthy_operations"].(int) + 1
|
||||
} else {
|
||||
summary["unhealthy_operations"] = summary["unhealthy_operations"].(int) + 1
|
||||
}
|
||||
|
||||
summary["operation_details"].(map[string]interface{})[opType] = map[string]interface{}{
|
||||
"success_rate": successRate,
|
||||
"average_attempts": opStats.AverageAttempts,
|
||||
"total_operations": total,
|
||||
"is_healthy": isHealthy,
|
||||
"last_success": opStats.LastSuccess,
|
||||
"last_failure": opStats.LastFailure,
|
||||
}
|
||||
}
|
||||
|
||||
return summary
|
||||
}
|
||||
362
orig/internal/recovery/retry_handler_test.go
Normal file
362
orig/internal/recovery/retry_handler_test.go
Normal file
@@ -0,0 +1,362 @@
|
||||
package recovery
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/fraktal/mev-beta/internal/logger"
|
||||
)
|
||||
|
||||
func TestRetryHandler_ExecuteWithRetry_Success(t *testing.T) {
|
||||
log := logger.New("debug", "text", "")
|
||||
handler := NewRetryHandler(log)
|
||||
|
||||
attempts := 0
|
||||
operation := func(ctx context.Context, attempt int) error {
|
||||
attempts++
|
||||
if attempts == 2 {
|
||||
return nil // Success on second attempt
|
||||
}
|
||||
return errors.New("temporary failure")
|
||||
}
|
||||
|
||||
result := handler.ExecuteWithRetry(context.Background(), "test_operation", operation)
|
||||
|
||||
assert.True(t, result.Success)
|
||||
assert.Equal(t, 2, result.Attempts)
|
||||
assert.Nil(t, result.LastError)
|
||||
assert.Equal(t, 2, attempts)
|
||||
}
|
||||
|
||||
func TestRetryHandler_ExecuteWithRetry_MaxAttemptsReached(t *testing.T) {
|
||||
log := logger.New("debug", "text", "")
|
||||
handler := NewRetryHandler(log)
|
||||
|
||||
attempts := 0
|
||||
operation := func(ctx context.Context, attempt int) error {
|
||||
attempts++
|
||||
return errors.New("persistent failure")
|
||||
}
|
||||
|
||||
result := handler.ExecuteWithRetry(context.Background(), "test_operation", operation)
|
||||
|
||||
assert.False(t, result.Success)
|
||||
assert.Equal(t, 3, result.Attempts) // Default max attempts
|
||||
assert.NotNil(t, result.LastError)
|
||||
assert.Equal(t, "persistent failure", result.LastError.Error())
|
||||
assert.Equal(t, 3, attempts)
|
||||
}
|
||||
|
||||
func TestRetryHandler_ExecuteWithRetry_ContextCanceled(t *testing.T) {
|
||||
log := logger.New("debug", "text", "")
|
||||
handler := NewRetryHandler(log)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
|
||||
attempts := 0
|
||||
operation := func(ctx context.Context, attempt int) error {
|
||||
attempts++
|
||||
if attempts == 2 {
|
||||
cancel() // Cancel context on second attempt
|
||||
}
|
||||
return errors.New("failure")
|
||||
}
|
||||
|
||||
result := handler.ExecuteWithRetry(ctx, "test_operation", operation)
|
||||
|
||||
assert.False(t, result.Success)
|
||||
assert.LessOrEqual(t, result.Attempts, 3)
|
||||
assert.NotNil(t, result.LastError)
|
||||
}
|
||||
|
||||
func TestRetryHandler_ExecuteWithRetry_CustomConfig(t *testing.T) {
|
||||
log := logger.New("debug", "text", "")
|
||||
handler := NewRetryHandler(log)
|
||||
|
||||
// Set custom configuration
|
||||
customConfig := RetryConfig{
|
||||
MaxAttempts: 5,
|
||||
InitialDelay: 10 * time.Millisecond,
|
||||
MaxDelay: 100 * time.Millisecond,
|
||||
BackoffFactor: 2.0,
|
||||
JitterEnabled: false,
|
||||
TimeoutPerAttempt: 1 * time.Second,
|
||||
}
|
||||
handler.SetConfig("custom_operation", customConfig)
|
||||
|
||||
attempts := 0
|
||||
operation := func(ctx context.Context, attempt int) error {
|
||||
attempts++
|
||||
return errors.New("persistent failure")
|
||||
}
|
||||
|
||||
start := time.Now()
|
||||
result := handler.ExecuteWithRetry(context.Background(), "custom_operation", operation)
|
||||
duration := time.Since(start)
|
||||
|
||||
assert.False(t, result.Success)
|
||||
assert.Equal(t, 5, result.Attempts) // Custom max attempts
|
||||
assert.Equal(t, 5, attempts)
|
||||
|
||||
// Should have taken some time due to delays (at least 150ms for delays)
|
||||
expectedMinDuration := 10*time.Millisecond + 20*time.Millisecond + 40*time.Millisecond + 80*time.Millisecond
|
||||
assert.GreaterOrEqual(t, duration, expectedMinDuration)
|
||||
}
|
||||
|
||||
func TestRetryHandler_ExecuteWithRetry_Disabled(t *testing.T) {
|
||||
log := logger.New("debug", "text", "")
|
||||
handler := NewRetryHandler(log)
|
||||
handler.Disable()
|
||||
|
||||
attempts := 0
|
||||
operation := func(ctx context.Context, attempt int) error {
|
||||
attempts++
|
||||
return errors.New("failure")
|
||||
}
|
||||
|
||||
result := handler.ExecuteWithRetry(context.Background(), "test_operation", operation)
|
||||
|
||||
assert.False(t, result.Success)
|
||||
assert.Equal(t, 1, result.Attempts) // Only one attempt when disabled
|
||||
assert.Equal(t, 1, attempts)
|
||||
}
|
||||
|
||||
func TestRetryHandler_CalculateDelay(t *testing.T) {
|
||||
log := logger.New("debug", "text", "")
|
||||
handler := NewRetryHandler(log)
|
||||
|
||||
config := RetryConfig{
|
||||
InitialDelay: 100 * time.Millisecond,
|
||||
MaxDelay: 1 * time.Second,
|
||||
BackoffFactor: 2.0,
|
||||
JitterEnabled: false,
|
||||
}
|
||||
|
||||
tests := []struct {
|
||||
attempt int
|
||||
expectedMin time.Duration
|
||||
expectedMax time.Duration
|
||||
}{
|
||||
{1, 100 * time.Millisecond, 100 * time.Millisecond},
|
||||
{2, 200 * time.Millisecond, 200 * time.Millisecond},
|
||||
{3, 400 * time.Millisecond, 400 * time.Millisecond},
|
||||
{4, 800 * time.Millisecond, 800 * time.Millisecond},
|
||||
{5, 1 * time.Second, 1 * time.Second}, // Should be capped at MaxDelay
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(fmt.Sprintf("attempt_%d", tt.attempt), func(t *testing.T) {
|
||||
delay := handler.calculateDelay(config, tt.attempt)
|
||||
assert.GreaterOrEqual(t, delay, tt.expectedMin)
|
||||
assert.LessOrEqual(t, delay, tt.expectedMax)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestRetryHandler_CalculateDelay_WithJitter(t *testing.T) {
|
||||
log := logger.New("debug", "text", "")
|
||||
handler := NewRetryHandler(log)
|
||||
|
||||
config := RetryConfig{
|
||||
InitialDelay: 100 * time.Millisecond,
|
||||
MaxDelay: 1 * time.Second,
|
||||
BackoffFactor: 2.0,
|
||||
JitterEnabled: true,
|
||||
}
|
||||
|
||||
// Test jitter variation
|
||||
delays := make([]time.Duration, 10)
|
||||
for i := 0; i < 10; i++ {
|
||||
delays[i] = handler.calculateDelay(config, 2) // 200ms base
|
||||
}
|
||||
|
||||
// Should have some variation due to jitter
|
||||
allSame := true
|
||||
for i := 1; i < len(delays); i++ {
|
||||
if delays[i] != delays[0] {
|
||||
allSame = false
|
||||
break
|
||||
}
|
||||
}
|
||||
assert.False(t, allSame, "Jitter should cause variation in delays")
|
||||
|
||||
// All delays should be reasonable (within 10% of base)
|
||||
baseDelay := 200 * time.Millisecond
|
||||
for _, delay := range delays {
|
||||
assert.GreaterOrEqual(t, delay, baseDelay*9/10) // 10% below
|
||||
assert.LessOrEqual(t, delay, baseDelay*11/10) // 10% above
|
||||
}
|
||||
}
|
||||
|
||||
func TestRetryHandler_GetStats(t *testing.T) {
|
||||
log := logger.New("debug", "text", "")
|
||||
handler := NewRetryHandler(log)
|
||||
|
||||
// Execute some operations
|
||||
successOp := func(ctx context.Context, attempt int) error {
|
||||
return nil
|
||||
}
|
||||
failOp := func(ctx context.Context, attempt int) error {
|
||||
return errors.New("failure")
|
||||
}
|
||||
|
||||
handler.ExecuteWithRetry(context.Background(), "test_success", successOp)
|
||||
handler.ExecuteWithRetry(context.Background(), "test_success", successOp)
|
||||
handler.ExecuteWithRetry(context.Background(), "test_fail", failOp)
|
||||
|
||||
stats := handler.GetStats()
|
||||
|
||||
// Check success stats
|
||||
successStats := stats["test_success"]
|
||||
require.NotNil(t, successStats)
|
||||
assert.Equal(t, 2, successStats.TotalAttempts)
|
||||
assert.Equal(t, 2, successStats.SuccessfulRetries)
|
||||
assert.Equal(t, 0, successStats.FailedRetries)
|
||||
|
||||
// Check failure stats
|
||||
failStats := stats["test_fail"]
|
||||
require.NotNil(t, failStats)
|
||||
assert.Equal(t, 3, failStats.TotalAttempts) // Default max attempts
|
||||
assert.Equal(t, 0, failStats.SuccessfulRetries)
|
||||
assert.Equal(t, 1, failStats.FailedRetries)
|
||||
}
|
||||
|
||||
func TestRetryHandler_GetHealthSummary(t *testing.T) {
|
||||
log := logger.New("debug", "text", "")
|
||||
handler := NewRetryHandler(log)
|
||||
|
||||
// Execute some operations to generate stats
|
||||
successOp := func(ctx context.Context, attempt int) error {
|
||||
return nil
|
||||
}
|
||||
partialFailOp := func(ctx context.Context, attempt int) error {
|
||||
if attempt < 2 {
|
||||
return errors.New("temporary failure")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// 2 immediate successes
|
||||
handler.ExecuteWithRetry(context.Background(), "immediate_success", successOp)
|
||||
handler.ExecuteWithRetry(context.Background(), "immediate_success", successOp)
|
||||
|
||||
// 1 success after retry
|
||||
handler.ExecuteWithRetry(context.Background(), "retry_success", partialFailOp)
|
||||
|
||||
summary := handler.GetHealthSummary()
|
||||
|
||||
assert.True(t, summary["enabled"].(bool))
|
||||
assert.Equal(t, 2, summary["total_operations"].(int))
|
||||
assert.Equal(t, 2, summary["healthy_operations"].(int))
|
||||
assert.Equal(t, 0, summary["unhealthy_operations"].(int))
|
||||
|
||||
// Check operation details
|
||||
details := summary["operation_details"].(map[string]interface{})
|
||||
|
||||
immediateDetails := details["immediate_success"].(map[string]interface{})
|
||||
assert.Equal(t, 1.0, immediateDetails["success_rate"].(float64))
|
||||
assert.Equal(t, 1.0, immediateDetails["average_attempts"].(float64))
|
||||
assert.True(t, immediateDetails["is_healthy"].(bool))
|
||||
|
||||
retryDetails := details["retry_success"].(map[string]interface{})
|
||||
assert.Equal(t, 1.0, retryDetails["success_rate"].(float64))
|
||||
assert.Equal(t, 2.0, retryDetails["average_attempts"].(float64))
|
||||
assert.True(t, retryDetails["is_healthy"].(bool)) // Still healthy despite retries
|
||||
}
|
||||
|
||||
func TestRetryHandler_ConcurrentExecution(t *testing.T) {
|
||||
log := logger.New("debug", "text", "")
|
||||
handler := NewRetryHandler(log)
|
||||
|
||||
const numGoroutines = 50
|
||||
const operationsPerGoroutine = 20
|
||||
|
||||
done := make(chan bool, numGoroutines)
|
||||
successCount := make(chan int, numGoroutines)
|
||||
|
||||
operation := func(ctx context.Context, attempt int) error {
|
||||
// 80% success rate
|
||||
if attempt <= 1 && time.Now().UnixNano()%5 != 0 {
|
||||
return nil
|
||||
}
|
||||
if attempt == 2 {
|
||||
return nil // Always succeed on second attempt
|
||||
}
|
||||
return errors.New("failure")
|
||||
}
|
||||
|
||||
// Launch concurrent retry operations
|
||||
for i := 0; i < numGoroutines; i++ {
|
||||
go func(id int) {
|
||||
defer func() { done <- true }()
|
||||
|
||||
successes := 0
|
||||
for j := 0; j < operationsPerGoroutine; j++ {
|
||||
result := handler.ExecuteWithRetry(context.Background(),
|
||||
fmt.Sprintf("concurrent_op_%d", id), operation)
|
||||
if result.Success {
|
||||
successes++
|
||||
}
|
||||
}
|
||||
successCount <- successes
|
||||
}(i)
|
||||
}
|
||||
|
||||
// Collect results
|
||||
totalSuccesses := 0
|
||||
for i := 0; i < numGoroutines; i++ {
|
||||
select {
|
||||
case <-done:
|
||||
totalSuccesses += <-successCount
|
||||
case <-time.After(30 * time.Second):
|
||||
t.Fatal("Concurrent retry test timed out")
|
||||
}
|
||||
}
|
||||
|
||||
totalOperations := numGoroutines * operationsPerGoroutine
|
||||
successRate := float64(totalSuccesses) / float64(totalOperations)
|
||||
|
||||
t.Logf("Concurrent execution: %d/%d operations succeeded (%.2f%%)",
|
||||
totalSuccesses, totalOperations, successRate*100)
|
||||
|
||||
// Should have high success rate due to retries
|
||||
assert.GreaterOrEqual(t, successRate, 0.8, "Success rate should be at least 80%")
|
||||
|
||||
// Verify stats are consistent
|
||||
stats := handler.GetStats()
|
||||
assert.NotEmpty(t, stats, "Should have recorded stats")
|
||||
}
|
||||
|
||||
func TestRetryHandler_EdgeCases(t *testing.T) {
|
||||
log := logger.New("debug", "text", "")
|
||||
handler := NewRetryHandler(log)
|
||||
|
||||
t.Run("nil operation", func(t *testing.T) {
|
||||
assert.Panics(t, func() {
|
||||
handler.ExecuteWithRetry(context.Background(), "nil_op", nil)
|
||||
})
|
||||
})
|
||||
|
||||
t.Run("empty operation type", func(t *testing.T) {
|
||||
operation := func(ctx context.Context, attempt int) error {
|
||||
return nil
|
||||
}
|
||||
result := handler.ExecuteWithRetry(context.Background(), "", operation)
|
||||
assert.True(t, result.Success)
|
||||
})
|
||||
|
||||
t.Run("very long operation type", func(t *testing.T) {
|
||||
longName := string(make([]byte, 1000))
|
||||
operation := func(ctx context.Context, attempt int) error {
|
||||
return nil
|
||||
}
|
||||
result := handler.ExecuteWithRetry(context.Background(), longName, operation)
|
||||
assert.True(t, result.Success)
|
||||
})
|
||||
}
|
||||
Reference in New Issue
Block a user