feat: create v2-prep branch with comprehensive planning

Restructured project for V2 refactor:

**Structure Changes:**
- Moved all V1 code to orig/ folder (preserved with git mv)
- Created docs/planning/ directory
- Added orig/README_V1.md explaining V1 preservation

**Planning Documents:**
- 00_V2_MASTER_PLAN.md: Complete architecture overview
  - Executive summary of critical V1 issues
  - High-level component architecture diagrams
  - 5-phase implementation roadmap
  - Success metrics and risk mitigation

- 07_TASK_BREAKDOWN.md: Atomic task breakdown
  - 99+ hours of detailed tasks
  - Every task < 2 hours (atomic)
  - Clear dependencies and success criteria
  - Organized by implementation phase

**V2 Key Improvements:**
- Per-exchange parsers (factory pattern)
- Multi-layer strict validation
- Multi-index pool cache
- Background validation pipeline
- Comprehensive observability

**Critical Issues Addressed:**
- Zero address tokens (strict validation + cache enrichment)
- Parsing accuracy (protocol-specific parsers)
- No audit trail (background validation channel)
- Inefficient lookups (multi-index cache)
- Stats disconnection (event-driven metrics)

Next Steps:
1. Review planning documents
2. Begin Phase 1: Foundation (P1-001 through P1-010)
3. Implement parsers in Phase 2
4. Build cache system in Phase 3
5. Add validation pipeline in Phase 4
6. Migrate and test in Phase 5

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Administrator
2025-11-10 10:14:26 +01:00
parent 1773daffe7
commit 803de231ba
411 changed files with 20390 additions and 8680 deletions

View File

@@ -0,0 +1,621 @@
package recovery
import (
"context"
"sync"
"time"
"github.com/ethereum/go-ethereum/common"
"github.com/fraktal/mev-beta/internal/logger"
)
// ErrorSeverity represents the severity level of an error
type ErrorSeverity int
const (
SeverityLow ErrorSeverity = iota
SeverityMedium
SeverityHigh
SeverityCritical
)
func (s ErrorSeverity) String() string {
switch s {
case SeverityLow:
return "LOW"
case SeverityMedium:
return "MEDIUM"
case SeverityHigh:
return "HIGH"
case SeverityCritical:
return "CRITICAL"
default:
return "UNKNOWN"
}
}
// ErrorType categorizes different types of errors
type ErrorType int
const (
ErrorTypeAddressCorruption ErrorType = iota
ErrorTypeContractCallFailed
ErrorTypeRPCConnectionFailed
ErrorTypeDataParsingFailed
ErrorTypeValidationFailed
ErrorTypeTimeoutError
)
func (e ErrorType) String() string {
switch e {
case ErrorTypeAddressCorruption:
return "ADDRESS_CORRUPTION"
case ErrorTypeContractCallFailed:
return "CONTRACT_CALL_FAILED"
case ErrorTypeRPCConnectionFailed:
return "RPC_CONNECTION_FAILED"
case ErrorTypeDataParsingFailed:
return "DATA_PARSING_FAILED"
case ErrorTypeValidationFailed:
return "VALIDATION_FAILED"
case ErrorTypeTimeoutError:
return "TIMEOUT_ERROR"
default:
return "UNKNOWN_ERROR"
}
}
// RecoveryAction represents an action to take when an error occurs
type RecoveryAction int
const (
ActionSkipAndContinue RecoveryAction = iota
ActionRetryWithBackoff
ActionUseFallbackData
ActionCircuitBreaker
ActionEmergencyStop
)
func (a RecoveryAction) String() string {
switch a {
case ActionSkipAndContinue:
return "SKIP_AND_CONTINUE"
case ActionRetryWithBackoff:
return "RETRY_WITH_BACKOFF"
case ActionUseFallbackData:
return "USE_FALLBACK_DATA"
case ActionCircuitBreaker:
return "CIRCUIT_BREAKER"
case ActionEmergencyStop:
return "EMERGENCY_STOP"
default:
return "UNKNOWN_ACTION"
}
}
// ErrorEvent represents a specific error occurrence
type ErrorEvent struct {
Timestamp time.Time
Type ErrorType
Severity ErrorSeverity
Component string
Address common.Address
Message string
Context map[string]interface{}
AttemptCount int
LastAttempt time.Time
Resolved bool
ResolvedAt time.Time
}
// RecoveryRule defines how to handle specific error patterns
type RecoveryRule struct {
ErrorType ErrorType
MaxSeverity ErrorSeverity
Action RecoveryAction
MaxRetries int
BackoffInterval time.Duration
CircuitBreakerThreshold int
ContextMatchers map[string]interface{}
}
// ErrorHandler provides comprehensive error handling and recovery capabilities
type ErrorHandler struct {
mu sync.RWMutex
logger *logger.Logger
errorHistory []ErrorEvent
componentStats map[string]*ComponentStats
circuitBreakers map[string]*CircuitBreaker
recoveryRules []RecoveryRule
fallbackProvider FallbackDataProvider
maxHistorySize int
alertThresholds map[ErrorType]int
enabled bool
}
// ComponentStats tracks error statistics for components
type ComponentStats struct {
mu sync.RWMutex
Component string
TotalErrors int
ErrorsByType map[ErrorType]int
ErrorsBySeverity map[ErrorSeverity]int
LastError time.Time
ConsecutiveFailures int
SuccessCount int
IsHealthy bool
LastHealthCheck time.Time
}
// CircuitBreaker implements circuit breaker pattern for failing components
type CircuitBreaker struct {
mu sync.RWMutex
Name string
State CircuitState
FailureCount int
Threshold int
Timeout time.Duration
LastFailure time.Time
LastSuccess time.Time
HalfOpenAllowed bool
}
type CircuitState int
const (
CircuitClosed CircuitState = iota
CircuitOpen
CircuitHalfOpen
)
func (s CircuitState) String() string {
switch s {
case CircuitClosed:
return "CLOSED"
case CircuitOpen:
return "OPEN"
case CircuitHalfOpen:
return "HALF_OPEN"
default:
return "UNKNOWN"
}
}
// FallbackDataProvider interface for providing fallback data when primary sources fail
type FallbackDataProvider interface {
GetFallbackTokenInfo(ctx context.Context, address common.Address) (*FallbackTokenInfo, error)
GetFallbackPoolInfo(ctx context.Context, address common.Address) (*FallbackPoolInfo, error)
GetFallbackContractType(ctx context.Context, address common.Address) (string, error)
}
type FallbackTokenInfo struct {
Address common.Address
Symbol string
Name string
Decimals uint8
IsVerified bool
Source string
Confidence float64
}
type FallbackPoolInfo struct {
Address common.Address
Token0 common.Address
Token1 common.Address
Protocol string
Fee uint32
IsVerified bool
Source string
Confidence float64
}
// NewErrorHandler creates a new error handler with default configuration
func NewErrorHandler(logger *logger.Logger) *ErrorHandler {
handler := &ErrorHandler{
logger: logger,
errorHistory: make([]ErrorEvent, 0),
componentStats: make(map[string]*ComponentStats),
circuitBreakers: make(map[string]*CircuitBreaker),
maxHistorySize: 1000,
alertThresholds: make(map[ErrorType]int),
enabled: true,
}
// Initialize default recovery rules
handler.initializeDefaultRules()
// Initialize default alert thresholds
handler.initializeAlertThresholds()
return handler
}
// initializeDefaultRules sets up default recovery rules for common error scenarios
func (eh *ErrorHandler) initializeDefaultRules() {
eh.recoveryRules = []RecoveryRule{
{
ErrorType: ErrorTypeAddressCorruption,
MaxSeverity: SeverityMedium,
Action: ActionRetryWithBackoff,
MaxRetries: 2,
BackoffInterval: 500 * time.Millisecond,
},
{
ErrorType: ErrorTypeAddressCorruption,
MaxSeverity: SeverityCritical,
Action: ActionUseFallbackData,
MaxRetries: 0,
BackoffInterval: 0,
},
{
ErrorType: ErrorTypeContractCallFailed,
MaxSeverity: SeverityMedium,
Action: ActionRetryWithBackoff,
MaxRetries: 3,
BackoffInterval: 2 * time.Second,
},
{
ErrorType: ErrorTypeRPCConnectionFailed,
MaxSeverity: SeverityHigh,
Action: ActionCircuitBreaker,
MaxRetries: 5,
BackoffInterval: 5 * time.Second,
CircuitBreakerThreshold: 10,
},
{
ErrorType: ErrorTypeDataParsingFailed,
MaxSeverity: SeverityMedium,
Action: ActionUseFallbackData,
MaxRetries: 2,
BackoffInterval: 1 * time.Second,
},
{
ErrorType: ErrorTypeValidationFailed,
MaxSeverity: SeverityLow,
Action: ActionSkipAndContinue,
MaxRetries: 0,
BackoffInterval: 0,
},
{
ErrorType: ErrorTypeValidationFailed,
MaxSeverity: SeverityHigh,
Action: ActionRetryWithBackoff,
MaxRetries: 1,
BackoffInterval: 500 * time.Millisecond,
},
{
ErrorType: ErrorTypeTimeoutError,
MaxSeverity: SeverityMedium,
Action: ActionRetryWithBackoff,
MaxRetries: 3,
BackoffInterval: 3 * time.Second,
},
}
}
// initializeAlertThresholds sets up alert thresholds for different error types
func (eh *ErrorHandler) initializeAlertThresholds() {
eh.alertThresholds[ErrorTypeAddressCorruption] = 5
eh.alertThresholds[ErrorTypeContractCallFailed] = 20
eh.alertThresholds[ErrorTypeRPCConnectionFailed] = 10
eh.alertThresholds[ErrorTypeDataParsingFailed] = 15
eh.alertThresholds[ErrorTypeValidationFailed] = 25
eh.alertThresholds[ErrorTypeTimeoutError] = 30
}
// HandleError processes an error and determines the appropriate recovery action
func (eh *ErrorHandler) HandleError(ctx context.Context, errorType ErrorType, severity ErrorSeverity, component string, address common.Address, message string, context map[string]interface{}) RecoveryAction {
if !eh.enabled {
return ActionSkipAndContinue
}
eh.mu.Lock()
defer eh.mu.Unlock()
// Record the error event
event := ErrorEvent{
Timestamp: time.Now(),
Type: errorType,
Severity: severity,
Component: component,
Address: address,
Message: message,
Context: context,
AttemptCount: 1,
LastAttempt: time.Now(),
}
// Update error history
eh.addToHistory(event)
// Update component statistics
eh.updateComponentStats(component, errorType, severity)
// Check circuit breakers
if eh.shouldTriggerCircuitBreaker(component, errorType) {
eh.triggerCircuitBreaker(component)
return ActionCircuitBreaker
}
// Find matching recovery rule
rule := eh.findRecoveryRule(errorType, severity, context)
if rule == nil {
// Default action for unmatched errors
return ActionSkipAndContinue
}
// Log the error and recovery action
eh.logger.Error("Error handled by recovery system",
"type", errorType.String(),
"severity", severity.String(),
"component", component,
"address", address.Hex(),
"message", message,
"action", rule.Action.String())
// Check if alert threshold is reached
eh.checkAlertThresholds(errorType)
return rule.Action
}
// addToHistory adds an error event to the history buffer
func (eh *ErrorHandler) addToHistory(event ErrorEvent) {
eh.errorHistory = append(eh.errorHistory, event)
// Trim history if it exceeds max size
if len(eh.errorHistory) > eh.maxHistorySize {
eh.errorHistory = eh.errorHistory[len(eh.errorHistory)-eh.maxHistorySize:]
}
}
// updateComponentStats updates statistics for a component
func (eh *ErrorHandler) updateComponentStats(component string, errorType ErrorType, severity ErrorSeverity) {
stats, exists := eh.componentStats[component]
if !exists {
stats = &ComponentStats{
Component: component,
ErrorsByType: make(map[ErrorType]int),
ErrorsBySeverity: make(map[ErrorSeverity]int),
IsHealthy: true,
}
eh.componentStats[component] = stats
}
stats.mu.Lock()
defer stats.mu.Unlock()
stats.TotalErrors++
stats.ErrorsByType[errorType]++
stats.ErrorsBySeverity[severity]++
stats.LastError = time.Now()
stats.ConsecutiveFailures++
// Mark as unhealthy if too many consecutive failures
if stats.ConsecutiveFailures > 10 {
stats.IsHealthy = false
}
}
// findRecoveryRule finds the best matching recovery rule for an error
func (eh *ErrorHandler) findRecoveryRule(errorType ErrorType, severity ErrorSeverity, context map[string]interface{}) *RecoveryRule {
for _, rule := range eh.recoveryRules {
if rule.ErrorType == errorType && severity <= rule.MaxSeverity {
// Check context matchers if present
if len(rule.ContextMatchers) > 0 {
if !eh.matchesContext(context, rule.ContextMatchers) {
continue
}
}
return &rule
}
}
return nil
}
// matchesContext checks if the error context matches the rule's context matchers
func (eh *ErrorHandler) matchesContext(errorContext, ruleMatchers map[string]interface{}) bool {
for key, expectedValue := range ruleMatchers {
if actualValue, exists := errorContext[key]; !exists || actualValue != expectedValue {
return false
}
}
return true
}
// shouldTriggerCircuitBreaker determines if a circuit breaker should be triggered
func (eh *ErrorHandler) shouldTriggerCircuitBreaker(component string, errorType ErrorType) bool {
stats, exists := eh.componentStats[component]
if !exists {
return false
}
stats.mu.RLock()
defer stats.mu.RUnlock()
// Trigger if consecutive failures exceed threshold for critical errors
if errorType == ErrorTypeRPCConnectionFailed && stats.ConsecutiveFailures >= 5 {
return true
}
if errorType == ErrorTypeAddressCorruption && stats.ConsecutiveFailures >= 3 {
return true
}
return false
}
// triggerCircuitBreaker activates a circuit breaker for a component
func (eh *ErrorHandler) triggerCircuitBreaker(component string) {
breaker := &CircuitBreaker{
Name: component,
State: CircuitOpen,
FailureCount: 0,
Threshold: 5,
Timeout: 30 * time.Second,
LastFailure: time.Now(),
}
eh.circuitBreakers[component] = breaker
eh.logger.Warn("Circuit breaker triggered",
"component", component,
"timeout", breaker.Timeout)
}
// checkAlertThresholds checks if error counts have reached alert thresholds
func (eh *ErrorHandler) checkAlertThresholds(errorType ErrorType) {
threshold, exists := eh.alertThresholds[errorType]
if !exists {
return
}
// Count recent errors of this type (last hour)
recentCount := 0
cutoff := time.Now().Add(-1 * time.Hour)
for _, event := range eh.errorHistory {
if event.Type == errorType && event.Timestamp.After(cutoff) {
recentCount++
}
}
if recentCount >= threshold {
eh.logger.Warn("Error threshold reached - alert triggered",
"error_type", errorType.String(),
"count", recentCount,
"threshold", threshold)
// Here you would trigger your alerting system
}
}
// GetComponentHealth returns the health status of all components
func (eh *ErrorHandler) GetComponentHealth() map[string]*ComponentStats {
eh.mu.RLock()
defer eh.mu.RUnlock()
// Return a copy to prevent external modification
result := make(map[string]*ComponentStats)
for name, stats := range eh.componentStats {
result[name] = &ComponentStats{
Component: stats.Component,
TotalErrors: stats.TotalErrors,
ErrorsByType: make(map[ErrorType]int),
ErrorsBySeverity: make(map[ErrorSeverity]int),
LastError: stats.LastError,
ConsecutiveFailures: stats.ConsecutiveFailures,
SuccessCount: stats.SuccessCount,
IsHealthy: stats.IsHealthy,
LastHealthCheck: stats.LastHealthCheck,
}
// Copy maps
for k, v := range stats.ErrorsByType {
result[name].ErrorsByType[k] = v
}
for k, v := range stats.ErrorsBySeverity {
result[name].ErrorsBySeverity[k] = v
}
}
return result
}
// RecordSuccess records a successful operation for a component
func (eh *ErrorHandler) RecordSuccess(component string) {
eh.mu.Lock()
defer eh.mu.Unlock()
stats, exists := eh.componentStats[component]
if !exists {
stats = &ComponentStats{
Component: component,
ErrorsByType: make(map[ErrorType]int),
ErrorsBySeverity: make(map[ErrorSeverity]int),
IsHealthy: true,
}
eh.componentStats[component] = stats
}
stats.mu.Lock()
defer stats.mu.Unlock()
stats.SuccessCount++
stats.ConsecutiveFailures = 0
stats.IsHealthy = true
stats.LastHealthCheck = time.Now()
// Reset circuit breaker if it exists
if breaker, exists := eh.circuitBreakers[component]; exists {
breaker.mu.Lock()
breaker.State = CircuitClosed
breaker.FailureCount = 0
breaker.LastSuccess = time.Now()
breaker.mu.Unlock()
}
}
// IsCircuitOpen checks if a circuit breaker is open for a component
func (eh *ErrorHandler) IsCircuitOpen(component string) bool {
eh.mu.RLock()
defer eh.mu.RUnlock()
breaker, exists := eh.circuitBreakers[component]
if !exists {
return false
}
breaker.mu.RLock()
defer breaker.mu.RUnlock()
if breaker.State == CircuitOpen {
// Check if timeout has passed
if time.Since(breaker.LastFailure) > breaker.Timeout {
breaker.State = CircuitHalfOpen
breaker.HalfOpenAllowed = true
return false
}
return true
}
return false
}
// SetFallbackProvider sets the fallback data provider
func (eh *ErrorHandler) SetFallbackProvider(provider FallbackDataProvider) {
eh.mu.Lock()
defer eh.mu.Unlock()
eh.fallbackProvider = provider
}
// GetErrorSummary returns a summary of recent errors
func (eh *ErrorHandler) GetErrorSummary(duration time.Duration) map[string]interface{} {
eh.mu.RLock()
defer eh.mu.RUnlock()
cutoff := time.Now().Add(-duration)
summary := map[string]interface{}{
"total_errors": 0,
"errors_by_type": make(map[string]int),
"errors_by_severity": make(map[string]int),
"errors_by_component": make(map[string]int),
"time_range": duration.String(),
}
for _, event := range eh.errorHistory {
if event.Timestamp.After(cutoff) {
summary["total_errors"] = summary["total_errors"].(int) + 1
typeKey := event.Type.String()
summary["errors_by_type"].(map[string]int)[typeKey]++
severityKey := event.Severity.String()
summary["errors_by_severity"].(map[string]int)[severityKey]++
summary["errors_by_component"].(map[string]int)[event.Component]++
}
}
return summary
}

View File

@@ -0,0 +1,384 @@
package recovery
import (
"context"
"fmt"
"sync"
"time"
"github.com/ethereum/go-ethereum/common"
"github.com/fraktal/mev-beta/internal/logger"
"github.com/fraktal/mev-beta/internal/registry"
)
// DefaultFallbackProvider implements FallbackDataProvider with multiple data sources
type DefaultFallbackProvider struct {
mu sync.RWMutex
logger *logger.Logger
contractRegistry *registry.ContractRegistry
staticTokenData map[common.Address]*FallbackTokenInfo
staticPoolData map[common.Address]*FallbackPoolInfo
cacheTimeout time.Duration
enabled bool
}
// NewDefaultFallbackProvider creates a new fallback data provider
func NewDefaultFallbackProvider(logger *logger.Logger, contractRegistry *registry.ContractRegistry) *DefaultFallbackProvider {
provider := &DefaultFallbackProvider{
logger: logger,
contractRegistry: contractRegistry,
staticTokenData: make(map[common.Address]*FallbackTokenInfo),
staticPoolData: make(map[common.Address]*FallbackPoolInfo),
cacheTimeout: 5 * time.Minute,
enabled: true,
}
// Initialize with known safe data
provider.initializeStaticData()
return provider
}
// initializeStaticData populates the provider with known good data for critical Arbitrum contracts
func (fp *DefaultFallbackProvider) initializeStaticData() {
fp.mu.Lock()
defer fp.mu.Unlock()
// Major Arbitrum tokens with verified addresses
fp.staticTokenData[common.HexToAddress("0x82aF49447D8a07e3bd95BD0d56f35241523fBab1")] = &FallbackTokenInfo{
Address: common.HexToAddress("0x82aF49447D8a07e3bd95BD0d56f35241523fBab1"),
Symbol: "WETH",
Name: "Wrapped Ether",
Decimals: 18,
IsVerified: true,
Source: "static_fallback",
Confidence: 1.0,
}
fp.staticTokenData[common.HexToAddress("0xaf88d065e77c8cC2239327C5EDb3A432268e5831")] = &FallbackTokenInfo{
Address: common.HexToAddress("0xaf88d065e77c8cC2239327C5EDb3A432268e5831"),
Symbol: "USDC",
Name: "USD Coin",
Decimals: 6,
IsVerified: true,
Source: "static_fallback",
Confidence: 1.0,
}
fp.staticTokenData[common.HexToAddress("0xFd086bC7CD5C481DCC9C85ebE478A1C0b69FCbb9")] = &FallbackTokenInfo{
Address: common.HexToAddress("0xFd086bC7CD5C481DCC9C85ebE478A1C0b69FCbb9"),
Symbol: "USDT",
Name: "Tether USD",
Decimals: 6,
IsVerified: true,
Source: "static_fallback",
Confidence: 1.0,
}
fp.staticTokenData[common.HexToAddress("0x2f2a2543B76A4166549F7aaB2e75Bef0aefC5B0f")] = &FallbackTokenInfo{
Address: common.HexToAddress("0x2f2a2543B76A4166549F7aaB2e75Bef0aefC5B0f"),
Symbol: "WBTC",
Name: "Wrapped BTC",
Decimals: 8,
IsVerified: true,
Source: "static_fallback",
Confidence: 1.0,
}
fp.staticTokenData[common.HexToAddress("0x912CE59144191C1204E64559FE8253a0e49E6548")] = &FallbackTokenInfo{
Address: common.HexToAddress("0x912CE59144191C1204E64559FE8253a0e49E6548"),
Symbol: "ARB",
Name: "Arbitrum",
Decimals: 18,
IsVerified: true,
Source: "static_fallback",
Confidence: 1.0,
}
// High-volume Uniswap V3 pools with verified addresses and token pairs
fp.staticPoolData[common.HexToAddress("0xC6962004f452bE9203591991D15f6b388e09E8D0")] = &FallbackPoolInfo{
Address: common.HexToAddress("0xC6962004f452bE9203591991D15f6b388e09E8D0"),
Token0: common.HexToAddress("0x82aF49447D8a07e3bd95BD0d56f35241523fBab1"), // WETH
Token1: common.HexToAddress("0xaf88d065e77c8cC2239327C5EDb3A432268e5831"), // USDC
Protocol: "UniswapV3",
Fee: 500, // 0.05%
IsVerified: true,
Source: "static_fallback",
Confidence: 1.0,
}
fp.staticPoolData[common.HexToAddress("0x641C00A822e8b671738d32a431a4Fb6074E5c79d")] = &FallbackPoolInfo{
Address: common.HexToAddress("0x641C00A822e8b671738d32a431a4Fb6074E5c79d"),
Token0: common.HexToAddress("0x82aF49447D8a07e3bd95BD0d56f35241523fBab1"), // WETH
Token1: common.HexToAddress("0xaf88d065e77c8cC2239327C5EDb3A432268e5831"), // USDC
Protocol: "UniswapV3",
Fee: 3000, // 0.3%
IsVerified: true,
Source: "static_fallback",
Confidence: 1.0,
}
fp.staticPoolData[common.HexToAddress("0x17c14D2c404D167802b16C450d3c99F88F2c4F4d")] = &FallbackPoolInfo{
Address: common.HexToAddress("0x17c14D2c404D167802b16C450d3c99F88F2c4F4d"),
Token0: common.HexToAddress("0xaf88d065e77c8cC2239327C5EDb3A432268e5831"), // USDC
Token1: common.HexToAddress("0xFd086bC7CD5C481DCC9C85ebE478A1C0b69FCbb9"), // USDT
Protocol: "UniswapV3",
Fee: 100, // 0.01%
IsVerified: true,
Source: "static_fallback",
Confidence: 1.0,
}
fp.staticPoolData[common.HexToAddress("0x2f5e87C032bc4F8526F320c012A4e678F1fa6cAB")] = &FallbackPoolInfo{
Address: common.HexToAddress("0x2f5e87C032bc4F8526F320c012A4e678F1fa6cAB"),
Token0: common.HexToAddress("0x2f2a2543B76A4166549F7aaB2e75Bef0aefC5B0f"), // WBTC
Token1: common.HexToAddress("0x82aF49447D8a07e3bd95BD0d56f35241523fBab1"), // WETH
Protocol: "UniswapV3",
Fee: 500, // 0.05%
IsVerified: true,
Source: "static_fallback",
Confidence: 1.0,
}
fp.logger.Info("Initialized fallback provider with static data",
"tokens", len(fp.staticTokenData),
"pools", len(fp.staticPoolData))
}
// GetFallbackTokenInfo provides fallback token information
func (fp *DefaultFallbackProvider) GetFallbackTokenInfo(ctx context.Context, address common.Address) (*FallbackTokenInfo, error) {
if !fp.enabled {
return nil, fmt.Errorf("fallback provider disabled")
}
fp.mu.RLock()
defer fp.mu.RUnlock()
// First, try static data
if tokenInfo, exists := fp.staticTokenData[address]; exists {
fp.logger.Debug("Fallback token info from static data",
"address", address.Hex(),
"symbol", tokenInfo.Symbol,
"source", tokenInfo.Source)
return tokenInfo, nil
}
// Second, try contract registry if available
if fp.contractRegistry != nil {
if contractInfo, err := fp.contractRegistry.GetContractInfo(ctx, address); err == nil && contractInfo != nil {
tokenInfo := &FallbackTokenInfo{
Address: address,
Symbol: contractInfo.Symbol,
Name: contractInfo.Name,
Decimals: contractInfo.Decimals,
IsVerified: contractInfo.IsVerified,
Source: "contract_registry",
Confidence: contractInfo.Confidence,
}
fp.logger.Debug("Fallback token info from registry",
"address", address.Hex(),
"symbol", tokenInfo.Symbol,
"confidence", tokenInfo.Confidence)
return tokenInfo, nil
}
}
// Third, provide minimal safe fallback for unknown tokens
tokenInfo := &FallbackTokenInfo{
Address: address,
Symbol: fmt.Sprintf("UNK_%s", address.Hex()[:8]),
Name: "Unknown Token",
Decimals: 18, // Safe default
IsVerified: false,
Source: "generated_fallback",
Confidence: 0.1,
}
fp.logger.Warn("Using generated fallback token info",
"address", address.Hex(),
"symbol", tokenInfo.Symbol)
return tokenInfo, nil
}
// GetFallbackPoolInfo provides fallback pool information
func (fp *DefaultFallbackProvider) GetFallbackPoolInfo(ctx context.Context, address common.Address) (*FallbackPoolInfo, error) {
if !fp.enabled {
return nil, fmt.Errorf("fallback provider disabled")
}
fp.mu.RLock()
defer fp.mu.RUnlock()
// First, try static data
if poolInfo, exists := fp.staticPoolData[address]; exists {
fp.logger.Debug("Fallback pool info from static data",
"address", address.Hex(),
"protocol", poolInfo.Protocol,
"token0", poolInfo.Token0.Hex(),
"token1", poolInfo.Token1.Hex())
return poolInfo, nil
}
// Second, try contract registry if available
if fp.contractRegistry != nil {
if poolInfo := fp.contractRegistry.GetPoolInfo(address); poolInfo != nil {
fallbackInfo := &FallbackPoolInfo{
Address: address,
Token0: poolInfo.Token0,
Token1: poolInfo.Token1,
Protocol: poolInfo.Protocol,
Fee: poolInfo.Fee,
IsVerified: poolInfo.IsVerified,
Source: "contract_registry",
Confidence: poolInfo.Confidence,
}
fp.logger.Debug("Fallback pool info from registry",
"address", address.Hex(),
"protocol", fallbackInfo.Protocol,
"confidence", fallbackInfo.Confidence)
return fallbackInfo, nil
}
}
// No fallback available for unknown pools - return error
return nil, fmt.Errorf("no fallback data available for pool %s", address.Hex())
}
// GetFallbackContractType provides fallback contract type information
func (fp *DefaultFallbackProvider) GetFallbackContractType(ctx context.Context, address common.Address) (string, error) {
if !fp.enabled {
return "", fmt.Errorf("fallback provider disabled")
}
fp.mu.RLock()
defer fp.mu.RUnlock()
// Check if it's a known token
if _, exists := fp.staticTokenData[address]; exists {
return "ERC20", nil
}
// Check if it's a known pool
if _, exists := fp.staticPoolData[address]; exists {
return "Pool", nil
}
// Try contract registry
if fp.contractRegistry != nil {
if contractInfo, err := fp.contractRegistry.GetContractInfo(ctx, address); err == nil && contractInfo != nil {
return contractInfo.Type.String(), nil
}
}
// Default to unknown
return "Unknown", nil
}
// AddStaticTokenData adds static token data for fallback use
func (fp *DefaultFallbackProvider) AddStaticTokenData(address common.Address, info *FallbackTokenInfo) {
fp.mu.Lock()
defer fp.mu.Unlock()
fp.staticTokenData[address] = info
fp.logger.Debug("Added static token data",
"address", address.Hex(),
"symbol", info.Symbol)
}
// AddStaticPoolData adds static pool data for fallback use
func (fp *DefaultFallbackProvider) AddStaticPoolData(address common.Address, info *FallbackPoolInfo) {
fp.mu.Lock()
defer fp.mu.Unlock()
fp.staticPoolData[address] = info
fp.logger.Debug("Added static pool data",
"address", address.Hex(),
"protocol", info.Protocol)
}
// IsAddressKnown checks if an address is in the static fallback data
func (fp *DefaultFallbackProvider) IsAddressKnown(address common.Address) bool {
fp.mu.RLock()
defer fp.mu.RUnlock()
_, isToken := fp.staticTokenData[address]
_, isPool := fp.staticPoolData[address]
return isToken || isPool
}
// GetKnownAddresses returns all known addresses in the fallback provider
func (fp *DefaultFallbackProvider) GetKnownAddresses() (tokens []common.Address, pools []common.Address) {
fp.mu.RLock()
defer fp.mu.RUnlock()
for addr := range fp.staticTokenData {
tokens = append(tokens, addr)
}
for addr := range fp.staticPoolData {
pools = append(pools, addr)
}
return tokens, pools
}
// ValidateAddressWithFallback performs validation using fallback data
func (fp *DefaultFallbackProvider) ValidateAddressWithFallback(ctx context.Context, address common.Address, expectedType string) (bool, float64, error) {
if !fp.enabled {
return false, 0.0, fmt.Errorf("fallback provider disabled")
}
// Check if address is known in our static data
if fp.IsAddressKnown(address) {
actualType, err := fp.GetFallbackContractType(ctx, address)
if err != nil {
return false, 0.0, err
}
if actualType == expectedType {
return true, 1.0, nil // High confidence for known addresses
}
return false, 0.0, fmt.Errorf("type mismatch: expected %s, got %s", expectedType, actualType)
}
// For unknown addresses, provide low confidence validation
return true, 0.3, nil // Allow with low confidence
}
// GetStats returns statistics about the fallback provider
func (fp *DefaultFallbackProvider) GetStats() map[string]interface{} {
fp.mu.RLock()
defer fp.mu.RUnlock()
return map[string]interface{}{
"enabled": fp.enabled,
"static_tokens_count": len(fp.staticTokenData),
"static_pools_count": len(fp.staticPoolData),
"cache_timeout": fp.cacheTimeout.String(),
"has_registry": fp.contractRegistry != nil,
}
}
// Enable enables the fallback provider
func (fp *DefaultFallbackProvider) Enable() {
fp.mu.Lock()
defer fp.mu.Unlock()
fp.enabled = true
fp.logger.Info("Fallback provider enabled")
}
// Disable disables the fallback provider
func (fp *DefaultFallbackProvider) Disable() {
fp.mu.Lock()
defer fp.mu.Unlock()
fp.enabled = false
fp.logger.Info("Fallback provider disabled")
}

View File

@@ -0,0 +1,446 @@
package recovery
import (
"context"
"math"
"sync"
"time"
"github.com/fraktal/mev-beta/internal/logger"
)
// RetryConfig defines retry behavior configuration
type RetryConfig struct {
MaxAttempts int
InitialDelay time.Duration
MaxDelay time.Duration
BackoffFactor float64
JitterEnabled bool
TimeoutPerAttempt time.Duration
}
// DefaultRetryConfig returns a sensible default retry configuration
func DefaultRetryConfig() RetryConfig {
return RetryConfig{
MaxAttempts: 3,
InitialDelay: 1 * time.Second,
MaxDelay: 30 * time.Second,
BackoffFactor: 2.0,
JitterEnabled: true,
TimeoutPerAttempt: 10 * time.Second,
}
}
// RetryableOperation represents an operation that can be retried
type RetryableOperation func(ctx context.Context, attempt int) error
// RetryHandler provides exponential backoff retry capabilities
type RetryHandler struct {
mu sync.RWMutex
logger *logger.Logger
configs map[string]RetryConfig
stats map[string]*RetryStats
enabled bool
}
// RetryStats tracks retry statistics for operations
type RetryStats struct {
mu sync.RWMutex
OperationType string
TotalAttempts int
SuccessfulRetries int
FailedRetries int
AverageAttempts float64
LastAttempt time.Time
LastSuccess time.Time
LastFailure time.Time
}
// RetryResult contains the result of a retry operation
type RetryResult struct {
Success bool
Attempts int
TotalDuration time.Duration
LastError error
LastAttemptAt time.Time
}
// NewRetryHandler creates a new retry handler
func NewRetryHandler(logger *logger.Logger) *RetryHandler {
handler := &RetryHandler{
logger: logger,
configs: make(map[string]RetryConfig),
stats: make(map[string]*RetryStats),
enabled: true,
}
// Initialize default configurations for common operations
handler.initializeDefaultConfigs()
return handler
}
// initializeDefaultConfigs sets up default retry configurations
func (rh *RetryHandler) initializeDefaultConfigs() {
// Contract call retries - moderate backoff
rh.configs["contract_call"] = RetryConfig{
MaxAttempts: 3,
InitialDelay: 500 * time.Millisecond,
MaxDelay: 5 * time.Second,
BackoffFactor: 2.0,
JitterEnabled: true,
TimeoutPerAttempt: 10 * time.Second,
}
// RPC connection retries - aggressive backoff
rh.configs["rpc_connection"] = RetryConfig{
MaxAttempts: 5,
InitialDelay: 1 * time.Second,
MaxDelay: 30 * time.Second,
BackoffFactor: 2.5,
JitterEnabled: true,
TimeoutPerAttempt: 15 * time.Second,
}
// Data parsing retries - quick retries
rh.configs["data_parsing"] = RetryConfig{
MaxAttempts: 2,
InitialDelay: 100 * time.Millisecond,
MaxDelay: 1 * time.Second,
BackoffFactor: 2.0,
JitterEnabled: false,
TimeoutPerAttempt: 5 * time.Second,
}
// Block processing retries - conservative
rh.configs["block_processing"] = RetryConfig{
MaxAttempts: 3,
InitialDelay: 2 * time.Second,
MaxDelay: 10 * time.Second,
BackoffFactor: 2.0,
JitterEnabled: true,
TimeoutPerAttempt: 30 * time.Second,
}
// Token metadata retries - patient backoff
rh.configs["token_metadata"] = RetryConfig{
MaxAttempts: 4,
InitialDelay: 1 * time.Second,
MaxDelay: 20 * time.Second,
BackoffFactor: 2.0,
JitterEnabled: true,
TimeoutPerAttempt: 15 * time.Second,
}
}
// ExecuteWithRetry executes an operation with retry logic
func (rh *RetryHandler) ExecuteWithRetry(ctx context.Context, operationType string, operation RetryableOperation) *RetryResult {
if !rh.enabled {
// If retries are disabled, try once
err := operation(ctx, 1)
return &RetryResult{
Success: err == nil,
Attempts: 1,
TotalDuration: 0,
LastError: err,
LastAttemptAt: time.Now(),
}
}
config := rh.getConfig(operationType)
start := time.Now()
var lastError error
rh.mu.Lock()
stats, exists := rh.stats[operationType]
if !exists {
stats = &RetryStats{
OperationType: operationType,
}
rh.stats[operationType] = stats
}
rh.mu.Unlock()
for attempt := 1; attempt <= config.MaxAttempts; attempt++ {
// Create context with timeout for this attempt
attemptCtx, cancel := context.WithTimeout(ctx, config.TimeoutPerAttempt)
rh.logger.Debug("Attempting operation with retry",
"operation", operationType,
"attempt", attempt,
"max_attempts", config.MaxAttempts)
// Execute the operation
err := operation(attemptCtx, attempt)
cancel()
// Update statistics
stats.mu.Lock()
stats.TotalAttempts++
stats.LastAttempt = time.Now()
stats.mu.Unlock()
if err == nil {
// Success!
duration := time.Since(start)
stats.mu.Lock()
stats.SuccessfulRetries++
stats.LastSuccess = time.Now()
denominator := stats.SuccessfulRetries + stats.FailedRetries
if denominator > 0 {
stats.AverageAttempts = float64(stats.TotalAttempts) / float64(denominator)
}
stats.mu.Unlock()
rh.logger.Debug("Operation succeeded",
"operation", operationType,
"attempt", attempt,
"duration", duration)
return &RetryResult{
Success: true,
Attempts: attempt,
TotalDuration: duration,
LastError: nil,
LastAttemptAt: time.Now(),
}
}
lastError = err
// Check if context was cancelled
if ctx.Err() != nil {
rh.logger.Debug("Operation cancelled by context",
"operation", operationType,
"attempt", attempt,
"error", ctx.Err())
break
}
// Don't wait after the last attempt
if attempt < config.MaxAttempts {
delay := rh.calculateDelay(config, attempt)
rh.logger.Debug("Operation failed, retrying",
"operation", operationType,
"attempt", attempt,
"error", err,
"delay", delay)
// Wait before next attempt
select {
case <-time.After(delay):
// Continue to next attempt
case <-ctx.Done():
// Context cancelled during wait
break
}
} else {
rh.logger.Warn("Operation failed after all retries",
"operation", operationType,
"attempts", attempt,
"error", err)
}
}
// All attempts failed
duration := time.Since(start)
stats.mu.Lock()
stats.FailedRetries++
stats.LastFailure = time.Now()
denominator := stats.SuccessfulRetries + stats.FailedRetries
if denominator > 0 {
stats.AverageAttempts = float64(stats.TotalAttempts) / float64(denominator)
}
stats.mu.Unlock()
return &RetryResult{
Success: false,
Attempts: config.MaxAttempts,
TotalDuration: duration,
LastError: lastError,
LastAttemptAt: time.Now(),
}
}
// calculateDelay calculates the delay before the next retry attempt
func (rh *RetryHandler) calculateDelay(config RetryConfig, attempt int) time.Duration {
// Calculate exponential backoff
delay := float64(config.InitialDelay) * math.Pow(config.BackoffFactor, float64(attempt-1))
// Apply maximum delay cap
if delay > float64(config.MaxDelay) {
delay = float64(config.MaxDelay)
}
duration := time.Duration(delay)
// Add jitter if enabled
if config.JitterEnabled {
jitter := time.Duration(float64(duration) * 0.1 * (2*rh.randomFloat() - 1))
duration += jitter
}
// Ensure minimum delay
if duration < 0 {
duration = config.InitialDelay
}
return duration
}
// randomFloat returns a pseudo-random float between 0 and 1
func (rh *RetryHandler) randomFloat() float64 {
// Simple pseudo-random number based on current time
return float64(time.Now().UnixNano()%1000) / 1000.0
}
// getConfig returns the retry configuration for an operation type
func (rh *RetryHandler) getConfig(operationType string) RetryConfig {
rh.mu.RLock()
defer rh.mu.RUnlock()
if config, exists := rh.configs[operationType]; exists {
return config
}
// Return default config if no specific config found
return DefaultRetryConfig()
}
// SetConfig sets a custom retry configuration for an operation type
func (rh *RetryHandler) SetConfig(operationType string, config RetryConfig) {
rh.mu.Lock()
defer rh.mu.Unlock()
rh.configs[operationType] = config
rh.logger.Debug("Set retry config",
"operation", operationType,
"max_attempts", config.MaxAttempts,
"initial_delay", config.InitialDelay,
"max_delay", config.MaxDelay)
}
// GetStats returns retry statistics for all operation types
func (rh *RetryHandler) GetStats() map[string]*RetryStats {
rh.mu.RLock()
defer rh.mu.RUnlock()
// Return a copy to prevent external modification
result := make(map[string]*RetryStats)
for opType, stats := range rh.stats {
stats.mu.RLock()
result[opType] = &RetryStats{
OperationType: stats.OperationType,
TotalAttempts: stats.TotalAttempts,
SuccessfulRetries: stats.SuccessfulRetries,
FailedRetries: stats.FailedRetries,
AverageAttempts: stats.AverageAttempts,
LastAttempt: stats.LastAttempt,
LastSuccess: stats.LastSuccess,
LastFailure: stats.LastFailure,
}
stats.mu.RUnlock()
}
return result
}
// GetOperationStats returns statistics for a specific operation type
func (rh *RetryHandler) GetOperationStats(operationType string) *RetryStats {
rh.mu.RLock()
defer rh.mu.RUnlock()
stats, exists := rh.stats[operationType]
if !exists {
return nil
}
stats.mu.RLock()
defer stats.mu.RUnlock()
return &RetryStats{
OperationType: stats.OperationType,
TotalAttempts: stats.TotalAttempts,
SuccessfulRetries: stats.SuccessfulRetries,
FailedRetries: stats.FailedRetries,
AverageAttempts: stats.AverageAttempts,
LastAttempt: stats.LastAttempt,
LastSuccess: stats.LastSuccess,
LastFailure: stats.LastFailure,
}
}
// ResetStats resets statistics for all operation types
func (rh *RetryHandler) ResetStats() {
rh.mu.Lock()
defer rh.mu.Unlock()
rh.stats = make(map[string]*RetryStats)
rh.logger.Info("Reset retry statistics")
}
// Enable enables the retry handler
func (rh *RetryHandler) Enable() {
rh.mu.Lock()
defer rh.mu.Unlock()
rh.enabled = true
rh.logger.Info("Retry handler enabled")
}
// Disable disables the retry handler
func (rh *RetryHandler) Disable() {
rh.mu.Lock()
defer rh.mu.Unlock()
rh.enabled = false
rh.logger.Info("Retry handler disabled")
}
// IsEnabled returns whether the retry handler is enabled
func (rh *RetryHandler) IsEnabled() bool {
rh.mu.RLock()
defer rh.mu.RUnlock()
return rh.enabled
}
// GetHealthSummary returns a health summary based on retry statistics
func (rh *RetryHandler) GetHealthSummary() map[string]interface{} {
stats := rh.GetStats()
summary := map[string]interface{}{
"enabled": rh.enabled,
"total_operations": len(stats),
"healthy_operations": 0,
"unhealthy_operations": 0,
"operation_details": make(map[string]interface{}),
}
for opType, opStats := range stats {
total := opStats.SuccessfulRetries + opStats.FailedRetries
successRate := 0.0
if total > 0 {
successRate = float64(opStats.SuccessfulRetries) / float64(total)
}
isHealthy := successRate >= 0.9 && opStats.AverageAttempts <= 2.0
if isHealthy {
summary["healthy_operations"] = summary["healthy_operations"].(int) + 1
} else {
summary["unhealthy_operations"] = summary["unhealthy_operations"].(int) + 1
}
summary["operation_details"].(map[string]interface{})[opType] = map[string]interface{}{
"success_rate": successRate,
"average_attempts": opStats.AverageAttempts,
"total_operations": total,
"is_healthy": isHealthy,
"last_success": opStats.LastSuccess,
"last_failure": opStats.LastFailure,
}
}
return summary
}

View File

@@ -0,0 +1,362 @@
package recovery
import (
"context"
"errors"
"fmt"
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/fraktal/mev-beta/internal/logger"
)
func TestRetryHandler_ExecuteWithRetry_Success(t *testing.T) {
log := logger.New("debug", "text", "")
handler := NewRetryHandler(log)
attempts := 0
operation := func(ctx context.Context, attempt int) error {
attempts++
if attempts == 2 {
return nil // Success on second attempt
}
return errors.New("temporary failure")
}
result := handler.ExecuteWithRetry(context.Background(), "test_operation", operation)
assert.True(t, result.Success)
assert.Equal(t, 2, result.Attempts)
assert.Nil(t, result.LastError)
assert.Equal(t, 2, attempts)
}
func TestRetryHandler_ExecuteWithRetry_MaxAttemptsReached(t *testing.T) {
log := logger.New("debug", "text", "")
handler := NewRetryHandler(log)
attempts := 0
operation := func(ctx context.Context, attempt int) error {
attempts++
return errors.New("persistent failure")
}
result := handler.ExecuteWithRetry(context.Background(), "test_operation", operation)
assert.False(t, result.Success)
assert.Equal(t, 3, result.Attempts) // Default max attempts
assert.NotNil(t, result.LastError)
assert.Equal(t, "persistent failure", result.LastError.Error())
assert.Equal(t, 3, attempts)
}
func TestRetryHandler_ExecuteWithRetry_ContextCanceled(t *testing.T) {
log := logger.New("debug", "text", "")
handler := NewRetryHandler(log)
ctx, cancel := context.WithCancel(context.Background())
attempts := 0
operation := func(ctx context.Context, attempt int) error {
attempts++
if attempts == 2 {
cancel() // Cancel context on second attempt
}
return errors.New("failure")
}
result := handler.ExecuteWithRetry(ctx, "test_operation", operation)
assert.False(t, result.Success)
assert.LessOrEqual(t, result.Attempts, 3)
assert.NotNil(t, result.LastError)
}
func TestRetryHandler_ExecuteWithRetry_CustomConfig(t *testing.T) {
log := logger.New("debug", "text", "")
handler := NewRetryHandler(log)
// Set custom configuration
customConfig := RetryConfig{
MaxAttempts: 5,
InitialDelay: 10 * time.Millisecond,
MaxDelay: 100 * time.Millisecond,
BackoffFactor: 2.0,
JitterEnabled: false,
TimeoutPerAttempt: 1 * time.Second,
}
handler.SetConfig("custom_operation", customConfig)
attempts := 0
operation := func(ctx context.Context, attempt int) error {
attempts++
return errors.New("persistent failure")
}
start := time.Now()
result := handler.ExecuteWithRetry(context.Background(), "custom_operation", operation)
duration := time.Since(start)
assert.False(t, result.Success)
assert.Equal(t, 5, result.Attempts) // Custom max attempts
assert.Equal(t, 5, attempts)
// Should have taken some time due to delays (at least 150ms for delays)
expectedMinDuration := 10*time.Millisecond + 20*time.Millisecond + 40*time.Millisecond + 80*time.Millisecond
assert.GreaterOrEqual(t, duration, expectedMinDuration)
}
func TestRetryHandler_ExecuteWithRetry_Disabled(t *testing.T) {
log := logger.New("debug", "text", "")
handler := NewRetryHandler(log)
handler.Disable()
attempts := 0
operation := func(ctx context.Context, attempt int) error {
attempts++
return errors.New("failure")
}
result := handler.ExecuteWithRetry(context.Background(), "test_operation", operation)
assert.False(t, result.Success)
assert.Equal(t, 1, result.Attempts) // Only one attempt when disabled
assert.Equal(t, 1, attempts)
}
func TestRetryHandler_CalculateDelay(t *testing.T) {
log := logger.New("debug", "text", "")
handler := NewRetryHandler(log)
config := RetryConfig{
InitialDelay: 100 * time.Millisecond,
MaxDelay: 1 * time.Second,
BackoffFactor: 2.0,
JitterEnabled: false,
}
tests := []struct {
attempt int
expectedMin time.Duration
expectedMax time.Duration
}{
{1, 100 * time.Millisecond, 100 * time.Millisecond},
{2, 200 * time.Millisecond, 200 * time.Millisecond},
{3, 400 * time.Millisecond, 400 * time.Millisecond},
{4, 800 * time.Millisecond, 800 * time.Millisecond},
{5, 1 * time.Second, 1 * time.Second}, // Should be capped at MaxDelay
}
for _, tt := range tests {
t.Run(fmt.Sprintf("attempt_%d", tt.attempt), func(t *testing.T) {
delay := handler.calculateDelay(config, tt.attempt)
assert.GreaterOrEqual(t, delay, tt.expectedMin)
assert.LessOrEqual(t, delay, tt.expectedMax)
})
}
}
func TestRetryHandler_CalculateDelay_WithJitter(t *testing.T) {
log := logger.New("debug", "text", "")
handler := NewRetryHandler(log)
config := RetryConfig{
InitialDelay: 100 * time.Millisecond,
MaxDelay: 1 * time.Second,
BackoffFactor: 2.0,
JitterEnabled: true,
}
// Test jitter variation
delays := make([]time.Duration, 10)
for i := 0; i < 10; i++ {
delays[i] = handler.calculateDelay(config, 2) // 200ms base
}
// Should have some variation due to jitter
allSame := true
for i := 1; i < len(delays); i++ {
if delays[i] != delays[0] {
allSame = false
break
}
}
assert.False(t, allSame, "Jitter should cause variation in delays")
// All delays should be reasonable (within 10% of base)
baseDelay := 200 * time.Millisecond
for _, delay := range delays {
assert.GreaterOrEqual(t, delay, baseDelay*9/10) // 10% below
assert.LessOrEqual(t, delay, baseDelay*11/10) // 10% above
}
}
func TestRetryHandler_GetStats(t *testing.T) {
log := logger.New("debug", "text", "")
handler := NewRetryHandler(log)
// Execute some operations
successOp := func(ctx context.Context, attempt int) error {
return nil
}
failOp := func(ctx context.Context, attempt int) error {
return errors.New("failure")
}
handler.ExecuteWithRetry(context.Background(), "test_success", successOp)
handler.ExecuteWithRetry(context.Background(), "test_success", successOp)
handler.ExecuteWithRetry(context.Background(), "test_fail", failOp)
stats := handler.GetStats()
// Check success stats
successStats := stats["test_success"]
require.NotNil(t, successStats)
assert.Equal(t, 2, successStats.TotalAttempts)
assert.Equal(t, 2, successStats.SuccessfulRetries)
assert.Equal(t, 0, successStats.FailedRetries)
// Check failure stats
failStats := stats["test_fail"]
require.NotNil(t, failStats)
assert.Equal(t, 3, failStats.TotalAttempts) // Default max attempts
assert.Equal(t, 0, failStats.SuccessfulRetries)
assert.Equal(t, 1, failStats.FailedRetries)
}
func TestRetryHandler_GetHealthSummary(t *testing.T) {
log := logger.New("debug", "text", "")
handler := NewRetryHandler(log)
// Execute some operations to generate stats
successOp := func(ctx context.Context, attempt int) error {
return nil
}
partialFailOp := func(ctx context.Context, attempt int) error {
if attempt < 2 {
return errors.New("temporary failure")
}
return nil
}
// 2 immediate successes
handler.ExecuteWithRetry(context.Background(), "immediate_success", successOp)
handler.ExecuteWithRetry(context.Background(), "immediate_success", successOp)
// 1 success after retry
handler.ExecuteWithRetry(context.Background(), "retry_success", partialFailOp)
summary := handler.GetHealthSummary()
assert.True(t, summary["enabled"].(bool))
assert.Equal(t, 2, summary["total_operations"].(int))
assert.Equal(t, 2, summary["healthy_operations"].(int))
assert.Equal(t, 0, summary["unhealthy_operations"].(int))
// Check operation details
details := summary["operation_details"].(map[string]interface{})
immediateDetails := details["immediate_success"].(map[string]interface{})
assert.Equal(t, 1.0, immediateDetails["success_rate"].(float64))
assert.Equal(t, 1.0, immediateDetails["average_attempts"].(float64))
assert.True(t, immediateDetails["is_healthy"].(bool))
retryDetails := details["retry_success"].(map[string]interface{})
assert.Equal(t, 1.0, retryDetails["success_rate"].(float64))
assert.Equal(t, 2.0, retryDetails["average_attempts"].(float64))
assert.True(t, retryDetails["is_healthy"].(bool)) // Still healthy despite retries
}
func TestRetryHandler_ConcurrentExecution(t *testing.T) {
log := logger.New("debug", "text", "")
handler := NewRetryHandler(log)
const numGoroutines = 50
const operationsPerGoroutine = 20
done := make(chan bool, numGoroutines)
successCount := make(chan int, numGoroutines)
operation := func(ctx context.Context, attempt int) error {
// 80% success rate
if attempt <= 1 && time.Now().UnixNano()%5 != 0 {
return nil
}
if attempt == 2 {
return nil // Always succeed on second attempt
}
return errors.New("failure")
}
// Launch concurrent retry operations
for i := 0; i < numGoroutines; i++ {
go func(id int) {
defer func() { done <- true }()
successes := 0
for j := 0; j < operationsPerGoroutine; j++ {
result := handler.ExecuteWithRetry(context.Background(),
fmt.Sprintf("concurrent_op_%d", id), operation)
if result.Success {
successes++
}
}
successCount <- successes
}(i)
}
// Collect results
totalSuccesses := 0
for i := 0; i < numGoroutines; i++ {
select {
case <-done:
totalSuccesses += <-successCount
case <-time.After(30 * time.Second):
t.Fatal("Concurrent retry test timed out")
}
}
totalOperations := numGoroutines * operationsPerGoroutine
successRate := float64(totalSuccesses) / float64(totalOperations)
t.Logf("Concurrent execution: %d/%d operations succeeded (%.2f%%)",
totalSuccesses, totalOperations, successRate*100)
// Should have high success rate due to retries
assert.GreaterOrEqual(t, successRate, 0.8, "Success rate should be at least 80%")
// Verify stats are consistent
stats := handler.GetStats()
assert.NotEmpty(t, stats, "Should have recorded stats")
}
func TestRetryHandler_EdgeCases(t *testing.T) {
log := logger.New("debug", "text", "")
handler := NewRetryHandler(log)
t.Run("nil operation", func(t *testing.T) {
assert.Panics(t, func() {
handler.ExecuteWithRetry(context.Background(), "nil_op", nil)
})
})
t.Run("empty operation type", func(t *testing.T) {
operation := func(ctx context.Context, attempt int) error {
return nil
}
result := handler.ExecuteWithRetry(context.Background(), "", operation)
assert.True(t, result.Success)
})
t.Run("very long operation type", func(t *testing.T) {
longName := string(make([]byte, 1000))
operation := func(ctx context.Context, attempt int) error {
return nil
}
result := handler.ExecuteWithRetry(context.Background(), longName, operation)
assert.True(t, result.Success)
})
}