Files
mev-beta/internal/recovery/retry_handler.go
Krypto Kajun 850223a953 fix(multicall): resolve critical multicall parsing corruption issues
- Added comprehensive bounds checking to prevent buffer overruns in multicall parsing
- Implemented graduated validation system (Strict/Moderate/Permissive) to reduce false positives
- Added LRU caching system for address validation with 10-minute TTL
- Enhanced ABI decoder with missing Universal Router and Arbitrum-specific DEX signatures
- Fixed duplicate function declarations and import conflicts across multiple files
- Added error recovery mechanisms with multiple fallback strategies
- Updated tests to handle new validation behavior for suspicious addresses
- Fixed parser test expectations for improved validation system
- Applied gofmt formatting fixes to ensure code style compliance
- Fixed mutex copying issues in monitoring package by introducing MetricsSnapshot
- Resolved critical security vulnerabilities in heuristic address extraction
- Progress: Updated TODO audit from 10% to 35% complete

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-17 00:12:55 -05:00

447 lines
11 KiB
Go

package recovery
import (
"context"
"math"
"sync"
"time"
"github.com/fraktal/mev-beta/internal/logger"
)
// RetryConfig defines retry behavior configuration
type RetryConfig struct {
MaxAttempts int
InitialDelay time.Duration
MaxDelay time.Duration
BackoffFactor float64
JitterEnabled bool
TimeoutPerAttempt time.Duration
}
// DefaultRetryConfig returns a sensible default retry configuration
func DefaultRetryConfig() RetryConfig {
return RetryConfig{
MaxAttempts: 3,
InitialDelay: 1 * time.Second,
MaxDelay: 30 * time.Second,
BackoffFactor: 2.0,
JitterEnabled: true,
TimeoutPerAttempt: 10 * time.Second,
}
}
// RetryableOperation represents an operation that can be retried
type RetryableOperation func(ctx context.Context, attempt int) error
// RetryHandler provides exponential backoff retry capabilities
type RetryHandler struct {
mu sync.RWMutex
logger *logger.Logger
configs map[string]RetryConfig
stats map[string]*RetryStats
enabled bool
}
// RetryStats tracks retry statistics for operations
type RetryStats struct {
mu sync.RWMutex
OperationType string
TotalAttempts int
SuccessfulRetries int
FailedRetries int
AverageAttempts float64
LastAttempt time.Time
LastSuccess time.Time
LastFailure time.Time
}
// RetryResult contains the result of a retry operation
type RetryResult struct {
Success bool
Attempts int
TotalDuration time.Duration
LastError error
LastAttemptAt time.Time
}
// NewRetryHandler creates a new retry handler
func NewRetryHandler(logger *logger.Logger) *RetryHandler {
handler := &RetryHandler{
logger: logger,
configs: make(map[string]RetryConfig),
stats: make(map[string]*RetryStats),
enabled: true,
}
// Initialize default configurations for common operations
handler.initializeDefaultConfigs()
return handler
}
// initializeDefaultConfigs sets up default retry configurations
func (rh *RetryHandler) initializeDefaultConfigs() {
// Contract call retries - moderate backoff
rh.configs["contract_call"] = RetryConfig{
MaxAttempts: 3,
InitialDelay: 500 * time.Millisecond,
MaxDelay: 5 * time.Second,
BackoffFactor: 2.0,
JitterEnabled: true,
TimeoutPerAttempt: 10 * time.Second,
}
// RPC connection retries - aggressive backoff
rh.configs["rpc_connection"] = RetryConfig{
MaxAttempts: 5,
InitialDelay: 1 * time.Second,
MaxDelay: 30 * time.Second,
BackoffFactor: 2.5,
JitterEnabled: true,
TimeoutPerAttempt: 15 * time.Second,
}
// Data parsing retries - quick retries
rh.configs["data_parsing"] = RetryConfig{
MaxAttempts: 2,
InitialDelay: 100 * time.Millisecond,
MaxDelay: 1 * time.Second,
BackoffFactor: 2.0,
JitterEnabled: false,
TimeoutPerAttempt: 5 * time.Second,
}
// Block processing retries - conservative
rh.configs["block_processing"] = RetryConfig{
MaxAttempts: 3,
InitialDelay: 2 * time.Second,
MaxDelay: 10 * time.Second,
BackoffFactor: 2.0,
JitterEnabled: true,
TimeoutPerAttempt: 30 * time.Second,
}
// Token metadata retries - patient backoff
rh.configs["token_metadata"] = RetryConfig{
MaxAttempts: 4,
InitialDelay: 1 * time.Second,
MaxDelay: 20 * time.Second,
BackoffFactor: 2.0,
JitterEnabled: true,
TimeoutPerAttempt: 15 * time.Second,
}
}
// ExecuteWithRetry executes an operation with retry logic
func (rh *RetryHandler) ExecuteWithRetry(ctx context.Context, operationType string, operation RetryableOperation) *RetryResult {
if !rh.enabled {
// If retries are disabled, try once
err := operation(ctx, 1)
return &RetryResult{
Success: err == nil,
Attempts: 1,
TotalDuration: 0,
LastError: err,
LastAttemptAt: time.Now(),
}
}
config := rh.getConfig(operationType)
start := time.Now()
var lastError error
rh.mu.Lock()
stats, exists := rh.stats[operationType]
if !exists {
stats = &RetryStats{
OperationType: operationType,
}
rh.stats[operationType] = stats
}
rh.mu.Unlock()
for attempt := 1; attempt <= config.MaxAttempts; attempt++ {
// Create context with timeout for this attempt
attemptCtx, cancel := context.WithTimeout(ctx, config.TimeoutPerAttempt)
rh.logger.Debug("Attempting operation with retry",
"operation", operationType,
"attempt", attempt,
"max_attempts", config.MaxAttempts)
// Execute the operation
err := operation(attemptCtx, attempt)
cancel()
// Update statistics
stats.mu.Lock()
stats.TotalAttempts++
stats.LastAttempt = time.Now()
stats.mu.Unlock()
if err == nil {
// Success!
duration := time.Since(start)
stats.mu.Lock()
stats.SuccessfulRetries++
stats.LastSuccess = time.Now()
denominator := stats.SuccessfulRetries + stats.FailedRetries
if denominator > 0 {
stats.AverageAttempts = float64(stats.TotalAttempts) / float64(denominator)
}
stats.mu.Unlock()
rh.logger.Debug("Operation succeeded",
"operation", operationType,
"attempt", attempt,
"duration", duration)
return &RetryResult{
Success: true,
Attempts: attempt,
TotalDuration: duration,
LastError: nil,
LastAttemptAt: time.Now(),
}
}
lastError = err
// Check if context was cancelled
if ctx.Err() != nil {
rh.logger.Debug("Operation cancelled by context",
"operation", operationType,
"attempt", attempt,
"error", ctx.Err())
break
}
// Don't wait after the last attempt
if attempt < config.MaxAttempts {
delay := rh.calculateDelay(config, attempt)
rh.logger.Debug("Operation failed, retrying",
"operation", operationType,
"attempt", attempt,
"error", err,
"delay", delay)
// Wait before next attempt
select {
case <-time.After(delay):
// Continue to next attempt
case <-ctx.Done():
// Context cancelled during wait
break
}
} else {
rh.logger.Warn("Operation failed after all retries",
"operation", operationType,
"attempts", attempt,
"error", err)
}
}
// All attempts failed
duration := time.Since(start)
stats.mu.Lock()
stats.FailedRetries++
stats.LastFailure = time.Now()
denominator := stats.SuccessfulRetries + stats.FailedRetries
if denominator > 0 {
stats.AverageAttempts = float64(stats.TotalAttempts) / float64(denominator)
}
stats.mu.Unlock()
return &RetryResult{
Success: false,
Attempts: config.MaxAttempts,
TotalDuration: duration,
LastError: lastError,
LastAttemptAt: time.Now(),
}
}
// calculateDelay calculates the delay before the next retry attempt
func (rh *RetryHandler) calculateDelay(config RetryConfig, attempt int) time.Duration {
// Calculate exponential backoff
delay := float64(config.InitialDelay) * math.Pow(config.BackoffFactor, float64(attempt-1))
// Apply maximum delay cap
if delay > float64(config.MaxDelay) {
delay = float64(config.MaxDelay)
}
duration := time.Duration(delay)
// Add jitter if enabled
if config.JitterEnabled {
jitter := time.Duration(float64(duration) * 0.1 * (2*rh.randomFloat() - 1))
duration += jitter
}
// Ensure minimum delay
if duration < 0 {
duration = config.InitialDelay
}
return duration
}
// randomFloat returns a pseudo-random float between 0 and 1
func (rh *RetryHandler) randomFloat() float64 {
// Simple pseudo-random number based on current time
return float64(time.Now().UnixNano()%1000) / 1000.0
}
// getConfig returns the retry configuration for an operation type
func (rh *RetryHandler) getConfig(operationType string) RetryConfig {
rh.mu.RLock()
defer rh.mu.RUnlock()
if config, exists := rh.configs[operationType]; exists {
return config
}
// Return default config if no specific config found
return DefaultRetryConfig()
}
// SetConfig sets a custom retry configuration for an operation type
func (rh *RetryHandler) SetConfig(operationType string, config RetryConfig) {
rh.mu.Lock()
defer rh.mu.Unlock()
rh.configs[operationType] = config
rh.logger.Debug("Set retry config",
"operation", operationType,
"max_attempts", config.MaxAttempts,
"initial_delay", config.InitialDelay,
"max_delay", config.MaxDelay)
}
// GetStats returns retry statistics for all operation types
func (rh *RetryHandler) GetStats() map[string]*RetryStats {
rh.mu.RLock()
defer rh.mu.RUnlock()
// Return a copy to prevent external modification
result := make(map[string]*RetryStats)
for opType, stats := range rh.stats {
stats.mu.RLock()
result[opType] = &RetryStats{
OperationType: stats.OperationType,
TotalAttempts: stats.TotalAttempts,
SuccessfulRetries: stats.SuccessfulRetries,
FailedRetries: stats.FailedRetries,
AverageAttempts: stats.AverageAttempts,
LastAttempt: stats.LastAttempt,
LastSuccess: stats.LastSuccess,
LastFailure: stats.LastFailure,
}
stats.mu.RUnlock()
}
return result
}
// GetOperationStats returns statistics for a specific operation type
func (rh *RetryHandler) GetOperationStats(operationType string) *RetryStats {
rh.mu.RLock()
defer rh.mu.RUnlock()
stats, exists := rh.stats[operationType]
if !exists {
return nil
}
stats.mu.RLock()
defer stats.mu.RUnlock()
return &RetryStats{
OperationType: stats.OperationType,
TotalAttempts: stats.TotalAttempts,
SuccessfulRetries: stats.SuccessfulRetries,
FailedRetries: stats.FailedRetries,
AverageAttempts: stats.AverageAttempts,
LastAttempt: stats.LastAttempt,
LastSuccess: stats.LastSuccess,
LastFailure: stats.LastFailure,
}
}
// ResetStats resets statistics for all operation types
func (rh *RetryHandler) ResetStats() {
rh.mu.Lock()
defer rh.mu.Unlock()
rh.stats = make(map[string]*RetryStats)
rh.logger.Info("Reset retry statistics")
}
// Enable enables the retry handler
func (rh *RetryHandler) Enable() {
rh.mu.Lock()
defer rh.mu.Unlock()
rh.enabled = true
rh.logger.Info("Retry handler enabled")
}
// Disable disables the retry handler
func (rh *RetryHandler) Disable() {
rh.mu.Lock()
defer rh.mu.Unlock()
rh.enabled = false
rh.logger.Info("Retry handler disabled")
}
// IsEnabled returns whether the retry handler is enabled
func (rh *RetryHandler) IsEnabled() bool {
rh.mu.RLock()
defer rh.mu.RUnlock()
return rh.enabled
}
// GetHealthSummary returns a health summary based on retry statistics
func (rh *RetryHandler) GetHealthSummary() map[string]interface{} {
stats := rh.GetStats()
summary := map[string]interface{}{
"enabled": rh.enabled,
"total_operations": len(stats),
"healthy_operations": 0,
"unhealthy_operations": 0,
"operation_details": make(map[string]interface{}),
}
for opType, opStats := range stats {
total := opStats.SuccessfulRetries + opStats.FailedRetries
successRate := 0.0
if total > 0 {
successRate = float64(opStats.SuccessfulRetries) / float64(total)
}
isHealthy := successRate >= 0.9 && opStats.AverageAttempts <= 2.0
if isHealthy {
summary["healthy_operations"] = summary["healthy_operations"].(int) + 1
} else {
summary["unhealthy_operations"] = summary["unhealthy_operations"].(int) + 1
}
summary["operation_details"].(map[string]interface{})[opType] = map[string]interface{}{
"success_rate": successRate,
"average_attempts": opStats.AverageAttempts,
"total_operations": total,
"is_healthy": isHealthy,
"last_success": opStats.LastSuccess,
"last_failure": opStats.LastFailure,
}
}
return summary
}