- Added comprehensive bounds checking to prevent buffer overruns in multicall parsing - Implemented graduated validation system (Strict/Moderate/Permissive) to reduce false positives - Added LRU caching system for address validation with 10-minute TTL - Enhanced ABI decoder with missing Universal Router and Arbitrum-specific DEX signatures - Fixed duplicate function declarations and import conflicts across multiple files - Added error recovery mechanisms with multiple fallback strategies - Updated tests to handle new validation behavior for suspicious addresses - Fixed parser test expectations for improved validation system - Applied gofmt formatting fixes to ensure code style compliance - Fixed mutex copying issues in monitoring package by introducing MetricsSnapshot - Resolved critical security vulnerabilities in heuristic address extraction - Progress: Updated TODO audit from 10% to 35% complete 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
447 lines
11 KiB
Go
447 lines
11 KiB
Go
package recovery
|
|
|
|
import (
|
|
"context"
|
|
"math"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/fraktal/mev-beta/internal/logger"
|
|
)
|
|
|
|
// RetryConfig defines retry behavior configuration
|
|
type RetryConfig struct {
|
|
MaxAttempts int
|
|
InitialDelay time.Duration
|
|
MaxDelay time.Duration
|
|
BackoffFactor float64
|
|
JitterEnabled bool
|
|
TimeoutPerAttempt time.Duration
|
|
}
|
|
|
|
// DefaultRetryConfig returns a sensible default retry configuration
|
|
func DefaultRetryConfig() RetryConfig {
|
|
return RetryConfig{
|
|
MaxAttempts: 3,
|
|
InitialDelay: 1 * time.Second,
|
|
MaxDelay: 30 * time.Second,
|
|
BackoffFactor: 2.0,
|
|
JitterEnabled: true,
|
|
TimeoutPerAttempt: 10 * time.Second,
|
|
}
|
|
}
|
|
|
|
// RetryableOperation represents an operation that can be retried
|
|
type RetryableOperation func(ctx context.Context, attempt int) error
|
|
|
|
// RetryHandler provides exponential backoff retry capabilities
|
|
type RetryHandler struct {
|
|
mu sync.RWMutex
|
|
logger *logger.Logger
|
|
configs map[string]RetryConfig
|
|
stats map[string]*RetryStats
|
|
enabled bool
|
|
}
|
|
|
|
// RetryStats tracks retry statistics for operations
|
|
type RetryStats struct {
|
|
mu sync.RWMutex
|
|
OperationType string
|
|
TotalAttempts int
|
|
SuccessfulRetries int
|
|
FailedRetries int
|
|
AverageAttempts float64
|
|
LastAttempt time.Time
|
|
LastSuccess time.Time
|
|
LastFailure time.Time
|
|
}
|
|
|
|
// RetryResult contains the result of a retry operation
|
|
type RetryResult struct {
|
|
Success bool
|
|
Attempts int
|
|
TotalDuration time.Duration
|
|
LastError error
|
|
LastAttemptAt time.Time
|
|
}
|
|
|
|
// NewRetryHandler creates a new retry handler
|
|
func NewRetryHandler(logger *logger.Logger) *RetryHandler {
|
|
handler := &RetryHandler{
|
|
logger: logger,
|
|
configs: make(map[string]RetryConfig),
|
|
stats: make(map[string]*RetryStats),
|
|
enabled: true,
|
|
}
|
|
|
|
// Initialize default configurations for common operations
|
|
handler.initializeDefaultConfigs()
|
|
|
|
return handler
|
|
}
|
|
|
|
// initializeDefaultConfigs sets up default retry configurations
|
|
func (rh *RetryHandler) initializeDefaultConfigs() {
|
|
// Contract call retries - moderate backoff
|
|
rh.configs["contract_call"] = RetryConfig{
|
|
MaxAttempts: 3,
|
|
InitialDelay: 500 * time.Millisecond,
|
|
MaxDelay: 5 * time.Second,
|
|
BackoffFactor: 2.0,
|
|
JitterEnabled: true,
|
|
TimeoutPerAttempt: 10 * time.Second,
|
|
}
|
|
|
|
// RPC connection retries - aggressive backoff
|
|
rh.configs["rpc_connection"] = RetryConfig{
|
|
MaxAttempts: 5,
|
|
InitialDelay: 1 * time.Second,
|
|
MaxDelay: 30 * time.Second,
|
|
BackoffFactor: 2.5,
|
|
JitterEnabled: true,
|
|
TimeoutPerAttempt: 15 * time.Second,
|
|
}
|
|
|
|
// Data parsing retries - quick retries
|
|
rh.configs["data_parsing"] = RetryConfig{
|
|
MaxAttempts: 2,
|
|
InitialDelay: 100 * time.Millisecond,
|
|
MaxDelay: 1 * time.Second,
|
|
BackoffFactor: 2.0,
|
|
JitterEnabled: false,
|
|
TimeoutPerAttempt: 5 * time.Second,
|
|
}
|
|
|
|
// Block processing retries - conservative
|
|
rh.configs["block_processing"] = RetryConfig{
|
|
MaxAttempts: 3,
|
|
InitialDelay: 2 * time.Second,
|
|
MaxDelay: 10 * time.Second,
|
|
BackoffFactor: 2.0,
|
|
JitterEnabled: true,
|
|
TimeoutPerAttempt: 30 * time.Second,
|
|
}
|
|
|
|
// Token metadata retries - patient backoff
|
|
rh.configs["token_metadata"] = RetryConfig{
|
|
MaxAttempts: 4,
|
|
InitialDelay: 1 * time.Second,
|
|
MaxDelay: 20 * time.Second,
|
|
BackoffFactor: 2.0,
|
|
JitterEnabled: true,
|
|
TimeoutPerAttempt: 15 * time.Second,
|
|
}
|
|
}
|
|
|
|
// ExecuteWithRetry executes an operation with retry logic
|
|
func (rh *RetryHandler) ExecuteWithRetry(ctx context.Context, operationType string, operation RetryableOperation) *RetryResult {
|
|
if !rh.enabled {
|
|
// If retries are disabled, try once
|
|
err := operation(ctx, 1)
|
|
return &RetryResult{
|
|
Success: err == nil,
|
|
Attempts: 1,
|
|
TotalDuration: 0,
|
|
LastError: err,
|
|
LastAttemptAt: time.Now(),
|
|
}
|
|
}
|
|
|
|
config := rh.getConfig(operationType)
|
|
start := time.Now()
|
|
var lastError error
|
|
|
|
rh.mu.Lock()
|
|
stats, exists := rh.stats[operationType]
|
|
if !exists {
|
|
stats = &RetryStats{
|
|
OperationType: operationType,
|
|
}
|
|
rh.stats[operationType] = stats
|
|
}
|
|
rh.mu.Unlock()
|
|
|
|
for attempt := 1; attempt <= config.MaxAttempts; attempt++ {
|
|
// Create context with timeout for this attempt
|
|
attemptCtx, cancel := context.WithTimeout(ctx, config.TimeoutPerAttempt)
|
|
|
|
rh.logger.Debug("Attempting operation with retry",
|
|
"operation", operationType,
|
|
"attempt", attempt,
|
|
"max_attempts", config.MaxAttempts)
|
|
|
|
// Execute the operation
|
|
err := operation(attemptCtx, attempt)
|
|
cancel()
|
|
|
|
// Update statistics
|
|
stats.mu.Lock()
|
|
stats.TotalAttempts++
|
|
stats.LastAttempt = time.Now()
|
|
stats.mu.Unlock()
|
|
|
|
if err == nil {
|
|
// Success!
|
|
duration := time.Since(start)
|
|
|
|
stats.mu.Lock()
|
|
stats.SuccessfulRetries++
|
|
stats.LastSuccess = time.Now()
|
|
denominator := stats.SuccessfulRetries + stats.FailedRetries
|
|
if denominator > 0 {
|
|
stats.AverageAttempts = float64(stats.TotalAttempts) / float64(denominator)
|
|
}
|
|
stats.mu.Unlock()
|
|
|
|
rh.logger.Debug("Operation succeeded",
|
|
"operation", operationType,
|
|
"attempt", attempt,
|
|
"duration", duration)
|
|
|
|
return &RetryResult{
|
|
Success: true,
|
|
Attempts: attempt,
|
|
TotalDuration: duration,
|
|
LastError: nil,
|
|
LastAttemptAt: time.Now(),
|
|
}
|
|
}
|
|
|
|
lastError = err
|
|
|
|
// Check if context was cancelled
|
|
if ctx.Err() != nil {
|
|
rh.logger.Debug("Operation cancelled by context",
|
|
"operation", operationType,
|
|
"attempt", attempt,
|
|
"error", ctx.Err())
|
|
break
|
|
}
|
|
|
|
// Don't wait after the last attempt
|
|
if attempt < config.MaxAttempts {
|
|
delay := rh.calculateDelay(config, attempt)
|
|
|
|
rh.logger.Debug("Operation failed, retrying",
|
|
"operation", operationType,
|
|
"attempt", attempt,
|
|
"error", err,
|
|
"delay", delay)
|
|
|
|
// Wait before next attempt
|
|
select {
|
|
case <-time.After(delay):
|
|
// Continue to next attempt
|
|
case <-ctx.Done():
|
|
// Context cancelled during wait
|
|
break
|
|
}
|
|
} else {
|
|
rh.logger.Warn("Operation failed after all retries",
|
|
"operation", operationType,
|
|
"attempts", attempt,
|
|
"error", err)
|
|
}
|
|
}
|
|
|
|
// All attempts failed
|
|
duration := time.Since(start)
|
|
|
|
stats.mu.Lock()
|
|
stats.FailedRetries++
|
|
stats.LastFailure = time.Now()
|
|
denominator := stats.SuccessfulRetries + stats.FailedRetries
|
|
if denominator > 0 {
|
|
stats.AverageAttempts = float64(stats.TotalAttempts) / float64(denominator)
|
|
}
|
|
stats.mu.Unlock()
|
|
|
|
return &RetryResult{
|
|
Success: false,
|
|
Attempts: config.MaxAttempts,
|
|
TotalDuration: duration,
|
|
LastError: lastError,
|
|
LastAttemptAt: time.Now(),
|
|
}
|
|
}
|
|
|
|
// calculateDelay calculates the delay before the next retry attempt
|
|
func (rh *RetryHandler) calculateDelay(config RetryConfig, attempt int) time.Duration {
|
|
// Calculate exponential backoff
|
|
delay := float64(config.InitialDelay) * math.Pow(config.BackoffFactor, float64(attempt-1))
|
|
|
|
// Apply maximum delay cap
|
|
if delay > float64(config.MaxDelay) {
|
|
delay = float64(config.MaxDelay)
|
|
}
|
|
|
|
duration := time.Duration(delay)
|
|
|
|
// Add jitter if enabled
|
|
if config.JitterEnabled {
|
|
jitter := time.Duration(float64(duration) * 0.1 * (2*rh.randomFloat() - 1))
|
|
duration += jitter
|
|
}
|
|
|
|
// Ensure minimum delay
|
|
if duration < 0 {
|
|
duration = config.InitialDelay
|
|
}
|
|
|
|
return duration
|
|
}
|
|
|
|
// randomFloat returns a pseudo-random float between 0 and 1
|
|
func (rh *RetryHandler) randomFloat() float64 {
|
|
// Simple pseudo-random number based on current time
|
|
return float64(time.Now().UnixNano()%1000) / 1000.0
|
|
}
|
|
|
|
// getConfig returns the retry configuration for an operation type
|
|
func (rh *RetryHandler) getConfig(operationType string) RetryConfig {
|
|
rh.mu.RLock()
|
|
defer rh.mu.RUnlock()
|
|
|
|
if config, exists := rh.configs[operationType]; exists {
|
|
return config
|
|
}
|
|
|
|
// Return default config if no specific config found
|
|
return DefaultRetryConfig()
|
|
}
|
|
|
|
// SetConfig sets a custom retry configuration for an operation type
|
|
func (rh *RetryHandler) SetConfig(operationType string, config RetryConfig) {
|
|
rh.mu.Lock()
|
|
defer rh.mu.Unlock()
|
|
|
|
rh.configs[operationType] = config
|
|
rh.logger.Debug("Set retry config",
|
|
"operation", operationType,
|
|
"max_attempts", config.MaxAttempts,
|
|
"initial_delay", config.InitialDelay,
|
|
"max_delay", config.MaxDelay)
|
|
}
|
|
|
|
// GetStats returns retry statistics for all operation types
|
|
func (rh *RetryHandler) GetStats() map[string]*RetryStats {
|
|
rh.mu.RLock()
|
|
defer rh.mu.RUnlock()
|
|
|
|
// Return a copy to prevent external modification
|
|
result := make(map[string]*RetryStats)
|
|
for opType, stats := range rh.stats {
|
|
stats.mu.RLock()
|
|
result[opType] = &RetryStats{
|
|
OperationType: stats.OperationType,
|
|
TotalAttempts: stats.TotalAttempts,
|
|
SuccessfulRetries: stats.SuccessfulRetries,
|
|
FailedRetries: stats.FailedRetries,
|
|
AverageAttempts: stats.AverageAttempts,
|
|
LastAttempt: stats.LastAttempt,
|
|
LastSuccess: stats.LastSuccess,
|
|
LastFailure: stats.LastFailure,
|
|
}
|
|
stats.mu.RUnlock()
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
// GetOperationStats returns statistics for a specific operation type
|
|
func (rh *RetryHandler) GetOperationStats(operationType string) *RetryStats {
|
|
rh.mu.RLock()
|
|
defer rh.mu.RUnlock()
|
|
|
|
stats, exists := rh.stats[operationType]
|
|
if !exists {
|
|
return nil
|
|
}
|
|
|
|
stats.mu.RLock()
|
|
defer stats.mu.RUnlock()
|
|
|
|
return &RetryStats{
|
|
OperationType: stats.OperationType,
|
|
TotalAttempts: stats.TotalAttempts,
|
|
SuccessfulRetries: stats.SuccessfulRetries,
|
|
FailedRetries: stats.FailedRetries,
|
|
AverageAttempts: stats.AverageAttempts,
|
|
LastAttempt: stats.LastAttempt,
|
|
LastSuccess: stats.LastSuccess,
|
|
LastFailure: stats.LastFailure,
|
|
}
|
|
}
|
|
|
|
// ResetStats resets statistics for all operation types
|
|
func (rh *RetryHandler) ResetStats() {
|
|
rh.mu.Lock()
|
|
defer rh.mu.Unlock()
|
|
|
|
rh.stats = make(map[string]*RetryStats)
|
|
rh.logger.Info("Reset retry statistics")
|
|
}
|
|
|
|
// Enable enables the retry handler
|
|
func (rh *RetryHandler) Enable() {
|
|
rh.mu.Lock()
|
|
defer rh.mu.Unlock()
|
|
rh.enabled = true
|
|
rh.logger.Info("Retry handler enabled")
|
|
}
|
|
|
|
// Disable disables the retry handler
|
|
func (rh *RetryHandler) Disable() {
|
|
rh.mu.Lock()
|
|
defer rh.mu.Unlock()
|
|
rh.enabled = false
|
|
rh.logger.Info("Retry handler disabled")
|
|
}
|
|
|
|
// IsEnabled returns whether the retry handler is enabled
|
|
func (rh *RetryHandler) IsEnabled() bool {
|
|
rh.mu.RLock()
|
|
defer rh.mu.RUnlock()
|
|
return rh.enabled
|
|
}
|
|
|
|
// GetHealthSummary returns a health summary based on retry statistics
|
|
func (rh *RetryHandler) GetHealthSummary() map[string]interface{} {
|
|
stats := rh.GetStats()
|
|
|
|
summary := map[string]interface{}{
|
|
"enabled": rh.enabled,
|
|
"total_operations": len(stats),
|
|
"healthy_operations": 0,
|
|
"unhealthy_operations": 0,
|
|
"operation_details": make(map[string]interface{}),
|
|
}
|
|
|
|
for opType, opStats := range stats {
|
|
total := opStats.SuccessfulRetries + opStats.FailedRetries
|
|
successRate := 0.0
|
|
if total > 0 {
|
|
successRate = float64(opStats.SuccessfulRetries) / float64(total)
|
|
}
|
|
|
|
isHealthy := successRate >= 0.9 && opStats.AverageAttempts <= 2.0
|
|
|
|
if isHealthy {
|
|
summary["healthy_operations"] = summary["healthy_operations"].(int) + 1
|
|
} else {
|
|
summary["unhealthy_operations"] = summary["unhealthy_operations"].(int) + 1
|
|
}
|
|
|
|
summary["operation_details"].(map[string]interface{})[opType] = map[string]interface{}{
|
|
"success_rate": successRate,
|
|
"average_attempts": opStats.AverageAttempts,
|
|
"total_operations": total,
|
|
"is_healthy": isHealthy,
|
|
"last_success": opStats.LastSuccess,
|
|
"last_failure": opStats.LastFailure,
|
|
}
|
|
}
|
|
|
|
return summary
|
|
}
|