feat: create v2-prep branch with comprehensive planning

Restructured project for V2 refactor: **Structure Changes:** - Moved all V1 code to orig/ folder (preserved with git mv) - Created docs/planning/ directory - Added orig/README_V1.md explaining V1 preservation **Planning Documents:** - 00_V2_MASTER_PLAN.md: Complete architecture overview - Executive summary of critical V1 issues - High-level component architecture diagrams - 5-phase implementation roadmap - Success metrics and risk mitigation - 07_TASK_BREAKDOWN.md: Atomic task breakdown - 99+ hours of detailed tasks - Every task < 2 hours (atomic) - Clear dependencies and success criteria - Organized by implementation phase **V2 Key Improvements:** - Per-exchange parsers (factory pattern) - Multi-layer strict validation - Multi-index pool cache - Background validation pipeline - Comprehensive observability **Critical Issues Addressed:** - Zero address tokens (strict validation + cache enrichment) - Parsing accuracy (protocol-specific parsers) - No audit trail (background validation channel) - Inefficient lookups (multi-index cache) - Stats disconnection (event-driven metrics) Next Steps: 1. Review planning documents 2. Begin Phase 1: Foundation (P1-001 through P1-010) 3. Implement parsers in Phase 2 4. Build cache system in Phase 3 5. Add validation pipeline in Phase 4 6. Migrate and test in Phase 5 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-10 10:14:26 +01:00
parent 1773daffe7
commit 803de231ba
411 changed files with 20390 additions and 8680 deletions
--- a/orig/internal/recovery/error_handler.go
+++ b/orig/internal/recovery/error_handler.go
@@ -0,0 +1,621 @@
+package recovery
+
+import (
+	"context"
+	"sync"
+	"time"
+
+	"github.com/ethereum/go-ethereum/common"
+
+	"github.com/fraktal/mev-beta/internal/logger"
+)
+
+// ErrorSeverity represents the severity level of an error
+type ErrorSeverity int
+
+const (
+	SeverityLow ErrorSeverity = iota
+	SeverityMedium
+	SeverityHigh
+	SeverityCritical
+)
+
+func (s ErrorSeverity) String() string {
+	switch s {
+	case SeverityLow:
+		return "LOW"
+	case SeverityMedium:
+		return "MEDIUM"
+	case SeverityHigh:
+		return "HIGH"
+	case SeverityCritical:
+		return "CRITICAL"
+	default:
+		return "UNKNOWN"
+	}
+}
+
+// ErrorType categorizes different types of errors
+type ErrorType int
+
+const (
+	ErrorTypeAddressCorruption ErrorType = iota
+	ErrorTypeContractCallFailed
+	ErrorTypeRPCConnectionFailed
+	ErrorTypeDataParsingFailed
+	ErrorTypeValidationFailed
+	ErrorTypeTimeoutError
+)
+
+func (e ErrorType) String() string {
+	switch e {
+	case ErrorTypeAddressCorruption:
+		return "ADDRESS_CORRUPTION"
+	case ErrorTypeContractCallFailed:
+		return "CONTRACT_CALL_FAILED"
+	case ErrorTypeRPCConnectionFailed:
+		return "RPC_CONNECTION_FAILED"
+	case ErrorTypeDataParsingFailed:
+		return "DATA_PARSING_FAILED"
+	case ErrorTypeValidationFailed:
+		return "VALIDATION_FAILED"
+	case ErrorTypeTimeoutError:
+		return "TIMEOUT_ERROR"
+	default:
+		return "UNKNOWN_ERROR"
+	}
+}
+
+// RecoveryAction represents an action to take when an error occurs
+type RecoveryAction int
+
+const (
+	ActionSkipAndContinue RecoveryAction = iota
+	ActionRetryWithBackoff
+	ActionUseFallbackData
+	ActionCircuitBreaker
+	ActionEmergencyStop
+)
+
+func (a RecoveryAction) String() string {
+	switch a {
+	case ActionSkipAndContinue:
+		return "SKIP_AND_CONTINUE"
+	case ActionRetryWithBackoff:
+		return "RETRY_WITH_BACKOFF"
+	case ActionUseFallbackData:
+		return "USE_FALLBACK_DATA"
+	case ActionCircuitBreaker:
+		return "CIRCUIT_BREAKER"
+	case ActionEmergencyStop:
+		return "EMERGENCY_STOP"
+	default:
+		return "UNKNOWN_ACTION"
+	}
+}
+
+// ErrorEvent represents a specific error occurrence
+type ErrorEvent struct {
+	Timestamp    time.Time
+	Type         ErrorType
+	Severity     ErrorSeverity
+	Component    string
+	Address      common.Address
+	Message      string
+	Context      map[string]interface{}
+	AttemptCount int
+	LastAttempt  time.Time
+	Resolved     bool
+	ResolvedAt   time.Time
+}
+
+// RecoveryRule defines how to handle specific error patterns
+type RecoveryRule struct {
+	ErrorType               ErrorType
+	MaxSeverity             ErrorSeverity
+	Action                  RecoveryAction
+	MaxRetries              int
+	BackoffInterval         time.Duration
+	CircuitBreakerThreshold int
+	ContextMatchers         map[string]interface{}
+}
+
+// ErrorHandler provides comprehensive error handling and recovery capabilities
+type ErrorHandler struct {
+	mu               sync.RWMutex
+	logger           *logger.Logger
+	errorHistory     []ErrorEvent
+	componentStats   map[string]*ComponentStats
+	circuitBreakers  map[string]*CircuitBreaker
+	recoveryRules    []RecoveryRule
+	fallbackProvider FallbackDataProvider
+	maxHistorySize   int
+	alertThresholds  map[ErrorType]int
+	enabled          bool
+}
+
+// ComponentStats tracks error statistics for components
+type ComponentStats struct {
+	mu                  sync.RWMutex
+	Component           string
+	TotalErrors         int
+	ErrorsByType        map[ErrorType]int
+	ErrorsBySeverity    map[ErrorSeverity]int
+	LastError           time.Time
+	ConsecutiveFailures int
+	SuccessCount        int
+	IsHealthy           bool
+	LastHealthCheck     time.Time
+}
+
+// CircuitBreaker implements circuit breaker pattern for failing components
+type CircuitBreaker struct {
+	mu              sync.RWMutex
+	Name            string
+	State           CircuitState
+	FailureCount    int
+	Threshold       int
+	Timeout         time.Duration
+	LastFailure     time.Time
+	LastSuccess     time.Time
+	HalfOpenAllowed bool
+}
+
+type CircuitState int
+
+const (
+	CircuitClosed CircuitState = iota
+	CircuitOpen
+	CircuitHalfOpen
+)
+
+func (s CircuitState) String() string {
+	switch s {
+	case CircuitClosed:
+		return "CLOSED"
+	case CircuitOpen:
+		return "OPEN"
+	case CircuitHalfOpen:
+		return "HALF_OPEN"
+	default:
+		return "UNKNOWN"
+	}
+}
+
+// FallbackDataProvider interface for providing fallback data when primary sources fail
+type FallbackDataProvider interface {
+	GetFallbackTokenInfo(ctx context.Context, address common.Address) (*FallbackTokenInfo, error)
+	GetFallbackPoolInfo(ctx context.Context, address common.Address) (*FallbackPoolInfo, error)
+	GetFallbackContractType(ctx context.Context, address common.Address) (string, error)
+}
+
+type FallbackTokenInfo struct {
+	Address    common.Address
+	Symbol     string
+	Name       string
+	Decimals   uint8
+	IsVerified bool
+	Source     string
+	Confidence float64
+}
+
+type FallbackPoolInfo struct {
+	Address    common.Address
+	Token0     common.Address
+	Token1     common.Address
+	Protocol   string
+	Fee        uint32
+	IsVerified bool
+	Source     string
+	Confidence float64
+}
+
+// NewErrorHandler creates a new error handler with default configuration
+func NewErrorHandler(logger *logger.Logger) *ErrorHandler {
+	handler := &ErrorHandler{
+		logger:          logger,
+		errorHistory:    make([]ErrorEvent, 0),
+		componentStats:  make(map[string]*ComponentStats),
+		circuitBreakers: make(map[string]*CircuitBreaker),
+		maxHistorySize:  1000,
+		alertThresholds: make(map[ErrorType]int),
+		enabled:         true,
+	}
+
+	// Initialize default recovery rules
+	handler.initializeDefaultRules()
+
+	// Initialize default alert thresholds
+	handler.initializeAlertThresholds()
+
+	return handler
+}
+
+// initializeDefaultRules sets up default recovery rules for common error scenarios
+func (eh *ErrorHandler) initializeDefaultRules() {
+	eh.recoveryRules = []RecoveryRule{
+		{
+			ErrorType:       ErrorTypeAddressCorruption,
+			MaxSeverity:     SeverityMedium,
+			Action:          ActionRetryWithBackoff,
+			MaxRetries:      2,
+			BackoffInterval: 500 * time.Millisecond,
+		},
+		{
+			ErrorType:       ErrorTypeAddressCorruption,
+			MaxSeverity:     SeverityCritical,
+			Action:          ActionUseFallbackData,
+			MaxRetries:      0,
+			BackoffInterval: 0,
+		},
+		{
+			ErrorType:       ErrorTypeContractCallFailed,
+			MaxSeverity:     SeverityMedium,
+			Action:          ActionRetryWithBackoff,
+			MaxRetries:      3,
+			BackoffInterval: 2 * time.Second,
+		},
+		{
+			ErrorType:               ErrorTypeRPCConnectionFailed,
+			MaxSeverity:             SeverityHigh,
+			Action:                  ActionCircuitBreaker,
+			MaxRetries:              5,
+			BackoffInterval:         5 * time.Second,
+			CircuitBreakerThreshold: 10,
+		},
+		{
+			ErrorType:       ErrorTypeDataParsingFailed,
+			MaxSeverity:     SeverityMedium,
+			Action:          ActionUseFallbackData,
+			MaxRetries:      2,
+			BackoffInterval: 1 * time.Second,
+		},
+		{
+			ErrorType:       ErrorTypeValidationFailed,
+			MaxSeverity:     SeverityLow,
+			Action:          ActionSkipAndContinue,
+			MaxRetries:      0,
+			BackoffInterval: 0,
+		},
+		{
+			ErrorType:       ErrorTypeValidationFailed,
+			MaxSeverity:     SeverityHigh,
+			Action:          ActionRetryWithBackoff,
+			MaxRetries:      1,
+			BackoffInterval: 500 * time.Millisecond,
+		},
+		{
+			ErrorType:       ErrorTypeTimeoutError,
+			MaxSeverity:     SeverityMedium,
+			Action:          ActionRetryWithBackoff,
+			MaxRetries:      3,
+			BackoffInterval: 3 * time.Second,
+		},
+	}
+}
+
+// initializeAlertThresholds sets up alert thresholds for different error types
+func (eh *ErrorHandler) initializeAlertThresholds() {
+	eh.alertThresholds[ErrorTypeAddressCorruption] = 5
+	eh.alertThresholds[ErrorTypeContractCallFailed] = 20
+	eh.alertThresholds[ErrorTypeRPCConnectionFailed] = 10
+	eh.alertThresholds[ErrorTypeDataParsingFailed] = 15
+	eh.alertThresholds[ErrorTypeValidationFailed] = 25
+	eh.alertThresholds[ErrorTypeTimeoutError] = 30
+}
+
+// HandleError processes an error and determines the appropriate recovery action
+func (eh *ErrorHandler) HandleError(ctx context.Context, errorType ErrorType, severity ErrorSeverity, component string, address common.Address, message string, context map[string]interface{}) RecoveryAction {
+	if !eh.enabled {
+		return ActionSkipAndContinue
+	}
+
+	eh.mu.Lock()
+	defer eh.mu.Unlock()
+
+	// Record the error event
+	event := ErrorEvent{
+		Timestamp:    time.Now(),
+		Type:         errorType,
+		Severity:     severity,
+		Component:    component,
+		Address:      address,
+		Message:      message,
+		Context:      context,
+		AttemptCount: 1,
+		LastAttempt:  time.Now(),
+	}
+
+	// Update error history
+	eh.addToHistory(event)
+
+	// Update component statistics
+	eh.updateComponentStats(component, errorType, severity)
+
+	// Check circuit breakers
+	if eh.shouldTriggerCircuitBreaker(component, errorType) {
+		eh.triggerCircuitBreaker(component)
+		return ActionCircuitBreaker
+	}
+
+	// Find matching recovery rule
+	rule := eh.findRecoveryRule(errorType, severity, context)
+	if rule == nil {
+		// Default action for unmatched errors
+		return ActionSkipAndContinue
+	}
+
+	// Log the error and recovery action
+	eh.logger.Error("Error handled by recovery system",
+		"type", errorType.String(),
+		"severity", severity.String(),
+		"component", component,
+		"address", address.Hex(),
+		"message", message,
+		"action", rule.Action.String())
+
+	// Check if alert threshold is reached
+	eh.checkAlertThresholds(errorType)
+
+	return rule.Action
+}
+
+// addToHistory adds an error event to the history buffer
+func (eh *ErrorHandler) addToHistory(event ErrorEvent) {
+	eh.errorHistory = append(eh.errorHistory, event)
+
+	// Trim history if it exceeds max size
+	if len(eh.errorHistory) > eh.maxHistorySize {
+		eh.errorHistory = eh.errorHistory[len(eh.errorHistory)-eh.maxHistorySize:]
+	}
+}
+
+// updateComponentStats updates statistics for a component
+func (eh *ErrorHandler) updateComponentStats(component string, errorType ErrorType, severity ErrorSeverity) {
+	stats, exists := eh.componentStats[component]
+	if !exists {
+		stats = &ComponentStats{
+			Component:        component,
+			ErrorsByType:     make(map[ErrorType]int),
+			ErrorsBySeverity: make(map[ErrorSeverity]int),
+			IsHealthy:        true,
+		}
+		eh.componentStats[component] = stats
+	}
+
+	stats.mu.Lock()
+	defer stats.mu.Unlock()
+
+	stats.TotalErrors++
+	stats.ErrorsByType[errorType]++
+	stats.ErrorsBySeverity[severity]++
+	stats.LastError = time.Now()
+	stats.ConsecutiveFailures++
+
+	// Mark as unhealthy if too many consecutive failures
+	if stats.ConsecutiveFailures > 10 {
+		stats.IsHealthy = false
+	}
+}
+
+// findRecoveryRule finds the best matching recovery rule for an error
+func (eh *ErrorHandler) findRecoveryRule(errorType ErrorType, severity ErrorSeverity, context map[string]interface{}) *RecoveryRule {
+	for _, rule := range eh.recoveryRules {
+		if rule.ErrorType == errorType && severity <= rule.MaxSeverity {
+			// Check context matchers if present
+			if len(rule.ContextMatchers) > 0 {
+				if !eh.matchesContext(context, rule.ContextMatchers) {
+					continue
+				}
+			}
+			return &rule
+		}
+	}
+	return nil
+}
+
+// matchesContext checks if the error context matches the rule's context matchers
+func (eh *ErrorHandler) matchesContext(errorContext, ruleMatchers map[string]interface{}) bool {
+	for key, expectedValue := range ruleMatchers {
+		if actualValue, exists := errorContext[key]; !exists || actualValue != expectedValue {
+			return false
+		}
+	}
+	return true
+}
+
+// shouldTriggerCircuitBreaker determines if a circuit breaker should be triggered
+func (eh *ErrorHandler) shouldTriggerCircuitBreaker(component string, errorType ErrorType) bool {
+	stats, exists := eh.componentStats[component]
+	if !exists {
+		return false
+	}
+
+	stats.mu.RLock()
+	defer stats.mu.RUnlock()
+
+	// Trigger if consecutive failures exceed threshold for critical errors
+	if errorType == ErrorTypeRPCConnectionFailed && stats.ConsecutiveFailures >= 5 {
+		return true
+	}
+
+	if errorType == ErrorTypeAddressCorruption && stats.ConsecutiveFailures >= 3 {
+		return true
+	}
+
+	return false
+}
+
+// triggerCircuitBreaker activates a circuit breaker for a component
+func (eh *ErrorHandler) triggerCircuitBreaker(component string) {
+	breaker := &CircuitBreaker{
+		Name:         component,
+		State:        CircuitOpen,
+		FailureCount: 0,
+		Threshold:    5,
+		Timeout:      30 * time.Second,
+		LastFailure:  time.Now(),
+	}
+
+	eh.circuitBreakers[component] = breaker
+
+	eh.logger.Warn("Circuit breaker triggered",
+		"component", component,
+		"timeout", breaker.Timeout)
+}
+
+// checkAlertThresholds checks if error counts have reached alert thresholds
+func (eh *ErrorHandler) checkAlertThresholds(errorType ErrorType) {
+	threshold, exists := eh.alertThresholds[errorType]
+	if !exists {
+		return
+	}
+
+	// Count recent errors of this type (last hour)
+	recentCount := 0
+	cutoff := time.Now().Add(-1 * time.Hour)
+
+	for _, event := range eh.errorHistory {
+		if event.Type == errorType && event.Timestamp.After(cutoff) {
+			recentCount++
+		}
+	}
+
+	if recentCount >= threshold {
+		eh.logger.Warn("Error threshold reached - alert triggered",
+			"error_type", errorType.String(),
+			"count", recentCount,
+			"threshold", threshold)
+		// Here you would trigger your alerting system
+	}
+}
+
+// GetComponentHealth returns the health status of all components
+func (eh *ErrorHandler) GetComponentHealth() map[string]*ComponentStats {
+	eh.mu.RLock()
+	defer eh.mu.RUnlock()
+
+	// Return a copy to prevent external modification
+	result := make(map[string]*ComponentStats)
+	for name, stats := range eh.componentStats {
+		result[name] = &ComponentStats{
+			Component:           stats.Component,
+			TotalErrors:         stats.TotalErrors,
+			ErrorsByType:        make(map[ErrorType]int),
+			ErrorsBySeverity:    make(map[ErrorSeverity]int),
+			LastError:           stats.LastError,
+			ConsecutiveFailures: stats.ConsecutiveFailures,
+			SuccessCount:        stats.SuccessCount,
+			IsHealthy:           stats.IsHealthy,
+			LastHealthCheck:     stats.LastHealthCheck,
+		}
+
+		// Copy maps
+		for k, v := range stats.ErrorsByType {
+			result[name].ErrorsByType[k] = v
+		}
+		for k, v := range stats.ErrorsBySeverity {
+			result[name].ErrorsBySeverity[k] = v
+		}
+	}
+
+	return result
+}
+
+// RecordSuccess records a successful operation for a component
+func (eh *ErrorHandler) RecordSuccess(component string) {
+	eh.mu.Lock()
+	defer eh.mu.Unlock()
+
+	stats, exists := eh.componentStats[component]
+	if !exists {
+		stats = &ComponentStats{
+			Component:        component,
+			ErrorsByType:     make(map[ErrorType]int),
+			ErrorsBySeverity: make(map[ErrorSeverity]int),
+			IsHealthy:        true,
+		}
+		eh.componentStats[component] = stats
+	}
+
+	stats.mu.Lock()
+	defer stats.mu.Unlock()
+
+	stats.SuccessCount++
+	stats.ConsecutiveFailures = 0
+	stats.IsHealthy = true
+	stats.LastHealthCheck = time.Now()
+
+	// Reset circuit breaker if it exists
+	if breaker, exists := eh.circuitBreakers[component]; exists {
+		breaker.mu.Lock()
+		breaker.State = CircuitClosed
+		breaker.FailureCount = 0
+		breaker.LastSuccess = time.Now()
+		breaker.mu.Unlock()
+	}
+}
+
+// IsCircuitOpen checks if a circuit breaker is open for a component
+func (eh *ErrorHandler) IsCircuitOpen(component string) bool {
+	eh.mu.RLock()
+	defer eh.mu.RUnlock()
+
+	breaker, exists := eh.circuitBreakers[component]
+	if !exists {
+		return false
+	}
+
+	breaker.mu.RLock()
+	defer breaker.mu.RUnlock()
+
+	if breaker.State == CircuitOpen {
+		// Check if timeout has passed
+		if time.Since(breaker.LastFailure) > breaker.Timeout {
+			breaker.State = CircuitHalfOpen
+			breaker.HalfOpenAllowed = true
+			return false
+		}
+		return true
+	}
+
+	return false
+}
+
+// SetFallbackProvider sets the fallback data provider
+func (eh *ErrorHandler) SetFallbackProvider(provider FallbackDataProvider) {
+	eh.mu.Lock()
+	defer eh.mu.Unlock()
+	eh.fallbackProvider = provider
+}
+
+// GetErrorSummary returns a summary of recent errors
+func (eh *ErrorHandler) GetErrorSummary(duration time.Duration) map[string]interface{} {
+	eh.mu.RLock()
+	defer eh.mu.RUnlock()
+
+	cutoff := time.Now().Add(-duration)
+	summary := map[string]interface{}{
+		"total_errors":        0,
+		"errors_by_type":      make(map[string]int),
+		"errors_by_severity":  make(map[string]int),
+		"errors_by_component": make(map[string]int),
+		"time_range":          duration.String(),
+	}
+
+	for _, event := range eh.errorHistory {
+		if event.Timestamp.After(cutoff) {
+			summary["total_errors"] = summary["total_errors"].(int) + 1
+
+			typeKey := event.Type.String()
+			summary["errors_by_type"].(map[string]int)[typeKey]++
+
+			severityKey := event.Severity.String()
+			summary["errors_by_severity"].(map[string]int)[severityKey]++
+
+			summary["errors_by_component"].(map[string]int)[event.Component]++
+		}
+	}
+
+	return summary
+}
--- a/orig/internal/recovery/fallback_provider.go
+++ b/orig/internal/recovery/fallback_provider.go
@@ -0,0 +1,384 @@
+package recovery
+
+import (
+	"context"
+	"fmt"
+	"sync"
+	"time"
+
+	"github.com/ethereum/go-ethereum/common"
+
+	"github.com/fraktal/mev-beta/internal/logger"
+	"github.com/fraktal/mev-beta/internal/registry"
+)
+
+// DefaultFallbackProvider implements FallbackDataProvider with multiple data sources
+type DefaultFallbackProvider struct {
+	mu               sync.RWMutex
+	logger           *logger.Logger
+	contractRegistry *registry.ContractRegistry
+	staticTokenData  map[common.Address]*FallbackTokenInfo
+	staticPoolData   map[common.Address]*FallbackPoolInfo
+	cacheTimeout     time.Duration
+	enabled          bool
+}
+
+// NewDefaultFallbackProvider creates a new fallback data provider
+func NewDefaultFallbackProvider(logger *logger.Logger, contractRegistry *registry.ContractRegistry) *DefaultFallbackProvider {
+	provider := &DefaultFallbackProvider{
+		logger:           logger,
+		contractRegistry: contractRegistry,
+		staticTokenData:  make(map[common.Address]*FallbackTokenInfo),
+		staticPoolData:   make(map[common.Address]*FallbackPoolInfo),
+		cacheTimeout:     5 * time.Minute,
+		enabled:          true,
+	}
+
+	// Initialize with known safe data
+	provider.initializeStaticData()
+
+	return provider
+}
+
+// initializeStaticData populates the provider with known good data for critical Arbitrum contracts
+func (fp *DefaultFallbackProvider) initializeStaticData() {
+	fp.mu.Lock()
+	defer fp.mu.Unlock()
+
+	// Major Arbitrum tokens with verified addresses
+	fp.staticTokenData[common.HexToAddress("0x82aF49447D8a07e3bd95BD0d56f35241523fBab1")] = &FallbackTokenInfo{
+		Address:    common.HexToAddress("0x82aF49447D8a07e3bd95BD0d56f35241523fBab1"),
+		Symbol:     "WETH",
+		Name:       "Wrapped Ether",
+		Decimals:   18,
+		IsVerified: true,
+		Source:     "static_fallback",
+		Confidence: 1.0,
+	}
+
+	fp.staticTokenData[common.HexToAddress("0xaf88d065e77c8cC2239327C5EDb3A432268e5831")] = &FallbackTokenInfo{
+		Address:    common.HexToAddress("0xaf88d065e77c8cC2239327C5EDb3A432268e5831"),
+		Symbol:     "USDC",
+		Name:       "USD Coin",
+		Decimals:   6,
+		IsVerified: true,
+		Source:     "static_fallback",
+		Confidence: 1.0,
+	}
+
+	fp.staticTokenData[common.HexToAddress("0xFd086bC7CD5C481DCC9C85ebE478A1C0b69FCbb9")] = &FallbackTokenInfo{
+		Address:    common.HexToAddress("0xFd086bC7CD5C481DCC9C85ebE478A1C0b69FCbb9"),
+		Symbol:     "USDT",
+		Name:       "Tether USD",
+		Decimals:   6,
+		IsVerified: true,
+		Source:     "static_fallback",
+		Confidence: 1.0,
+	}
+
+	fp.staticTokenData[common.HexToAddress("0x2f2a2543B76A4166549F7aaB2e75Bef0aefC5B0f")] = &FallbackTokenInfo{
+		Address:    common.HexToAddress("0x2f2a2543B76A4166549F7aaB2e75Bef0aefC5B0f"),
+		Symbol:     "WBTC",
+		Name:       "Wrapped BTC",
+		Decimals:   8,
+		IsVerified: true,
+		Source:     "static_fallback",
+		Confidence: 1.0,
+	}
+
+	fp.staticTokenData[common.HexToAddress("0x912CE59144191C1204E64559FE8253a0e49E6548")] = &FallbackTokenInfo{
+		Address:    common.HexToAddress("0x912CE59144191C1204E64559FE8253a0e49E6548"),
+		Symbol:     "ARB",
+		Name:       "Arbitrum",
+		Decimals:   18,
+		IsVerified: true,
+		Source:     "static_fallback",
+		Confidence: 1.0,
+	}
+
+	// High-volume Uniswap V3 pools with verified addresses and token pairs
+	fp.staticPoolData[common.HexToAddress("0xC6962004f452bE9203591991D15f6b388e09E8D0")] = &FallbackPoolInfo{
+		Address:    common.HexToAddress("0xC6962004f452bE9203591991D15f6b388e09E8D0"),
+		Token0:     common.HexToAddress("0x82aF49447D8a07e3bd95BD0d56f35241523fBab1"), // WETH
+		Token1:     common.HexToAddress("0xaf88d065e77c8cC2239327C5EDb3A432268e5831"), // USDC
+		Protocol:   "UniswapV3",
+		Fee:        500, // 0.05%
+		IsVerified: true,
+		Source:     "static_fallback",
+		Confidence: 1.0,
+	}
+
+	fp.staticPoolData[common.HexToAddress("0x641C00A822e8b671738d32a431a4Fb6074E5c79d")] = &FallbackPoolInfo{
+		Address:    common.HexToAddress("0x641C00A822e8b671738d32a431a4Fb6074E5c79d"),
+		Token0:     common.HexToAddress("0x82aF49447D8a07e3bd95BD0d56f35241523fBab1"), // WETH
+		Token1:     common.HexToAddress("0xaf88d065e77c8cC2239327C5EDb3A432268e5831"), // USDC
+		Protocol:   "UniswapV3",
+		Fee:        3000, // 0.3%
+		IsVerified: true,
+		Source:     "static_fallback",
+		Confidence: 1.0,
+	}
+
+	fp.staticPoolData[common.HexToAddress("0x17c14D2c404D167802b16C450d3c99F88F2c4F4d")] = &FallbackPoolInfo{
+		Address:    common.HexToAddress("0x17c14D2c404D167802b16C450d3c99F88F2c4F4d"),
+		Token0:     common.HexToAddress("0xaf88d065e77c8cC2239327C5EDb3A432268e5831"), // USDC
+		Token1:     common.HexToAddress("0xFd086bC7CD5C481DCC9C85ebE478A1C0b69FCbb9"), // USDT
+		Protocol:   "UniswapV3",
+		Fee:        100, // 0.01%
+		IsVerified: true,
+		Source:     "static_fallback",
+		Confidence: 1.0,
+	}
+
+	fp.staticPoolData[common.HexToAddress("0x2f5e87C032bc4F8526F320c012A4e678F1fa6cAB")] = &FallbackPoolInfo{
+		Address:    common.HexToAddress("0x2f5e87C032bc4F8526F320c012A4e678F1fa6cAB"),
+		Token0:     common.HexToAddress("0x2f2a2543B76A4166549F7aaB2e75Bef0aefC5B0f"), // WBTC
+		Token1:     common.HexToAddress("0x82aF49447D8a07e3bd95BD0d56f35241523fBab1"), // WETH
+		Protocol:   "UniswapV3",
+		Fee:        500, // 0.05%
+		IsVerified: true,
+		Source:     "static_fallback",
+		Confidence: 1.0,
+	}
+
+	fp.logger.Info("Initialized fallback provider with static data",
+		"tokens", len(fp.staticTokenData),
+		"pools", len(fp.staticPoolData))
+}
+
+// GetFallbackTokenInfo provides fallback token information
+func (fp *DefaultFallbackProvider) GetFallbackTokenInfo(ctx context.Context, address common.Address) (*FallbackTokenInfo, error) {
+	if !fp.enabled {
+		return nil, fmt.Errorf("fallback provider disabled")
+	}
+
+	fp.mu.RLock()
+	defer fp.mu.RUnlock()
+
+	// First, try static data
+	if tokenInfo, exists := fp.staticTokenData[address]; exists {
+		fp.logger.Debug("Fallback token info from static data",
+			"address", address.Hex(),
+			"symbol", tokenInfo.Symbol,
+			"source", tokenInfo.Source)
+		return tokenInfo, nil
+	}
+
+	// Second, try contract registry if available
+	if fp.contractRegistry != nil {
+		if contractInfo, err := fp.contractRegistry.GetContractInfo(ctx, address); err == nil && contractInfo != nil {
+			tokenInfo := &FallbackTokenInfo{
+				Address:    address,
+				Symbol:     contractInfo.Symbol,
+				Name:       contractInfo.Name,
+				Decimals:   contractInfo.Decimals,
+				IsVerified: contractInfo.IsVerified,
+				Source:     "contract_registry",
+				Confidence: contractInfo.Confidence,
+			}
+
+			fp.logger.Debug("Fallback token info from registry",
+				"address", address.Hex(),
+				"symbol", tokenInfo.Symbol,
+				"confidence", tokenInfo.Confidence)
+
+			return tokenInfo, nil
+		}
+	}
+
+	// Third, provide minimal safe fallback for unknown tokens
+	tokenInfo := &FallbackTokenInfo{
+		Address:    address,
+		Symbol:     fmt.Sprintf("UNK_%s", address.Hex()[:8]),
+		Name:       "Unknown Token",
+		Decimals:   18, // Safe default
+		IsVerified: false,
+		Source:     "generated_fallback",
+		Confidence: 0.1,
+	}
+
+	fp.logger.Warn("Using generated fallback token info",
+		"address", address.Hex(),
+		"symbol", tokenInfo.Symbol)
+
+	return tokenInfo, nil
+}
+
+// GetFallbackPoolInfo provides fallback pool information
+func (fp *DefaultFallbackProvider) GetFallbackPoolInfo(ctx context.Context, address common.Address) (*FallbackPoolInfo, error) {
+	if !fp.enabled {
+		return nil, fmt.Errorf("fallback provider disabled")
+	}
+
+	fp.mu.RLock()
+	defer fp.mu.RUnlock()
+
+	// First, try static data
+	if poolInfo, exists := fp.staticPoolData[address]; exists {
+		fp.logger.Debug("Fallback pool info from static data",
+			"address", address.Hex(),
+			"protocol", poolInfo.Protocol,
+			"token0", poolInfo.Token0.Hex(),
+			"token1", poolInfo.Token1.Hex())
+		return poolInfo, nil
+	}
+
+	// Second, try contract registry if available
+	if fp.contractRegistry != nil {
+		if poolInfo := fp.contractRegistry.GetPoolInfo(address); poolInfo != nil {
+			fallbackInfo := &FallbackPoolInfo{
+				Address:    address,
+				Token0:     poolInfo.Token0,
+				Token1:     poolInfo.Token1,
+				Protocol:   poolInfo.Protocol,
+				Fee:        poolInfo.Fee,
+				IsVerified: poolInfo.IsVerified,
+				Source:     "contract_registry",
+				Confidence: poolInfo.Confidence,
+			}
+
+			fp.logger.Debug("Fallback pool info from registry",
+				"address", address.Hex(),
+				"protocol", fallbackInfo.Protocol,
+				"confidence", fallbackInfo.Confidence)
+
+			return fallbackInfo, nil
+		}
+	}
+
+	// No fallback available for unknown pools - return error
+	return nil, fmt.Errorf("no fallback data available for pool %s", address.Hex())
+}
+
+// GetFallbackContractType provides fallback contract type information
+func (fp *DefaultFallbackProvider) GetFallbackContractType(ctx context.Context, address common.Address) (string, error) {
+	if !fp.enabled {
+		return "", fmt.Errorf("fallback provider disabled")
+	}
+
+	fp.mu.RLock()
+	defer fp.mu.RUnlock()
+
+	// Check if it's a known token
+	if _, exists := fp.staticTokenData[address]; exists {
+		return "ERC20", nil
+	}
+
+	// Check if it's a known pool
+	if _, exists := fp.staticPoolData[address]; exists {
+		return "Pool", nil
+	}
+
+	// Try contract registry
+	if fp.contractRegistry != nil {
+		if contractInfo, err := fp.contractRegistry.GetContractInfo(ctx, address); err == nil && contractInfo != nil {
+			return contractInfo.Type.String(), nil
+		}
+	}
+
+	// Default to unknown
+	return "Unknown", nil
+}
+
+// AddStaticTokenData adds static token data for fallback use
+func (fp *DefaultFallbackProvider) AddStaticTokenData(address common.Address, info *FallbackTokenInfo) {
+	fp.mu.Lock()
+	defer fp.mu.Unlock()
+
+	fp.staticTokenData[address] = info
+	fp.logger.Debug("Added static token data",
+		"address", address.Hex(),
+		"symbol", info.Symbol)
+}
+
+// AddStaticPoolData adds static pool data for fallback use
+func (fp *DefaultFallbackProvider) AddStaticPoolData(address common.Address, info *FallbackPoolInfo) {
+	fp.mu.Lock()
+	defer fp.mu.Unlock()
+
+	fp.staticPoolData[address] = info
+	fp.logger.Debug("Added static pool data",
+		"address", address.Hex(),
+		"protocol", info.Protocol)
+}
+
+// IsAddressKnown checks if an address is in the static fallback data
+func (fp *DefaultFallbackProvider) IsAddressKnown(address common.Address) bool {
+	fp.mu.RLock()
+	defer fp.mu.RUnlock()
+
+	_, isToken := fp.staticTokenData[address]
+	_, isPool := fp.staticPoolData[address]
+
+	return isToken || isPool
+}
+
+// GetKnownAddresses returns all known addresses in the fallback provider
+func (fp *DefaultFallbackProvider) GetKnownAddresses() (tokens []common.Address, pools []common.Address) {
+	fp.mu.RLock()
+	defer fp.mu.RUnlock()
+
+	for addr := range fp.staticTokenData {
+		tokens = append(tokens, addr)
+	}
+
+	for addr := range fp.staticPoolData {
+		pools = append(pools, addr)
+	}
+
+	return tokens, pools
+}
+
+// ValidateAddressWithFallback performs validation using fallback data
+func (fp *DefaultFallbackProvider) ValidateAddressWithFallback(ctx context.Context, address common.Address, expectedType string) (bool, float64, error) {
+	if !fp.enabled {
+		return false, 0.0, fmt.Errorf("fallback provider disabled")
+	}
+
+	// Check if address is known in our static data
+	if fp.IsAddressKnown(address) {
+		actualType, err := fp.GetFallbackContractType(ctx, address)
+		if err != nil {
+			return false, 0.0, err
+		}
+
+		if actualType == expectedType {
+			return true, 1.0, nil // High confidence for known addresses
+		}
+
+		return false, 0.0, fmt.Errorf("type mismatch: expected %s, got %s", expectedType, actualType)
+	}
+
+	// For unknown addresses, provide low confidence validation
+	return true, 0.3, nil // Allow with low confidence
+}
+
+// GetStats returns statistics about the fallback provider
+func (fp *DefaultFallbackProvider) GetStats() map[string]interface{} {
+	fp.mu.RLock()
+	defer fp.mu.RUnlock()
+
+	return map[string]interface{}{
+		"enabled":             fp.enabled,
+		"static_tokens_count": len(fp.staticTokenData),
+		"static_pools_count":  len(fp.staticPoolData),
+		"cache_timeout":       fp.cacheTimeout.String(),
+		"has_registry":        fp.contractRegistry != nil,
+	}
+}
+
+// Enable enables the fallback provider
+func (fp *DefaultFallbackProvider) Enable() {
+	fp.mu.Lock()
+	defer fp.mu.Unlock()
+	fp.enabled = true
+	fp.logger.Info("Fallback provider enabled")
+}
+
+// Disable disables the fallback provider
+func (fp *DefaultFallbackProvider) Disable() {
+	fp.mu.Lock()
+	defer fp.mu.Unlock()
+	fp.enabled = false
+	fp.logger.Info("Fallback provider disabled")
+}
--- a/orig/internal/recovery/retry_handler.go
+++ b/orig/internal/recovery/retry_handler.go
@@ -0,0 +1,446 @@
+package recovery
+
+import (
+	"context"
+	"math"
+	"sync"
+	"time"
+
+	"github.com/fraktal/mev-beta/internal/logger"
+)
+
+// RetryConfig defines retry behavior configuration
+type RetryConfig struct {
+	MaxAttempts       int
+	InitialDelay      time.Duration
+	MaxDelay          time.Duration
+	BackoffFactor     float64
+	JitterEnabled     bool
+	TimeoutPerAttempt time.Duration
+}
+
+// DefaultRetryConfig returns a sensible default retry configuration
+func DefaultRetryConfig() RetryConfig {
+	return RetryConfig{
+		MaxAttempts:       3,
+		InitialDelay:      1 * time.Second,
+		MaxDelay:          30 * time.Second,
+		BackoffFactor:     2.0,
+		JitterEnabled:     true,
+		TimeoutPerAttempt: 10 * time.Second,
+	}
+}
+
+// RetryableOperation represents an operation that can be retried
+type RetryableOperation func(ctx context.Context, attempt int) error
+
+// RetryHandler provides exponential backoff retry capabilities
+type RetryHandler struct {
+	mu      sync.RWMutex
+	logger  *logger.Logger
+	configs map[string]RetryConfig
+	stats   map[string]*RetryStats
+	enabled bool
+}
+
+// RetryStats tracks retry statistics for operations
+type RetryStats struct {
+	mu                sync.RWMutex
+	OperationType     string
+	TotalAttempts     int
+	SuccessfulRetries int
+	FailedRetries     int
+	AverageAttempts   float64
+	LastAttempt       time.Time
+	LastSuccess       time.Time
+	LastFailure       time.Time
+}
+
+// RetryResult contains the result of a retry operation
+type RetryResult struct {
+	Success       bool
+	Attempts      int
+	TotalDuration time.Duration
+	LastError     error
+	LastAttemptAt time.Time
+}
+
+// NewRetryHandler creates a new retry handler
+func NewRetryHandler(logger *logger.Logger) *RetryHandler {
+	handler := &RetryHandler{
+		logger:  logger,
+		configs: make(map[string]RetryConfig),
+		stats:   make(map[string]*RetryStats),
+		enabled: true,
+	}
+
+	// Initialize default configurations for common operations
+	handler.initializeDefaultConfigs()
+
+	return handler
+}
+
+// initializeDefaultConfigs sets up default retry configurations
+func (rh *RetryHandler) initializeDefaultConfigs() {
+	// Contract call retries - moderate backoff
+	rh.configs["contract_call"] = RetryConfig{
+		MaxAttempts:       3,
+		InitialDelay:      500 * time.Millisecond,
+		MaxDelay:          5 * time.Second,
+		BackoffFactor:     2.0,
+		JitterEnabled:     true,
+		TimeoutPerAttempt: 10 * time.Second,
+	}
+
+	// RPC connection retries - aggressive backoff
+	rh.configs["rpc_connection"] = RetryConfig{
+		MaxAttempts:       5,
+		InitialDelay:      1 * time.Second,
+		MaxDelay:          30 * time.Second,
+		BackoffFactor:     2.5,
+		JitterEnabled:     true,
+		TimeoutPerAttempt: 15 * time.Second,
+	}
+
+	// Data parsing retries - quick retries
+	rh.configs["data_parsing"] = RetryConfig{
+		MaxAttempts:       2,
+		InitialDelay:      100 * time.Millisecond,
+		MaxDelay:          1 * time.Second,
+		BackoffFactor:     2.0,
+		JitterEnabled:     false,
+		TimeoutPerAttempt: 5 * time.Second,
+	}
+
+	// Block processing retries - conservative
+	rh.configs["block_processing"] = RetryConfig{
+		MaxAttempts:       3,
+		InitialDelay:      2 * time.Second,
+		MaxDelay:          10 * time.Second,
+		BackoffFactor:     2.0,
+		JitterEnabled:     true,
+		TimeoutPerAttempt: 30 * time.Second,
+	}
+
+	// Token metadata retries - patient backoff
+	rh.configs["token_metadata"] = RetryConfig{
+		MaxAttempts:       4,
+		InitialDelay:      1 * time.Second,
+		MaxDelay:          20 * time.Second,
+		BackoffFactor:     2.0,
+		JitterEnabled:     true,
+		TimeoutPerAttempt: 15 * time.Second,
+	}
+}
+
+// ExecuteWithRetry executes an operation with retry logic
+func (rh *RetryHandler) ExecuteWithRetry(ctx context.Context, operationType string, operation RetryableOperation) *RetryResult {
+	if !rh.enabled {
+		// If retries are disabled, try once
+		err := operation(ctx, 1)
+		return &RetryResult{
+			Success:       err == nil,
+			Attempts:      1,
+			TotalDuration: 0,
+			LastError:     err,
+			LastAttemptAt: time.Now(),
+		}
+	}
+
+	config := rh.getConfig(operationType)
+	start := time.Now()
+	var lastError error
+
+	rh.mu.Lock()
+	stats, exists := rh.stats[operationType]
+	if !exists {
+		stats = &RetryStats{
+			OperationType: operationType,
+		}
+		rh.stats[operationType] = stats
+	}
+	rh.mu.Unlock()
+
+	for attempt := 1; attempt <= config.MaxAttempts; attempt++ {
+		// Create context with timeout for this attempt
+		attemptCtx, cancel := context.WithTimeout(ctx, config.TimeoutPerAttempt)
+
+		rh.logger.Debug("Attempting operation with retry",
+			"operation", operationType,
+			"attempt", attempt,
+			"max_attempts", config.MaxAttempts)
+
+		// Execute the operation
+		err := operation(attemptCtx, attempt)
+		cancel()
+
+		// Update statistics
+		stats.mu.Lock()
+		stats.TotalAttempts++
+		stats.LastAttempt = time.Now()
+		stats.mu.Unlock()
+
+		if err == nil {
+			// Success!
+			duration := time.Since(start)
+
+			stats.mu.Lock()
+			stats.SuccessfulRetries++
+			stats.LastSuccess = time.Now()
+			denominator := stats.SuccessfulRetries + stats.FailedRetries
+			if denominator > 0 {
+				stats.AverageAttempts = float64(stats.TotalAttempts) / float64(denominator)
+			}
+			stats.mu.Unlock()
+
+			rh.logger.Debug("Operation succeeded",
+				"operation", operationType,
+				"attempt", attempt,
+				"duration", duration)
+
+			return &RetryResult{
+				Success:       true,
+				Attempts:      attempt,
+				TotalDuration: duration,
+				LastError:     nil,
+				LastAttemptAt: time.Now(),
+			}
+		}
+
+		lastError = err
+
+		// Check if context was cancelled
+		if ctx.Err() != nil {
+			rh.logger.Debug("Operation cancelled by context",
+				"operation", operationType,
+				"attempt", attempt,
+				"error", ctx.Err())
+			break
+		}
+
+		// Don't wait after the last attempt
+		if attempt < config.MaxAttempts {
+			delay := rh.calculateDelay(config, attempt)
+
+			rh.logger.Debug("Operation failed, retrying",
+				"operation", operationType,
+				"attempt", attempt,
+				"error", err,
+				"delay", delay)
+
+			// Wait before next attempt
+			select {
+			case <-time.After(delay):
+				// Continue to next attempt
+			case <-ctx.Done():
+				// Context cancelled during wait
+				break
+			}
+		} else {
+			rh.logger.Warn("Operation failed after all retries",
+				"operation", operationType,
+				"attempts", attempt,
+				"error", err)
+		}
+	}
+
+	// All attempts failed
+	duration := time.Since(start)
+
+	stats.mu.Lock()
+	stats.FailedRetries++
+	stats.LastFailure = time.Now()
+	denominator := stats.SuccessfulRetries + stats.FailedRetries
+	if denominator > 0 {
+		stats.AverageAttempts = float64(stats.TotalAttempts) / float64(denominator)
+	}
+	stats.mu.Unlock()
+
+	return &RetryResult{
+		Success:       false,
+		Attempts:      config.MaxAttempts,
+		TotalDuration: duration,
+		LastError:     lastError,
+		LastAttemptAt: time.Now(),
+	}
+}
+
+// calculateDelay calculates the delay before the next retry attempt
+func (rh *RetryHandler) calculateDelay(config RetryConfig, attempt int) time.Duration {
+	// Calculate exponential backoff
+	delay := float64(config.InitialDelay) * math.Pow(config.BackoffFactor, float64(attempt-1))
+
+	// Apply maximum delay cap
+	if delay > float64(config.MaxDelay) {
+		delay = float64(config.MaxDelay)
+	}
+
+	duration := time.Duration(delay)
+
+	// Add jitter if enabled
+	if config.JitterEnabled {
+		jitter := time.Duration(float64(duration) * 0.1 * (2*rh.randomFloat() - 1))
+		duration += jitter
+	}
+
+	// Ensure minimum delay
+	if duration < 0 {
+		duration = config.InitialDelay
+	}
+
+	return duration
+}
+
+// randomFloat returns a pseudo-random float between 0 and 1
+func (rh *RetryHandler) randomFloat() float64 {
+	// Simple pseudo-random number based on current time
+	return float64(time.Now().UnixNano()%1000) / 1000.0
+}
+
+// getConfig returns the retry configuration for an operation type
+func (rh *RetryHandler) getConfig(operationType string) RetryConfig {
+	rh.mu.RLock()
+	defer rh.mu.RUnlock()
+
+	if config, exists := rh.configs[operationType]; exists {
+		return config
+	}
+
+	// Return default config if no specific config found
+	return DefaultRetryConfig()
+}
+
+// SetConfig sets a custom retry configuration for an operation type
+func (rh *RetryHandler) SetConfig(operationType string, config RetryConfig) {
+	rh.mu.Lock()
+	defer rh.mu.Unlock()
+
+	rh.configs[operationType] = config
+	rh.logger.Debug("Set retry config",
+		"operation", operationType,
+		"max_attempts", config.MaxAttempts,
+		"initial_delay", config.InitialDelay,
+		"max_delay", config.MaxDelay)
+}
+
+// GetStats returns retry statistics for all operation types
+func (rh *RetryHandler) GetStats() map[string]*RetryStats {
+	rh.mu.RLock()
+	defer rh.mu.RUnlock()
+
+	// Return a copy to prevent external modification
+	result := make(map[string]*RetryStats)
+	for opType, stats := range rh.stats {
+		stats.mu.RLock()
+		result[opType] = &RetryStats{
+			OperationType:     stats.OperationType,
+			TotalAttempts:     stats.TotalAttempts,
+			SuccessfulRetries: stats.SuccessfulRetries,
+			FailedRetries:     stats.FailedRetries,
+			AverageAttempts:   stats.AverageAttempts,
+			LastAttempt:       stats.LastAttempt,
+			LastSuccess:       stats.LastSuccess,
+			LastFailure:       stats.LastFailure,
+		}
+		stats.mu.RUnlock()
+	}
+
+	return result
+}
+
+// GetOperationStats returns statistics for a specific operation type
+func (rh *RetryHandler) GetOperationStats(operationType string) *RetryStats {
+	rh.mu.RLock()
+	defer rh.mu.RUnlock()
+
+	stats, exists := rh.stats[operationType]
+	if !exists {
+		return nil
+	}
+
+	stats.mu.RLock()
+	defer stats.mu.RUnlock()
+
+	return &RetryStats{
+		OperationType:     stats.OperationType,
+		TotalAttempts:     stats.TotalAttempts,
+		SuccessfulRetries: stats.SuccessfulRetries,
+		FailedRetries:     stats.FailedRetries,
+		AverageAttempts:   stats.AverageAttempts,
+		LastAttempt:       stats.LastAttempt,
+		LastSuccess:       stats.LastSuccess,
+		LastFailure:       stats.LastFailure,
+	}
+}
+
+// ResetStats resets statistics for all operation types
+func (rh *RetryHandler) ResetStats() {
+	rh.mu.Lock()
+	defer rh.mu.Unlock()
+
+	rh.stats = make(map[string]*RetryStats)
+	rh.logger.Info("Reset retry statistics")
+}
+
+// Enable enables the retry handler
+func (rh *RetryHandler) Enable() {
+	rh.mu.Lock()
+	defer rh.mu.Unlock()
+	rh.enabled = true
+	rh.logger.Info("Retry handler enabled")
+}
+
+// Disable disables the retry handler
+func (rh *RetryHandler) Disable() {
+	rh.mu.Lock()
+	defer rh.mu.Unlock()
+	rh.enabled = false
+	rh.logger.Info("Retry handler disabled")
+}
+
+// IsEnabled returns whether the retry handler is enabled
+func (rh *RetryHandler) IsEnabled() bool {
+	rh.mu.RLock()
+	defer rh.mu.RUnlock()
+	return rh.enabled
+}
+
+// GetHealthSummary returns a health summary based on retry statistics
+func (rh *RetryHandler) GetHealthSummary() map[string]interface{} {
+	stats := rh.GetStats()
+
+	summary := map[string]interface{}{
+		"enabled":              rh.enabled,
+		"total_operations":     len(stats),
+		"healthy_operations":   0,
+		"unhealthy_operations": 0,
+		"operation_details":    make(map[string]interface{}),
+	}
+
+	for opType, opStats := range stats {
+		total := opStats.SuccessfulRetries + opStats.FailedRetries
+		successRate := 0.0
+		if total > 0 {
+			successRate = float64(opStats.SuccessfulRetries) / float64(total)
+		}
+
+		isHealthy := successRate >= 0.9 && opStats.AverageAttempts <= 2.0
+
+		if isHealthy {
+			summary["healthy_operations"] = summary["healthy_operations"].(int) + 1
+		} else {
+			summary["unhealthy_operations"] = summary["unhealthy_operations"].(int) + 1
+		}
+
+		summary["operation_details"].(map[string]interface{})[opType] = map[string]interface{}{
+			"success_rate":     successRate,
+			"average_attempts": opStats.AverageAttempts,
+			"total_operations": total,
+			"is_healthy":       isHealthy,
+			"last_success":     opStats.LastSuccess,
+			"last_failure":     opStats.LastFailure,
+		}
+	}
+
+	return summary
+}
--- a/orig/internal/recovery/retry_handler_test.go
+++ b/orig/internal/recovery/retry_handler_test.go
@@ -0,0 +1,362 @@
+package recovery
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+
+	"github.com/fraktal/mev-beta/internal/logger"
+)
+
+func TestRetryHandler_ExecuteWithRetry_Success(t *testing.T) {
+	log := logger.New("debug", "text", "")
+	handler := NewRetryHandler(log)
+
+	attempts := 0
+	operation := func(ctx context.Context, attempt int) error {
+		attempts++
+		if attempts == 2 {
+			return nil // Success on second attempt
+		}
+		return errors.New("temporary failure")
+	}
+
+	result := handler.ExecuteWithRetry(context.Background(), "test_operation", operation)
+
+	assert.True(t, result.Success)
+	assert.Equal(t, 2, result.Attempts)
+	assert.Nil(t, result.LastError)
+	assert.Equal(t, 2, attempts)
+}
+
+func TestRetryHandler_ExecuteWithRetry_MaxAttemptsReached(t *testing.T) {
+	log := logger.New("debug", "text", "")
+	handler := NewRetryHandler(log)
+
+	attempts := 0
+	operation := func(ctx context.Context, attempt int) error {
+		attempts++
+		return errors.New("persistent failure")
+	}
+
+	result := handler.ExecuteWithRetry(context.Background(), "test_operation", operation)
+
+	assert.False(t, result.Success)
+	assert.Equal(t, 3, result.Attempts) // Default max attempts
+	assert.NotNil(t, result.LastError)
+	assert.Equal(t, "persistent failure", result.LastError.Error())
+	assert.Equal(t, 3, attempts)
+}
+
+func TestRetryHandler_ExecuteWithRetry_ContextCanceled(t *testing.T) {
+	log := logger.New("debug", "text", "")
+	handler := NewRetryHandler(log)
+
+	ctx, cancel := context.WithCancel(context.Background())
+
+	attempts := 0
+	operation := func(ctx context.Context, attempt int) error {
+		attempts++
+		if attempts == 2 {
+			cancel() // Cancel context on second attempt
+		}
+		return errors.New("failure")
+	}
+
+	result := handler.ExecuteWithRetry(ctx, "test_operation", operation)
+
+	assert.False(t, result.Success)
+	assert.LessOrEqual(t, result.Attempts, 3)
+	assert.NotNil(t, result.LastError)
+}
+
+func TestRetryHandler_ExecuteWithRetry_CustomConfig(t *testing.T) {
+	log := logger.New("debug", "text", "")
+	handler := NewRetryHandler(log)
+
+	// Set custom configuration
+	customConfig := RetryConfig{
+		MaxAttempts:       5,
+		InitialDelay:      10 * time.Millisecond,
+		MaxDelay:          100 * time.Millisecond,
+		BackoffFactor:     2.0,
+		JitterEnabled:     false,
+		TimeoutPerAttempt: 1 * time.Second,
+	}
+	handler.SetConfig("custom_operation", customConfig)
+
+	attempts := 0
+	operation := func(ctx context.Context, attempt int) error {
+		attempts++
+		return errors.New("persistent failure")
+	}
+
+	start := time.Now()
+	result := handler.ExecuteWithRetry(context.Background(), "custom_operation", operation)
+	duration := time.Since(start)
+
+	assert.False(t, result.Success)
+	assert.Equal(t, 5, result.Attempts) // Custom max attempts
+	assert.Equal(t, 5, attempts)
+
+	// Should have taken some time due to delays (at least 150ms for delays)
+	expectedMinDuration := 10*time.Millisecond + 20*time.Millisecond + 40*time.Millisecond + 80*time.Millisecond
+	assert.GreaterOrEqual(t, duration, expectedMinDuration)
+}
+
+func TestRetryHandler_ExecuteWithRetry_Disabled(t *testing.T) {
+	log := logger.New("debug", "text", "")
+	handler := NewRetryHandler(log)
+	handler.Disable()
+
+	attempts := 0
+	operation := func(ctx context.Context, attempt int) error {
+		attempts++
+		return errors.New("failure")
+	}
+
+	result := handler.ExecuteWithRetry(context.Background(), "test_operation", operation)
+
+	assert.False(t, result.Success)
+	assert.Equal(t, 1, result.Attempts) // Only one attempt when disabled
+	assert.Equal(t, 1, attempts)
+}
+
+func TestRetryHandler_CalculateDelay(t *testing.T) {
+	log := logger.New("debug", "text", "")
+	handler := NewRetryHandler(log)
+
+	config := RetryConfig{
+		InitialDelay:  100 * time.Millisecond,
+		MaxDelay:      1 * time.Second,
+		BackoffFactor: 2.0,
+		JitterEnabled: false,
+	}
+
+	tests := []struct {
+		attempt     int
+		expectedMin time.Duration
+		expectedMax time.Duration
+	}{
+		{1, 100 * time.Millisecond, 100 * time.Millisecond},
+		{2, 200 * time.Millisecond, 200 * time.Millisecond},
+		{3, 400 * time.Millisecond, 400 * time.Millisecond},
+		{4, 800 * time.Millisecond, 800 * time.Millisecond},
+		{5, 1 * time.Second, 1 * time.Second}, // Should be capped at MaxDelay
+	}
+
+	for _, tt := range tests {
+		t.Run(fmt.Sprintf("attempt_%d", tt.attempt), func(t *testing.T) {
+			delay := handler.calculateDelay(config, tt.attempt)
+			assert.GreaterOrEqual(t, delay, tt.expectedMin)
+			assert.LessOrEqual(t, delay, tt.expectedMax)
+		})
+	}
+}
+
+func TestRetryHandler_CalculateDelay_WithJitter(t *testing.T) {
+	log := logger.New("debug", "text", "")
+	handler := NewRetryHandler(log)
+
+	config := RetryConfig{
+		InitialDelay:  100 * time.Millisecond,
+		MaxDelay:      1 * time.Second,
+		BackoffFactor: 2.0,
+		JitterEnabled: true,
+	}
+
+	// Test jitter variation
+	delays := make([]time.Duration, 10)
+	for i := 0; i < 10; i++ {
+		delays[i] = handler.calculateDelay(config, 2) // 200ms base
+	}
+
+	// Should have some variation due to jitter
+	allSame := true
+	for i := 1; i < len(delays); i++ {
+		if delays[i] != delays[0] {
+			allSame = false
+			break
+		}
+	}
+	assert.False(t, allSame, "Jitter should cause variation in delays")
+
+	// All delays should be reasonable (within 10% of base)
+	baseDelay := 200 * time.Millisecond
+	for _, delay := range delays {
+		assert.GreaterOrEqual(t, delay, baseDelay*9/10) // 10% below
+		assert.LessOrEqual(t, delay, baseDelay*11/10)   // 10% above
+	}
+}
+
+func TestRetryHandler_GetStats(t *testing.T) {
+	log := logger.New("debug", "text", "")
+	handler := NewRetryHandler(log)
+
+	// Execute some operations
+	successOp := func(ctx context.Context, attempt int) error {
+		return nil
+	}
+	failOp := func(ctx context.Context, attempt int) error {
+		return errors.New("failure")
+	}
+
+	handler.ExecuteWithRetry(context.Background(), "test_success", successOp)
+	handler.ExecuteWithRetry(context.Background(), "test_success", successOp)
+	handler.ExecuteWithRetry(context.Background(), "test_fail", failOp)
+
+	stats := handler.GetStats()
+
+	// Check success stats
+	successStats := stats["test_success"]
+	require.NotNil(t, successStats)
+	assert.Equal(t, 2, successStats.TotalAttempts)
+	assert.Equal(t, 2, successStats.SuccessfulRetries)
+	assert.Equal(t, 0, successStats.FailedRetries)
+
+	// Check failure stats
+	failStats := stats["test_fail"]
+	require.NotNil(t, failStats)
+	assert.Equal(t, 3, failStats.TotalAttempts) // Default max attempts
+	assert.Equal(t, 0, failStats.SuccessfulRetries)
+	assert.Equal(t, 1, failStats.FailedRetries)
+}
+
+func TestRetryHandler_GetHealthSummary(t *testing.T) {
+	log := logger.New("debug", "text", "")
+	handler := NewRetryHandler(log)
+
+	// Execute some operations to generate stats
+	successOp := func(ctx context.Context, attempt int) error {
+		return nil
+	}
+	partialFailOp := func(ctx context.Context, attempt int) error {
+		if attempt < 2 {
+			return errors.New("temporary failure")
+		}
+		return nil
+	}
+
+	// 2 immediate successes
+	handler.ExecuteWithRetry(context.Background(), "immediate_success", successOp)
+	handler.ExecuteWithRetry(context.Background(), "immediate_success", successOp)
+
+	// 1 success after retry
+	handler.ExecuteWithRetry(context.Background(), "retry_success", partialFailOp)
+
+	summary := handler.GetHealthSummary()
+
+	assert.True(t, summary["enabled"].(bool))
+	assert.Equal(t, 2, summary["total_operations"].(int))
+	assert.Equal(t, 2, summary["healthy_operations"].(int))
+	assert.Equal(t, 0, summary["unhealthy_operations"].(int))
+
+	// Check operation details
+	details := summary["operation_details"].(map[string]interface{})
+
+	immediateDetails := details["immediate_success"].(map[string]interface{})
+	assert.Equal(t, 1.0, immediateDetails["success_rate"].(float64))
+	assert.Equal(t, 1.0, immediateDetails["average_attempts"].(float64))
+	assert.True(t, immediateDetails["is_healthy"].(bool))
+
+	retryDetails := details["retry_success"].(map[string]interface{})
+	assert.Equal(t, 1.0, retryDetails["success_rate"].(float64))
+	assert.Equal(t, 2.0, retryDetails["average_attempts"].(float64))
+	assert.True(t, retryDetails["is_healthy"].(bool)) // Still healthy despite retries
+}
+
+func TestRetryHandler_ConcurrentExecution(t *testing.T) {
+	log := logger.New("debug", "text", "")
+	handler := NewRetryHandler(log)
+
+	const numGoroutines = 50
+	const operationsPerGoroutine = 20
+
+	done := make(chan bool, numGoroutines)
+	successCount := make(chan int, numGoroutines)
+
+	operation := func(ctx context.Context, attempt int) error {
+		// 80% success rate
+		if attempt <= 1 && time.Now().UnixNano()%5 != 0 {
+			return nil
+		}
+		if attempt == 2 {
+			return nil // Always succeed on second attempt
+		}
+		return errors.New("failure")
+	}
+
+	// Launch concurrent retry operations
+	for i := 0; i < numGoroutines; i++ {
+		go func(id int) {
+			defer func() { done <- true }()
+
+			successes := 0
+			for j := 0; j < operationsPerGoroutine; j++ {
+				result := handler.ExecuteWithRetry(context.Background(),
+					fmt.Sprintf("concurrent_op_%d", id), operation)
+				if result.Success {
+					successes++
+				}
+			}
+			successCount <- successes
+		}(i)
+	}
+
+	// Collect results
+	totalSuccesses := 0
+	for i := 0; i < numGoroutines; i++ {
+		select {
+		case <-done:
+			totalSuccesses += <-successCount
+		case <-time.After(30 * time.Second):
+			t.Fatal("Concurrent retry test timed out")
+		}
+	}
+
+	totalOperations := numGoroutines * operationsPerGoroutine
+	successRate := float64(totalSuccesses) / float64(totalOperations)
+
+	t.Logf("Concurrent execution: %d/%d operations succeeded (%.2f%%)",
+		totalSuccesses, totalOperations, successRate*100)
+
+	// Should have high success rate due to retries
+	assert.GreaterOrEqual(t, successRate, 0.8, "Success rate should be at least 80%")
+
+	// Verify stats are consistent
+	stats := handler.GetStats()
+	assert.NotEmpty(t, stats, "Should have recorded stats")
+}
+
+func TestRetryHandler_EdgeCases(t *testing.T) {
+	log := logger.New("debug", "text", "")
+	handler := NewRetryHandler(log)
+
+	t.Run("nil operation", func(t *testing.T) {
+		assert.Panics(t, func() {
+			handler.ExecuteWithRetry(context.Background(), "nil_op", nil)
+		})
+	})
+
+	t.Run("empty operation type", func(t *testing.T) {
+		operation := func(ctx context.Context, attempt int) error {
+			return nil
+		}
+		result := handler.ExecuteWithRetry(context.Background(), "", operation)
+		assert.True(t, result.Success)
+	})
+
+	t.Run("very long operation type", func(t *testing.T) {
+		longName := string(make([]byte, 1000))
+		operation := func(ctx context.Context, attempt int) error {
+			return nil
+		}
+		result := handler.ExecuteWithRetry(context.Background(), longName, operation)
+		assert.True(t, result.Success)
+	})
+}