package recovery import ( "context" "math" "sync" "time" "github.com/fraktal/mev-beta/internal/logger" ) // RetryConfig defines retry behavior configuration type RetryConfig struct { MaxAttempts int InitialDelay time.Duration MaxDelay time.Duration BackoffFactor float64 JitterEnabled bool TimeoutPerAttempt time.Duration } // DefaultRetryConfig returns a sensible default retry configuration func DefaultRetryConfig() RetryConfig { return RetryConfig{ MaxAttempts: 3, InitialDelay: 1 * time.Second, MaxDelay: 30 * time.Second, BackoffFactor: 2.0, JitterEnabled: true, TimeoutPerAttempt: 10 * time.Second, } } // RetryableOperation represents an operation that can be retried type RetryableOperation func(ctx context.Context, attempt int) error // RetryHandler provides exponential backoff retry capabilities type RetryHandler struct { mu sync.RWMutex logger *logger.Logger configs map[string]RetryConfig stats map[string]*RetryStats enabled bool } // RetryStats tracks retry statistics for operations type RetryStats struct { mu sync.RWMutex OperationType string TotalAttempts int SuccessfulRetries int FailedRetries int AverageAttempts float64 LastAttempt time.Time LastSuccess time.Time LastFailure time.Time } // RetryResult contains the result of a retry operation type RetryResult struct { Success bool Attempts int TotalDuration time.Duration LastError error LastAttemptAt time.Time } // NewRetryHandler creates a new retry handler func NewRetryHandler(logger *logger.Logger) *RetryHandler { handler := &RetryHandler{ logger: logger, configs: make(map[string]RetryConfig), stats: make(map[string]*RetryStats), enabled: true, } // Initialize default configurations for common operations handler.initializeDefaultConfigs() return handler } // initializeDefaultConfigs sets up default retry configurations func (rh *RetryHandler) initializeDefaultConfigs() { // Contract call retries - moderate backoff rh.configs["contract_call"] = RetryConfig{ MaxAttempts: 3, InitialDelay: 500 * time.Millisecond, MaxDelay: 5 * time.Second, BackoffFactor: 2.0, JitterEnabled: true, TimeoutPerAttempt: 10 * time.Second, } // RPC connection retries - aggressive backoff rh.configs["rpc_connection"] = RetryConfig{ MaxAttempts: 5, InitialDelay: 1 * time.Second, MaxDelay: 30 * time.Second, BackoffFactor: 2.5, JitterEnabled: true, TimeoutPerAttempt: 15 * time.Second, } // Data parsing retries - quick retries rh.configs["data_parsing"] = RetryConfig{ MaxAttempts: 2, InitialDelay: 100 * time.Millisecond, MaxDelay: 1 * time.Second, BackoffFactor: 2.0, JitterEnabled: false, TimeoutPerAttempt: 5 * time.Second, } // Block processing retries - conservative rh.configs["block_processing"] = RetryConfig{ MaxAttempts: 3, InitialDelay: 2 * time.Second, MaxDelay: 10 * time.Second, BackoffFactor: 2.0, JitterEnabled: true, TimeoutPerAttempt: 30 * time.Second, } // Token metadata retries - patient backoff rh.configs["token_metadata"] = RetryConfig{ MaxAttempts: 4, InitialDelay: 1 * time.Second, MaxDelay: 20 * time.Second, BackoffFactor: 2.0, JitterEnabled: true, TimeoutPerAttempt: 15 * time.Second, } } // ExecuteWithRetry executes an operation with retry logic func (rh *RetryHandler) ExecuteWithRetry(ctx context.Context, operationType string, operation RetryableOperation) *RetryResult { if !rh.enabled { // If retries are disabled, try once err := operation(ctx, 1) return &RetryResult{ Success: err == nil, Attempts: 1, TotalDuration: 0, LastError: err, LastAttemptAt: time.Now(), } } config := rh.getConfig(operationType) start := time.Now() var lastError error rh.mu.Lock() stats, exists := rh.stats[operationType] if !exists { stats = &RetryStats{ OperationType: operationType, } rh.stats[operationType] = stats } rh.mu.Unlock() for attempt := 1; attempt <= config.MaxAttempts; attempt++ { // Create context with timeout for this attempt attemptCtx, cancel := context.WithTimeout(ctx, config.TimeoutPerAttempt) rh.logger.Debug("Attempting operation with retry", "operation", operationType, "attempt", attempt, "max_attempts", config.MaxAttempts) // Execute the operation err := operation(attemptCtx, attempt) cancel() // Update statistics stats.mu.Lock() stats.TotalAttempts++ stats.LastAttempt = time.Now() stats.mu.Unlock() if err == nil { // Success! duration := time.Since(start) stats.mu.Lock() stats.SuccessfulRetries++ stats.LastSuccess = time.Now() denominator := stats.SuccessfulRetries + stats.FailedRetries if denominator > 0 { stats.AverageAttempts = float64(stats.TotalAttempts) / float64(denominator) } stats.mu.Unlock() rh.logger.Debug("Operation succeeded", "operation", operationType, "attempt", attempt, "duration", duration) return &RetryResult{ Success: true, Attempts: attempt, TotalDuration: duration, LastError: nil, LastAttemptAt: time.Now(), } } lastError = err // Check if context was cancelled if ctx.Err() != nil { rh.logger.Debug("Operation cancelled by context", "operation", operationType, "attempt", attempt, "error", ctx.Err()) break } // Don't wait after the last attempt if attempt < config.MaxAttempts { delay := rh.calculateDelay(config, attempt) rh.logger.Debug("Operation failed, retrying", "operation", operationType, "attempt", attempt, "error", err, "delay", delay) // Wait before next attempt select { case <-time.After(delay): // Continue to next attempt case <-ctx.Done(): // Context cancelled during wait break } } else { rh.logger.Warn("Operation failed after all retries", "operation", operationType, "attempts", attempt, "error", err) } } // All attempts failed duration := time.Since(start) stats.mu.Lock() stats.FailedRetries++ stats.LastFailure = time.Now() denominator := stats.SuccessfulRetries + stats.FailedRetries if denominator > 0 { stats.AverageAttempts = float64(stats.TotalAttempts) / float64(denominator) } stats.mu.Unlock() return &RetryResult{ Success: false, Attempts: config.MaxAttempts, TotalDuration: duration, LastError: lastError, LastAttemptAt: time.Now(), } } // calculateDelay calculates the delay before the next retry attempt func (rh *RetryHandler) calculateDelay(config RetryConfig, attempt int) time.Duration { // Calculate exponential backoff delay := float64(config.InitialDelay) * math.Pow(config.BackoffFactor, float64(attempt-1)) // Apply maximum delay cap if delay > float64(config.MaxDelay) { delay = float64(config.MaxDelay) } duration := time.Duration(delay) // Add jitter if enabled if config.JitterEnabled { jitter := time.Duration(float64(duration) * 0.1 * (2*rh.randomFloat() - 1)) duration += jitter } // Ensure minimum delay if duration < 0 { duration = config.InitialDelay } return duration } // randomFloat returns a pseudo-random float between 0 and 1 func (rh *RetryHandler) randomFloat() float64 { // Simple pseudo-random number based on current time return float64(time.Now().UnixNano()%1000) / 1000.0 } // getConfig returns the retry configuration for an operation type func (rh *RetryHandler) getConfig(operationType string) RetryConfig { rh.mu.RLock() defer rh.mu.RUnlock() if config, exists := rh.configs[operationType]; exists { return config } // Return default config if no specific config found return DefaultRetryConfig() } // SetConfig sets a custom retry configuration for an operation type func (rh *RetryHandler) SetConfig(operationType string, config RetryConfig) { rh.mu.Lock() defer rh.mu.Unlock() rh.configs[operationType] = config rh.logger.Debug("Set retry config", "operation", operationType, "max_attempts", config.MaxAttempts, "initial_delay", config.InitialDelay, "max_delay", config.MaxDelay) } // GetStats returns retry statistics for all operation types func (rh *RetryHandler) GetStats() map[string]*RetryStats { rh.mu.RLock() defer rh.mu.RUnlock() // Return a copy to prevent external modification result := make(map[string]*RetryStats) for opType, stats := range rh.stats { stats.mu.RLock() result[opType] = &RetryStats{ OperationType: stats.OperationType, TotalAttempts: stats.TotalAttempts, SuccessfulRetries: stats.SuccessfulRetries, FailedRetries: stats.FailedRetries, AverageAttempts: stats.AverageAttempts, LastAttempt: stats.LastAttempt, LastSuccess: stats.LastSuccess, LastFailure: stats.LastFailure, } stats.mu.RUnlock() } return result } // GetOperationStats returns statistics for a specific operation type func (rh *RetryHandler) GetOperationStats(operationType string) *RetryStats { rh.mu.RLock() defer rh.mu.RUnlock() stats, exists := rh.stats[operationType] if !exists { return nil } stats.mu.RLock() defer stats.mu.RUnlock() return &RetryStats{ OperationType: stats.OperationType, TotalAttempts: stats.TotalAttempts, SuccessfulRetries: stats.SuccessfulRetries, FailedRetries: stats.FailedRetries, AverageAttempts: stats.AverageAttempts, LastAttempt: stats.LastAttempt, LastSuccess: stats.LastSuccess, LastFailure: stats.LastFailure, } } // ResetStats resets statistics for all operation types func (rh *RetryHandler) ResetStats() { rh.mu.Lock() defer rh.mu.Unlock() rh.stats = make(map[string]*RetryStats) rh.logger.Info("Reset retry statistics") } // Enable enables the retry handler func (rh *RetryHandler) Enable() { rh.mu.Lock() defer rh.mu.Unlock() rh.enabled = true rh.logger.Info("Retry handler enabled") } // Disable disables the retry handler func (rh *RetryHandler) Disable() { rh.mu.Lock() defer rh.mu.Unlock() rh.enabled = false rh.logger.Info("Retry handler disabled") } // IsEnabled returns whether the retry handler is enabled func (rh *RetryHandler) IsEnabled() bool { rh.mu.RLock() defer rh.mu.RUnlock() return rh.enabled } // GetHealthSummary returns a health summary based on retry statistics func (rh *RetryHandler) GetHealthSummary() map[string]interface{} { stats := rh.GetStats() summary := map[string]interface{}{ "enabled": rh.enabled, "total_operations": len(stats), "healthy_operations": 0, "unhealthy_operations": 0, "operation_details": make(map[string]interface{}), } for opType, opStats := range stats { total := opStats.SuccessfulRetries + opStats.FailedRetries successRate := 0.0 if total > 0 { successRate = float64(opStats.SuccessfulRetries) / float64(total) } isHealthy := successRate >= 0.9 && opStats.AverageAttempts <= 2.0 if isHealthy { summary["healthy_operations"] = summary["healthy_operations"].(int) + 1 } else { summary["unhealthy_operations"] = summary["unhealthy_operations"].(int) + 1 } summary["operation_details"].(map[string]interface{})[opType] = map[string]interface{}{ "success_rate": successRate, "average_attempts": opStats.AverageAttempts, "total_operations": total, "is_healthy": isHealthy, "last_success": opStats.LastSuccess, "last_failure": opStats.LastFailure, } } return summary }