feat: create v2-prep branch with comprehensive planning
Restructured project for V2 refactor: **Structure Changes:** - Moved all V1 code to orig/ folder (preserved with git mv) - Created docs/planning/ directory - Added orig/README_V1.md explaining V1 preservation **Planning Documents:** - 00_V2_MASTER_PLAN.md: Complete architecture overview - Executive summary of critical V1 issues - High-level component architecture diagrams - 5-phase implementation roadmap - Success metrics and risk mitigation - 07_TASK_BREAKDOWN.md: Atomic task breakdown - 99+ hours of detailed tasks - Every task < 2 hours (atomic) - Clear dependencies and success criteria - Organized by implementation phase **V2 Key Improvements:** - Per-exchange parsers (factory pattern) - Multi-layer strict validation - Multi-index pool cache - Background validation pipeline - Comprehensive observability **Critical Issues Addressed:** - Zero address tokens (strict validation + cache enrichment) - Parsing accuracy (protocol-specific parsers) - No audit trail (background validation channel) - Inefficient lookups (multi-index cache) - Stats disconnection (event-driven metrics) Next Steps: 1. Review planning documents 2. Begin Phase 1: Foundation (P1-001 through P1-010) 3. Implement parsers in Phase 2 4. Build cache system in Phase 3 5. Add validation pipeline in Phase 4 6. Migrate and test in Phase 5 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
446
orig/internal/recovery/retry_handler.go
Normal file
446
orig/internal/recovery/retry_handler.go
Normal file
@@ -0,0 +1,446 @@
|
||||
package recovery
|
||||
|
||||
import (
|
||||
"context"
|
||||
"math"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/fraktal/mev-beta/internal/logger"
|
||||
)
|
||||
|
||||
// RetryConfig defines retry behavior configuration
|
||||
type RetryConfig struct {
|
||||
MaxAttempts int
|
||||
InitialDelay time.Duration
|
||||
MaxDelay time.Duration
|
||||
BackoffFactor float64
|
||||
JitterEnabled bool
|
||||
TimeoutPerAttempt time.Duration
|
||||
}
|
||||
|
||||
// DefaultRetryConfig returns a sensible default retry configuration
|
||||
func DefaultRetryConfig() RetryConfig {
|
||||
return RetryConfig{
|
||||
MaxAttempts: 3,
|
||||
InitialDelay: 1 * time.Second,
|
||||
MaxDelay: 30 * time.Second,
|
||||
BackoffFactor: 2.0,
|
||||
JitterEnabled: true,
|
||||
TimeoutPerAttempt: 10 * time.Second,
|
||||
}
|
||||
}
|
||||
|
||||
// RetryableOperation represents an operation that can be retried
|
||||
type RetryableOperation func(ctx context.Context, attempt int) error
|
||||
|
||||
// RetryHandler provides exponential backoff retry capabilities
|
||||
type RetryHandler struct {
|
||||
mu sync.RWMutex
|
||||
logger *logger.Logger
|
||||
configs map[string]RetryConfig
|
||||
stats map[string]*RetryStats
|
||||
enabled bool
|
||||
}
|
||||
|
||||
// RetryStats tracks retry statistics for operations
|
||||
type RetryStats struct {
|
||||
mu sync.RWMutex
|
||||
OperationType string
|
||||
TotalAttempts int
|
||||
SuccessfulRetries int
|
||||
FailedRetries int
|
||||
AverageAttempts float64
|
||||
LastAttempt time.Time
|
||||
LastSuccess time.Time
|
||||
LastFailure time.Time
|
||||
}
|
||||
|
||||
// RetryResult contains the result of a retry operation
|
||||
type RetryResult struct {
|
||||
Success bool
|
||||
Attempts int
|
||||
TotalDuration time.Duration
|
||||
LastError error
|
||||
LastAttemptAt time.Time
|
||||
}
|
||||
|
||||
// NewRetryHandler creates a new retry handler
|
||||
func NewRetryHandler(logger *logger.Logger) *RetryHandler {
|
||||
handler := &RetryHandler{
|
||||
logger: logger,
|
||||
configs: make(map[string]RetryConfig),
|
||||
stats: make(map[string]*RetryStats),
|
||||
enabled: true,
|
||||
}
|
||||
|
||||
// Initialize default configurations for common operations
|
||||
handler.initializeDefaultConfigs()
|
||||
|
||||
return handler
|
||||
}
|
||||
|
||||
// initializeDefaultConfigs sets up default retry configurations
|
||||
func (rh *RetryHandler) initializeDefaultConfigs() {
|
||||
// Contract call retries - moderate backoff
|
||||
rh.configs["contract_call"] = RetryConfig{
|
||||
MaxAttempts: 3,
|
||||
InitialDelay: 500 * time.Millisecond,
|
||||
MaxDelay: 5 * time.Second,
|
||||
BackoffFactor: 2.0,
|
||||
JitterEnabled: true,
|
||||
TimeoutPerAttempt: 10 * time.Second,
|
||||
}
|
||||
|
||||
// RPC connection retries - aggressive backoff
|
||||
rh.configs["rpc_connection"] = RetryConfig{
|
||||
MaxAttempts: 5,
|
||||
InitialDelay: 1 * time.Second,
|
||||
MaxDelay: 30 * time.Second,
|
||||
BackoffFactor: 2.5,
|
||||
JitterEnabled: true,
|
||||
TimeoutPerAttempt: 15 * time.Second,
|
||||
}
|
||||
|
||||
// Data parsing retries - quick retries
|
||||
rh.configs["data_parsing"] = RetryConfig{
|
||||
MaxAttempts: 2,
|
||||
InitialDelay: 100 * time.Millisecond,
|
||||
MaxDelay: 1 * time.Second,
|
||||
BackoffFactor: 2.0,
|
||||
JitterEnabled: false,
|
||||
TimeoutPerAttempt: 5 * time.Second,
|
||||
}
|
||||
|
||||
// Block processing retries - conservative
|
||||
rh.configs["block_processing"] = RetryConfig{
|
||||
MaxAttempts: 3,
|
||||
InitialDelay: 2 * time.Second,
|
||||
MaxDelay: 10 * time.Second,
|
||||
BackoffFactor: 2.0,
|
||||
JitterEnabled: true,
|
||||
TimeoutPerAttempt: 30 * time.Second,
|
||||
}
|
||||
|
||||
// Token metadata retries - patient backoff
|
||||
rh.configs["token_metadata"] = RetryConfig{
|
||||
MaxAttempts: 4,
|
||||
InitialDelay: 1 * time.Second,
|
||||
MaxDelay: 20 * time.Second,
|
||||
BackoffFactor: 2.0,
|
||||
JitterEnabled: true,
|
||||
TimeoutPerAttempt: 15 * time.Second,
|
||||
}
|
||||
}
|
||||
|
||||
// ExecuteWithRetry executes an operation with retry logic
|
||||
func (rh *RetryHandler) ExecuteWithRetry(ctx context.Context, operationType string, operation RetryableOperation) *RetryResult {
|
||||
if !rh.enabled {
|
||||
// If retries are disabled, try once
|
||||
err := operation(ctx, 1)
|
||||
return &RetryResult{
|
||||
Success: err == nil,
|
||||
Attempts: 1,
|
||||
TotalDuration: 0,
|
||||
LastError: err,
|
||||
LastAttemptAt: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
config := rh.getConfig(operationType)
|
||||
start := time.Now()
|
||||
var lastError error
|
||||
|
||||
rh.mu.Lock()
|
||||
stats, exists := rh.stats[operationType]
|
||||
if !exists {
|
||||
stats = &RetryStats{
|
||||
OperationType: operationType,
|
||||
}
|
||||
rh.stats[operationType] = stats
|
||||
}
|
||||
rh.mu.Unlock()
|
||||
|
||||
for attempt := 1; attempt <= config.MaxAttempts; attempt++ {
|
||||
// Create context with timeout for this attempt
|
||||
attemptCtx, cancel := context.WithTimeout(ctx, config.TimeoutPerAttempt)
|
||||
|
||||
rh.logger.Debug("Attempting operation with retry",
|
||||
"operation", operationType,
|
||||
"attempt", attempt,
|
||||
"max_attempts", config.MaxAttempts)
|
||||
|
||||
// Execute the operation
|
||||
err := operation(attemptCtx, attempt)
|
||||
cancel()
|
||||
|
||||
// Update statistics
|
||||
stats.mu.Lock()
|
||||
stats.TotalAttempts++
|
||||
stats.LastAttempt = time.Now()
|
||||
stats.mu.Unlock()
|
||||
|
||||
if err == nil {
|
||||
// Success!
|
||||
duration := time.Since(start)
|
||||
|
||||
stats.mu.Lock()
|
||||
stats.SuccessfulRetries++
|
||||
stats.LastSuccess = time.Now()
|
||||
denominator := stats.SuccessfulRetries + stats.FailedRetries
|
||||
if denominator > 0 {
|
||||
stats.AverageAttempts = float64(stats.TotalAttempts) / float64(denominator)
|
||||
}
|
||||
stats.mu.Unlock()
|
||||
|
||||
rh.logger.Debug("Operation succeeded",
|
||||
"operation", operationType,
|
||||
"attempt", attempt,
|
||||
"duration", duration)
|
||||
|
||||
return &RetryResult{
|
||||
Success: true,
|
||||
Attempts: attempt,
|
||||
TotalDuration: duration,
|
||||
LastError: nil,
|
||||
LastAttemptAt: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
lastError = err
|
||||
|
||||
// Check if context was cancelled
|
||||
if ctx.Err() != nil {
|
||||
rh.logger.Debug("Operation cancelled by context",
|
||||
"operation", operationType,
|
||||
"attempt", attempt,
|
||||
"error", ctx.Err())
|
||||
break
|
||||
}
|
||||
|
||||
// Don't wait after the last attempt
|
||||
if attempt < config.MaxAttempts {
|
||||
delay := rh.calculateDelay(config, attempt)
|
||||
|
||||
rh.logger.Debug("Operation failed, retrying",
|
||||
"operation", operationType,
|
||||
"attempt", attempt,
|
||||
"error", err,
|
||||
"delay", delay)
|
||||
|
||||
// Wait before next attempt
|
||||
select {
|
||||
case <-time.After(delay):
|
||||
// Continue to next attempt
|
||||
case <-ctx.Done():
|
||||
// Context cancelled during wait
|
||||
break
|
||||
}
|
||||
} else {
|
||||
rh.logger.Warn("Operation failed after all retries",
|
||||
"operation", operationType,
|
||||
"attempts", attempt,
|
||||
"error", err)
|
||||
}
|
||||
}
|
||||
|
||||
// All attempts failed
|
||||
duration := time.Since(start)
|
||||
|
||||
stats.mu.Lock()
|
||||
stats.FailedRetries++
|
||||
stats.LastFailure = time.Now()
|
||||
denominator := stats.SuccessfulRetries + stats.FailedRetries
|
||||
if denominator > 0 {
|
||||
stats.AverageAttempts = float64(stats.TotalAttempts) / float64(denominator)
|
||||
}
|
||||
stats.mu.Unlock()
|
||||
|
||||
return &RetryResult{
|
||||
Success: false,
|
||||
Attempts: config.MaxAttempts,
|
||||
TotalDuration: duration,
|
||||
LastError: lastError,
|
||||
LastAttemptAt: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
// calculateDelay calculates the delay before the next retry attempt
|
||||
func (rh *RetryHandler) calculateDelay(config RetryConfig, attempt int) time.Duration {
|
||||
// Calculate exponential backoff
|
||||
delay := float64(config.InitialDelay) * math.Pow(config.BackoffFactor, float64(attempt-1))
|
||||
|
||||
// Apply maximum delay cap
|
||||
if delay > float64(config.MaxDelay) {
|
||||
delay = float64(config.MaxDelay)
|
||||
}
|
||||
|
||||
duration := time.Duration(delay)
|
||||
|
||||
// Add jitter if enabled
|
||||
if config.JitterEnabled {
|
||||
jitter := time.Duration(float64(duration) * 0.1 * (2*rh.randomFloat() - 1))
|
||||
duration += jitter
|
||||
}
|
||||
|
||||
// Ensure minimum delay
|
||||
if duration < 0 {
|
||||
duration = config.InitialDelay
|
||||
}
|
||||
|
||||
return duration
|
||||
}
|
||||
|
||||
// randomFloat returns a pseudo-random float between 0 and 1
|
||||
func (rh *RetryHandler) randomFloat() float64 {
|
||||
// Simple pseudo-random number based on current time
|
||||
return float64(time.Now().UnixNano()%1000) / 1000.0
|
||||
}
|
||||
|
||||
// getConfig returns the retry configuration for an operation type
|
||||
func (rh *RetryHandler) getConfig(operationType string) RetryConfig {
|
||||
rh.mu.RLock()
|
||||
defer rh.mu.RUnlock()
|
||||
|
||||
if config, exists := rh.configs[operationType]; exists {
|
||||
return config
|
||||
}
|
||||
|
||||
// Return default config if no specific config found
|
||||
return DefaultRetryConfig()
|
||||
}
|
||||
|
||||
// SetConfig sets a custom retry configuration for an operation type
|
||||
func (rh *RetryHandler) SetConfig(operationType string, config RetryConfig) {
|
||||
rh.mu.Lock()
|
||||
defer rh.mu.Unlock()
|
||||
|
||||
rh.configs[operationType] = config
|
||||
rh.logger.Debug("Set retry config",
|
||||
"operation", operationType,
|
||||
"max_attempts", config.MaxAttempts,
|
||||
"initial_delay", config.InitialDelay,
|
||||
"max_delay", config.MaxDelay)
|
||||
}
|
||||
|
||||
// GetStats returns retry statistics for all operation types
|
||||
func (rh *RetryHandler) GetStats() map[string]*RetryStats {
|
||||
rh.mu.RLock()
|
||||
defer rh.mu.RUnlock()
|
||||
|
||||
// Return a copy to prevent external modification
|
||||
result := make(map[string]*RetryStats)
|
||||
for opType, stats := range rh.stats {
|
||||
stats.mu.RLock()
|
||||
result[opType] = &RetryStats{
|
||||
OperationType: stats.OperationType,
|
||||
TotalAttempts: stats.TotalAttempts,
|
||||
SuccessfulRetries: stats.SuccessfulRetries,
|
||||
FailedRetries: stats.FailedRetries,
|
||||
AverageAttempts: stats.AverageAttempts,
|
||||
LastAttempt: stats.LastAttempt,
|
||||
LastSuccess: stats.LastSuccess,
|
||||
LastFailure: stats.LastFailure,
|
||||
}
|
||||
stats.mu.RUnlock()
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// GetOperationStats returns statistics for a specific operation type
|
||||
func (rh *RetryHandler) GetOperationStats(operationType string) *RetryStats {
|
||||
rh.mu.RLock()
|
||||
defer rh.mu.RUnlock()
|
||||
|
||||
stats, exists := rh.stats[operationType]
|
||||
if !exists {
|
||||
return nil
|
||||
}
|
||||
|
||||
stats.mu.RLock()
|
||||
defer stats.mu.RUnlock()
|
||||
|
||||
return &RetryStats{
|
||||
OperationType: stats.OperationType,
|
||||
TotalAttempts: stats.TotalAttempts,
|
||||
SuccessfulRetries: stats.SuccessfulRetries,
|
||||
FailedRetries: stats.FailedRetries,
|
||||
AverageAttempts: stats.AverageAttempts,
|
||||
LastAttempt: stats.LastAttempt,
|
||||
LastSuccess: stats.LastSuccess,
|
||||
LastFailure: stats.LastFailure,
|
||||
}
|
||||
}
|
||||
|
||||
// ResetStats resets statistics for all operation types
|
||||
func (rh *RetryHandler) ResetStats() {
|
||||
rh.mu.Lock()
|
||||
defer rh.mu.Unlock()
|
||||
|
||||
rh.stats = make(map[string]*RetryStats)
|
||||
rh.logger.Info("Reset retry statistics")
|
||||
}
|
||||
|
||||
// Enable enables the retry handler
|
||||
func (rh *RetryHandler) Enable() {
|
||||
rh.mu.Lock()
|
||||
defer rh.mu.Unlock()
|
||||
rh.enabled = true
|
||||
rh.logger.Info("Retry handler enabled")
|
||||
}
|
||||
|
||||
// Disable disables the retry handler
|
||||
func (rh *RetryHandler) Disable() {
|
||||
rh.mu.Lock()
|
||||
defer rh.mu.Unlock()
|
||||
rh.enabled = false
|
||||
rh.logger.Info("Retry handler disabled")
|
||||
}
|
||||
|
||||
// IsEnabled returns whether the retry handler is enabled
|
||||
func (rh *RetryHandler) IsEnabled() bool {
|
||||
rh.mu.RLock()
|
||||
defer rh.mu.RUnlock()
|
||||
return rh.enabled
|
||||
}
|
||||
|
||||
// GetHealthSummary returns a health summary based on retry statistics
|
||||
func (rh *RetryHandler) GetHealthSummary() map[string]interface{} {
|
||||
stats := rh.GetStats()
|
||||
|
||||
summary := map[string]interface{}{
|
||||
"enabled": rh.enabled,
|
||||
"total_operations": len(stats),
|
||||
"healthy_operations": 0,
|
||||
"unhealthy_operations": 0,
|
||||
"operation_details": make(map[string]interface{}),
|
||||
}
|
||||
|
||||
for opType, opStats := range stats {
|
||||
total := opStats.SuccessfulRetries + opStats.FailedRetries
|
||||
successRate := 0.0
|
||||
if total > 0 {
|
||||
successRate = float64(opStats.SuccessfulRetries) / float64(total)
|
||||
}
|
||||
|
||||
isHealthy := successRate >= 0.9 && opStats.AverageAttempts <= 2.0
|
||||
|
||||
if isHealthy {
|
||||
summary["healthy_operations"] = summary["healthy_operations"].(int) + 1
|
||||
} else {
|
||||
summary["unhealthy_operations"] = summary["unhealthy_operations"].(int) + 1
|
||||
}
|
||||
|
||||
summary["operation_details"].(map[string]interface{})[opType] = map[string]interface{}{
|
||||
"success_rate": successRate,
|
||||
"average_attempts": opStats.AverageAttempts,
|
||||
"total_operations": total,
|
||||
"is_healthy": isHealthy,
|
||||
"last_success": opStats.LastSuccess,
|
||||
"last_failure": opStats.LastFailure,
|
||||
}
|
||||
}
|
||||
|
||||
return summary
|
||||
}
|
||||
Reference in New Issue
Block a user