feat: create v2-prep branch with comprehensive planning

Restructured project for V2 refactor: **Structure Changes:** - Moved all V1 code to orig/ folder (preserved with git mv) - Created docs/planning/ directory - Added orig/README_V1.md explaining V1 preservation **Planning Documents:** - 00_V2_MASTER_PLAN.md: Complete architecture overview - Executive summary of critical V1 issues - High-level component architecture diagrams - 5-phase implementation roadmap - Success metrics and risk mitigation - 07_TASK_BREAKDOWN.md: Atomic task breakdown - 99+ hours of detailed tasks - Every task < 2 hours (atomic) - Clear dependencies and success criteria - Organized by implementation phase **V2 Key Improvements:** - Per-exchange parsers (factory pattern) - Multi-layer strict validation - Multi-index pool cache - Background validation pipeline - Comprehensive observability **Critical Issues Addressed:** - Zero address tokens (strict validation + cache enrichment) - Parsing accuracy (protocol-specific parsers) - No audit trail (background validation channel) - Inefficient lookups (multi-index cache) - Stats disconnection (event-driven metrics) Next Steps: 1. Review planning documents 2. Begin Phase 1: Foundation (P1-001 through P1-010) 3. Implement parsers in Phase 2 4. Build cache system in Phase 3 5. Add validation pipeline in Phase 4 6. Migrate and test in Phase 5 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-10 10:14:26 +01:00
parent 1773daffe7
commit 803de231ba
411 changed files with 20390 additions and 8680 deletions
--- a/orig/pkg/arbitrum/connection.go
+++ b/orig/pkg/arbitrum/connection.go
@@ -0,0 +1,440 @@
+package arbitrum
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"strings"
+	"time"
+
+	"github.com/ethereum/go-ethereum/ethclient"
+	"golang.org/x/time/rate"
+
+	"github.com/fraktal/mev-beta/internal/config"
+	"github.com/fraktal/mev-beta/internal/logger"
+	pkgerrors "github.com/fraktal/mev-beta/pkg/errors"
+)
+
+// RateLimitedClient wraps ethclient.Client with rate limiting and circuit breaker
+type RateLimitedClient struct {
+	*ethclient.Client
+	limiter        *rate.Limiter
+	circuitBreaker *CircuitBreaker
+	logger         *logger.Logger
+}
+
+// RateLimitConfig represents the configuration for rate limiting
+type RateLimitConfig struct {
+	RequestsPerSecond float64 `yaml:"requests_per_second"`
+	MaxConcurrent     int     `yaml:"max_concurrent"`
+	Burst             int     `yaml:"burst"`
+}
+
+// NewRateLimitedClient creates a new rate limited client
+func NewRateLimitedClient(client *ethclient.Client, requestsPerSecond float64, logger *logger.Logger) *RateLimitedClient {
+	// Create a rate limiter
+	limiter := rate.NewLimiter(rate.Limit(requestsPerSecond), int(requestsPerSecond*2))
+
+	// Create circuit breaker with default configuration
+	circuitBreakerConfig := &CircuitBreakerConfig{
+		FailureThreshold: 5,
+		Timeout:          30 * time.Second,
+		SuccessThreshold: 3,
+	}
+	circuitBreaker := NewCircuitBreaker(circuitBreakerConfig)
+	circuitBreaker.SetLogger(logger)
+
+	return &RateLimitedClient{
+		Client:         client,
+		limiter:        limiter,
+		circuitBreaker: circuitBreaker,
+		logger:         logger,
+	}
+}
+
+// CallWithRateLimit executes a call with rate limiting and circuit breaker protection
+func (rlc *RateLimitedClient) CallWithRateLimit(ctx context.Context, call func() error) error {
+	// Check circuit breaker state
+	if rlc.circuitBreaker.GetState() == Open {
+		return fmt.Errorf("circuit breaker is open")
+	}
+
+	// Wait for rate limiter
+	if err := rlc.limiter.Wait(ctx); err != nil {
+		return fmt.Errorf("rate limiter wait error: %w", err)
+	}
+
+	// Execute the call through circuit breaker with retry on rate limit errors
+	var lastErr error
+	maxRetries := 3
+
+	for attempt := 0; attempt < maxRetries; attempt++ {
+		err := rlc.circuitBreaker.Call(ctx, call)
+
+		// Check if this is a rate limit error
+		if err != nil && strings.Contains(err.Error(), "RPS limit") {
+			rlc.logger.Warn(fmt.Sprintf("⚠️ RPC rate limit hit (attempt %d/%d), applying exponential backoff", attempt+1, maxRetries))
+
+			// Exponential backoff: 1s, 2s, 4s
+			backoffDuration := time.Duration(1<<uint(attempt)) * time.Second
+
+			select {
+			case <-ctx.Done():
+				return pkgerrors.WrapContextError(ctx.Err(), "RateLimitedClient.ExecuteWithRetry.rateLimitBackoff",
+					map[string]interface{}{
+						"attempt":         attempt + 1,
+						"maxRetries":      maxRetries,
+						"backoffDuration": backoffDuration.String(),
+						"lastError":       err.Error(),
+					})
+			case <-time.After(backoffDuration):
+				lastErr = err
+				continue // Retry
+			}
+		}
+
+		// Not a rate limit error or call succeeded
+		if err != nil {
+			// Log circuit breaker state transitions
+			if rlc.circuitBreaker.GetState() == Open {
+				rlc.logger.Warn("🚨 Circuit breaker OPENED due to failed RPC calls")
+			}
+		}
+
+		return err
+	}
+
+	// All retries exhausted
+	rlc.logger.Error(fmt.Sprintf("❌ Rate limit retries exhausted after %d attempts", maxRetries))
+	return fmt.Errorf("rate limit exceeded after %d retries: %w", maxRetries, lastErr)
+}
+
+// GetCircuitBreaker returns the circuit breaker for external access
+func (rlc *RateLimitedClient) GetCircuitBreaker() *CircuitBreaker {
+	return rlc.circuitBreaker
+}
+
+// ResetCircuitBreaker resets the circuit breaker
+func (rlc *RateLimitedClient) ResetCircuitBreaker() {
+	rlc.circuitBreaker.Reset()
+	rlc.logger.Info("✅ Circuit breaker reset to closed state")
+}
+
+// ConnectionManager manages Arbitrum RPC connections with fallback support and round-robin load balancing
+type ConnectionManager struct {
+	config             *config.ArbitrumConfig
+	primaryClient      *RateLimitedClient
+	fallbackClients    []*RateLimitedClient
+	currentClientIndex int
+	logger             *logger.Logger
+	rpcManager         *RPCManager
+	useRoundRobin      bool
+}
+
+// NewConnectionManager creates a new connection manager
+func NewConnectionManager(cfg *config.ArbitrumConfig, logger *logger.Logger) *ConnectionManager {
+	rpcManager := NewRPCManager(logger)
+	return &ConnectionManager{
+		config:        cfg,
+		logger:        logger,
+		rpcManager:    rpcManager,
+		useRoundRobin: true, // Enable round-robin by default
+	}
+}
+
+// EnableRoundRobin enables round-robin load balancing across RPC endpoints
+func (cm *ConnectionManager) EnableRoundRobin(enabled bool) {
+	cm.useRoundRobin = enabled
+	if enabled {
+		cm.logger.Info("✅ Round-robin RPC load balancing ENABLED")
+	} else {
+		cm.logger.Info("⚠️ Round-robin RPC load balancing DISABLED")
+	}
+}
+
+// SetRPCRotationPolicy sets the rotation policy for the RPC manager
+func (cm *ConnectionManager) SetRPCRotationPolicy(policy RotationPolicy) {
+	cm.rpcManager.SetRotationPolicy(policy)
+}
+
+// GetClient returns a connected Ethereum client with automatic fallback
+func (cm *ConnectionManager) GetClient(ctx context.Context) (*RateLimitedClient, error) {
+	// If using round-robin, try to get from RPC manager
+	if cm.useRoundRobin && len(cm.rpcManager.endpoints) > 0 {
+		client, idx, err := cm.rpcManager.GetNextClient(ctx)
+		if err == nil && client != nil {
+			// Test connection health
+			if cm.testConnection(ctx, client.Client) == nil {
+				return client, nil
+			}
+			// Record the failure in RPC manager
+			cm.rpcManager.RecordFailure(idx)
+		}
+	}
+
+	// Fallback to primary/fallback endpoint logic if round-robin fails
+	// Try primary endpoint first
+	if cm.primaryClient == nil {
+		primaryEndpoint := cm.getPrimaryEndpoint()
+		client, err := cm.connectWithTimeout(ctx, primaryEndpoint)
+		if err == nil {
+			cm.primaryClient = client
+			cm.logger.Info(fmt.Sprintf("✅ Connected to primary endpoint: %s", primaryEndpoint))
+			// Add to RPC manager if not already there
+			if cm.useRoundRobin && len(cm.rpcManager.endpoints) == 0 {
+				_ = cm.rpcManager.AddEndpoint(client, primaryEndpoint)
+			}
+			return client, nil
+		}
+		cm.logger.Warn(fmt.Sprintf("⚠️ Primary endpoint failed: %s - %v", primaryEndpoint, err))
+	} else {
+		// Test if primary client is still connected
+		if cm.testConnection(ctx, cm.primaryClient.Client) == nil {
+			return cm.primaryClient, nil
+		}
+		// Primary client failed, close it
+		cm.primaryClient.Client.Close()
+		cm.primaryClient = nil
+	}
+
+	// Try fallback endpoints
+	fallbackEndpoints := cm.getFallbackEndpoints()
+	for i, endpoint := range fallbackEndpoints {
+		client, err := cm.connectWithTimeout(ctx, endpoint)
+		if err == nil {
+			// Store successful fallback client
+			if i < len(cm.fallbackClients) {
+				if cm.fallbackClients[i] != nil {
+					cm.fallbackClients[i].Client.Close()
+				}
+				cm.fallbackClients[i] = client
+			} else {
+				cm.fallbackClients = append(cm.fallbackClients, client)
+			}
+			cm.currentClientIndex = i
+
+			// Add to RPC manager for round-robin
+			if cm.useRoundRobin {
+				_ = cm.rpcManager.AddEndpoint(client, endpoint)
+			}
+
+			return client, nil
+		}
+	}
+
+	return nil, fmt.Errorf("all RPC endpoints failed to connect")
+}
+
+// getPrimaryEndpoint returns the primary RPC endpoint
+func (cm *ConnectionManager) getPrimaryEndpoint() string {
+	// Check environment variable first
+	if endpoint := os.Getenv("ARBITRUM_RPC_ENDPOINT"); endpoint != "" {
+		return endpoint
+	}
+
+	// Use config value
+	if cm.config != nil && cm.config.RPCEndpoint != "" {
+		return cm.config.RPCEndpoint
+	}
+
+	// Default fallback
+	return "wss://arbitrum-mainnet.core.chainstack.com/53c30e7a941160679fdcc396c894fc57"
+}
+
+// getFallbackEndpoints returns fallback RPC endpoints
+func (cm *ConnectionManager) getFallbackEndpoints() []string {
+	var endpoints []string
+
+	// Check environment variable first
+	if envEndpoints := os.Getenv("ARBITRUM_FALLBACK_ENDPOINTS"); envEndpoints != "" {
+		for _, endpoint := range strings.Split(envEndpoints, ",") {
+			if endpoint = strings.TrimSpace(endpoint); endpoint != "" {
+				endpoints = append(endpoints, endpoint)
+			}
+		}
+		// If environment variables are set, use only those and return
+		return endpoints
+	}
+
+	// Add configured reading and execution endpoints
+	if cm.config != nil {
+		// Add reading endpoints
+		for _, endpoint := range cm.config.ReadingEndpoints {
+			if endpoint.URL != "" {
+				endpoints = append(endpoints, endpoint.URL)
+			}
+		}
+		// Add execution endpoints
+		for _, endpoint := range cm.config.ExecutionEndpoints {
+			if endpoint.URL != "" {
+				endpoints = append(endpoints, endpoint.URL)
+			}
+		}
+	}
+
+	// Default fallbacks if none configured - enhanced list from providers_runtime.yaml
+	if len(endpoints) == 0 {
+		endpoints = []string{
+			"https://arb1.arbitrum.io/rpc",                   // Official Arbitrum
+			"https://arbitrum-one.publicnode.com",            // PublicNode
+			"https://arbitrum-one.public.blastapi.io",        // BlastAPI
+			"https://1rpc.io/42161",                          // 1RPC
+			"https://rpc.arb1.arbitrum.gateway.fm",           // Gateway FM
+			"https://arb-mainnet-public.unifra.io",           // Unifra
+			"https://arbitrum.blockpi.network/v1/rpc/public", // BlockPI
+			"https://arbitrum.llamarpc.com",                  // LlamaNodes
+			"wss://arbitrum-one.publicnode.com",              // PublicNode WebSocket
+			"https://arbitrum-one-rpc.publicnode.com",        // PublicNode Alternative
+			"https://arb-mainnet.g.alchemy.com/v2/demo",      // Alchemy demo
+		}
+		cm.logger.Info(fmt.Sprintf("📋 Using %d default RPC endpoints for failover", len(endpoints)))
+	}
+
+	return endpoints
+}
+
+// connectWithTimeout attempts to connect to an RPC endpoint with timeout
+func (cm *ConnectionManager) connectWithTimeout(ctx context.Context, endpoint string) (*RateLimitedClient, error) {
+	// Create timeout context with extended timeout for production stability
+	// Increased from 10s to 30s to handle network congestion and slow RPC responses
+	connectCtx, cancel := context.WithTimeout(ctx, 30*time.Second)
+	defer cancel()
+
+	cm.logger.Info(fmt.Sprintf("🔌 Attempting connection to endpoint: %s (timeout: 30s)", endpoint))
+
+	// Create client
+	client, err := ethclient.DialContext(connectCtx, endpoint)
+	if err != nil {
+		return nil, fmt.Errorf("failed to connect to %s: %w", endpoint, err)
+	}
+
+	cm.logger.Info("✅ Client connected, testing connection health...")
+
+	// Test connection with a simple call
+	if err := cm.testConnection(connectCtx, client); err != nil {
+		client.Close()
+		return nil, fmt.Errorf("connection test failed for %s: %w", endpoint, err)
+	}
+
+	cm.logger.Info("✅ Connection health check passed")
+
+	// Wrap with rate limiting
+	// Get rate limit from config or use conservative defaults
+	// Lowered from 10 RPS to 5 RPS to avoid Chainstack rate limits
+	requestsPerSecond := 5.0 // Default 5 requests per second (conservative for free/basic plans)
+	if cm.config != nil && cm.config.RateLimit.RequestsPerSecond > 0 {
+		requestsPerSecond = float64(cm.config.RateLimit.RequestsPerSecond)
+	}
+
+	cm.logger.Info(fmt.Sprintf("📊 Rate limiting configured: %.1f requests/second", requestsPerSecond))
+	rateLimitedClient := NewRateLimitedClient(client, requestsPerSecond, cm.logger)
+
+	return rateLimitedClient, nil
+}
+
+// testConnection tests if a client connection is working
+func (cm *ConnectionManager) testConnection(ctx context.Context, client *ethclient.Client) error {
+	// Increased timeout from 5s to 15s for production stability
+	testCtx, cancel := context.WithTimeout(ctx, 15*time.Second)
+	defer cancel()
+
+	// Try to get chain ID as a simple connection test
+	chainID, err := client.ChainID(testCtx)
+	if err != nil {
+		return err
+	}
+
+	cm.logger.Info(fmt.Sprintf("✅ Connected to chain ID: %s", chainID.String()))
+	return nil
+}
+
+// Close closes all client connections
+func (cm *ConnectionManager) Close() {
+	if cm.primaryClient != nil {
+		cm.primaryClient.Client.Close()
+		cm.primaryClient = nil
+	}
+
+	for _, client := range cm.fallbackClients {
+		if client != nil {
+			client.Client.Close()
+		}
+	}
+	cm.fallbackClients = nil
+
+	// Close RPC manager
+	if cm.rpcManager != nil {
+		_ = cm.rpcManager.Close()
+	}
+}
+
+// GetRPCManagerStats returns statistics about RPC endpoint usage and health
+func (cm *ConnectionManager) GetRPCManagerStats() map[string]interface{} {
+	if cm.rpcManager == nil {
+		return map[string]interface{}{
+			"error": "RPC manager not initialized",
+		}
+	}
+	return cm.rpcManager.GetStats()
+}
+
+// PerformRPCHealthCheck performs a health check on all RPC endpoints
+func (cm *ConnectionManager) PerformRPCHealthCheck(ctx context.Context) error {
+	if cm.rpcManager == nil {
+		return fmt.Errorf("RPC manager not initialized")
+	}
+	return cm.rpcManager.HealthCheckAll(ctx)
+}
+
+// GetClientWithRetry returns a client with automatic retry on failure
+func (cm *ConnectionManager) GetClientWithRetry(ctx context.Context, maxRetries int) (*RateLimitedClient, error) {
+	var lastErr error
+
+	cm.logger.Info(fmt.Sprintf("🔄 Starting connection attempts (max retries: %d)", maxRetries))
+
+	for attempt := 0; attempt < maxRetries; attempt++ {
+		cm.logger.Info(fmt.Sprintf("📡 Connection attempt %d/%d", attempt+1, maxRetries))
+
+		client, err := cm.GetClient(ctx)
+		if err == nil {
+			cm.logger.Info("✅ Successfully connected to RPC endpoint")
+			return client, nil
+		}
+
+		lastErr = err
+		cm.logger.Warn(fmt.Sprintf("❌ Connection attempt %d failed: %v", attempt+1, err))
+
+		// Wait before retry (exponential backoff with cap at 8 seconds)
+		if attempt < maxRetries-1 {
+			waitTime := time.Duration(1<<uint(attempt)) * time.Second
+			if waitTime > 8*time.Second {
+				waitTime = 8 * time.Second
+			}
+			cm.logger.Info(fmt.Sprintf("⏳ Waiting %v before retry...", waitTime))
+
+			select {
+			case <-ctx.Done():
+				return nil, pkgerrors.WrapContextError(ctx.Err(), "ConnectionManager.GetClientWithRetry.retryBackoff",
+					map[string]interface{}{
+						"attempt":    attempt + 1,
+						"maxRetries": maxRetries,
+						"waitTime":   waitTime.String(),
+						"lastError":  err.Error(),
+					})
+			case <-time.After(waitTime):
+				// Continue to next attempt
+			}
+		}
+	}
+
+	return nil, fmt.Errorf("failed to connect after %d attempts (last error: %w)", maxRetries, lastErr)
+}
+
+// GetHealthyClient returns a client that passes health checks
+func GetHealthyClient(ctx context.Context, logger *logger.Logger) (*RateLimitedClient, error) {
+	cfg := &config.ArbitrumConfig{} // Use default config
+	cm := NewConnectionManager(cfg, logger)
+	defer cm.Close()
+
+	return cm.GetClientWithRetry(ctx, 3)
+}