mev-beta/pkg/transport/provider_manager.go

package transport

import (
	"context"
	"fmt"
	"net/http"
	"os"
	"strings"
	"sync"
	"sync/atomic"
	"time"

	"github.com/ethereum/go-ethereum/ethclient"
	"github.com/ethereum/go-ethereum/rpc"
	"golang.org/x/time/rate"
	"gopkg.in/yaml.v3"
)

// ProviderConfig represents a single RPC provider configuration
type ProviderConfig struct {
	Name         string            `yaml:"name"`
	Type         string            `yaml:"type"`
	HTTPEndpoint string            `yaml:"http_endpoint"`
	WSEndpoint   string            `yaml:"ws_endpoint"`
	Priority     int               `yaml:"priority"`
	RateLimit    RateLimitConfig   `yaml:"rate_limit"`
	Features     []string          `yaml:"features"`
	HealthCheck  HealthCheckConfig `yaml:"health_check"`
	AnvilConfig  *AnvilConfig      `yaml:"anvil_config,omitempty"` // For Anvil fork providers
}

// AnvilConfig defines Anvil-specific configuration
type AnvilConfig struct {
	ForkURL         string `yaml:"fork_url"`
	ChainID         int    `yaml:"chain_id"`
	Port            int    `yaml:"port"`
	BlockTime       int    `yaml:"block_time"`
	AutoImpersonate bool   `yaml:"auto_impersonate"`
	StateInterval   int    `yaml:"state_interval"`
}

// RateLimitConfig defines rate limiting parameters
type RateLimitConfig struct {
	RequestsPerSecond int           `yaml:"requests_per_second"`
	Burst             int           `yaml:"burst"`
	Timeout           time.Duration `yaml:"timeout"`
	RetryDelay        time.Duration `yaml:"retry_delay"`
	MaxRetries        int           `yaml:"max_retries"`
}

// HealthCheckConfig defines health check parameters
type HealthCheckConfig struct {
	Enabled  bool          `yaml:"enabled"`
	Interval time.Duration `yaml:"interval"`
	Timeout  time.Duration `yaml:"timeout"`
}

// RotationConfig defines provider rotation strategy
type RotationConfig struct {
	Strategy            string        `yaml:"strategy"`
	HealthCheckRequired bool          `yaml:"health_check_required"`
	FallbackEnabled     bool          `yaml:"fallback_enabled"`
	RetryFailedAfter    time.Duration `yaml:"retry_failed_after"`
}

// ProviderPoolConfig defines configuration for a provider pool
type ProviderPoolConfig struct {
	Strategy                 string   `yaml:"strategy"`
	MaxConcurrentConnections int      `yaml:"max_concurrent_connections"`
	HealthCheckInterval      string   `yaml:"health_check_interval"`
	FailoverEnabled          bool     `yaml:"failover_enabled"`
	Providers                []string `yaml:"providers"`
}

// ProvidersConfig represents the complete provider configuration
type ProvidersConfig struct {
	ProviderPools map[string]ProviderPoolConfig `yaml:"provider_pools"`
	Providers     []ProviderConfig              `yaml:"providers"`
	Rotation      RotationConfig                `yaml:"rotation"`
	GlobalLimits  GlobalLimits                  `yaml:"global_limits"`
	Monitoring    MonitoringConfig              `yaml:"monitoring"`
}

// GlobalLimits defines global connection limits
type GlobalLimits struct {
	MaxConcurrentConnections int           `yaml:"max_concurrent_connections"`
	ConnectionTimeout        time.Duration `yaml:"connection_timeout"`
	ReadTimeout              time.Duration `yaml:"read_timeout"`
	WriteTimeout             time.Duration `yaml:"write_timeout"`
	IdleTimeout              time.Duration `yaml:"idle_timeout"`
}

// MonitoringConfig defines monitoring settings
type MonitoringConfig struct {
	Enabled                  bool          `yaml:"enabled"`
	MetricsInterval          time.Duration `yaml:"metrics_interval"`
	LogSlowRequests          bool          `yaml:"log_slow_requests"`
	SlowRequestThreshold     time.Duration `yaml:"slow_request_threshold"`
	TrackProviderPerformance bool          `yaml:"track_provider_performance"`
}

// Provider represents an active RPC provider connection
type Provider struct {
	Config          ProviderConfig
	HTTPClient      *ethclient.Client
	WSClient        *ethclient.Client
	RateLimiter     *rate.Limiter
	HTTPConn        *rpc.Client
	WSConn          *rpc.Client
	IsHealthy       bool
	LastHealthCheck time.Time
	RequestCount    int64
	ErrorCount      int64
	AvgResponseTime time.Duration
	mutex           sync.RWMutex
}

// ProviderManager manages multiple RPC providers with rotation and failover
type ProviderManager struct {
	providers       []*Provider
	config          ProvidersConfig
	currentProvider int
	mutex           sync.RWMutex
	healthTicker    *time.Ticker
	metricsTicker   *time.Ticker
	stopChan        chan struct{}
}

// NewProviderManager creates a new provider manager from configuration
func NewProviderManager(configPath string) (*ProviderManager, error) {
	// Load configuration
	config, err := LoadProvidersConfig(configPath)
	if err != nil {
		return nil, fmt.Errorf("failed to load provider config: %w", err)
	}

	pm := &ProviderManager{
		config:   config,
		stopChan: make(chan struct{}),
	}

	// Initialize providers
	if err := pm.initializeProviders(); err != nil {
		return nil, fmt.Errorf("failed to initialize providers: %w", err)
	}

	// Start health checks and metrics collection
	pm.startBackgroundTasks()

	return pm, nil
}

// LoadProvidersConfig loads provider configuration from YAML file
func LoadProvidersConfig(path string) (ProvidersConfig, error) {
	var config ProvidersConfig

	// Read the YAML file
	data, err := os.ReadFile(path)
	if err != nil {
		return config, fmt.Errorf("failed to read config file %s: %w", path, err)
	}

	// Unmarshal the YAML data
	expanded := os.ExpandEnv(string(data))
	if strings.Contains(expanded, "${") {
		return config, fmt.Errorf("unresolved environment variables found in provider config %s", path)
	}

	if err := yaml.Unmarshal([]byte(expanded), &config); err != nil {
		return config, fmt.Errorf("failed to parse YAML config: %w", err)
	}

	// Validate the configuration
	if err := validateConfig(&config); err != nil {
		return config, fmt.Errorf("invalid configuration: %w", err)
	}

	return config, nil
}

// validateConfig validates the provider configuration
func validateConfig(config *ProvidersConfig) error {
	if len(config.Providers) == 0 {
		return fmt.Errorf("no providers configured")
	}

	for i, provider := range config.Providers {
		if provider.Name == "" {
			return fmt.Errorf("provider %d has no name", i)
		}
		if provider.HTTPEndpoint == "" && provider.WSEndpoint == "" {
			return fmt.Errorf("provider %s has no endpoints", provider.Name)
		}
		if provider.RateLimit.RequestsPerSecond <= 0 {
			return fmt.Errorf("provider %s has invalid rate limit", provider.Name)
		}
	}

	return nil
}

// initializeProviders sets up all configured providers
func (pm *ProviderManager) initializeProviders() error {
	pm.providers = make([]*Provider, 0, len(pm.config.Providers))

	for _, providerConfig := range pm.config.Providers {
		provider, err := createProvider(providerConfig)
		if err != nil {
			// Log error but continue with other providers
			continue
		}
		pm.providers = append(pm.providers, provider)
	}

	if len(pm.providers) == 0 {
		return fmt.Errorf("no providers successfully initialized")
	}

	return nil
}

// createProvider creates a new provider instance (shared utility function)
func createProvider(config ProviderConfig) (*Provider, error) {
	// Create rate limiter
	rateLimiter := rate.NewLimiter(
		rate.Limit(config.RateLimit.RequestsPerSecond),
		config.RateLimit.Burst,
	)

	provider := &Provider{
		Config:      config,
		RateLimiter: rateLimiter,
		IsHealthy:   true, // Assume healthy until proven otherwise
	}

	// Initialize HTTP connection
	if config.HTTPEndpoint != "" {
		httpClient := &http.Client{
			Timeout: config.RateLimit.Timeout, // Use config timeout
		}

		rpcClient, err := rpc.DialHTTPWithClient(config.HTTPEndpoint, httpClient)
		if err != nil {
			return nil, fmt.Errorf("failed to connect to HTTP endpoint %s: %w", config.HTTPEndpoint, err)
		}

		provider.HTTPConn = rpcClient
		provider.HTTPClient = ethclient.NewClient(rpcClient)
	}

	// Initialize WebSocket connection
	if config.WSEndpoint != "" {
		wsClient, err := rpc.DialWebsocket(context.Background(), config.WSEndpoint, "")
		if err != nil {
			// Don't fail if WS connection fails, HTTP might still work
			fmt.Printf("Warning: failed to connect to WebSocket endpoint %s: %v\n", config.WSEndpoint, err)
		} else {
			provider.WSConn = wsClient
			provider.WSClient = ethclient.NewClient(wsClient)
		}
	}

	return provider, nil
}

// GetHealthyProvider returns the next healthy provider based on rotation strategy
func (pm *ProviderManager) GetHealthyProvider() (*Provider, error) {
	pm.mutex.RLock()
	defer pm.mutex.RUnlock()

	if len(pm.providers) == 0 {
		return nil, fmt.Errorf("no providers available")
	}

	switch pm.config.Rotation.Strategy {
	case "round_robin":
		return pm.getNextRoundRobin()
	case "weighted":
		return pm.getWeightedProvider()
	case "priority_based":
		return pm.getPriorityProvider()
	default:
		return pm.getNextRoundRobin()
	}
}

// getNextRoundRobin implements round-robin provider selection
func (pm *ProviderManager) getNextRoundRobin() (*Provider, error) {
	startIndex := pm.currentProvider

	for i := 0; i < len(pm.providers); i++ {
		index := (startIndex + i) % len(pm.providers)
		provider := pm.providers[index]

		if pm.isProviderUsable(provider) {
			pm.currentProvider = (index + 1) % len(pm.providers)
			return provider, nil
		}
	}

	return nil, fmt.Errorf("no healthy providers available")
}

// getPriorityProvider returns the highest priority healthy provider
func (pm *ProviderManager) getPriorityProvider() (*Provider, error) {
	var bestProvider *Provider
	highestPriority := int(^uint(0) >> 1) // Max int

	for _, provider := range pm.providers {
		if pm.isProviderUsable(provider) && provider.Config.Priority < highestPriority {
			bestProvider = provider
			highestPriority = provider.Config.Priority
		}
	}

	if bestProvider == nil {
		return nil, fmt.Errorf("no healthy providers available")
	}

	return bestProvider, nil
}

// getWeightedProvider implements weighted provider selection based on performance
func (pm *ProviderManager) getWeightedProvider() (*Provider, error) {
	// For now, fallback to priority-based selection
	// In a full implementation, this would consider response times and success rates
	return pm.getPriorityProvider()
}

// isProviderUsable checks if a provider is healthy and within rate limits
func (pm *ProviderManager) isProviderUsable(provider *Provider) bool {
	provider.mutex.RLock()
	defer provider.mutex.RUnlock()

	// Check health status
	if pm.config.Rotation.HealthCheckRequired && !provider.IsHealthy {
		return false
	}

	// Check rate limit
	if !provider.RateLimiter.Allow() {
		return false
	}

	return true
}

// GetHTTPClient returns an HTTP client for the current provider
func (pm *ProviderManager) GetHTTPClient() (*ethclient.Client, error) {
	provider, err := pm.GetHealthyProvider()
	if err != nil {
		return nil, err
	}

	if provider.HTTPClient == nil {
		return nil, fmt.Errorf("provider %s has no HTTP client", provider.Config.Name)
	}

	return provider.HTTPClient, nil
}

// GetWSClient returns a WebSocket client for the current provider
func (pm *ProviderManager) GetWSClient() (*ethclient.Client, error) {
	provider, err := pm.GetHealthyProvider()
	if err != nil {
		return nil, err
	}

	if provider.WSClient == nil {
		return nil, fmt.Errorf("provider %s has no WebSocket client", provider.Config.Name)
	}

	return provider.WSClient, nil
}

// GetRPCClient returns a raw RPC client for advanced operations
func (pm *ProviderManager) GetRPCClient(preferWS bool) (*rpc.Client, error) {
	provider, err := pm.GetHealthyProvider()
	if err != nil {
		return nil, err
	}

	if preferWS && provider.WSConn != nil {
		return provider.WSConn, nil
	}

	if provider.HTTPConn != nil {
		return provider.HTTPConn, nil
	}

	return nil, fmt.Errorf("provider %s has no available RPC client", provider.Config.Name)
}

// startBackgroundTasks starts health checking and metrics collection
func (pm *ProviderManager) startBackgroundTasks() {
	// Start health checks
	if pm.config.Monitoring.Enabled {
		pm.healthTicker = time.NewTicker(time.Minute) // Default 1 minute
		go pm.healthCheckLoop()

		pm.metricsTicker = time.NewTicker(pm.config.Monitoring.MetricsInterval)
		go pm.metricsLoop()
	}
}

// healthCheckLoop periodically checks provider health
func (pm *ProviderManager) healthCheckLoop() {
	for {
		select {
		case <-pm.healthTicker.C:
			pm.performHealthChecks()
		case <-pm.stopChan:
			return
		}
	}
}

// metricsLoop periodically collects provider metrics
func (pm *ProviderManager) metricsLoop() {
	for {
		select {
		case <-pm.metricsTicker.C:
			pm.collectMetrics()
		case <-pm.stopChan:
			return
		}
	}
}

// performHealthChecks checks all providers' health
func (pm *ProviderManager) performHealthChecks() {
	for _, provider := range pm.providers {
		go pm.checkProviderHealth(provider)
	}
}

// checkProviderHealth performs a health check on a single provider
func (pm *ProviderManager) checkProviderHealth(provider *Provider) {
	pm.performProviderHealthCheck(provider, func(ctx context.Context, provider *Provider) error {
		// Try to get latest block number as health check
		if provider.HTTPClient != nil {
			_, err := provider.HTTPClient.BlockNumber(ctx)
			return err
		} else if provider.WSClient != nil {
			_, err := provider.WSClient.BlockNumber(ctx)
			return err
		}
		return fmt.Errorf("no client available for health check")
	})
}

// RACE CONDITION FIX: performProviderHealthCheck executes health check with proper synchronization
func (pm *ProviderManager) performProviderHealthCheck(provider *Provider, healthChecker func(context.Context, *Provider) error) {
	ctx, cancel := context.WithTimeout(context.Background(), provider.Config.HealthCheck.Timeout)
	defer cancel()

	start := time.Now()
	err := healthChecker(ctx, provider)
	duration := time.Since(start)

	// RACE CONDITION FIX: Use atomic operations for counters
	atomic.AddInt64(&provider.RequestCount, 1)

	provider.mutex.Lock()
	defer provider.mutex.Unlock()

	provider.LastHealthCheck = time.Now()

	if err != nil {
		// RACE CONDITION FIX: Use atomic operation for error count
		atomic.AddInt64(&provider.ErrorCount, 1)
		provider.IsHealthy = false
	} else {
		provider.IsHealthy = true
	}

	// Update average response time
	// Simple moving average calculation
	if provider.AvgResponseTime == 0 {
		provider.AvgResponseTime = duration
	} else {
		// Weight new measurement at 20% to smooth out spikes
		provider.AvgResponseTime = time.Duration(
			float64(provider.AvgResponseTime)*0.8 + float64(duration)*0.2,
		)
	}
}

// RACE CONDITION FIX: IncrementRequestCount safely increments request counter
func (p *Provider) IncrementRequestCount() {
	atomic.AddInt64(&p.RequestCount, 1)
}

// RACE CONDITION FIX: IncrementErrorCount safely increments error counter
func (p *Provider) IncrementErrorCount() {
	atomic.AddInt64(&p.ErrorCount, 1)
}

// RACE CONDITION FIX: GetRequestCount safely gets request count
func (p *Provider) GetRequestCount() int64 {
	return atomic.LoadInt64(&p.RequestCount)
}

// RACE CONDITION FIX: GetErrorCount safely gets error count
func (p *Provider) GetErrorCount() int64 {
	return atomic.LoadInt64(&p.ErrorCount)
}

// collectMetrics collects performance metrics
func (pm *ProviderManager) collectMetrics() {
	// Implementation would collect and report metrics
	// For now, just log basic stats
}

// Close shuts down the provider manager
func (pm *ProviderManager) Close() error {
	close(pm.stopChan)

	if pm.healthTicker != nil {
		pm.healthTicker.Stop()
	}
	if pm.metricsTicker != nil {
		pm.metricsTicker.Stop()
	}

	// Close all connections
	for _, provider := range pm.providers {
		if provider.HTTPConn != nil {
			provider.HTTPConn.Close()
		}
		if provider.WSConn != nil {
			provider.WSConn.Close()
		}
	}

	return nil
}

// GetProviderStats returns current provider statistics
func (pm *ProviderManager) GetProviderStats() map[string]interface{} {
	pm.mutex.RLock()
	defer pm.mutex.RUnlock()

	stats := make(map[string]interface{})
	for _, provider := range pm.providers {
		provider.mutex.RLock()
		providerStats := map[string]interface{}{
			"name":              provider.Config.Name,
			"healthy":           provider.IsHealthy,
			"last_health_check": provider.LastHealthCheck,
			"request_count":     provider.GetRequestCount(), // RACE CONDITION FIX: Use atomic getter
			"error_count":       provider.GetErrorCount(),   // RACE CONDITION FIX: Use atomic getter
			"avg_response_time": provider.AvgResponseTime,
		}
		provider.mutex.RUnlock()
		stats[provider.Config.Name] = providerStats
	}

	return stats
}