Files
mev-beta/pkg/transport/failover.go
Krypto Kajun 3f69aeafcf fix: resolve all compilation issues across transport and lifecycle packages
- Fixed duplicate type declarations in transport package
- Removed unused variables in lifecycle and dependency injection
- Fixed big.Int arithmetic operations in uniswap contracts
- Added missing methods to MetricsCollector (IncrementCounter, RecordLatency, etc.)
- Fixed jitter calculation in TCP transport retry logic
- Updated ComponentHealth field access to use transport type
- Ensured all core packages build successfully

All major compilation errors resolved:
 Transport package builds clean
 Lifecycle package builds clean
 Main MEV bot application builds clean
 Fixed method signature mismatches
 Resolved type conflicts and duplications

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-19 17:23:14 -05:00

613 lines
16 KiB
Go

package transport
import (
"context"
"fmt"
"sync"
"time"
)
// FailoverManager handles transport failover and redundancy
type FailoverManager struct {
transports map[string]*ManagedTransport
primaryTransport string
backupTransports []string
failoverPolicy FailoverPolicy
healthChecker HealthChecker
circuitBreaker *CircuitBreaker
mu sync.RWMutex
ctx context.Context
cancel context.CancelFunc
metrics FailoverMetrics
notifications chan FailoverEvent
}
// ManagedTransport wraps a transport with management metadata
type ManagedTransport struct {
Transport Transport
ID string
Name string
Priority int
Status TransportStatus
LastHealthCheck time.Time
FailureCount int
LastFailure time.Time
Config TransportConfig
Metrics TransportMetrics
}
// TransportStatus represents the current status of a transport
type TransportStatus string
const (
StatusHealthy TransportStatus = "healthy"
StatusDegraded TransportStatus = "degraded"
StatusUnhealthy TransportStatus = "unhealthy"
StatusDisabled TransportStatus = "disabled"
)
// FailoverPolicy defines when and how to failover
type FailoverPolicy struct {
FailureThreshold int // Number of failures before marking unhealthy
HealthCheckInterval time.Duration // How often to check health
FailoverTimeout time.Duration // Timeout for failover operations
RetryInterval time.Duration // Interval between retry attempts
MaxRetries int // Maximum retry attempts
AutoFailback bool // Whether to automatically failback to primary
FailbackDelay time.Duration // Delay before attempting failback
RequireAllHealthy bool // Whether all transports must be healthy
}
// FailoverMetrics tracks failover statistics
type FailoverMetrics struct {
TotalFailovers int64 `json:"total_failovers"`
TotalFailbacks int64 `json:"total_failbacks"`
CurrentTransport string `json:"current_transport"`
LastFailover time.Time `json:"last_failover"`
LastFailback time.Time `json:"last_failback"`
FailoverDuration time.Duration `json:"failover_duration"`
FailoverSuccessRate float64 `json:"failover_success_rate"`
HealthCheckFailures int64 `json:"health_check_failures"`
CircuitBreakerTrips int64 `json:"circuit_breaker_trips"`
}
// FailoverEvent represents a failover-related event
type FailoverEvent struct {
Type FailoverEventType `json:"type"`
FromTransport string `json:"from_transport"`
ToTransport string `json:"to_transport"`
Reason string `json:"reason"`
Timestamp time.Time `json:"timestamp"`
Success bool `json:"success"`
Duration time.Duration `json:"duration"`
}
// FailoverEventType defines types of failover events
type FailoverEventType string
const (
EventFailover FailoverEventType = "failover"
EventFailback FailoverEventType = "failback"
EventHealthCheck FailoverEventType = "health_check"
EventCircuitBreak FailoverEventType = "circuit_break"
EventRecovery FailoverEventType = "recovery"
)
// HealthChecker interface for custom health checking logic
type HealthChecker interface {
CheckHealth(ctx context.Context, transport Transport) (bool, error)
GetHealthScore(transport Transport) float64
}
// NewFailoverManager creates a new failover manager
func NewFailoverManager(policy FailoverPolicy) *FailoverManager {
ctx, cancel := context.WithCancel(context.Background())
fm := &FailoverManager{
transports: make(map[string]*ManagedTransport),
failoverPolicy: policy,
healthChecker: NewDefaultHealthChecker(),
circuitBreaker: NewCircuitBreaker(CircuitBreakerConfig{
FailureThreshold: policy.FailureThreshold,
RecoveryTimeout: policy.RetryInterval,
MaxRetries: policy.MaxRetries,
}),
ctx: ctx,
cancel: cancel,
notifications: make(chan FailoverEvent, 100),
}
// Start background routines
go fm.healthCheckLoop()
go fm.failoverMonitorLoop()
return fm
}
// RegisterTransport adds a transport to the failover manager
func (fm *FailoverManager) RegisterTransport(id, name string, transport Transport, priority int, config TransportConfig) error {
fm.mu.Lock()
defer fm.mu.Unlock()
managedTransport := &ManagedTransport{
Transport: transport,
ID: id,
Name: name,
Priority: priority,
Status: StatusHealthy,
LastHealthCheck: time.Now(),
Config: config,
}
fm.transports[id] = managedTransport
// Set as primary if it's the first or highest priority transport
if fm.primaryTransport == "" || priority > fm.transports[fm.primaryTransport].Priority {
fm.primaryTransport = id
} else {
fm.backupTransports = append(fm.backupTransports, id)
}
return nil
}
// UnregisterTransport removes a transport from the failover manager
func (fm *FailoverManager) UnregisterTransport(id string) error {
fm.mu.Lock()
defer fm.mu.Unlock()
if _, exists := fm.transports[id]; !exists {
return fmt.Errorf("transport not found: %s", id)
}
delete(fm.transports, id)
// Update primary if needed
if fm.primaryTransport == id {
fm.selectNewPrimary()
}
// Remove from backups
for i, backupID := range fm.backupTransports {
if backupID == id {
fm.backupTransports = append(fm.backupTransports[:i], fm.backupTransports[i+1:]...)
break
}
}
return nil
}
// GetActiveTransport returns the currently active transport
func (fm *FailoverManager) GetActiveTransport() (Transport, error) {
fm.mu.RLock()
defer fm.mu.RUnlock()
if fm.primaryTransport == "" {
return nil, fmt.Errorf("no active transport available")
}
transport, exists := fm.transports[fm.primaryTransport]
if !exists {
return nil, fmt.Errorf("primary transport not found: %s", fm.primaryTransport)
}
if transport.Status == StatusHealthy || transport.Status == StatusDegraded {
return transport.Transport, nil
}
// Try to failover to a backup
if err := fm.performFailover(); err != nil {
return nil, fmt.Errorf("failover failed: %w", err)
}
// Return new primary after failover
newPrimary := fm.transports[fm.primaryTransport]
return newPrimary.Transport, nil
}
// Send sends a message through the active transport with automatic failover
func (fm *FailoverManager) Send(ctx context.Context, msg *Message) error {
transport, err := fm.GetActiveTransport()
if err != nil {
return fmt.Errorf("no available transport: %w", err)
}
// Try to send through circuit breaker
return fm.circuitBreaker.Execute(func() error {
return transport.Send(ctx, msg)
})
}
// Receive receives messages from the active transport
func (fm *FailoverManager) Receive(ctx context.Context) (<-chan *Message, error) {
transport, err := fm.GetActiveTransport()
if err != nil {
return nil, fmt.Errorf("no available transport: %w", err)
}
return transport.Receive(ctx)
}
// ForceFailover manually triggers a failover to a specific transport
func (fm *FailoverManager) ForceFailover(targetTransportID string) error {
fm.mu.Lock()
defer fm.mu.Unlock()
target, exists := fm.transports[targetTransportID]
if !exists {
return fmt.Errorf("target transport not found: %s", targetTransportID)
}
if target.Status != StatusHealthy && target.Status != StatusDegraded {
return fmt.Errorf("target transport is not healthy: %s", target.Status)
}
return fm.switchPrimary(targetTransportID, "manual failover")
}
// GetTransportStatus returns the status of all transports
func (fm *FailoverManager) GetTransportStatus() map[string]TransportStatus {
fm.mu.RLock()
defer fm.mu.RUnlock()
status := make(map[string]TransportStatus)
for id, transport := range fm.transports {
status[id] = transport.Status
}
return status
}
// GetMetrics returns failover metrics
func (fm *FailoverManager) GetMetrics() FailoverMetrics {
fm.mu.RLock()
defer fm.mu.RUnlock()
return fm.metrics
}
// GetNotifications returns a channel for failover events
func (fm *FailoverManager) GetNotifications() <-chan FailoverEvent {
return fm.notifications
}
// SetHealthChecker sets a custom health checker
func (fm *FailoverManager) SetHealthChecker(checker HealthChecker) {
fm.mu.Lock()
defer fm.mu.Unlock()
fm.healthChecker = checker
}
// Stop gracefully stops the failover manager
func (fm *FailoverManager) Stop() error {
fm.cancel()
close(fm.notifications)
return nil
}
// Private methods
func (fm *FailoverManager) healthCheckLoop() {
ticker := time.NewTicker(fm.failoverPolicy.HealthCheckInterval)
defer ticker.Stop()
for {
select {
case <-fm.ctx.Done():
return
case <-ticker.C:
fm.performHealthChecks()
}
}
}
func (fm *FailoverManager) failoverMonitorLoop() {
for {
select {
case <-fm.ctx.Done():
return
default:
if fm.shouldPerformFailover() {
if err := fm.performFailover(); err != nil {
fm.metrics.HealthCheckFailures++
}
}
if fm.shouldPerformFailback() {
if err := fm.performFailback(); err != nil {
fm.metrics.HealthCheckFailures++
}
}
time.Sleep(time.Second) // Check every second
}
}
}
func (fm *FailoverManager) performHealthChecks() {
fm.mu.Lock()
defer fm.mu.Unlock()
for id, transport := range fm.transports {
healthy, err := fm.healthChecker.CheckHealth(fm.ctx, transport.Transport)
transport.LastHealthCheck = time.Now()
previousStatus := transport.Status
if err != nil || !healthy {
transport.FailureCount++
transport.LastFailure = time.Now()
if transport.FailureCount >= fm.failoverPolicy.FailureThreshold {
transport.Status = StatusUnhealthy
} else {
transport.Status = StatusDegraded
}
} else {
// Reset failure count on successful health check
transport.FailureCount = 0
transport.Status = StatusHealthy
}
// Notify status change
if previousStatus != transport.Status {
fm.notifyEvent(FailoverEvent{
Type: EventHealthCheck,
ToTransport: id,
Reason: fmt.Sprintf("status changed from %s to %s", previousStatus, transport.Status),
Timestamp: time.Now(),
Success: transport.Status == StatusHealthy,
})
}
}
}
func (fm *FailoverManager) shouldPerformFailover() bool {
fm.mu.RLock()
defer fm.mu.RUnlock()
if fm.primaryTransport == "" {
return false
}
primary := fm.transports[fm.primaryTransport]
return primary.Status == StatusUnhealthy
}
func (fm *FailoverManager) shouldPerformFailback() bool {
if !fm.failoverPolicy.AutoFailback {
return false
}
fm.mu.RLock()
defer fm.mu.RUnlock()
// Find the highest priority healthy transport
var highestPriority int
var highestPriorityID string
for id, transport := range fm.transports {
if transport.Status == StatusHealthy && transport.Priority > highestPriority {
highestPriority = transport.Priority
highestPriorityID = id
}
}
// Failback if there's a higher priority transport available
return highestPriorityID != "" && highestPriorityID != fm.primaryTransport
}
func (fm *FailoverManager) performFailover() error {
fm.mu.Lock()
defer fm.mu.Unlock()
// Find the best backup transport
var bestBackup string
var bestPriority int
for _, backupID := range fm.backupTransports {
backup := fm.transports[backupID]
if (backup.Status == StatusHealthy || backup.Status == StatusDegraded) && backup.Priority > bestPriority {
bestBackup = backupID
bestPriority = backup.Priority
}
}
if bestBackup == "" {
return fmt.Errorf("no healthy backup transport available")
}
return fm.switchPrimary(bestBackup, "automatic failover")
}
func (fm *FailoverManager) performFailback() error {
fm.mu.Lock()
defer fm.mu.Unlock()
// Find the highest priority healthy transport
var highestPriority int
var highestPriorityID string
for id, transport := range fm.transports {
if transport.Status == StatusHealthy && transport.Priority > highestPriority {
highestPriority = transport.Priority
highestPriorityID = id
}
}
if highestPriorityID == "" || highestPriorityID == fm.primaryTransport {
return nil // No failback needed
}
// Wait for failback delay
if time.Since(fm.metrics.LastFailover) < fm.failoverPolicy.FailbackDelay {
return nil
}
return fm.switchPrimary(highestPriorityID, "automatic failback")
}
func (fm *FailoverManager) switchPrimary(newPrimaryID, reason string) error {
start := time.Now()
oldPrimary := fm.primaryTransport
// Update primary and backup lists
fm.primaryTransport = newPrimaryID
// Rebuild backup list
fm.backupTransports = make([]string, 0)
for id := range fm.transports {
if id != newPrimaryID {
fm.backupTransports = append(fm.backupTransports, id)
}
}
// Update metrics
duration := time.Since(start)
if oldPrimary != newPrimaryID {
if reason == "automatic failback" {
fm.metrics.TotalFailbacks++
fm.metrics.LastFailback = time.Now()
} else {
fm.metrics.TotalFailovers++
fm.metrics.LastFailover = time.Now()
}
fm.metrics.FailoverDuration = duration
fm.metrics.CurrentTransport = newPrimaryID
}
// Notify
eventType := EventFailover
if reason == "automatic failback" {
eventType = EventFailback
}
fm.notifyEvent(FailoverEvent{
Type: eventType,
FromTransport: oldPrimary,
ToTransport: newPrimaryID,
Reason: reason,
Timestamp: time.Now(),
Success: true,
Duration: duration,
})
return nil
}
func (fm *FailoverManager) selectNewPrimary() {
var bestID string
var bestPriority int
for id, transport := range fm.transports {
if transport.Status == StatusHealthy && transport.Priority > bestPriority {
bestID = id
bestPriority = transport.Priority
}
}
fm.primaryTransport = bestID
}
func (fm *FailoverManager) notifyEvent(event FailoverEvent) {
select {
case fm.notifications <- event:
default:
// Channel full, drop event
}
}
// DefaultHealthChecker implements basic health checking
type DefaultHealthChecker struct{}
func NewDefaultHealthChecker() *DefaultHealthChecker {
return &DefaultHealthChecker{}
}
func (dhc *DefaultHealthChecker) CheckHealth(ctx context.Context, transport Transport) (bool, error) {
health := transport.Health()
return health.Status == "healthy", nil
}
func (dhc *DefaultHealthChecker) GetHealthScore(transport Transport) float64 {
health := transport.Health()
switch health.Status {
case "healthy":
return 1.0
case "degraded":
return 0.5
default:
return 0.0
}
}
// CircuitBreaker implements circuit breaker pattern for transport operations
type CircuitBreaker struct {
config CircuitBreakerConfig
state CircuitBreakerState
failureCount int
lastFailure time.Time
mu sync.Mutex
}
type CircuitBreakerConfig struct {
FailureThreshold int
RecoveryTimeout time.Duration
MaxRetries int
}
type CircuitBreakerState string
const (
StateClosed CircuitBreakerState = "closed"
StateOpen CircuitBreakerState = "open"
StateHalfOpen CircuitBreakerState = "half_open"
)
func NewCircuitBreaker(config CircuitBreakerConfig) *CircuitBreaker {
return &CircuitBreaker{
config: config,
state: StateClosed,
}
}
func (cb *CircuitBreaker) Execute(operation func() error) error {
cb.mu.Lock()
defer cb.mu.Unlock()
if cb.state == StateOpen {
if time.Since(cb.lastFailure) < cb.config.RecoveryTimeout {
return fmt.Errorf("circuit breaker is open")
}
cb.state = StateHalfOpen
}
err := operation()
if err != nil {
cb.onFailure()
return err
}
cb.onSuccess()
return nil
}
func (cb *CircuitBreaker) onFailure() {
cb.failureCount++
cb.lastFailure = time.Now()
if cb.failureCount >= cb.config.FailureThreshold {
cb.state = StateOpen
}
}
func (cb *CircuitBreaker) onSuccess() {
cb.failureCount = 0
cb.state = StateClosed
}
func (cb *CircuitBreaker) GetState() CircuitBreakerState {
cb.mu.Lock()
defer cb.mu.Unlock()
return cb.state
}