feat: create v2-prep branch with comprehensive planning
Restructured project for V2 refactor: **Structure Changes:** - Moved all V1 code to orig/ folder (preserved with git mv) - Created docs/planning/ directory - Added orig/README_V1.md explaining V1 preservation **Planning Documents:** - 00_V2_MASTER_PLAN.md: Complete architecture overview - Executive summary of critical V1 issues - High-level component architecture diagrams - 5-phase implementation roadmap - Success metrics and risk mitigation - 07_TASK_BREAKDOWN.md: Atomic task breakdown - 99+ hours of detailed tasks - Every task < 2 hours (atomic) - Clear dependencies and success criteria - Organized by implementation phase **V2 Key Improvements:** - Per-exchange parsers (factory pattern) - Multi-layer strict validation - Multi-index pool cache - Background validation pipeline - Comprehensive observability **Critical Issues Addressed:** - Zero address tokens (strict validation + cache enrichment) - Parsing accuracy (protocol-specific parsers) - No audit trail (background validation channel) - Inefficient lookups (multi-index cache) - Stats disconnection (event-driven metrics) Next Steps: 1. Review planning documents 2. Begin Phase 1: Foundation (P1-001 through P1-010) 3. Implement parsers in Phase 2 4. Build cache system in Phase 3 5. Add validation pipeline in Phase 4 6. Migrate and test in Phase 5 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
494
orig/internal/ratelimit/adaptive.go
Normal file
494
orig/internal/ratelimit/adaptive.go
Normal file
@@ -0,0 +1,494 @@
|
||||
package ratelimit
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"golang.org/x/time/rate"
|
||||
|
||||
"github.com/fraktal/mev-beta/internal/config"
|
||||
"github.com/fraktal/mev-beta/internal/logger"
|
||||
)
|
||||
|
||||
// AdaptiveRateLimiter implements adaptive rate limiting that adjusts to endpoint capacity
|
||||
type AdaptiveRateLimiter struct {
|
||||
endpoints map[string]*AdaptiveEndpoint
|
||||
mu sync.RWMutex
|
||||
logger *logger.Logger
|
||||
defaultConfig config.RateLimitConfig
|
||||
adjustInterval time.Duration
|
||||
stopChan chan struct{}
|
||||
}
|
||||
|
||||
// AdaptiveEndpoint represents an endpoint with adaptive rate limiting
|
||||
type AdaptiveEndpoint struct {
|
||||
URL string
|
||||
limiter *rate.Limiter
|
||||
config config.RateLimitConfig
|
||||
circuitBreaker *CircuitBreaker
|
||||
metrics *EndpointMetrics
|
||||
healthChecker *HealthChecker
|
||||
lastAdjustment time.Time
|
||||
consecutiveErrors int64
|
||||
consecutiveSuccess int64
|
||||
}
|
||||
|
||||
// EndpointMetrics tracks performance metrics for an endpoint
|
||||
// All fields must be 64-bit aligned for atomic access
|
||||
type EndpointMetrics struct {
|
||||
TotalRequests int64
|
||||
SuccessfulRequests int64
|
||||
FailedRequests int64
|
||||
TotalLatency int64 // nanoseconds
|
||||
LastRequestTime int64 // unix timestamp
|
||||
// Non-atomic fields - must be protected by mutex when accessed
|
||||
mu sync.RWMutex
|
||||
SuccessRate float64
|
||||
AverageLatency float64 // milliseconds
|
||||
}
|
||||
|
||||
// CircuitBreaker implements circuit breaker pattern for failed endpoints
|
||||
type CircuitBreaker struct {
|
||||
state int32 // 0: Closed, 1: Open, 2: HalfOpen
|
||||
failureCount int64
|
||||
lastFailTime int64
|
||||
threshold int64
|
||||
timeout time.Duration // How long to wait before trying again
|
||||
testRequests int64 // Number of test requests in half-open state
|
||||
}
|
||||
|
||||
// Circuit breaker states
|
||||
const (
|
||||
CircuitClosed = 0
|
||||
CircuitOpen = 1
|
||||
CircuitHalfOpen = 2
|
||||
)
|
||||
|
||||
// HealthChecker monitors endpoint health
|
||||
type HealthChecker struct {
|
||||
endpoint string
|
||||
interval time.Duration
|
||||
timeout time.Duration
|
||||
isHealthy int64 // atomic bool
|
||||
lastCheck int64 // unix timestamp
|
||||
stopChan chan struct{}
|
||||
}
|
||||
|
||||
// NewAdaptiveRateLimiter creates a new adaptive rate limiter
|
||||
func NewAdaptiveRateLimiter(cfg *config.ArbitrumConfig, logger *logger.Logger) *AdaptiveRateLimiter {
|
||||
arl := &AdaptiveRateLimiter{
|
||||
endpoints: make(map[string]*AdaptiveEndpoint),
|
||||
logger: logger,
|
||||
defaultConfig: cfg.RateLimit,
|
||||
adjustInterval: 30 * time.Second,
|
||||
stopChan: make(chan struct{}),
|
||||
}
|
||||
|
||||
// Create adaptive endpoint for primary endpoint
|
||||
arl.addEndpoint(cfg.RPCEndpoint, cfg.RateLimit)
|
||||
|
||||
// Create adaptive endpoints for reading endpoints
|
||||
for _, endpoint := range cfg.ReadingEndpoints {
|
||||
arl.addEndpoint(endpoint.URL, endpoint.RateLimit)
|
||||
}
|
||||
|
||||
// Create adaptive endpoints for execution endpoints
|
||||
for _, endpoint := range cfg.ExecutionEndpoints {
|
||||
arl.addEndpoint(endpoint.URL, endpoint.RateLimit)
|
||||
}
|
||||
|
||||
// Start background adjustment routine
|
||||
go arl.adjustmentLoop()
|
||||
|
||||
return arl
|
||||
}
|
||||
|
||||
// addEndpoint adds a new adaptive endpoint
|
||||
func (arl *AdaptiveRateLimiter) addEndpoint(url string, config config.RateLimitConfig) {
|
||||
endpoint := &AdaptiveEndpoint{
|
||||
URL: url,
|
||||
limiter: rate.NewLimiter(rate.Limit(config.RequestsPerSecond), config.Burst),
|
||||
config: config,
|
||||
circuitBreaker: &CircuitBreaker{
|
||||
threshold: 10, // Break after 10 consecutive failures
|
||||
timeout: 60 * time.Second,
|
||||
},
|
||||
metrics: &EndpointMetrics{},
|
||||
healthChecker: &HealthChecker{
|
||||
endpoint: url,
|
||||
interval: 30 * time.Second,
|
||||
timeout: 5 * time.Second,
|
||||
isHealthy: 1, // Start assuming healthy
|
||||
stopChan: make(chan struct{}),
|
||||
},
|
||||
}
|
||||
|
||||
arl.mu.Lock()
|
||||
arl.endpoints[url] = endpoint
|
||||
arl.mu.Unlock()
|
||||
|
||||
// Start health checker for this endpoint
|
||||
go endpoint.healthChecker.start()
|
||||
|
||||
arl.logger.Info(fmt.Sprintf("Added adaptive rate limiter for endpoint: %s", url))
|
||||
}
|
||||
|
||||
// WaitForBestEndpoint waits for the best available endpoint
|
||||
func (arl *AdaptiveRateLimiter) WaitForBestEndpoint(ctx context.Context) (string, error) {
|
||||
// Find the best available endpoint
|
||||
bestEndpoint := arl.getBestEndpoint()
|
||||
if bestEndpoint == "" {
|
||||
return "", fmt.Errorf("no healthy endpoints available")
|
||||
}
|
||||
|
||||
// Wait for rate limiter
|
||||
arl.mu.RLock()
|
||||
endpoint := arl.endpoints[bestEndpoint]
|
||||
arl.mu.RUnlock()
|
||||
|
||||
if endpoint == nil {
|
||||
return "", fmt.Errorf("endpoint not found: %s", bestEndpoint)
|
||||
}
|
||||
|
||||
// Check circuit breaker
|
||||
if !endpoint.circuitBreaker.canExecute() {
|
||||
return "", fmt.Errorf("circuit breaker open for endpoint: %s", bestEndpoint)
|
||||
}
|
||||
|
||||
// Wait for rate limiter
|
||||
err := endpoint.limiter.Wait(ctx)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return bestEndpoint, nil
|
||||
}
|
||||
|
||||
// RecordResult records the result of a request for adaptive adjustment
|
||||
func (arl *AdaptiveRateLimiter) RecordResult(endpointURL string, success bool, latency time.Duration) {
|
||||
arl.mu.RLock()
|
||||
endpoint, exists := arl.endpoints[endpointURL]
|
||||
arl.mu.RUnlock()
|
||||
|
||||
if !exists {
|
||||
return
|
||||
}
|
||||
|
||||
// Update metrics atomically
|
||||
atomic.AddInt64(&endpoint.metrics.TotalRequests, 1)
|
||||
atomic.AddInt64(&endpoint.metrics.TotalLatency, latency.Nanoseconds())
|
||||
atomic.StoreInt64(&endpoint.metrics.LastRequestTime, time.Now().Unix())
|
||||
|
||||
if success {
|
||||
atomic.AddInt64(&endpoint.metrics.SuccessfulRequests, 1)
|
||||
atomic.AddInt64(&endpoint.consecutiveSuccess, 1)
|
||||
atomic.StoreInt64(&endpoint.consecutiveErrors, 0)
|
||||
endpoint.circuitBreaker.recordSuccess()
|
||||
} else {
|
||||
atomic.AddInt64(&endpoint.metrics.FailedRequests, 1)
|
||||
atomic.AddInt64(&endpoint.consecutiveErrors, 1)
|
||||
atomic.StoreInt64(&endpoint.consecutiveSuccess, 0)
|
||||
endpoint.circuitBreaker.recordFailure()
|
||||
}
|
||||
|
||||
// Update calculated metrics
|
||||
arl.updateCalculatedMetrics(endpoint)
|
||||
}
|
||||
|
||||
// updateCalculatedMetrics updates derived metrics
|
||||
func (arl *AdaptiveRateLimiter) updateCalculatedMetrics(endpoint *AdaptiveEndpoint) {
|
||||
totalReq := atomic.LoadInt64(&endpoint.metrics.TotalRequests)
|
||||
successReq := atomic.LoadInt64(&endpoint.metrics.SuccessfulRequests)
|
||||
totalLatency := atomic.LoadInt64(&endpoint.metrics.TotalLatency)
|
||||
|
||||
if totalReq > 0 {
|
||||
endpoint.metrics.SuccessRate = float64(successReq) / float64(totalReq)
|
||||
endpoint.metrics.AverageLatency = float64(totalLatency) / float64(totalReq) / 1000000 // Convert to milliseconds
|
||||
}
|
||||
}
|
||||
|
||||
// getBestEndpoint selects the best available endpoint based on metrics
|
||||
func (arl *AdaptiveRateLimiter) getBestEndpoint() string {
|
||||
arl.mu.RLock()
|
||||
defer arl.mu.RUnlock()
|
||||
|
||||
bestEndpoint := ""
|
||||
bestScore := float64(-1)
|
||||
|
||||
for url, endpoint := range arl.endpoints {
|
||||
// Skip unhealthy endpoints
|
||||
if atomic.LoadInt64(&endpoint.healthChecker.isHealthy) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
// Skip if circuit breaker is open
|
||||
if !endpoint.circuitBreaker.canExecute() {
|
||||
continue
|
||||
}
|
||||
|
||||
// Calculate score based on success rate, latency, and current load
|
||||
score := arl.calculateEndpointScore(endpoint)
|
||||
if score > bestScore {
|
||||
bestScore = score
|
||||
bestEndpoint = url
|
||||
}
|
||||
}
|
||||
|
||||
return bestEndpoint
|
||||
}
|
||||
|
||||
// updateDerivedMetrics safely updates calculated metrics with proper synchronization
|
||||
func (em *EndpointMetrics) updateDerivedMetrics() {
|
||||
totalRequests := atomic.LoadInt64(&em.TotalRequests)
|
||||
successfulRequests := atomic.LoadInt64(&em.SuccessfulRequests)
|
||||
totalLatency := atomic.LoadInt64(&em.TotalLatency)
|
||||
|
||||
em.mu.Lock()
|
||||
defer em.mu.Unlock()
|
||||
|
||||
// Calculate success rate
|
||||
if totalRequests > 0 {
|
||||
em.SuccessRate = float64(successfulRequests) / float64(totalRequests)
|
||||
} else {
|
||||
em.SuccessRate = 0.0
|
||||
}
|
||||
|
||||
// Calculate average latency in milliseconds
|
||||
if totalRequests > 0 {
|
||||
em.AverageLatency = float64(totalLatency) / float64(totalRequests) / 1e6 // ns to ms
|
||||
} else {
|
||||
em.AverageLatency = 0.0
|
||||
}
|
||||
}
|
||||
|
||||
// getCalculatedMetrics safely returns derived metrics
|
||||
func (em *EndpointMetrics) getCalculatedMetrics() (float64, float64) {
|
||||
em.mu.RLock()
|
||||
defer em.mu.RUnlock()
|
||||
return em.SuccessRate, em.AverageLatency
|
||||
}
|
||||
|
||||
// calculateEndpointScore calculates a score for endpoint selection
|
||||
func (arl *AdaptiveRateLimiter) calculateEndpointScore(endpoint *AdaptiveEndpoint) float64 {
|
||||
// Base score on success rate (0-1)
|
||||
successWeight := 0.6
|
||||
latencyWeight := 0.3
|
||||
loadWeight := 0.1
|
||||
|
||||
// Update derived metrics first
|
||||
endpoint.metrics.updateDerivedMetrics()
|
||||
|
||||
// Get calculated metrics safely
|
||||
successScore, avgLatency := endpoint.metrics.getCalculatedMetrics()
|
||||
|
||||
// Invert latency score (lower latency = higher score)
|
||||
latencyScore := 1.0
|
||||
if avgLatency > 0 {
|
||||
// Normalize latency score (assuming 1000ms is poor, 100ms is good)
|
||||
latencyScore = 1.0 - (avgLatency / 1000.0)
|
||||
if latencyScore < 0 {
|
||||
latencyScore = 0
|
||||
}
|
||||
}
|
||||
|
||||
// Load score based on current rate limiter state
|
||||
loadScore := 1.0 // Simplified - could check current tokens in limiter
|
||||
|
||||
return successScore*successWeight + latencyScore*latencyWeight + loadScore*loadWeight
|
||||
}
|
||||
|
||||
// adjustmentLoop runs periodic adjustments to rate limits
|
||||
func (arl *AdaptiveRateLimiter) adjustmentLoop() {
|
||||
ticker := time.NewTicker(arl.adjustInterval)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
arl.adjustRateLimits()
|
||||
case <-arl.stopChan:
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// adjustRateLimits adjusts rate limits based on observed performance
|
||||
func (arl *AdaptiveRateLimiter) adjustRateLimits() {
|
||||
arl.mu.Lock()
|
||||
defer arl.mu.Unlock()
|
||||
|
||||
for url, endpoint := range arl.endpoints {
|
||||
arl.adjustEndpointRateLimit(url, endpoint)
|
||||
}
|
||||
}
|
||||
|
||||
// adjustEndpointRateLimit adjusts rate limit for a specific endpoint
|
||||
func (arl *AdaptiveRateLimiter) adjustEndpointRateLimit(url string, endpoint *AdaptiveEndpoint) {
|
||||
// Don't adjust too frequently
|
||||
if time.Since(endpoint.lastAdjustment) < arl.adjustInterval {
|
||||
return
|
||||
}
|
||||
|
||||
successRate := endpoint.metrics.SuccessRate
|
||||
avgLatency := endpoint.metrics.AverageLatency
|
||||
currentLimit := float64(endpoint.limiter.Limit())
|
||||
|
||||
var newLimit float64 = currentLimit
|
||||
adjustmentFactor := 0.1 // 10% adjustment
|
||||
|
||||
// Increase rate if performing well
|
||||
if successRate > 0.95 && avgLatency < 500 { // 95% success, < 500ms latency
|
||||
newLimit = currentLimit * (1.0 + adjustmentFactor)
|
||||
} else if successRate < 0.8 || avgLatency > 2000 { // < 80% success or > 2s latency
|
||||
newLimit = currentLimit * (1.0 - adjustmentFactor)
|
||||
}
|
||||
|
||||
// Apply bounds
|
||||
minLimit := float64(arl.defaultConfig.RequestsPerSecond) * 0.1 // 10% of default minimum
|
||||
maxLimit := float64(arl.defaultConfig.RequestsPerSecond) * 3.0 // 300% of default maximum
|
||||
|
||||
if newLimit < minLimit {
|
||||
newLimit = minLimit
|
||||
}
|
||||
if newLimit > maxLimit {
|
||||
newLimit = maxLimit
|
||||
}
|
||||
|
||||
// Update if changed significantly
|
||||
if abs(newLimit-currentLimit)/currentLimit > 0.05 { // 5% change threshold
|
||||
endpoint.limiter.SetLimit(rate.Limit(newLimit))
|
||||
endpoint.lastAdjustment = time.Now()
|
||||
|
||||
arl.logger.Info(fmt.Sprintf("Adjusted rate limit for %s: %.2f -> %.2f (success: %.2f%%, latency: %.2fms)",
|
||||
url, currentLimit, newLimit, successRate*100, avgLatency))
|
||||
}
|
||||
}
|
||||
|
||||
// abs returns absolute value of float64
|
||||
func abs(x float64) float64 {
|
||||
if x < 0 {
|
||||
return -x
|
||||
}
|
||||
return x
|
||||
}
|
||||
|
||||
// canExecute checks if circuit breaker allows execution
|
||||
func (cb *CircuitBreaker) canExecute() bool {
|
||||
state := atomic.LoadInt32(&cb.state)
|
||||
now := time.Now().Unix()
|
||||
|
||||
switch state {
|
||||
case CircuitClosed:
|
||||
return true
|
||||
case CircuitOpen:
|
||||
// Check if timeout has passed
|
||||
lastFail := atomic.LoadInt64(&cb.lastFailTime)
|
||||
if now-lastFail > int64(cb.timeout.Seconds()) {
|
||||
// Try to move to half-open
|
||||
if atomic.CompareAndSwapInt32(&cb.state, CircuitOpen, CircuitHalfOpen) {
|
||||
atomic.StoreInt64(&cb.testRequests, 0)
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
case CircuitHalfOpen:
|
||||
// Allow limited test requests
|
||||
testReq := atomic.LoadInt64(&cb.testRequests)
|
||||
if testReq < 3 { // Allow up to 3 test requests
|
||||
atomic.AddInt64(&cb.testRequests, 1)
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// recordSuccess records a successful request
|
||||
func (cb *CircuitBreaker) recordSuccess() {
|
||||
state := atomic.LoadInt32(&cb.state)
|
||||
if state == CircuitHalfOpen {
|
||||
// Move back to closed after successful test
|
||||
atomic.StoreInt32(&cb.state, CircuitClosed)
|
||||
atomic.StoreInt64(&cb.failureCount, 0)
|
||||
}
|
||||
}
|
||||
|
||||
// recordFailure records a failed request
|
||||
func (cb *CircuitBreaker) recordFailure() {
|
||||
failures := atomic.AddInt64(&cb.failureCount, 1)
|
||||
atomic.StoreInt64(&cb.lastFailTime, time.Now().Unix())
|
||||
|
||||
if failures >= cb.threshold {
|
||||
atomic.StoreInt32(&cb.state, CircuitOpen)
|
||||
}
|
||||
}
|
||||
|
||||
// start starts the health checker
|
||||
func (hc *HealthChecker) start() {
|
||||
ticker := time.NewTicker(hc.interval)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
hc.checkHealth()
|
||||
case <-hc.stopChan:
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// checkHealth performs a health check on the endpoint
|
||||
func (hc *HealthChecker) checkHealth() {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), hc.timeout)
|
||||
defer cancel()
|
||||
|
||||
// Simple health check - try to connect
|
||||
// In production, this might make a simple RPC call
|
||||
healthy := hc.performHealthCheck(ctx)
|
||||
|
||||
if healthy {
|
||||
atomic.StoreInt64(&hc.isHealthy, 1)
|
||||
} else {
|
||||
atomic.StoreInt64(&hc.isHealthy, 0)
|
||||
}
|
||||
|
||||
atomic.StoreInt64(&hc.lastCheck, time.Now().Unix())
|
||||
}
|
||||
|
||||
// performHealthCheck performs the actual health check
|
||||
func (hc *HealthChecker) performHealthCheck(ctx context.Context) bool {
|
||||
// Simplified health check - in production would make actual RPC call
|
||||
// For now, just simulate based on endpoint availability
|
||||
return true // Assume healthy for demo
|
||||
}
|
||||
|
||||
// Stop stops the adaptive rate limiter
|
||||
func (arl *AdaptiveRateLimiter) Stop() {
|
||||
close(arl.stopChan)
|
||||
|
||||
// Stop all health checkers
|
||||
arl.mu.RLock()
|
||||
for _, endpoint := range arl.endpoints {
|
||||
close(endpoint.healthChecker.stopChan)
|
||||
}
|
||||
arl.mu.RUnlock()
|
||||
}
|
||||
|
||||
// GetMetrics returns current metrics for all endpoints
|
||||
func (arl *AdaptiveRateLimiter) GetMetrics() map[string]*EndpointMetrics {
|
||||
arl.mu.RLock()
|
||||
defer arl.mu.RUnlock()
|
||||
|
||||
metrics := make(map[string]*EndpointMetrics)
|
||||
for url, endpoint := range arl.endpoints {
|
||||
// Update calculated metrics before returning
|
||||
arl.updateCalculatedMetrics(endpoint)
|
||||
metrics[url] = endpoint.metrics
|
||||
}
|
||||
|
||||
return metrics
|
||||
}
|
||||
Reference in New Issue
Block a user