feat: create v2-prep branch with comprehensive planning

Restructured project for V2 refactor: **Structure Changes:** - Moved all V1 code to orig/ folder (preserved with git mv) - Created docs/planning/ directory - Added orig/README_V1.md explaining V1 preservation **Planning Documents:** - 00_V2_MASTER_PLAN.md: Complete architecture overview - Executive summary of critical V1 issues - High-level component architecture diagrams - 5-phase implementation roadmap - Success metrics and risk mitigation - 07_TASK_BREAKDOWN.md: Atomic task breakdown - 99+ hours of detailed tasks - Every task < 2 hours (atomic) - Clear dependencies and success criteria - Organized by implementation phase **V2 Key Improvements:** - Per-exchange parsers (factory pattern) - Multi-layer strict validation - Multi-index pool cache - Background validation pipeline - Comprehensive observability **Critical Issues Addressed:** - Zero address tokens (strict validation + cache enrichment) - Parsing accuracy (protocol-specific parsers) - No audit trail (background validation channel) - Inefficient lookups (multi-index cache) - Stats disconnection (event-driven metrics) Next Steps: 1. Review planning documents 2. Begin Phase 1: Foundation (P1-001 through P1-010) 3. Implement parsers in Phase 2 4. Build cache system in Phase 3 5. Add validation pipeline in Phase 4 6. Migrate and test in Phase 5 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-10 10:14:26 +01:00
parent 1773daffe7
commit 803de231ba
411 changed files with 20390 additions and 8680 deletions
--- a/orig/internal/monitoring/health_checker.go
+++ b/orig/internal/monitoring/health_checker.go
@@ -0,0 +1,447 @@
+package monitoring
+
+import (
+	"context"
+	"fmt"
+	"sync"
+	"time"
+
+	"github.com/fraktal/mev-beta/internal/logger"
+)
+
+// HealthCheckRunner performs periodic health checks and monitoring
+type HealthCheckRunner struct {
+	mu                    sync.RWMutex
+	logger                *logger.Logger
+	integrityMonitor      *IntegrityMonitor
+	checkInterval         time.Duration
+	running               bool
+	stopChan              chan struct{}
+	lastHealthCheck       time.Time
+	healthHistory         []HealthSnapshot
+	maxHistorySize        int
+	warmupSamples         int
+	minAddressesForAlerts int64
+}
+
+// HealthSnapshot represents a point-in-time health snapshot
+type HealthSnapshot struct {
+	Timestamp           time.Time
+	HealthScore         float64
+	CorruptionRate      float64
+	ValidationSuccess   float64
+	ContractCallSuccess float64
+	ActiveAlerts        int
+	Trend               HealthTrend
+}
+
+// HealthTrend indicates the direction of health metrics
+type HealthTrend int
+
+const (
+	HealthTrendUnknown HealthTrend = iota
+	HealthTrendImproving
+	HealthTrendStable
+	HealthTrendDeclining
+	HealthTrendCritical
+)
+
+func (t HealthTrend) String() string {
+	switch t {
+	case HealthTrendImproving:
+		return "IMPROVING"
+	case HealthTrendStable:
+		return "STABLE"
+	case HealthTrendDeclining:
+		return "DECLINING"
+	case HealthTrendCritical:
+		return "CRITICAL"
+	default:
+		return "UNKNOWN"
+	}
+}
+
+// NewHealthCheckRunner creates a new health check runner
+func NewHealthCheckRunner(logger *logger.Logger, integrityMonitor *IntegrityMonitor) *HealthCheckRunner {
+	return &HealthCheckRunner{
+		logger:                logger,
+		integrityMonitor:      integrityMonitor,
+		checkInterval:         30 * time.Second, // Default 30 second intervals
+		stopChan:              make(chan struct{}),
+		healthHistory:         make([]HealthSnapshot, 0),
+		maxHistorySize:        100, // Keep last 100 snapshots (50 minutes at 30s intervals)
+		warmupSamples:         3,
+		minAddressesForAlerts: 25,
+	}
+}
+
+// Start begins the periodic health checking routine
+func (hcr *HealthCheckRunner) Start(ctx context.Context) {
+	hcr.mu.Lock()
+	if hcr.running {
+		hcr.mu.Unlock()
+		return
+	}
+	hcr.running = true
+	hcr.mu.Unlock()
+
+	hcr.logger.Info("Starting health check runner",
+		"interval", hcr.checkInterval)
+
+	go hcr.healthCheckLoop(ctx)
+}
+
+// Stop stops the health checking routine
+func (hcr *HealthCheckRunner) Stop() {
+	hcr.mu.Lock()
+	defer hcr.mu.Unlock()
+
+	if !hcr.running {
+		return
+	}
+
+	hcr.running = false
+	close(hcr.stopChan)
+	hcr.logger.Info("Health check runner stopped")
+}
+
+// healthCheckLoop runs the periodic health checking
+func (hcr *HealthCheckRunner) healthCheckLoop(ctx context.Context) {
+	ticker := time.NewTicker(hcr.checkInterval)
+	defer ticker.Stop()
+
+	// Perform initial health check
+	hcr.performHealthCheck()
+
+	for {
+		select {
+		case <-ctx.Done():
+			hcr.logger.Info("Health check runner stopped due to context cancellation")
+			return
+		case <-hcr.stopChan:
+			hcr.logger.Info("Health check runner stopped")
+			return
+		case <-ticker.C:
+			hcr.performHealthCheck()
+		}
+	}
+}
+
+// performHealthCheck executes a comprehensive health check
+func (hcr *HealthCheckRunner) performHealthCheck() {
+	start := time.Now()
+	hcr.lastHealthCheck = start
+
+	if !hcr.integrityMonitor.IsEnabled() {
+		hcr.logger.Debug("Skipping health check - integrity monitor disabled")
+		return
+	}
+
+	// Get current metrics
+	metrics := hcr.integrityMonitor.GetMetrics()
+	healthSummary := hcr.integrityMonitor.GetHealthSummary()
+
+	// Calculate rates
+	corruptionRate := 0.0
+	if metrics.TotalAddressesProcessed > 0 {
+		corruptionRate = float64(metrics.CorruptAddressesDetected) / float64(metrics.TotalAddressesProcessed)
+	}
+
+	validationSuccessRate := 0.0
+	totalValidations := metrics.AddressValidationPassed + metrics.AddressValidationFailed
+	if totalValidations > 0 {
+		validationSuccessRate = float64(metrics.AddressValidationPassed) / float64(totalValidations)
+	}
+
+	contractCallSuccessRate := 0.0
+	totalCalls := metrics.ContractCallsSucceeded + metrics.ContractCallsFailed
+	if totalCalls > 0 {
+		contractCallSuccessRate = float64(metrics.ContractCallsSucceeded) / float64(totalCalls)
+	}
+
+	// Create health snapshot
+	snapshot := HealthSnapshot{
+		Timestamp:           start,
+		HealthScore:         metrics.HealthScore,
+		CorruptionRate:      corruptionRate,
+		ValidationSuccess:   validationSuccessRate,
+		ContractCallSuccess: contractCallSuccessRate,
+		ActiveAlerts:        0, // Will be calculated based on current conditions
+		Trend:               hcr.calculateHealthTrend(metrics.HealthScore),
+	}
+
+	// Add to history
+	hcr.addHealthSnapshot(snapshot)
+
+	// Check for threshold violations and generate alerts
+	hcr.checkThresholds(healthSummary, snapshot)
+
+	// Log health status periodically
+	hcr.logHealthStatus(snapshot, time.Since(start))
+}
+
+// addHealthSnapshot adds a snapshot to the health history
+func (hcr *HealthCheckRunner) addHealthSnapshot(snapshot HealthSnapshot) {
+	hcr.mu.Lock()
+	defer hcr.mu.Unlock()
+
+	hcr.healthHistory = append(hcr.healthHistory, snapshot)
+
+	// Trim history if it exceeds max size
+	if len(hcr.healthHistory) > hcr.maxHistorySize {
+		hcr.healthHistory = hcr.healthHistory[len(hcr.healthHistory)-hcr.maxHistorySize:]
+	}
+}
+
+// calculateHealthTrend analyzes recent health scores to determine trend
+func (hcr *HealthCheckRunner) calculateHealthTrend(currentScore float64) HealthTrend {
+	hcr.mu.RLock()
+	defer hcr.mu.RUnlock()
+
+	if len(hcr.healthHistory) < 3 {
+		return HealthTrendUnknown
+	}
+
+	// Get last few scores for trend analysis
+	recentScores := make([]float64, 0, 5)
+	start := len(hcr.healthHistory) - 5
+	if start < 0 {
+		start = 0
+	}
+
+	for i := start; i < len(hcr.healthHistory); i++ {
+		recentScores = append(recentScores, hcr.healthHistory[i].HealthScore)
+	}
+	recentScores = append(recentScores, currentScore)
+
+	// Calculate trend
+	if currentScore < 0.5 {
+		return HealthTrendCritical
+	}
+
+	// Simple linear trend calculation
+	if len(recentScores) >= 3 {
+		first := recentScores[0]
+		last := recentScores[len(recentScores)-1]
+		diff := last - first
+
+		if diff > 0.05 {
+			return HealthTrendImproving
+		} else if diff < -0.05 {
+			return HealthTrendDeclining
+		} else {
+			return HealthTrendStable
+		}
+	}
+
+	return HealthTrendUnknown
+}
+
+// checkThresholds checks for threshold violations and generates alerts
+func (hcr *HealthCheckRunner) checkThresholds(healthSummary map[string]interface{}, snapshot HealthSnapshot) {
+	if !hcr.readyForAlerts(healthSummary, snapshot) {
+		hcr.logger.Debug("Health alerts suppressed during warm-up",
+			"health_score", snapshot.HealthScore,
+			"total_addresses_processed", safeNumericLookup(healthSummary, "total_addresses_processed"),
+			"history_size", hcr.historySize())
+		return
+	}
+
+	// Critical health score alert
+	if snapshot.HealthScore < 0.5 {
+		alert := CorruptionAlert{
+			Timestamp: time.Now(),
+			Severity:  AlertSeverityEmergency,
+			Message:   fmt.Sprintf("CRITICAL: System health score is %.2f (below 0.5)", snapshot.HealthScore),
+			Context: map[string]interface{}{
+				"health_score":          snapshot.HealthScore,
+				"corruption_rate":       snapshot.CorruptionRate,
+				"validation_success":    snapshot.ValidationSuccess,
+				"contract_call_success": snapshot.ContractCallSuccess,
+				"trend":                 snapshot.Trend.String(),
+			},
+		}
+		hcr.integrityMonitor.sendAlert(alert)
+	}
+
+	// High corruption rate alert
+	if snapshot.CorruptionRate > 0.10 { // 10% corruption rate
+		alert := CorruptionAlert{
+			Timestamp: time.Now(),
+			Severity:  AlertSeverityCritical,
+			Message:   fmt.Sprintf("High corruption rate detected: %.2f%%", snapshot.CorruptionRate*100),
+			Context: map[string]interface{}{
+				"corruption_rate":    snapshot.CorruptionRate,
+				"threshold":          0.10,
+				"addresses_affected": snapshot.CorruptionRate,
+			},
+		}
+		hcr.integrityMonitor.sendAlert(alert)
+	}
+
+	// Declining trend alert
+	if snapshot.Trend == HealthTrendDeclining || snapshot.Trend == HealthTrendCritical {
+		alert := CorruptionAlert{
+			Timestamp: time.Now(),
+			Severity:  AlertSeverityWarning,
+			Message:   fmt.Sprintf("System health trend is %s (current score: %.2f)", snapshot.Trend.String(), snapshot.HealthScore),
+			Context: map[string]interface{}{
+				"trend":            snapshot.Trend.String(),
+				"health_score":     snapshot.HealthScore,
+				"recent_snapshots": hcr.getRecentSnapshots(5),
+			},
+		}
+		hcr.integrityMonitor.sendAlert(alert)
+	}
+}
+
+func (hcr *HealthCheckRunner) readyForAlerts(healthSummary map[string]interface{}, snapshot HealthSnapshot) bool {
+	hcr.mu.RLock()
+	historyLen := len(hcr.healthHistory)
+	hcr.mu.RUnlock()
+
+	if historyLen < hcr.warmupSamples {
+		return false
+	}
+
+	totalProcessed := safeNumericLookup(healthSummary, "total_addresses_processed")
+	if totalProcessed >= 0 && totalProcessed < float64(hcr.minAddressesForAlerts) {
+		return false
+	}
+
+	// Require at least one validation or contract call attempt before alarming.
+	if snapshot.ValidationSuccess == 0 && snapshot.ContractCallSuccess == 0 && totalProcessed == 0 {
+		return false
+	}
+
+	return true
+}
+
+func safeNumericLookup(summary map[string]interface{}, key string) float64 {
+	if summary == nil {
+		return -1
+	}
+
+	value, ok := summary[key]
+	if !ok {
+		return -1
+	}
+
+	switch v := value.(type) {
+	case int:
+		return float64(v)
+	case int32:
+		return float64(v)
+	case int64:
+		return float64(v)
+	case uint:
+		return float64(v)
+	case uint32:
+		return float64(v)
+	case uint64:
+		return float64(v)
+	case float32:
+		return float64(v)
+	case float64:
+		return v
+	default:
+		return -1
+	}
+}
+
+func (hcr *HealthCheckRunner) historySize() int {
+	hcr.mu.RLock()
+	defer hcr.mu.RUnlock()
+	return len(hcr.healthHistory)
+}
+
+// logHealthStatus logs periodic health status information
+func (hcr *HealthCheckRunner) logHealthStatus(snapshot HealthSnapshot, duration time.Duration) {
+	// Log detailed status every 5 minutes (10 checks at 30s intervals)
+	if len(hcr.healthHistory)%10 == 0 {
+		hcr.logger.Info("System health status",
+			"health_score", snapshot.HealthScore,
+			"corruption_rate", fmt.Sprintf("%.4f", snapshot.CorruptionRate),
+			"validation_success", fmt.Sprintf("%.4f", snapshot.ValidationSuccess),
+			"contract_call_success", fmt.Sprintf("%.4f", snapshot.ContractCallSuccess),
+			"trend", snapshot.Trend.String(),
+			"check_duration", duration)
+	} else {
+		// Brief status for regular checks
+		hcr.logger.Debug("Health check completed",
+			"health_score", snapshot.HealthScore,
+			"trend", snapshot.Trend.String(),
+			"duration", duration)
+	}
+}
+
+// GetRecentSnapshots returns the most recent health snapshots
+func (hcr *HealthCheckRunner) GetRecentSnapshots(count int) []HealthSnapshot {
+	return hcr.getRecentSnapshots(count)
+}
+
+// getRecentSnapshots internal implementation
+func (hcr *HealthCheckRunner) getRecentSnapshots(count int) []HealthSnapshot {
+	hcr.mu.RLock()
+	defer hcr.mu.RUnlock()
+
+	if len(hcr.healthHistory) == 0 {
+		return []HealthSnapshot{}
+	}
+
+	start := len(hcr.healthHistory) - count
+	if start < 0 {
+		start = 0
+	}
+
+	// Create a copy to avoid external modification
+	snapshots := make([]HealthSnapshot, len(hcr.healthHistory[start:]))
+	copy(snapshots, hcr.healthHistory[start:])
+
+	return snapshots
+}
+
+// GetHealthSummary returns a comprehensive health summary
+func (hcr *HealthCheckRunner) GetHealthSummary() map[string]interface{} {
+	hcr.mu.RLock()
+	defer hcr.mu.RUnlock()
+
+	if len(hcr.healthHistory) == 0 {
+		return map[string]interface{}{
+			"running":        hcr.running,
+			"check_interval": hcr.checkInterval.String(),
+			"history_size":   0,
+			"last_check":     nil,
+		}
+	}
+
+	lastSnapshot := hcr.healthHistory[len(hcr.healthHistory)-1]
+
+	return map[string]interface{}{
+		"running":               hcr.running,
+		"check_interval":        hcr.checkInterval.String(),
+		"history_size":          len(hcr.healthHistory),
+		"last_check":            hcr.lastHealthCheck,
+		"current_health_score":  lastSnapshot.HealthScore,
+		"current_trend":         lastSnapshot.Trend.String(),
+		"corruption_rate":       lastSnapshot.CorruptionRate,
+		"validation_success":    lastSnapshot.ValidationSuccess,
+		"contract_call_success": lastSnapshot.ContractCallSuccess,
+		"recent_snapshots":      hcr.getRecentSnapshots(10),
+	}
+}
+
+// SetCheckInterval sets the health check interval
+func (hcr *HealthCheckRunner) SetCheckInterval(interval time.Duration) {
+	hcr.mu.Lock()
+	defer hcr.mu.Unlock()
+	hcr.checkInterval = interval
+	hcr.logger.Info("Health check interval updated", "interval", interval)
+}
+
+// IsRunning returns whether the health checker is running
+func (hcr *HealthCheckRunner) IsRunning() bool {
+	hcr.mu.RLock()
+	defer hcr.mu.RUnlock()
+	return hcr.running
+}