feat: create v2-prep branch with comprehensive planning

Restructured project for V2 refactor: **Structure Changes:** - Moved all V1 code to orig/ folder (preserved with git mv) - Created docs/planning/ directory - Added orig/README_V1.md explaining V1 preservation **Planning Documents:** - 00_V2_MASTER_PLAN.md: Complete architecture overview - Executive summary of critical V1 issues - High-level component architecture diagrams - 5-phase implementation roadmap - Success metrics and risk mitigation - 07_TASK_BREAKDOWN.md: Atomic task breakdown - 99+ hours of detailed tasks - Every task < 2 hours (atomic) - Clear dependencies and success criteria - Organized by implementation phase **V2 Key Improvements:** - Per-exchange parsers (factory pattern) - Multi-layer strict validation - Multi-index pool cache - Background validation pipeline - Comprehensive observability **Critical Issues Addressed:** - Zero address tokens (strict validation + cache enrichment) - Parsing accuracy (protocol-specific parsers) - No audit trail (background validation channel) - Inefficient lookups (multi-index cache) - Stats disconnection (event-driven metrics) Next Steps: 1. Review planning documents 2. Begin Phase 1: Foundation (P1-001 through P1-010) 3. Implement parsers in Phase 2 4. Build cache system in Phase 3 5. Add validation pipeline in Phase 4 6. Migrate and test in Phase 5 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-10 10:14:26 +01:00
parent 1773daffe7
commit 803de231ba
411 changed files with 20390 additions and 8680 deletions
--- a/orig/internal/monitoring/integrity_monitor.go
+++ b/orig/internal/monitoring/integrity_monitor.go
@@ -0,0 +1,533 @@
+package monitoring
+
+import (
+	"context"
+	"fmt"
+	"sync"
+	"time"
+
+	"github.com/ethereum/go-ethereum/common"
+
+	"github.com/fraktal/mev-beta/internal/logger"
+	"github.com/fraktal/mev-beta/internal/recovery"
+)
+
+// IntegrityMetrics tracks data integrity statistics
+type IntegrityMetrics struct {
+	mu                       sync.RWMutex
+	TotalAddressesProcessed  int64
+	CorruptAddressesDetected int64
+	AddressValidationPassed  int64
+	AddressValidationFailed  int64
+	ContractCallsSucceeded   int64
+	ContractCallsFailed      int64
+	RetryOperationsTriggered int64
+	FallbackOperationsUsed   int64
+	CircuitBreakersTripped   int64
+	LastCorruptionDetection  time.Time
+	AverageCorruptionScore   float64
+	MaxCorruptionScore       int
+	HealthScore              float64
+	HighScore                float64
+	RecoveryActions          map[recovery.RecoveryAction]int64
+	ErrorsByType             map[recovery.ErrorType]int64
+}
+
+// MetricsSnapshot represents a copy of metrics without mutex for safe external access
+type MetricsSnapshot struct {
+	TotalAddressesProcessed  int64                             `json:"total_addresses_processed"`
+	CorruptAddressesDetected int64                             `json:"corrupt_addresses_detected"`
+	AddressValidationPassed  int64                             `json:"address_validation_passed"`
+	AddressValidationFailed  int64                             `json:"address_validation_failed"`
+	ContractCallsSucceeded   int64                             `json:"contract_calls_succeeded"`
+	ContractCallsFailed      int64                             `json:"contract_calls_failed"`
+	RetryOperationsTriggered int64                             `json:"retry_operations_triggered"`
+	FallbackOperationsUsed   int64                             `json:"fallback_operations_used"`
+	CircuitBreakersTripped   int64                             `json:"circuit_breakers_tripped"`
+	LastCorruptionDetection  time.Time                         `json:"last_corruption_detection"`
+	AverageCorruptionScore   float64                           `json:"average_corruption_score"`
+	MaxCorruptionScore       int                               `json:"max_corruption_score"`
+	HealthScore              float64                           `json:"health_score"`
+	HighScore                float64                           `json:"high_score"`
+	RecoveryActions          map[recovery.RecoveryAction]int64 `json:"recovery_actions"`
+	ErrorsByType             map[recovery.ErrorType]int64      `json:"errors_by_type"`
+}
+
+// CorruptionAlert represents a corruption detection alert
+type CorruptionAlert struct {
+	Timestamp       time.Time
+	Address         common.Address
+	CorruptionScore int
+	Source          string
+	Severity        AlertSeverity
+	Message         string
+	Context         map[string]interface{}
+}
+
+// AlertSeverity defines alert severity levels
+type AlertSeverity int
+
+const (
+	AlertSeverityInfo AlertSeverity = iota
+	AlertSeverityWarning
+	AlertSeverityCritical
+	AlertSeverityEmergency
+)
+
+func (s AlertSeverity) String() string {
+	switch s {
+	case AlertSeverityInfo:
+		return "INFO"
+	case AlertSeverityWarning:
+		return "WARNING"
+	case AlertSeverityCritical:
+		return "CRITICAL"
+	case AlertSeverityEmergency:
+		return "EMERGENCY"
+	default:
+		return "UNKNOWN"
+	}
+}
+
+// IntegrityMonitor monitors and tracks data integrity metrics
+type IntegrityMonitor struct {
+	mu                sync.RWMutex
+	logger            *logger.Logger
+	metrics           *IntegrityMetrics
+	alertThresholds   map[string]float64
+	alertSubscribers  []AlertSubscriber
+	healthCheckRunner *HealthCheckRunner
+	enabled           bool
+	alerts            []CorruptionAlert
+	alertsMutex       sync.RWMutex
+}
+
+// AlertSubscriber defines the interface for alert handlers
+type AlertSubscriber interface {
+	HandleAlert(alert CorruptionAlert) error
+}
+
+// NewIntegrityMonitor creates a new integrity monitoring system
+func NewIntegrityMonitor(logger *logger.Logger) *IntegrityMonitor {
+	monitor := &IntegrityMonitor{
+		logger: logger,
+		metrics: &IntegrityMetrics{
+			RecoveryActions: make(map[recovery.RecoveryAction]int64),
+			ErrorsByType:    make(map[recovery.ErrorType]int64),
+			HealthScore:     1.0,
+			HighScore:       1.0,
+		},
+		alertThresholds: make(map[string]float64),
+		enabled:         true,
+		alerts:          make([]CorruptionAlert, 0, 256),
+	}
+
+	// Set default thresholds
+	monitor.setDefaultThresholds()
+
+	// Initialize health check runner
+	monitor.healthCheckRunner = NewHealthCheckRunner(logger, monitor)
+
+	return monitor
+}
+
+// setDefaultThresholds configures default alert thresholds
+func (im *IntegrityMonitor) setDefaultThresholds() {
+	im.alertThresholds["corruption_rate"] = 0.05      // 5% corruption rate
+	im.alertThresholds["failure_rate"] = 0.10         // 10% failure rate
+	im.alertThresholds["health_score_min"] = 0.80     // 80% minimum health
+	im.alertThresholds["max_corruption_score"] = 70.0 // Maximum individual corruption score
+	im.alertThresholds["circuit_breaker_rate"] = 0.02 // 2% circuit breaker rate
+}
+
+// RecordAddressProcessed increments the counter for processed addresses
+func (im *IntegrityMonitor) RecordAddressProcessed() {
+	if !im.enabled {
+		return
+	}
+
+	im.metrics.mu.Lock()
+	im.metrics.TotalAddressesProcessed++
+	im.metrics.mu.Unlock()
+
+	im.updateHealthScore()
+}
+
+// RecordCorruptionDetected records a corruption detection event
+func (im *IntegrityMonitor) RecordCorruptionDetected(address common.Address, corruptionScore int, source string) {
+	if !im.enabled {
+		return
+	}
+
+	im.metrics.mu.Lock()
+	im.metrics.CorruptAddressesDetected++
+	im.metrics.LastCorruptionDetection = time.Now()
+
+	// Update corruption statistics
+	if corruptionScore > im.metrics.MaxCorruptionScore {
+		im.metrics.MaxCorruptionScore = corruptionScore
+	}
+
+	// Calculate rolling average corruption score
+	total := float64(im.metrics.CorruptAddressesDetected)
+	im.metrics.AverageCorruptionScore = ((im.metrics.AverageCorruptionScore * (total - 1)) + float64(corruptionScore)) / total
+	im.metrics.mu.Unlock()
+
+	// Generate alert based on corruption score
+	severity := im.getCorruptionSeverity(corruptionScore)
+	alert := CorruptionAlert{
+		Timestamp:       time.Now(),
+		Address:         address,
+		CorruptionScore: corruptionScore,
+		Source:          source,
+		Severity:        severity,
+		Message:         fmt.Sprintf("Corruption detected: address %s, score %d, source %s", address.Hex(), corruptionScore, source),
+		Context: map[string]interface{}{
+			"address":          address.Hex(),
+			"corruption_score": corruptionScore,
+			"source":           source,
+			"timestamp":        time.Now().Unix(),
+		},
+	}
+
+	im.sendAlert(alert)
+	im.updateHealthScore()
+
+	im.logger.Warn("Corruption detected",
+		"address", address.Hex(),
+		"corruption_score", corruptionScore,
+		"source", source,
+		"severity", severity.String())
+}
+
+// RecordValidationResult records address validation results
+func (im *IntegrityMonitor) RecordValidationResult(passed bool) {
+	if !im.enabled {
+		return
+	}
+
+	im.metrics.mu.Lock()
+	if passed {
+		im.metrics.AddressValidationPassed++
+	} else {
+		im.metrics.AddressValidationFailed++
+	}
+	im.metrics.mu.Unlock()
+
+	im.updateHealthScore()
+}
+
+// RecordContractCallResult records contract call success/failure
+func (im *IntegrityMonitor) RecordContractCallResult(succeeded bool) {
+	if !im.enabled {
+		return
+	}
+
+	im.metrics.mu.Lock()
+	if succeeded {
+		im.metrics.ContractCallsSucceeded++
+	} else {
+		im.metrics.ContractCallsFailed++
+	}
+	im.metrics.mu.Unlock()
+
+	im.updateHealthScore()
+}
+
+// RecordRecoveryAction records recovery action usage
+func (im *IntegrityMonitor) RecordRecoveryAction(action recovery.RecoveryAction) {
+	if !im.enabled {
+		return
+	}
+
+	im.metrics.mu.Lock()
+	im.metrics.RecoveryActions[action]++
+
+	// Track specific metrics
+	switch action {
+	case recovery.ActionRetryWithBackoff:
+		im.metrics.RetryOperationsTriggered++
+	case recovery.ActionUseFallbackData:
+		im.metrics.FallbackOperationsUsed++
+	case recovery.ActionCircuitBreaker:
+		im.metrics.CircuitBreakersTripped++
+	}
+	im.metrics.mu.Unlock()
+
+	im.updateHealthScore()
+}
+
+// RecordErrorType records error by type
+func (im *IntegrityMonitor) RecordErrorType(errorType recovery.ErrorType) {
+	if !im.enabled {
+		return
+	}
+
+	im.metrics.mu.Lock()
+	im.metrics.ErrorsByType[errorType]++
+	im.metrics.mu.Unlock()
+}
+
+// getCorruptionSeverity determines alert severity based on corruption score
+func (im *IntegrityMonitor) getCorruptionSeverity(corruptionScore int) AlertSeverity {
+	if corruptionScore >= 90 {
+		return AlertSeverityEmergency
+	} else if corruptionScore >= 70 {
+		return AlertSeverityCritical
+	} else if corruptionScore >= 40 {
+		return AlertSeverityWarning
+	}
+	return AlertSeverityInfo
+}
+
+// updateHealthScore calculates overall system health score
+func (im *IntegrityMonitor) updateHealthScore() {
+	im.metrics.mu.Lock()
+	defer im.metrics.mu.Unlock()
+
+	if im.metrics.TotalAddressesProcessed == 0 {
+		im.metrics.HealthScore = 1.0
+		return
+	}
+
+	// Calculate component scores
+	corruptionRate := float64(im.metrics.CorruptAddressesDetected) / float64(im.metrics.TotalAddressesProcessed)
+
+	var validationSuccessRate float64 = 1.0
+	validationTotal := im.metrics.AddressValidationPassed + im.metrics.AddressValidationFailed
+	if validationTotal > 0 {
+		validationSuccessRate = float64(im.metrics.AddressValidationPassed) / float64(validationTotal)
+	}
+
+	var contractCallSuccessRate float64 = 1.0
+	contractTotal := im.metrics.ContractCallsSucceeded + im.metrics.ContractCallsFailed
+	if contractTotal > 0 {
+		contractCallSuccessRate = float64(im.metrics.ContractCallsSucceeded) / float64(contractTotal)
+	}
+
+	// Weighted health score calculation
+	healthScore := 0.0
+	healthScore += (1.0 - corruptionRate) * 0.4  // 40% weight on corruption prevention
+	healthScore += validationSuccessRate * 0.3   // 30% weight on validation success
+	healthScore += contractCallSuccessRate * 0.3 // 30% weight on contract call success
+
+	// Cap at 1.0 and handle edge cases
+	if healthScore > 1.0 {
+		healthScore = 1.0
+	} else if healthScore < 0.0 {
+		healthScore = 0.0
+	}
+
+	im.metrics.HealthScore = healthScore
+	if healthScore > im.metrics.HighScore {
+		im.metrics.HighScore = healthScore
+	}
+
+	// Check for health score threshold alerts
+	if healthScore < im.alertThresholds["health_score_min"] {
+		alert := CorruptionAlert{
+			Timestamp: time.Now(),
+			Severity:  AlertSeverityCritical,
+			Message:   fmt.Sprintf("System health score dropped to %.2f (threshold: %.2f)", healthScore, im.alertThresholds["health_score_min"]),
+			Context: map[string]interface{}{
+				"health_score":          healthScore,
+				"threshold":             im.alertThresholds["health_score_min"],
+				"corruption_rate":       corruptionRate,
+				"validation_success":    validationSuccessRate,
+				"contract_call_success": contractCallSuccessRate,
+			},
+		}
+		im.sendAlert(alert)
+	}
+}
+
+// sendAlert sends alerts to all subscribers
+func (im *IntegrityMonitor) sendAlert(alert CorruptionAlert) {
+	im.alertsMutex.Lock()
+	im.alerts = append(im.alerts, alert)
+	if len(im.alerts) > 1000 {
+		trimmed := make([]CorruptionAlert, 1000)
+		copy(trimmed, im.alerts[len(im.alerts)-1000:])
+		im.alerts = trimmed
+	}
+	im.alertsMutex.Unlock()
+
+	for _, subscriber := range im.alertSubscribers {
+		if err := subscriber.HandleAlert(alert); err != nil {
+			im.logger.Error("Failed to send alert",
+				"subscriber", fmt.Sprintf("%T", subscriber),
+				"error", err)
+		}
+	}
+}
+
+// AddAlertSubscriber adds an alert subscriber
+func (im *IntegrityMonitor) AddAlertSubscriber(subscriber AlertSubscriber) {
+	im.mu.Lock()
+	defer im.mu.Unlock()
+	im.alertSubscribers = append(im.alertSubscribers, subscriber)
+}
+
+// GetMetrics returns a copy of current metrics
+func (im *IntegrityMonitor) GetMetrics() MetricsSnapshot {
+	im.metrics.mu.RLock()
+	defer im.metrics.mu.RUnlock()
+
+	// Create a deep copy
+	metrics := IntegrityMetrics{
+		TotalAddressesProcessed:  im.metrics.TotalAddressesProcessed,
+		CorruptAddressesDetected: im.metrics.CorruptAddressesDetected,
+		AddressValidationPassed:  im.metrics.AddressValidationPassed,
+		AddressValidationFailed:  im.metrics.AddressValidationFailed,
+		ContractCallsSucceeded:   im.metrics.ContractCallsSucceeded,
+		ContractCallsFailed:      im.metrics.ContractCallsFailed,
+		RetryOperationsTriggered: im.metrics.RetryOperationsTriggered,
+		FallbackOperationsUsed:   im.metrics.FallbackOperationsUsed,
+		CircuitBreakersTripped:   im.metrics.CircuitBreakersTripped,
+		LastCorruptionDetection:  im.metrics.LastCorruptionDetection,
+		AverageCorruptionScore:   im.metrics.AverageCorruptionScore,
+		MaxCorruptionScore:       im.metrics.MaxCorruptionScore,
+		HealthScore:              im.metrics.HealthScore,
+		HighScore:                im.metrics.HighScore,
+		RecoveryActions:          make(map[recovery.RecoveryAction]int64),
+		ErrorsByType:             make(map[recovery.ErrorType]int64),
+	}
+
+	// Copy maps
+	for k, v := range im.metrics.RecoveryActions {
+		metrics.RecoveryActions[k] = v
+	}
+	for k, v := range im.metrics.ErrorsByType {
+		metrics.ErrorsByType[k] = v
+	}
+
+	// Return a safe copy without mutex
+	return MetricsSnapshot{
+		TotalAddressesProcessed:  metrics.TotalAddressesProcessed,
+		CorruptAddressesDetected: metrics.CorruptAddressesDetected,
+		AddressValidationPassed:  metrics.AddressValidationPassed,
+		AddressValidationFailed:  metrics.AddressValidationFailed,
+		ContractCallsSucceeded:   metrics.ContractCallsSucceeded,
+		ContractCallsFailed:      metrics.ContractCallsFailed,
+		RetryOperationsTriggered: metrics.RetryOperationsTriggered,
+		FallbackOperationsUsed:   metrics.FallbackOperationsUsed,
+		CircuitBreakersTripped:   metrics.CircuitBreakersTripped,
+		LastCorruptionDetection:  metrics.LastCorruptionDetection,
+		AverageCorruptionScore:   metrics.AverageCorruptionScore,
+		MaxCorruptionScore:       metrics.MaxCorruptionScore,
+		HealthScore:              metrics.HealthScore,
+		HighScore:                metrics.HighScore,
+		RecoveryActions:          metrics.RecoveryActions,
+		ErrorsByType:             metrics.ErrorsByType,
+	}
+}
+
+// GetHealthSummary returns a comprehensive health summary
+func (im *IntegrityMonitor) GetHealthSummary() map[string]interface{} {
+	metrics := im.GetMetrics()
+
+	corruptionRate := 0.0
+	if metrics.TotalAddressesProcessed > 0 {
+		corruptionRate = float64(metrics.CorruptAddressesDetected) / float64(metrics.TotalAddressesProcessed)
+	}
+
+	validationSuccessRate := 0.0
+	totalValidations := metrics.AddressValidationPassed + metrics.AddressValidationFailed
+	if totalValidations > 0 {
+		validationSuccessRate = float64(metrics.AddressValidationPassed) / float64(totalValidations)
+	}
+
+	contractCallSuccessRate := 0.0
+	totalCalls := metrics.ContractCallsSucceeded + metrics.ContractCallsFailed
+	if totalCalls > 0 {
+		contractCallSuccessRate = float64(metrics.ContractCallsSucceeded) / float64(totalCalls)
+	}
+
+	return map[string]interface{}{
+		"enabled":                    im.enabled,
+		"health_score":               metrics.HealthScore,
+		"total_addresses_processed":  metrics.TotalAddressesProcessed,
+		"corruption_detections":      metrics.CorruptAddressesDetected,
+		"corruption_rate":            corruptionRate,
+		"validation_success_rate":    validationSuccessRate,
+		"contract_call_success_rate": contractCallSuccessRate,
+		"average_corruption_score":   metrics.AverageCorruptionScore,
+		"max_corruption_score":       metrics.MaxCorruptionScore,
+		"retry_operations":           metrics.RetryOperationsTriggered,
+		"fallback_operations":        metrics.FallbackOperationsUsed,
+		"circuit_breakers_tripped":   metrics.CircuitBreakersTripped,
+		"last_corruption":            metrics.LastCorruptionDetection,
+		"recovery_actions":           metrics.RecoveryActions,
+		"errors_by_type":             metrics.ErrorsByType,
+		"alert_thresholds":           im.alertThresholds,
+		"alert_subscribers":          len(im.alertSubscribers),
+	}
+}
+
+// GetRecentAlerts returns the most recent corruption alerts up to the specified limit.
+func (im *IntegrityMonitor) GetRecentAlerts(limit int) []CorruptionAlert {
+	im.alertsMutex.RLock()
+	defer im.alertsMutex.RUnlock()
+
+	if limit <= 0 || limit > len(im.alerts) {
+		limit = len(im.alerts)
+	}
+
+	if limit == 0 {
+		return []CorruptionAlert{}
+	}
+
+	start := len(im.alerts) - limit
+	alertsCopy := make([]CorruptionAlert, limit)
+	copy(alertsCopy, im.alerts[start:])
+	return alertsCopy
+}
+
+// SetThreshold sets an alert threshold
+func (im *IntegrityMonitor) SetThreshold(name string, value float64) {
+	im.mu.Lock()
+	defer im.mu.Unlock()
+	im.alertThresholds[name] = value
+}
+
+// Enable enables the integrity monitor
+func (im *IntegrityMonitor) Enable() {
+	im.mu.Lock()
+	defer im.mu.Unlock()
+	im.enabled = true
+	im.logger.Info("Integrity monitor enabled")
+}
+
+// Disable disables the integrity monitor
+func (im *IntegrityMonitor) Disable() {
+	im.mu.Lock()
+	defer im.mu.Unlock()
+	im.enabled = false
+	im.logger.Info("Integrity monitor disabled")
+}
+
+// IsEnabled returns whether the monitor is enabled
+func (im *IntegrityMonitor) IsEnabled() bool {
+	im.mu.RLock()
+	defer im.mu.RUnlock()
+	return im.enabled
+}
+
+// StartHealthCheckRunner starts the periodic health check routine
+func (im *IntegrityMonitor) StartHealthCheckRunner(ctx context.Context) {
+	if im.healthCheckRunner != nil {
+		im.healthCheckRunner.Start(ctx)
+	}
+}
+
+// StopHealthCheckRunner stops the periodic health check routine
+func (im *IntegrityMonitor) StopHealthCheckRunner() {
+	if im.healthCheckRunner != nil {
+		im.healthCheckRunner.Stop()
+	}
+}
+
+// GetHealthCheckRunner returns the health check runner
+func (im *IntegrityMonitor) GetHealthCheckRunner() *HealthCheckRunner {
+	return im.healthCheckRunner
+}