package monitoring import ( "context" "fmt" "sync" "time" "github.com/ethereum/go-ethereum/common" "github.com/fraktal/mev-beta/internal/logger" "github.com/fraktal/mev-beta/internal/recovery" ) // IntegrityMetrics tracks data integrity statistics type IntegrityMetrics struct { mu sync.RWMutex TotalAddressesProcessed int64 CorruptAddressesDetected int64 AddressValidationPassed int64 AddressValidationFailed int64 ContractCallsSucceeded int64 ContractCallsFailed int64 RetryOperationsTriggered int64 FallbackOperationsUsed int64 CircuitBreakersTripped int64 LastCorruptionDetection time.Time AverageCorruptionScore float64 MaxCorruptionScore int HealthScore float64 HighScore float64 RecoveryActions map[recovery.RecoveryAction]int64 ErrorsByType map[recovery.ErrorType]int64 } // MetricsSnapshot represents a copy of metrics without mutex for safe external access type MetricsSnapshot struct { TotalAddressesProcessed int64 `json:"total_addresses_processed"` CorruptAddressesDetected int64 `json:"corrupt_addresses_detected"` AddressValidationPassed int64 `json:"address_validation_passed"` AddressValidationFailed int64 `json:"address_validation_failed"` ContractCallsSucceeded int64 `json:"contract_calls_succeeded"` ContractCallsFailed int64 `json:"contract_calls_failed"` RetryOperationsTriggered int64 `json:"retry_operations_triggered"` FallbackOperationsUsed int64 `json:"fallback_operations_used"` CircuitBreakersTripped int64 `json:"circuit_breakers_tripped"` LastCorruptionDetection time.Time `json:"last_corruption_detection"` AverageCorruptionScore float64 `json:"average_corruption_score"` MaxCorruptionScore int `json:"max_corruption_score"` HealthScore float64 `json:"health_score"` HighScore float64 `json:"high_score"` RecoveryActions map[recovery.RecoveryAction]int64 `json:"recovery_actions"` ErrorsByType map[recovery.ErrorType]int64 `json:"errors_by_type"` } // CorruptionAlert represents a corruption detection alert type CorruptionAlert struct { Timestamp time.Time Address common.Address CorruptionScore int Source string Severity AlertSeverity Message string Context map[string]interface{} } // AlertSeverity defines alert severity levels type AlertSeverity int const ( AlertSeverityInfo AlertSeverity = iota AlertSeverityWarning AlertSeverityCritical AlertSeverityEmergency ) func (s AlertSeverity) String() string { switch s { case AlertSeverityInfo: return "INFO" case AlertSeverityWarning: return "WARNING" case AlertSeverityCritical: return "CRITICAL" case AlertSeverityEmergency: return "EMERGENCY" default: return "UNKNOWN" } } // IntegrityMonitor monitors and tracks data integrity metrics type IntegrityMonitor struct { mu sync.RWMutex logger *logger.Logger metrics *IntegrityMetrics alertThresholds map[string]float64 alertSubscribers []AlertSubscriber healthCheckRunner *HealthCheckRunner enabled bool alerts []CorruptionAlert alertsMutex sync.RWMutex } // AlertSubscriber defines the interface for alert handlers type AlertSubscriber interface { HandleAlert(alert CorruptionAlert) error } // NewIntegrityMonitor creates a new integrity monitoring system func NewIntegrityMonitor(logger *logger.Logger) *IntegrityMonitor { monitor := &IntegrityMonitor{ logger: logger, metrics: &IntegrityMetrics{ RecoveryActions: make(map[recovery.RecoveryAction]int64), ErrorsByType: make(map[recovery.ErrorType]int64), HealthScore: 1.0, HighScore: 1.0, }, alertThresholds: make(map[string]float64), enabled: true, alerts: make([]CorruptionAlert, 0, 256), } // Set default thresholds monitor.setDefaultThresholds() // Initialize health check runner monitor.healthCheckRunner = NewHealthCheckRunner(logger, monitor) return monitor } // setDefaultThresholds configures default alert thresholds func (im *IntegrityMonitor) setDefaultThresholds() { im.alertThresholds["corruption_rate"] = 0.05 // 5% corruption rate im.alertThresholds["failure_rate"] = 0.10 // 10% failure rate im.alertThresholds["health_score_min"] = 0.80 // 80% minimum health im.alertThresholds["max_corruption_score"] = 70.0 // Maximum individual corruption score im.alertThresholds["circuit_breaker_rate"] = 0.02 // 2% circuit breaker rate } // RecordAddressProcessed increments the counter for processed addresses func (im *IntegrityMonitor) RecordAddressProcessed() { if !im.enabled { return } im.metrics.mu.Lock() im.metrics.TotalAddressesProcessed++ im.metrics.mu.Unlock() im.updateHealthScore() } // RecordCorruptionDetected records a corruption detection event func (im *IntegrityMonitor) RecordCorruptionDetected(address common.Address, corruptionScore int, source string) { if !im.enabled { return } im.metrics.mu.Lock() im.metrics.CorruptAddressesDetected++ im.metrics.LastCorruptionDetection = time.Now() // Update corruption statistics if corruptionScore > im.metrics.MaxCorruptionScore { im.metrics.MaxCorruptionScore = corruptionScore } // Calculate rolling average corruption score total := float64(im.metrics.CorruptAddressesDetected) im.metrics.AverageCorruptionScore = ((im.metrics.AverageCorruptionScore * (total - 1)) + float64(corruptionScore)) / total im.metrics.mu.Unlock() // Generate alert based on corruption score severity := im.getCorruptionSeverity(corruptionScore) alert := CorruptionAlert{ Timestamp: time.Now(), Address: address, CorruptionScore: corruptionScore, Source: source, Severity: severity, Message: fmt.Sprintf("Corruption detected: address %s, score %d, source %s", address.Hex(), corruptionScore, source), Context: map[string]interface{}{ "address": address.Hex(), "corruption_score": corruptionScore, "source": source, "timestamp": time.Now().Unix(), }, } im.sendAlert(alert) im.updateHealthScore() im.logger.Warn("Corruption detected", "address", address.Hex(), "corruption_score", corruptionScore, "source", source, "severity", severity.String()) } // RecordValidationResult records address validation results func (im *IntegrityMonitor) RecordValidationResult(passed bool) { if !im.enabled { return } im.metrics.mu.Lock() if passed { im.metrics.AddressValidationPassed++ } else { im.metrics.AddressValidationFailed++ } im.metrics.mu.Unlock() im.updateHealthScore() } // RecordContractCallResult records contract call success/failure func (im *IntegrityMonitor) RecordContractCallResult(succeeded bool) { if !im.enabled { return } im.metrics.mu.Lock() if succeeded { im.metrics.ContractCallsSucceeded++ } else { im.metrics.ContractCallsFailed++ } im.metrics.mu.Unlock() im.updateHealthScore() } // RecordRecoveryAction records recovery action usage func (im *IntegrityMonitor) RecordRecoveryAction(action recovery.RecoveryAction) { if !im.enabled { return } im.metrics.mu.Lock() im.metrics.RecoveryActions[action]++ // Track specific metrics switch action { case recovery.ActionRetryWithBackoff: im.metrics.RetryOperationsTriggered++ case recovery.ActionUseFallbackData: im.metrics.FallbackOperationsUsed++ case recovery.ActionCircuitBreaker: im.metrics.CircuitBreakersTripped++ } im.metrics.mu.Unlock() im.updateHealthScore() } // RecordErrorType records error by type func (im *IntegrityMonitor) RecordErrorType(errorType recovery.ErrorType) { if !im.enabled { return } im.metrics.mu.Lock() im.metrics.ErrorsByType[errorType]++ im.metrics.mu.Unlock() } // getCorruptionSeverity determines alert severity based on corruption score func (im *IntegrityMonitor) getCorruptionSeverity(corruptionScore int) AlertSeverity { if corruptionScore >= 90 { return AlertSeverityEmergency } else if corruptionScore >= 70 { return AlertSeverityCritical } else if corruptionScore >= 40 { return AlertSeverityWarning } return AlertSeverityInfo } // updateHealthScore calculates overall system health score func (im *IntegrityMonitor) updateHealthScore() { im.metrics.mu.Lock() defer im.metrics.mu.Unlock() if im.metrics.TotalAddressesProcessed == 0 { im.metrics.HealthScore = 1.0 return } // Calculate component scores corruptionRate := float64(im.metrics.CorruptAddressesDetected) / float64(im.metrics.TotalAddressesProcessed) var validationSuccessRate float64 = 1.0 validationTotal := im.metrics.AddressValidationPassed + im.metrics.AddressValidationFailed if validationTotal > 0 { validationSuccessRate = float64(im.metrics.AddressValidationPassed) / float64(validationTotal) } var contractCallSuccessRate float64 = 1.0 contractTotal := im.metrics.ContractCallsSucceeded + im.metrics.ContractCallsFailed if contractTotal > 0 { contractCallSuccessRate = float64(im.metrics.ContractCallsSucceeded) / float64(contractTotal) } // Weighted health score calculation healthScore := 0.0 healthScore += (1.0 - corruptionRate) * 0.4 // 40% weight on corruption prevention healthScore += validationSuccessRate * 0.3 // 30% weight on validation success healthScore += contractCallSuccessRate * 0.3 // 30% weight on contract call success // Cap at 1.0 and handle edge cases if healthScore > 1.0 { healthScore = 1.0 } else if healthScore < 0.0 { healthScore = 0.0 } im.metrics.HealthScore = healthScore if healthScore > im.metrics.HighScore { im.metrics.HighScore = healthScore } // Check for health score threshold alerts if healthScore < im.alertThresholds["health_score_min"] { alert := CorruptionAlert{ Timestamp: time.Now(), Severity: AlertSeverityCritical, Message: fmt.Sprintf("System health score dropped to %.2f (threshold: %.2f)", healthScore, im.alertThresholds["health_score_min"]), Context: map[string]interface{}{ "health_score": healthScore, "threshold": im.alertThresholds["health_score_min"], "corruption_rate": corruptionRate, "validation_success": validationSuccessRate, "contract_call_success": contractCallSuccessRate, }, } im.sendAlert(alert) } } // sendAlert sends alerts to all subscribers func (im *IntegrityMonitor) sendAlert(alert CorruptionAlert) { im.alertsMutex.Lock() im.alerts = append(im.alerts, alert) if len(im.alerts) > 1000 { trimmed := make([]CorruptionAlert, 1000) copy(trimmed, im.alerts[len(im.alerts)-1000:]) im.alerts = trimmed } im.alertsMutex.Unlock() for _, subscriber := range im.alertSubscribers { if err := subscriber.HandleAlert(alert); err != nil { im.logger.Error("Failed to send alert", "subscriber", fmt.Sprintf("%T", subscriber), "error", err) } } } // AddAlertSubscriber adds an alert subscriber func (im *IntegrityMonitor) AddAlertSubscriber(subscriber AlertSubscriber) { im.mu.Lock() defer im.mu.Unlock() im.alertSubscribers = append(im.alertSubscribers, subscriber) } // GetMetrics returns a copy of current metrics func (im *IntegrityMonitor) GetMetrics() MetricsSnapshot { im.metrics.mu.RLock() defer im.metrics.mu.RUnlock() // Create a deep copy metrics := IntegrityMetrics{ TotalAddressesProcessed: im.metrics.TotalAddressesProcessed, CorruptAddressesDetected: im.metrics.CorruptAddressesDetected, AddressValidationPassed: im.metrics.AddressValidationPassed, AddressValidationFailed: im.metrics.AddressValidationFailed, ContractCallsSucceeded: im.metrics.ContractCallsSucceeded, ContractCallsFailed: im.metrics.ContractCallsFailed, RetryOperationsTriggered: im.metrics.RetryOperationsTriggered, FallbackOperationsUsed: im.metrics.FallbackOperationsUsed, CircuitBreakersTripped: im.metrics.CircuitBreakersTripped, LastCorruptionDetection: im.metrics.LastCorruptionDetection, AverageCorruptionScore: im.metrics.AverageCorruptionScore, MaxCorruptionScore: im.metrics.MaxCorruptionScore, HealthScore: im.metrics.HealthScore, HighScore: im.metrics.HighScore, RecoveryActions: make(map[recovery.RecoveryAction]int64), ErrorsByType: make(map[recovery.ErrorType]int64), } // Copy maps for k, v := range im.metrics.RecoveryActions { metrics.RecoveryActions[k] = v } for k, v := range im.metrics.ErrorsByType { metrics.ErrorsByType[k] = v } // Return a safe copy without mutex return MetricsSnapshot{ TotalAddressesProcessed: metrics.TotalAddressesProcessed, CorruptAddressesDetected: metrics.CorruptAddressesDetected, AddressValidationPassed: metrics.AddressValidationPassed, AddressValidationFailed: metrics.AddressValidationFailed, ContractCallsSucceeded: metrics.ContractCallsSucceeded, ContractCallsFailed: metrics.ContractCallsFailed, RetryOperationsTriggered: metrics.RetryOperationsTriggered, FallbackOperationsUsed: metrics.FallbackOperationsUsed, CircuitBreakersTripped: metrics.CircuitBreakersTripped, LastCorruptionDetection: metrics.LastCorruptionDetection, AverageCorruptionScore: metrics.AverageCorruptionScore, MaxCorruptionScore: metrics.MaxCorruptionScore, HealthScore: metrics.HealthScore, HighScore: metrics.HighScore, RecoveryActions: metrics.RecoveryActions, ErrorsByType: metrics.ErrorsByType, } } // GetHealthSummary returns a comprehensive health summary func (im *IntegrityMonitor) GetHealthSummary() map[string]interface{} { metrics := im.GetMetrics() corruptionRate := 0.0 if metrics.TotalAddressesProcessed > 0 { corruptionRate = float64(metrics.CorruptAddressesDetected) / float64(metrics.TotalAddressesProcessed) } validationSuccessRate := 0.0 totalValidations := metrics.AddressValidationPassed + metrics.AddressValidationFailed if totalValidations > 0 { validationSuccessRate = float64(metrics.AddressValidationPassed) / float64(totalValidations) } contractCallSuccessRate := 0.0 totalCalls := metrics.ContractCallsSucceeded + metrics.ContractCallsFailed if totalCalls > 0 { contractCallSuccessRate = float64(metrics.ContractCallsSucceeded) / float64(totalCalls) } return map[string]interface{}{ "enabled": im.enabled, "health_score": metrics.HealthScore, "total_addresses_processed": metrics.TotalAddressesProcessed, "corruption_detections": metrics.CorruptAddressesDetected, "corruption_rate": corruptionRate, "validation_success_rate": validationSuccessRate, "contract_call_success_rate": contractCallSuccessRate, "average_corruption_score": metrics.AverageCorruptionScore, "max_corruption_score": metrics.MaxCorruptionScore, "retry_operations": metrics.RetryOperationsTriggered, "fallback_operations": metrics.FallbackOperationsUsed, "circuit_breakers_tripped": metrics.CircuitBreakersTripped, "last_corruption": metrics.LastCorruptionDetection, "recovery_actions": metrics.RecoveryActions, "errors_by_type": metrics.ErrorsByType, "alert_thresholds": im.alertThresholds, "alert_subscribers": len(im.alertSubscribers), } } // GetRecentAlerts returns the most recent corruption alerts up to the specified limit. func (im *IntegrityMonitor) GetRecentAlerts(limit int) []CorruptionAlert { im.alertsMutex.RLock() defer im.alertsMutex.RUnlock() if limit <= 0 || limit > len(im.alerts) { limit = len(im.alerts) } if limit == 0 { return []CorruptionAlert{} } start := len(im.alerts) - limit alertsCopy := make([]CorruptionAlert, limit) copy(alertsCopy, im.alerts[start:]) return alertsCopy } // SetThreshold sets an alert threshold func (im *IntegrityMonitor) SetThreshold(name string, value float64) { im.mu.Lock() defer im.mu.Unlock() im.alertThresholds[name] = value } // Enable enables the integrity monitor func (im *IntegrityMonitor) Enable() { im.mu.Lock() defer im.mu.Unlock() im.enabled = true im.logger.Info("Integrity monitor enabled") } // Disable disables the integrity monitor func (im *IntegrityMonitor) Disable() { im.mu.Lock() defer im.mu.Unlock() im.enabled = false im.logger.Info("Integrity monitor disabled") } // IsEnabled returns whether the monitor is enabled func (im *IntegrityMonitor) IsEnabled() bool { im.mu.RLock() defer im.mu.RUnlock() return im.enabled } // StartHealthCheckRunner starts the periodic health check routine func (im *IntegrityMonitor) StartHealthCheckRunner(ctx context.Context) { if im.healthCheckRunner != nil { im.healthCheckRunner.Start(ctx) } } // StopHealthCheckRunner stops the periodic health check routine func (im *IntegrityMonitor) StopHealthCheckRunner() { if im.healthCheckRunner != nil { im.healthCheckRunner.Stop() } } // GetHealthCheckRunner returns the health check runner func (im *IntegrityMonitor) GetHealthCheckRunner() *HealthCheckRunner { return im.healthCheckRunner }