package monitoring import ( "context" "fmt" "sync" "time" "github.com/fraktal/mev-beta/internal/logger" ) // HealthCheckRunner performs periodic health checks and monitoring type HealthCheckRunner struct { mu sync.RWMutex logger *logger.Logger integrityMonitor *IntegrityMonitor checkInterval time.Duration running bool stopChan chan struct{} lastHealthCheck time.Time healthHistory []HealthSnapshot maxHistorySize int warmupSamples int minAddressesForAlerts int64 } // HealthSnapshot represents a point-in-time health snapshot type HealthSnapshot struct { Timestamp time.Time HealthScore float64 CorruptionRate float64 ValidationSuccess float64 ContractCallSuccess float64 ActiveAlerts int Trend HealthTrend } // HealthTrend indicates the direction of health metrics type HealthTrend int const ( HealthTrendUnknown HealthTrend = iota HealthTrendImproving HealthTrendStable HealthTrendDeclining HealthTrendCritical ) func (t HealthTrend) String() string { switch t { case HealthTrendImproving: return "IMPROVING" case HealthTrendStable: return "STABLE" case HealthTrendDeclining: return "DECLINING" case HealthTrendCritical: return "CRITICAL" default: return "UNKNOWN" } } // NewHealthCheckRunner creates a new health check runner func NewHealthCheckRunner(logger *logger.Logger, integrityMonitor *IntegrityMonitor) *HealthCheckRunner { return &HealthCheckRunner{ logger: logger, integrityMonitor: integrityMonitor, checkInterval: 30 * time.Second, // Default 30 second intervals stopChan: make(chan struct{}), healthHistory: make([]HealthSnapshot, 0), maxHistorySize: 100, // Keep last 100 snapshots (50 minutes at 30s intervals) warmupSamples: 3, minAddressesForAlerts: 25, } } // Start begins the periodic health checking routine func (hcr *HealthCheckRunner) Start(ctx context.Context) { hcr.mu.Lock() if hcr.running { hcr.mu.Unlock() return } hcr.running = true hcr.mu.Unlock() hcr.logger.Info("Starting health check runner", "interval", hcr.checkInterval) go hcr.healthCheckLoop(ctx) } // Stop stops the health checking routine func (hcr *HealthCheckRunner) Stop() { hcr.mu.Lock() defer hcr.mu.Unlock() if !hcr.running { return } hcr.running = false close(hcr.stopChan) hcr.logger.Info("Health check runner stopped") } // healthCheckLoop runs the periodic health checking func (hcr *HealthCheckRunner) healthCheckLoop(ctx context.Context) { ticker := time.NewTicker(hcr.checkInterval) defer ticker.Stop() // Perform initial health check hcr.performHealthCheck() for { select { case <-ctx.Done(): hcr.logger.Info("Health check runner stopped due to context cancellation") return case <-hcr.stopChan: hcr.logger.Info("Health check runner stopped") return case <-ticker.C: hcr.performHealthCheck() } } } // performHealthCheck executes a comprehensive health check func (hcr *HealthCheckRunner) performHealthCheck() { start := time.Now() hcr.lastHealthCheck = start if !hcr.integrityMonitor.IsEnabled() { hcr.logger.Debug("Skipping health check - integrity monitor disabled") return } // Get current metrics metrics := hcr.integrityMonitor.GetMetrics() healthSummary := hcr.integrityMonitor.GetHealthSummary() // Calculate rates corruptionRate := 0.0 if metrics.TotalAddressesProcessed > 0 { corruptionRate = float64(metrics.CorruptAddressesDetected) / float64(metrics.TotalAddressesProcessed) } validationSuccessRate := 0.0 totalValidations := metrics.AddressValidationPassed + metrics.AddressValidationFailed if totalValidations > 0 { validationSuccessRate = float64(metrics.AddressValidationPassed) / float64(totalValidations) } contractCallSuccessRate := 0.0 totalCalls := metrics.ContractCallsSucceeded + metrics.ContractCallsFailed if totalCalls > 0 { contractCallSuccessRate = float64(metrics.ContractCallsSucceeded) / float64(totalCalls) } // Create health snapshot snapshot := HealthSnapshot{ Timestamp: start, HealthScore: metrics.HealthScore, CorruptionRate: corruptionRate, ValidationSuccess: validationSuccessRate, ContractCallSuccess: contractCallSuccessRate, ActiveAlerts: 0, // Will be calculated based on current conditions Trend: hcr.calculateHealthTrend(metrics.HealthScore), } // Add to history hcr.addHealthSnapshot(snapshot) // Check for threshold violations and generate alerts hcr.checkThresholds(healthSummary, snapshot) // Log health status periodically hcr.logHealthStatus(snapshot, time.Since(start)) } // addHealthSnapshot adds a snapshot to the health history func (hcr *HealthCheckRunner) addHealthSnapshot(snapshot HealthSnapshot) { hcr.mu.Lock() defer hcr.mu.Unlock() hcr.healthHistory = append(hcr.healthHistory, snapshot) // Trim history if it exceeds max size if len(hcr.healthHistory) > hcr.maxHistorySize { hcr.healthHistory = hcr.healthHistory[len(hcr.healthHistory)-hcr.maxHistorySize:] } } // calculateHealthTrend analyzes recent health scores to determine trend func (hcr *HealthCheckRunner) calculateHealthTrend(currentScore float64) HealthTrend { hcr.mu.RLock() defer hcr.mu.RUnlock() if len(hcr.healthHistory) < 3 { return HealthTrendUnknown } // Get last few scores for trend analysis recentScores := make([]float64, 0, 5) start := len(hcr.healthHistory) - 5 if start < 0 { start = 0 } for i := start; i < len(hcr.healthHistory); i++ { recentScores = append(recentScores, hcr.healthHistory[i].HealthScore) } recentScores = append(recentScores, currentScore) // Calculate trend if currentScore < 0.5 { return HealthTrendCritical } // Simple linear trend calculation if len(recentScores) >= 3 { first := recentScores[0] last := recentScores[len(recentScores)-1] diff := last - first if diff > 0.05 { return HealthTrendImproving } else if diff < -0.05 { return HealthTrendDeclining } else { return HealthTrendStable } } return HealthTrendUnknown } // checkThresholds checks for threshold violations and generates alerts func (hcr *HealthCheckRunner) checkThresholds(healthSummary map[string]interface{}, snapshot HealthSnapshot) { if !hcr.readyForAlerts(healthSummary, snapshot) { hcr.logger.Debug("Health alerts suppressed during warm-up", "health_score", snapshot.HealthScore, "total_addresses_processed", safeNumericLookup(healthSummary, "total_addresses_processed"), "history_size", hcr.historySize()) return } // Critical health score alert if snapshot.HealthScore < 0.5 { alert := CorruptionAlert{ Timestamp: time.Now(), Severity: AlertSeverityEmergency, Message: fmt.Sprintf("CRITICAL: System health score is %.2f (below 0.5)", snapshot.HealthScore), Context: map[string]interface{}{ "health_score": snapshot.HealthScore, "corruption_rate": snapshot.CorruptionRate, "validation_success": snapshot.ValidationSuccess, "contract_call_success": snapshot.ContractCallSuccess, "trend": snapshot.Trend.String(), }, } hcr.integrityMonitor.sendAlert(alert) } // High corruption rate alert if snapshot.CorruptionRate > 0.10 { // 10% corruption rate alert := CorruptionAlert{ Timestamp: time.Now(), Severity: AlertSeverityCritical, Message: fmt.Sprintf("High corruption rate detected: %.2f%%", snapshot.CorruptionRate*100), Context: map[string]interface{}{ "corruption_rate": snapshot.CorruptionRate, "threshold": 0.10, "addresses_affected": snapshot.CorruptionRate, }, } hcr.integrityMonitor.sendAlert(alert) } // Declining trend alert if snapshot.Trend == HealthTrendDeclining || snapshot.Trend == HealthTrendCritical { alert := CorruptionAlert{ Timestamp: time.Now(), Severity: AlertSeverityWarning, Message: fmt.Sprintf("System health trend is %s (current score: %.2f)", snapshot.Trend.String(), snapshot.HealthScore), Context: map[string]interface{}{ "trend": snapshot.Trend.String(), "health_score": snapshot.HealthScore, "recent_snapshots": hcr.getRecentSnapshots(5), }, } hcr.integrityMonitor.sendAlert(alert) } } func (hcr *HealthCheckRunner) readyForAlerts(healthSummary map[string]interface{}, snapshot HealthSnapshot) bool { hcr.mu.RLock() historyLen := len(hcr.healthHistory) hcr.mu.RUnlock() if historyLen < hcr.warmupSamples { return false } totalProcessed := safeNumericLookup(healthSummary, "total_addresses_processed") if totalProcessed >= 0 && totalProcessed < float64(hcr.minAddressesForAlerts) { return false } // Require at least one validation or contract call attempt before alarming. if snapshot.ValidationSuccess == 0 && snapshot.ContractCallSuccess == 0 && totalProcessed == 0 { return false } return true } func safeNumericLookup(summary map[string]interface{}, key string) float64 { if summary == nil { return -1 } value, ok := summary[key] if !ok { return -1 } switch v := value.(type) { case int: return float64(v) case int32: return float64(v) case int64: return float64(v) case uint: return float64(v) case uint32: return float64(v) case uint64: return float64(v) case float32: return float64(v) case float64: return v default: return -1 } } func (hcr *HealthCheckRunner) historySize() int { hcr.mu.RLock() defer hcr.mu.RUnlock() return len(hcr.healthHistory) } // logHealthStatus logs periodic health status information func (hcr *HealthCheckRunner) logHealthStatus(snapshot HealthSnapshot, duration time.Duration) { // Log detailed status every 5 minutes (10 checks at 30s intervals) if len(hcr.healthHistory)%10 == 0 { hcr.logger.Info("System health status", "health_score", snapshot.HealthScore, "corruption_rate", fmt.Sprintf("%.4f", snapshot.CorruptionRate), "validation_success", fmt.Sprintf("%.4f", snapshot.ValidationSuccess), "contract_call_success", fmt.Sprintf("%.4f", snapshot.ContractCallSuccess), "trend", snapshot.Trend.String(), "check_duration", duration) } else { // Brief status for regular checks hcr.logger.Debug("Health check completed", "health_score", snapshot.HealthScore, "trend", snapshot.Trend.String(), "duration", duration) } } // GetRecentSnapshots returns the most recent health snapshots func (hcr *HealthCheckRunner) GetRecentSnapshots(count int) []HealthSnapshot { return hcr.getRecentSnapshots(count) } // getRecentSnapshots internal implementation func (hcr *HealthCheckRunner) getRecentSnapshots(count int) []HealthSnapshot { hcr.mu.RLock() defer hcr.mu.RUnlock() if len(hcr.healthHistory) == 0 { return []HealthSnapshot{} } start := len(hcr.healthHistory) - count if start < 0 { start = 0 } // Create a copy to avoid external modification snapshots := make([]HealthSnapshot, len(hcr.healthHistory[start:])) copy(snapshots, hcr.healthHistory[start:]) return snapshots } // GetHealthSummary returns a comprehensive health summary func (hcr *HealthCheckRunner) GetHealthSummary() map[string]interface{} { hcr.mu.RLock() defer hcr.mu.RUnlock() if len(hcr.healthHistory) == 0 { return map[string]interface{}{ "running": hcr.running, "check_interval": hcr.checkInterval.String(), "history_size": 0, "last_check": nil, } } lastSnapshot := hcr.healthHistory[len(hcr.healthHistory)-1] return map[string]interface{}{ "running": hcr.running, "check_interval": hcr.checkInterval.String(), "history_size": len(hcr.healthHistory), "last_check": hcr.lastHealthCheck, "current_health_score": lastSnapshot.HealthScore, "current_trend": lastSnapshot.Trend.String(), "corruption_rate": lastSnapshot.CorruptionRate, "validation_success": lastSnapshot.ValidationSuccess, "contract_call_success": lastSnapshot.ContractCallSuccess, "recent_snapshots": hcr.getRecentSnapshots(10), } } // SetCheckInterval sets the health check interval func (hcr *HealthCheckRunner) SetCheckInterval(interval time.Duration) { hcr.mu.Lock() defer hcr.mu.Unlock() hcr.checkInterval = interval hcr.logger.Info("Health check interval updated", "interval", interval) } // IsRunning returns whether the health checker is running func (hcr *HealthCheckRunner) IsRunning() bool { hcr.mu.RLock() defer hcr.mu.RUnlock() return hcr.running }