package lifecycle import ( "context" "errors" "fmt" "os" "sync" "time" "github.com/fraktal/mev-beta/internal/logger" ) // HealthMonitorImpl implements comprehensive health monitoring for modules type HealthMonitorImpl struct { monitors map[string]*ModuleMonitor config HealthMonitorConfig aggregator HealthAggregator notifier HealthNotifier metrics HealthMetrics rules []HealthRule notificationErrors []error notificationErrorDetails []RecordedError notifyMu sync.Mutex mu sync.RWMutex ctx context.Context cancel context.CancelFunc running bool logger *logger.Logger } // ModuleMonitor monitors a specific module's health type ModuleMonitor struct { moduleID string module *RegisteredModule config ModuleHealthConfig lastCheck time.Time checkCount int64 successCount int64 failureCount int64 history []HealthCheckResult currentHealth ModuleHealth trend HealthTrend mu sync.RWMutex } // HealthMonitorConfig configures the health monitoring system type HealthMonitorConfig struct { CheckInterval time.Duration `json:"check_interval"` CheckTimeout time.Duration `json:"check_timeout"` HistorySize int `json:"history_size"` FailureThreshold int `json:"failure_threshold"` RecoveryThreshold int `json:"recovery_threshold"` EnableNotifications bool `json:"enable_notifications"` EnableMetrics bool `json:"enable_metrics"` EnableTrends bool `json:"enable_trends"` ParallelChecks bool `json:"parallel_checks"` MaxConcurrentChecks int `json:"max_concurrent_checks"` NotificationRetries int `json:"notification_retries"` NotificationRetryDelay time.Duration `json:"notification_retry_delay"` } // ModuleHealthConfig configures health checking for a specific module type ModuleHealthConfig struct { CheckInterval time.Duration `json:"check_interval"` CheckTimeout time.Duration `json:"check_timeout"` Enabled bool `json:"enabled"` CriticalModule bool `json:"critical_module"` CustomChecks []HealthCheck `json:"custom_checks"` FailureThreshold int `json:"failure_threshold"` RecoveryThreshold int `json:"recovery_threshold"` AutoRestart bool `json:"auto_restart"` MaxRestarts int `json:"max_restarts"` RestartDelay time.Duration `json:"restart_delay"` } // HealthCheck represents a custom health check type HealthCheck struct { Name string `json:"name"` Description string `json:"description"` CheckFunc func() error `json:"-"` Interval time.Duration `json:"interval"` Timeout time.Duration `json:"timeout"` Critical bool `json:"critical"` Enabled bool `json:"enabled"` } // HealthCheckResult represents the result of a health check type HealthCheckResult struct { Timestamp time.Time `json:"timestamp"` Status HealthStatus `json:"status"` ResponseTime time.Duration `json:"response_time"` Message string `json:"message"` Details map[string]interface{} `json:"details"` Checks map[string]CheckResult `json:"checks"` Error error `json:"error,omitempty"` } // CheckResult represents the result of an individual check type CheckResult struct { Name string `json:"name"` Status HealthStatus `json:"status"` ResponseTime time.Duration `json:"response_time"` Message string `json:"message"` Details map[string]interface{} `json:"details"` Error error `json:"error,omitempty"` } // HealthTrend tracks health trends over time type HealthTrend struct { Direction TrendDirection `json:"direction"` Confidence float64 `json:"confidence"` Slope float64 `json:"slope"` Prediction HealthStatus `json:"prediction"` TimeToAlert time.Duration `json:"time_to_alert"` LastUpdated time.Time `json:"last_updated"` } // TrendDirection indicates the health trend direction type TrendDirection string const ( TrendImproving TrendDirection = "improving" TrendStable TrendDirection = "stable" TrendDegrading TrendDirection = "degrading" TrendUnknown TrendDirection = "unknown" ) // HealthAggregator aggregates health status from multiple modules type HealthAggregator interface { AggregateHealth(modules map[string]ModuleHealth) OverallHealth CalculateSystemHealth(individual []ModuleHealth) HealthStatus GetHealthScore(health ModuleHealth) float64 } // HealthNotifier sends health notifications type HealthNotifier interface { NotifyHealthChange(moduleID string, oldHealth, newHealth ModuleHealth) error NotifySystemHealth(health OverallHealth) error NotifyAlert(alert HealthAlert) error } // OverallHealth represents the overall system health type OverallHealth struct { Status HealthStatus `json:"status"` Score float64 `json:"score"` ModuleCount int `json:"module_count"` HealthyCount int `json:"healthy_count"` DegradedCount int `json:"degraded_count"` UnhealthyCount int `json:"unhealthy_count"` CriticalIssues []string `json:"critical_issues"` Modules map[string]ModuleHealth `json:"modules"` LastUpdated time.Time `json:"last_updated"` Trends map[string]HealthTrend `json:"trends"` Recommendations []HealthRecommendation `json:"recommendations"` } // HealthAlert represents a health alert type HealthAlert struct { ID string `json:"id"` ModuleID string `json:"module_id"` Severity AlertSeverity `json:"severity"` Type AlertType `json:"type"` Message string `json:"message"` Details map[string]interface{} `json:"details"` Timestamp time.Time `json:"timestamp"` Resolved bool `json:"resolved"` ResolvedAt time.Time `json:"resolved_at,omitempty"` } // AlertSeverity defines alert severity levels type AlertSeverity string const ( SeverityInfo AlertSeverity = "info" SeverityWarning AlertSeverity = "warning" SeverityError AlertSeverity = "error" SeverityCritical AlertSeverity = "critical" ) // AlertType defines types of alerts type AlertType string const ( AlertHealthChange AlertType = "health_change" AlertThresholdBreach AlertType = "threshold_breach" AlertTrendAlert AlertType = "trend_alert" AlertSystemDown AlertType = "system_down" AlertRecovery AlertType = "recovery" ) // HealthRule defines rules for health evaluation type HealthRule struct { Name string `json:"name"` Description string `json:"description"` Condition func(ModuleHealth) bool `json:"-"` Action func(string, ModuleHealth) error `json:"-"` Severity AlertSeverity `json:"severity"` Enabled bool `json:"enabled"` } // HealthRecommendation provides actionable health recommendations type HealthRecommendation struct { ModuleID string `json:"module_id"` Type string `json:"type"` Description string `json:"description"` Action string `json:"action"` Priority string `json:"priority"` Timestamp time.Time `json:"timestamp"` } // HealthMetrics tracks health monitoring metrics type HealthMetrics struct { ChecksPerformed int64 `json:"checks_performed"` ChecksSuccessful int64 `json:"checks_successful"` ChecksFailed int64 `json:"checks_failed"` AverageCheckTime time.Duration `json:"average_check_time"` AlertsGenerated int64 `json:"alerts_generated"` ModuleRestarts int64 `json:"module_restarts"` SystemDowntime time.Duration `json:"system_downtime"` ModuleHealthScores map[string]float64 `json:"module_health_scores"` TrendAccuracy float64 `json:"trend_accuracy"` } // NewHealthMonitor creates a new health monitor func NewHealthMonitor(config HealthMonitorConfig) *HealthMonitorImpl { ctx, cancel := context.WithCancel(context.Background()) hm := &HealthMonitorImpl{ monitors: make(map[string]*ModuleMonitor), config: config, aggregator: NewDefaultHealthAggregator(), notifier: NewDefaultHealthNotifier(), rules: make([]HealthRule, 0), notificationErrors: make([]error, 0), notificationErrorDetails: make([]RecordedError, 0), ctx: ctx, cancel: cancel, metrics: HealthMetrics{ ModuleHealthScores: make(map[string]float64), }, } // Set default configuration if hm.config.CheckInterval == 0 { hm.config.CheckInterval = 30 * time.Second } if hm.config.CheckTimeout == 0 { hm.config.CheckTimeout = 10 * time.Second } if hm.config.HistorySize == 0 { hm.config.HistorySize = 100 } if hm.config.FailureThreshold == 0 { hm.config.FailureThreshold = 3 } if hm.config.RecoveryThreshold == 0 { hm.config.RecoveryThreshold = 3 } if hm.config.MaxConcurrentChecks == 0 { hm.config.MaxConcurrentChecks = 10 } if hm.config.NotificationRetries == 0 { hm.config.NotificationRetries = 3 } if hm.config.NotificationRetryDelay == 0 { hm.config.NotificationRetryDelay = 500 * time.Millisecond } // Setup default health rules hm.setupDefaultRules() if err := os.MkdirAll("logs", 0o755); err != nil { fmt.Printf("failed to ensure logs directory: %v\n", err) } hm.logger = logger.New("info", "", "logs/lifecycle_health.log") return hm } // Start starts the health monitoring system func (hm *HealthMonitorImpl) Start() error { hm.mu.Lock() defer hm.mu.Unlock() if hm.running { return fmt.Errorf("health monitor already running") } hm.running = true // Start monitoring loop go hm.monitoringLoop() return nil } // Stop stops the health monitoring system func (hm *HealthMonitorImpl) Stop() error { hm.mu.Lock() defer hm.mu.Unlock() if !hm.running { return nil } hm.cancel() hm.running = false return nil } // StartMonitoring starts monitoring a specific module func (hm *HealthMonitorImpl) StartMonitoring(module *RegisteredModule) error { hm.mu.Lock() defer hm.mu.Unlock() moduleID := module.ID // Create module monitor monitor := &ModuleMonitor{ moduleID: moduleID, module: module, config: ModuleHealthConfig{ CheckInterval: hm.config.CheckInterval, CheckTimeout: hm.config.CheckTimeout, Enabled: true, CriticalModule: module.Config.CriticalModule, FailureThreshold: hm.config.FailureThreshold, RecoveryThreshold: hm.config.RecoveryThreshold, AutoRestart: module.Config.MaxRestarts > 0, MaxRestarts: module.Config.MaxRestarts, RestartDelay: module.Config.RestartDelay, }, history: make([]HealthCheckResult, 0), currentHealth: ModuleHealth{ Status: HealthUnknown, LastCheck: time.Now(), }, } hm.monitors[moduleID] = monitor return nil } // StopMonitoring stops monitoring a specific module func (hm *HealthMonitorImpl) StopMonitoring(moduleID string) error { hm.mu.Lock() defer hm.mu.Unlock() delete(hm.monitors, moduleID) delete(hm.metrics.ModuleHealthScores, moduleID) return nil } // CheckHealth performs a health check on a specific module func (hm *HealthMonitorImpl) CheckHealth(module *RegisteredModule) ModuleHealth { moduleID := module.ID hm.mu.RLock() monitor, exists := hm.monitors[moduleID] hm.mu.RUnlock() if !exists { return ModuleHealth{ Status: HealthUnknown, Message: "Module not monitored", } } return hm.performHealthCheck(monitor) } // GetHealthStatus returns the health status of all monitored modules func (hm *HealthMonitorImpl) GetHealthStatus() map[string]ModuleHealth { hm.mu.RLock() defer hm.mu.RUnlock() status := make(map[string]ModuleHealth) for moduleID, monitor := range hm.monitors { monitor.mu.RLock() status[moduleID] = monitor.currentHealth monitor.mu.RUnlock() } return status } // GetOverallHealth returns the overall system health func (hm *HealthMonitorImpl) GetOverallHealth() OverallHealth { hm.mu.RLock() defer hm.mu.RUnlock() moduleHealths := make(map[string]ModuleHealth) for moduleID, monitor := range hm.monitors { monitor.mu.RLock() moduleHealths[moduleID] = monitor.currentHealth monitor.mu.RUnlock() } return hm.aggregator.AggregateHealth(moduleHealths) } // AddHealthRule adds a custom health rule func (hm *HealthMonitorImpl) AddHealthRule(rule HealthRule) { hm.mu.Lock() defer hm.mu.Unlock() hm.rules = append(hm.rules, rule) } // SetHealthAggregator sets a custom health aggregator func (hm *HealthMonitorImpl) SetHealthAggregator(aggregator HealthAggregator) { hm.mu.Lock() defer hm.mu.Unlock() hm.aggregator = aggregator } // SetHealthNotifier sets a custom health notifier func (hm *HealthMonitorImpl) SetHealthNotifier(notifier HealthNotifier) { hm.mu.Lock() defer hm.mu.Unlock() hm.notifier = notifier } // GetMetrics returns health monitoring metrics func (hm *HealthMonitorImpl) GetMetrics() HealthMetrics { hm.mu.RLock() defer hm.mu.RUnlock() return hm.metrics } // Private methods func (hm *HealthMonitorImpl) monitoringLoop() { ticker := time.NewTicker(hm.config.CheckInterval) defer ticker.Stop() for { select { case <-hm.ctx.Done(): return case <-ticker.C: hm.performAllHealthChecks() } } } func (hm *HealthMonitorImpl) performAllHealthChecks() { hm.mu.RLock() monitors := make([]*ModuleMonitor, 0, len(hm.monitors)) for _, monitor := range hm.monitors { monitors = append(monitors, monitor) } hm.mu.RUnlock() if hm.config.ParallelChecks { hm.performHealthChecksParallel(monitors) } else { hm.performHealthChecksSequential(monitors) } // Update overall health and send notifications overallHealth := hm.GetOverallHealth() if hm.config.EnableNotifications { if notifyErr := hm.notifyWithRetry( func() error { return hm.notifier.NotifySystemHealth(overallHealth) }, "Failed to notify system health", "overall_health_status", overallHealth.Status, ); notifyErr != nil { // CRITICAL FIX: Log system health notification failure but don't fail health checks hm.logger.Warn("Failed to notify system health after retries", "error", notifyErr, "overall_health_status", overallHealth.Status) } } } func (hm *HealthMonitorImpl) performHealthChecksSequential(monitors []*ModuleMonitor) { for _, monitor := range monitors { if monitor.config.Enabled { hm.performHealthCheck(monitor) } } } func (hm *HealthMonitorImpl) performHealthChecksParallel(monitors []*ModuleMonitor) { semaphore := make(chan struct{}, hm.config.MaxConcurrentChecks) var wg sync.WaitGroup for _, monitor := range monitors { if monitor.config.Enabled { wg.Add(1) go func(m *ModuleMonitor) { defer wg.Done() semaphore <- struct{}{} defer func() { <-semaphore }() hm.performHealthCheck(m) }(monitor) } } wg.Wait() } func (hm *HealthMonitorImpl) performHealthCheck(monitor *ModuleMonitor) ModuleHealth { start := time.Now() monitor.mu.Lock() defer monitor.mu.Unlock() monitor.checkCount++ monitor.lastCheck = start // Create check context with timeout ctx, cancel := context.WithTimeout(hm.ctx, monitor.config.CheckTimeout) defer cancel() // Perform basic module health check moduleHealth := monitor.module.Instance.GetHealth() // Perform custom health checks checkResults := make(map[string]CheckResult) for _, check := range monitor.config.CustomChecks { if check.Enabled { checkResult := hm.performCustomCheck(ctx, check) checkResults[check.Name] = checkResult // Update overall status based on check results if check.Critical && checkResult.Status != HealthHealthy { moduleHealth.Status = HealthUnhealthy } } } // Create health check result result := HealthCheckResult{ Timestamp: start, Status: moduleHealth.Status, ResponseTime: time.Since(start), Message: moduleHealth.Message, Details: moduleHealth.Details, Checks: checkResults, } // Update statistics if result.Status == HealthHealthy { monitor.successCount++ } else { monitor.failureCount++ } // Add to history monitor.history = append(monitor.history, result) if len(monitor.history) > hm.config.HistorySize { monitor.history = monitor.history[1:] } // Update current health oldHealth := monitor.currentHealth monitor.currentHealth = moduleHealth monitor.currentHealth.LastCheck = start monitor.currentHealth.RestartCount = int(monitor.module.HealthStatus.RestartCount) // Calculate uptime if !monitor.module.StartTime.IsZero() { monitor.currentHealth.Uptime = time.Since(monitor.module.StartTime) } // Update trends if enabled if hm.config.EnableTrends { monitor.trend = hm.calculateHealthTrend(monitor) } // Apply health rules hm.applyHealthRules(monitor.moduleID, monitor.currentHealth) if notifyErr := hm.notifyWithRetry( func() error { return hm.notifier.NotifyHealthChange(monitor.moduleID, oldHealth, monitor.currentHealth) }, "Failed to notify health change", "module_id", monitor.moduleID, ); notifyErr != nil { // CRITICAL FIX: Log health notification failure but don't fail health check hm.logger.Warn("Failed to notify health change after retries", "module_id", monitor.moduleID, "error", notifyErr, "old_status", oldHealth.Status, "new_status", monitor.currentHealth.Status) } if hm.config.EnableNotifications && oldHealth.Status != monitor.currentHealth.Status { if notifyErr := hm.notifyWithRetry( func() error { return hm.notifier.NotifyHealthChange(monitor.moduleID, oldHealth, monitor.currentHealth) }, "Failed to notify health change (status transition)", "module_id", monitor.moduleID, "reason", "status_change", ); notifyErr != nil { // CRITICAL FIX: Log status transition notification failure but don't fail health check hm.logger.Warn("Failed to notify health status transition after retries", "module_id", monitor.moduleID, "error", notifyErr, "old_status", oldHealth.Status, "new_status", monitor.currentHealth.Status, "transition_reason", "status_change") } } // Update metrics if hm.config.EnableMetrics { hm.updateMetrics(monitor, result) } return monitor.currentHealth } func (hm *HealthMonitorImpl) performCustomCheck(ctx context.Context, check HealthCheck) CheckResult { start := time.Now() result := CheckResult{ Name: check.Name, Status: HealthHealthy, ResponseTime: 0, Message: "Check passed", Details: make(map[string]interface{}), } // Create timeout context for the check checkCtx, cancel := context.WithTimeout(ctx, check.Timeout) defer cancel() // Run the check done := make(chan error, 1) go func() { done <- check.CheckFunc() }() select { case err := <-done: result.ResponseTime = time.Since(start) if err != nil { result.Status = HealthUnhealthy result.Message = err.Error() result.Error = err } case <-checkCtx.Done(): result.ResponseTime = time.Since(start) result.Status = HealthUnhealthy result.Message = "Check timed out" result.Error = checkCtx.Err() } return result } func (hm *HealthMonitorImpl) notifyWithRetry(send func() error, failureMessage string, attrs ...interface{}) error { if hm.notifier == nil { return nil } retries := hm.config.NotificationRetries if retries <= 0 { retries = 1 } delay := hm.config.NotificationRetryDelay if delay <= 0 { delay = 500 * time.Millisecond } var errs []error for attempt := 1; attempt <= retries; attempt++ { if err := send(); err != nil { errs = append(errs, err) if attempt < retries { time.Sleep(delay) continue } joined := errors.Join(errs...) attemptAttrs := append([]interface{}{}, attrs...) attemptAttrs = append(attemptAttrs, "attempts", attempt) hm.recordNotificationError(failureMessage, joined, attemptAttrs...) return fmt.Errorf("%s after %d attempts: %w", failureMessage, attempt, joined) } if attempt > 1 && hm.logger != nil { attemptAttrs := append([]interface{}{}, attrs...) attemptAttrs = append(attemptAttrs, "attempts", attempt) hm.logger.Warn(append([]interface{}{"Health notification succeeded after retry"}, attemptAttrs...)...) } return nil } return nil } func (hm *HealthMonitorImpl) recordNotificationError(message string, err error, attrs ...interface{}) { if err == nil { return } attrCopy := append([]interface{}{}, attrs...) wrapped, txHash, attrsWithTx := enrichErrorWithTxHash(message, err, attrCopy) hm.notifyMu.Lock() hm.notificationErrors = append(hm.notificationErrors, wrapped) hm.notificationErrorDetails = append(hm.notificationErrorDetails, RecordedError{ Err: wrapped, TxHash: txHash, }) hm.notifyMu.Unlock() if hm.logger != nil { kv := append([]interface{}{}, attrsWithTx...) kv = append(kv, "error", err) args := append([]interface{}{message}, kv...) hm.logger.Error(args...) } } func (hm *HealthMonitorImpl) aggregatedNotificationError() error { hm.notifyMu.Lock() defer hm.notifyMu.Unlock() if len(hm.notificationErrors) == 0 { return nil } errs := make([]error, len(hm.notificationErrors)) copy(errs, hm.notificationErrors) return errors.Join(errs...) } // NotificationErrors returns a copy of recorded notification errors for diagnostics. func (hm *HealthMonitorImpl) NotificationErrors() []error { hm.notifyMu.Lock() defer hm.notifyMu.Unlock() if len(hm.notificationErrors) == 0 { return nil } errs := make([]error, len(hm.notificationErrors)) copy(errs, hm.notificationErrors) return errs } // NotificationErrorDetails returns recorded notification errors with tx hash metadata. func (hm *HealthMonitorImpl) NotificationErrorDetails() []RecordedError { hm.notifyMu.Lock() defer hm.notifyMu.Unlock() if len(hm.notificationErrorDetails) == 0 { return nil } details := make([]RecordedError, len(hm.notificationErrorDetails)) copy(details, hm.notificationErrorDetails) return details } func (hm *HealthMonitorImpl) calculateHealthTrend(monitor *ModuleMonitor) HealthTrend { if len(monitor.history) < 5 { return HealthTrend{ Direction: TrendUnknown, Confidence: 0, LastUpdated: time.Now(), } } // Simple trend calculation based on recent health status recent := monitor.history[len(monitor.history)-5:] healthyCount := 0 for _, result := range recent { if result.Status == HealthHealthy { healthyCount++ } } healthRatio := float64(healthyCount) / float64(len(recent)) var direction TrendDirection var confidence float64 if healthRatio > 0.8 { direction = TrendImproving confidence = healthRatio } else if healthRatio < 0.4 { direction = TrendDegrading confidence = 1.0 - healthRatio } else { direction = TrendStable confidence = 0.5 } return HealthTrend{ Direction: direction, Confidence: confidence, Slope: healthRatio - 0.5, // Simplified slope calculation Prediction: hm.predictHealthStatus(healthRatio), LastUpdated: time.Now(), } } func (hm *HealthMonitorImpl) predictHealthStatus(healthRatio float64) HealthStatus { if healthRatio > 0.7 { return HealthHealthy } else if healthRatio > 0.3 { return HealthDegraded } else { return HealthUnhealthy } } func (hm *HealthMonitorImpl) applyHealthRules(moduleID string, health ModuleHealth) { for _, rule := range hm.rules { if rule.Enabled && rule.Condition(health) { if err := rule.Action(moduleID, health); err != nil { // Log error but continue with other rules } } } } func (hm *HealthMonitorImpl) updateMetrics(monitor *ModuleMonitor, result HealthCheckResult) { hm.metrics.ChecksPerformed++ if result.Status == HealthHealthy { hm.metrics.ChecksSuccessful++ } else { hm.metrics.ChecksFailed++ } // Update average check time if hm.metrics.ChecksPerformed > 0 { totalTime := hm.metrics.AverageCheckTime * time.Duration(hm.metrics.ChecksPerformed-1) hm.metrics.AverageCheckTime = (totalTime + result.ResponseTime) / time.Duration(hm.metrics.ChecksPerformed) } // Update health score score := hm.aggregator.GetHealthScore(monitor.currentHealth) hm.metrics.ModuleHealthScores[monitor.moduleID] = score } func (hm *HealthMonitorImpl) createHealthRule(name, description, messageFormat string, status HealthStatus, severity AlertSeverity) HealthRule { return HealthRule{ Name: name, Description: description, Condition: func(health ModuleHealth) bool { return health.Status == status }, Action: func(moduleID string, health ModuleHealth) error { alert := HealthAlert{ ID: fmt.Sprintf("%s_%s_%d", name, moduleID, time.Now().Unix()), ModuleID: moduleID, Severity: severity, Type: AlertHealthChange, Message: fmt.Sprintf(messageFormat, moduleID, health.Message), Timestamp: time.Now(), } return hm.notifier.NotifyAlert(alert) }, Severity: severity, Enabled: true, } } func (hm *HealthMonitorImpl) setupDefaultRules() { hm.rules = append(hm.rules, hm.createHealthRule( "critical_module_unhealthy", "Alert when a critical module becomes unhealthy", "Critical module %s is unhealthy: %s", HealthUnhealthy, SeverityCritical, )) hm.rules = append(hm.rules, hm.createHealthRule( "degraded_performance", "Alert when module performance is degraded", "Module %s performance is degraded: %s", HealthDegraded, SeverityWarning, )) } // DefaultHealthAggregator implements basic health aggregation type DefaultHealthAggregator struct{} func NewDefaultHealthAggregator() *DefaultHealthAggregator { return &DefaultHealthAggregator{} } func (dha *DefaultHealthAggregator) AggregateHealth(modules map[string]ModuleHealth) OverallHealth { overall := OverallHealth{ Modules: modules, LastUpdated: time.Now(), Trends: make(map[string]HealthTrend), } if len(modules) == 0 { overall.Status = HealthUnknown return overall } overall.ModuleCount = len(modules) var totalScore float64 for moduleID, health := range modules { score := dha.GetHealthScore(health) totalScore += score switch health.Status { case HealthHealthy: overall.HealthyCount++ case HealthDegraded: overall.DegradedCount++ case HealthUnhealthy: overall.UnhealthyCount++ overall.CriticalIssues = append(overall.CriticalIssues, fmt.Sprintf("Module %s is unhealthy: %s", moduleID, health.Message)) } } overall.Score = totalScore / float64(len(modules)) overall.Status = dha.CalculateSystemHealth(getHealthValues(modules)) return overall } func (dha *DefaultHealthAggregator) CalculateSystemHealth(individual []ModuleHealth) HealthStatus { if len(individual) == 0 { return HealthUnknown } healthyCount := 0 degradedCount := 0 unhealthyCount := 0 for _, health := range individual { switch health.Status { case HealthHealthy: healthyCount++ case HealthDegraded: degradedCount++ case HealthUnhealthy: unhealthyCount++ } } total := len(individual) healthyRatio := float64(healthyCount) / float64(total) unhealthyRatio := float64(unhealthyCount) / float64(total) if unhealthyRatio > 0.3 { return HealthUnhealthy } else if healthyRatio < 0.7 { return HealthDegraded } else { return HealthHealthy } } func (dha *DefaultHealthAggregator) GetHealthScore(health ModuleHealth) float64 { switch health.Status { case HealthHealthy: return 1.0 case HealthDegraded: return 0.5 case HealthUnhealthy: return 0.0 default: return 0.0 } } func getHealthValues(modules map[string]ModuleHealth) []ModuleHealth { values := make([]ModuleHealth, 0, len(modules)) for _, health := range modules { values = append(values, health) } return values } // DefaultHealthNotifier implements basic health notifications type DefaultHealthNotifier struct{} func NewDefaultHealthNotifier() *DefaultHealthNotifier { return &DefaultHealthNotifier{} } func (dhn *DefaultHealthNotifier) NotifyHealthChange(moduleID string, oldHealth, newHealth ModuleHealth) error { // Basic notification implementation - could be extended to send emails, webhooks, etc. return nil } func (dhn *DefaultHealthNotifier) NotifySystemHealth(health OverallHealth) error { // Basic notification implementation return nil } func (dhn *DefaultHealthNotifier) NotifyAlert(alert HealthAlert) error { // Basic notification implementation return nil }