package lifecycle import ( "context" "fmt" "sync" "time" ) // HealthMonitorImpl implements comprehensive health monitoring for modules type HealthMonitorImpl struct { monitors map[string]*ModuleMonitor config HealthMonitorConfig aggregator HealthAggregator notifier HealthNotifier metrics HealthMetrics rules []HealthRule mu sync.RWMutex ctx context.Context cancel context.CancelFunc running bool } // ModuleMonitor monitors a specific module's health type ModuleMonitor struct { moduleID string module *RegisteredModule config ModuleHealthConfig lastCheck time.Time checkCount int64 successCount int64 failureCount int64 history []HealthCheckResult currentHealth ModuleHealth trend HealthTrend mu sync.RWMutex } // HealthMonitorConfig configures the health monitoring system type HealthMonitorConfig struct { CheckInterval time.Duration `json:"check_interval"` CheckTimeout time.Duration `json:"check_timeout"` HistorySize int `json:"history_size"` FailureThreshold int `json:"failure_threshold"` RecoveryThreshold int `json:"recovery_threshold"` EnableNotifications bool `json:"enable_notifications"` EnableMetrics bool `json:"enable_metrics"` EnableTrends bool `json:"enable_trends"` ParallelChecks bool `json:"parallel_checks"` MaxConcurrentChecks int `json:"max_concurrent_checks"` } // ModuleHealthConfig configures health checking for a specific module type ModuleHealthConfig struct { CheckInterval time.Duration `json:"check_interval"` CheckTimeout time.Duration `json:"check_timeout"` Enabled bool `json:"enabled"` CriticalModule bool `json:"critical_module"` CustomChecks []HealthCheck `json:"custom_checks"` FailureThreshold int `json:"failure_threshold"` RecoveryThreshold int `json:"recovery_threshold"` AutoRestart bool `json:"auto_restart"` MaxRestarts int `json:"max_restarts"` RestartDelay time.Duration `json:"restart_delay"` } // HealthCheck represents a custom health check type HealthCheck struct { Name string `json:"name"` Description string `json:"description"` CheckFunc func() error `json:"-"` Interval time.Duration `json:"interval"` Timeout time.Duration `json:"timeout"` Critical bool `json:"critical"` Enabled bool `json:"enabled"` } // HealthCheckResult represents the result of a health check type HealthCheckResult struct { Timestamp time.Time `json:"timestamp"` Status HealthStatus `json:"status"` ResponseTime time.Duration `json:"response_time"` Message string `json:"message"` Details map[string]interface{} `json:"details"` Checks map[string]CheckResult `json:"checks"` Error error `json:"error,omitempty"` } // CheckResult represents the result of an individual check type CheckResult struct { Name string `json:"name"` Status HealthStatus `json:"status"` ResponseTime time.Duration `json:"response_time"` Message string `json:"message"` Details map[string]interface{} `json:"details"` Error error `json:"error,omitempty"` } // HealthTrend tracks health trends over time type HealthTrend struct { Direction TrendDirection `json:"direction"` Confidence float64 `json:"confidence"` Slope float64 `json:"slope"` Prediction HealthStatus `json:"prediction"` TimeToAlert time.Duration `json:"time_to_alert"` LastUpdated time.Time `json:"last_updated"` } // TrendDirection indicates the health trend direction type TrendDirection string const ( TrendImproving TrendDirection = "improving" TrendStable TrendDirection = "stable" TrendDegrading TrendDirection = "degrading" TrendUnknown TrendDirection = "unknown" ) // HealthAggregator aggregates health status from multiple modules type HealthAggregator interface { AggregateHealth(modules map[string]ModuleHealth) OverallHealth CalculateSystemHealth(individual []ModuleHealth) HealthStatus GetHealthScore(health ModuleHealth) float64 } // HealthNotifier sends health notifications type HealthNotifier interface { NotifyHealthChange(moduleID string, oldHealth, newHealth ModuleHealth) error NotifySystemHealth(health OverallHealth) error NotifyAlert(alert HealthAlert) error } // OverallHealth represents the overall system health type OverallHealth struct { Status HealthStatus `json:"status"` Score float64 `json:"score"` ModuleCount int `json:"module_count"` HealthyCount int `json:"healthy_count"` DegradedCount int `json:"degraded_count"` UnhealthyCount int `json:"unhealthy_count"` CriticalIssues []string `json:"critical_issues"` Modules map[string]ModuleHealth `json:"modules"` LastUpdated time.Time `json:"last_updated"` Trends map[string]HealthTrend `json:"trends"` Recommendations []HealthRecommendation `json:"recommendations"` } // HealthAlert represents a health alert type HealthAlert struct { ID string `json:"id"` ModuleID string `json:"module_id"` Severity AlertSeverity `json:"severity"` Type AlertType `json:"type"` Message string `json:"message"` Details map[string]interface{} `json:"details"` Timestamp time.Time `json:"timestamp"` Resolved bool `json:"resolved"` ResolvedAt time.Time `json:"resolved_at,omitempty"` } // AlertSeverity defines alert severity levels type AlertSeverity string const ( SeverityInfo AlertSeverity = "info" SeverityWarning AlertSeverity = "warning" SeverityError AlertSeverity = "error" SeverityCritical AlertSeverity = "critical" ) // AlertType defines types of alerts type AlertType string const ( AlertHealthChange AlertType = "health_change" AlertThresholdBreach AlertType = "threshold_breach" AlertTrendAlert AlertType = "trend_alert" AlertSystemDown AlertType = "system_down" AlertRecovery AlertType = "recovery" ) // HealthRule defines rules for health evaluation type HealthRule struct { Name string `json:"name"` Description string `json:"description"` Condition func(ModuleHealth) bool `json:"-"` Action func(string, ModuleHealth) error `json:"-"` Severity AlertSeverity `json:"severity"` Enabled bool `json:"enabled"` } // HealthRecommendation provides actionable health recommendations type HealthRecommendation struct { ModuleID string `json:"module_id"` Type string `json:"type"` Description string `json:"description"` Action string `json:"action"` Priority string `json:"priority"` Timestamp time.Time `json:"timestamp"` } // HealthMetrics tracks health monitoring metrics type HealthMetrics struct { ChecksPerformed int64 `json:"checks_performed"` ChecksSuccessful int64 `json:"checks_successful"` ChecksFailed int64 `json:"checks_failed"` AverageCheckTime time.Duration `json:"average_check_time"` AlertsGenerated int64 `json:"alerts_generated"` ModuleRestarts int64 `json:"module_restarts"` SystemDowntime time.Duration `json:"system_downtime"` ModuleHealthScores map[string]float64 `json:"module_health_scores"` TrendAccuracy float64 `json:"trend_accuracy"` } // NewHealthMonitor creates a new health monitor func NewHealthMonitor(config HealthMonitorConfig) *HealthMonitorImpl { ctx, cancel := context.WithCancel(context.Background()) hm := &HealthMonitorImpl{ monitors: make(map[string]*ModuleMonitor), config: config, aggregator: NewDefaultHealthAggregator(), notifier: NewDefaultHealthNotifier(), rules: make([]HealthRule, 0), ctx: ctx, cancel: cancel, metrics: HealthMetrics{ ModuleHealthScores: make(map[string]float64), }, } // Set default configuration if hm.config.CheckInterval == 0 { hm.config.CheckInterval = 30 * time.Second } if hm.config.CheckTimeout == 0 { hm.config.CheckTimeout = 10 * time.Second } if hm.config.HistorySize == 0 { hm.config.HistorySize = 100 } if hm.config.FailureThreshold == 0 { hm.config.FailureThreshold = 3 } if hm.config.RecoveryThreshold == 0 { hm.config.RecoveryThreshold = 3 } if hm.config.MaxConcurrentChecks == 0 { hm.config.MaxConcurrentChecks = 10 } // Setup default health rules hm.setupDefaultRules() return hm } // Start starts the health monitoring system func (hm *HealthMonitorImpl) Start() error { hm.mu.Lock() defer hm.mu.Unlock() if hm.running { return fmt.Errorf("health monitor already running") } hm.running = true // Start monitoring loop go hm.monitoringLoop() return nil } // Stop stops the health monitoring system func (hm *HealthMonitorImpl) Stop() error { hm.mu.Lock() defer hm.mu.Unlock() if !hm.running { return nil } hm.cancel() hm.running = false return nil } // StartMonitoring starts monitoring a specific module func (hm *HealthMonitorImpl) StartMonitoring(module *RegisteredModule) error { hm.mu.Lock() defer hm.mu.Unlock() moduleID := module.ID // Create module monitor monitor := &ModuleMonitor{ moduleID: moduleID, module: module, config: ModuleHealthConfig{ CheckInterval: hm.config.CheckInterval, CheckTimeout: hm.config.CheckTimeout, Enabled: true, CriticalModule: module.Config.CriticalModule, FailureThreshold: hm.config.FailureThreshold, RecoveryThreshold: hm.config.RecoveryThreshold, AutoRestart: module.Config.MaxRestarts > 0, MaxRestarts: module.Config.MaxRestarts, RestartDelay: module.Config.RestartDelay, }, history: make([]HealthCheckResult, 0), currentHealth: ModuleHealth{ Status: HealthUnknown, LastCheck: time.Now(), }, } hm.monitors[moduleID] = monitor return nil } // StopMonitoring stops monitoring a specific module func (hm *HealthMonitorImpl) StopMonitoring(moduleID string) error { hm.mu.Lock() defer hm.mu.Unlock() delete(hm.monitors, moduleID) delete(hm.metrics.ModuleHealthScores, moduleID) return nil } // CheckHealth performs a health check on a specific module func (hm *HealthMonitorImpl) CheckHealth(module *RegisteredModule) ModuleHealth { moduleID := module.ID hm.mu.RLock() monitor, exists := hm.monitors[moduleID] hm.mu.RUnlock() if !exists { return ModuleHealth{ Status: HealthUnknown, Message: "Module not monitored", } } return hm.performHealthCheck(monitor) } // GetHealthStatus returns the health status of all monitored modules func (hm *HealthMonitorImpl) GetHealthStatus() map[string]ModuleHealth { hm.mu.RLock() defer hm.mu.RUnlock() status := make(map[string]ModuleHealth) for moduleID, monitor := range hm.monitors { monitor.mu.RLock() status[moduleID] = monitor.currentHealth monitor.mu.RUnlock() } return status } // GetOverallHealth returns the overall system health func (hm *HealthMonitorImpl) GetOverallHealth() OverallHealth { hm.mu.RLock() defer hm.mu.RUnlock() moduleHealths := make(map[string]ModuleHealth) for moduleID, monitor := range hm.monitors { monitor.mu.RLock() moduleHealths[moduleID] = monitor.currentHealth monitor.mu.RUnlock() } return hm.aggregator.AggregateHealth(moduleHealths) } // AddHealthRule adds a custom health rule func (hm *HealthMonitorImpl) AddHealthRule(rule HealthRule) { hm.mu.Lock() defer hm.mu.Unlock() hm.rules = append(hm.rules, rule) } // SetHealthAggregator sets a custom health aggregator func (hm *HealthMonitorImpl) SetHealthAggregator(aggregator HealthAggregator) { hm.mu.Lock() defer hm.mu.Unlock() hm.aggregator = aggregator } // SetHealthNotifier sets a custom health notifier func (hm *HealthMonitorImpl) SetHealthNotifier(notifier HealthNotifier) { hm.mu.Lock() defer hm.mu.Unlock() hm.notifier = notifier } // GetMetrics returns health monitoring metrics func (hm *HealthMonitorImpl) GetMetrics() HealthMetrics { hm.mu.RLock() defer hm.mu.RUnlock() return hm.metrics } // Private methods func (hm *HealthMonitorImpl) monitoringLoop() { ticker := time.NewTicker(hm.config.CheckInterval) defer ticker.Stop() for { select { case <-hm.ctx.Done(): return case <-ticker.C: hm.performAllHealthChecks() } } } func (hm *HealthMonitorImpl) performAllHealthChecks() { hm.mu.RLock() monitors := make([]*ModuleMonitor, 0, len(hm.monitors)) for _, monitor := range hm.monitors { monitors = append(monitors, monitor) } hm.mu.RUnlock() if hm.config.ParallelChecks { hm.performHealthChecksParallel(monitors) } else { hm.performHealthChecksSequential(monitors) } // Update overall health and send notifications overallHealth := hm.GetOverallHealth() if hm.config.EnableNotifications { hm.notifier.NotifySystemHealth(overallHealth) } } func (hm *HealthMonitorImpl) performHealthChecksSequential(monitors []*ModuleMonitor) { for _, monitor := range monitors { if monitor.config.Enabled { hm.performHealthCheck(monitor) } } } func (hm *HealthMonitorImpl) performHealthChecksParallel(monitors []*ModuleMonitor) { semaphore := make(chan struct{}, hm.config.MaxConcurrentChecks) var wg sync.WaitGroup for _, monitor := range monitors { if monitor.config.Enabled { wg.Add(1) go func(m *ModuleMonitor) { defer wg.Done() semaphore <- struct{}{} defer func() { <-semaphore }() hm.performHealthCheck(m) }(monitor) } } wg.Wait() } func (hm *HealthMonitorImpl) performHealthCheck(monitor *ModuleMonitor) ModuleHealth { start := time.Now() monitor.mu.Lock() defer monitor.mu.Unlock() monitor.checkCount++ monitor.lastCheck = start // Create check context with timeout ctx, cancel := context.WithTimeout(hm.ctx, monitor.config.CheckTimeout) defer cancel() // Perform basic module health check moduleHealth := monitor.module.Instance.GetHealth() // Perform custom health checks checkResults := make(map[string]CheckResult) for _, check := range monitor.config.CustomChecks { if check.Enabled { checkResult := hm.performCustomCheck(ctx, check) checkResults[check.Name] = checkResult // Update overall status based on check results if check.Critical && checkResult.Status != HealthHealthy { moduleHealth.Status = HealthUnhealthy } } } // Create health check result result := HealthCheckResult{ Timestamp: start, Status: moduleHealth.Status, ResponseTime: time.Since(start), Message: moduleHealth.Message, Details: moduleHealth.Details, Checks: checkResults, } // Update statistics if result.Status == HealthHealthy { monitor.successCount++ } else { monitor.failureCount++ } // Add to history monitor.history = append(monitor.history, result) if len(monitor.history) > hm.config.HistorySize { monitor.history = monitor.history[1:] } // Update current health oldHealth := monitor.currentHealth monitor.currentHealth = moduleHealth monitor.currentHealth.LastCheck = start monitor.currentHealth.RestartCount = int(monitor.module.HealthStatus.RestartCount) // Calculate uptime if !monitor.module.StartTime.IsZero() { monitor.currentHealth.Uptime = time.Since(monitor.module.StartTime) } // Update trends if enabled if hm.config.EnableTrends { monitor.trend = hm.calculateHealthTrend(monitor) } // Apply health rules hm.applyHealthRules(monitor.moduleID, monitor.currentHealth) // Send notifications if health changed if hm.config.EnableNotifications && oldHealth.Status != monitor.currentHealth.Status { hm.notifier.NotifyHealthChange(monitor.moduleID, oldHealth, monitor.currentHealth) } // Update metrics if hm.config.EnableMetrics { hm.updateMetrics(monitor, result) } return monitor.currentHealth } func (hm *HealthMonitorImpl) performCustomCheck(ctx context.Context, check HealthCheck) CheckResult { start := time.Now() result := CheckResult{ Name: check.Name, Status: HealthHealthy, ResponseTime: 0, Message: "Check passed", Details: make(map[string]interface{}), } // Create timeout context for the check checkCtx, cancel := context.WithTimeout(ctx, check.Timeout) defer cancel() // Run the check done := make(chan error, 1) go func() { done <- check.CheckFunc() }() select { case err := <-done: result.ResponseTime = time.Since(start) if err != nil { result.Status = HealthUnhealthy result.Message = err.Error() result.Error = err } case <-checkCtx.Done(): result.ResponseTime = time.Since(start) result.Status = HealthUnhealthy result.Message = "Check timed out" result.Error = checkCtx.Err() } return result } func (hm *HealthMonitorImpl) calculateHealthTrend(monitor *ModuleMonitor) HealthTrend { if len(monitor.history) < 5 { return HealthTrend{ Direction: TrendUnknown, Confidence: 0, LastUpdated: time.Now(), } } // Simple trend calculation based on recent health status recent := monitor.history[len(monitor.history)-5:] healthyCount := 0 for _, result := range recent { if result.Status == HealthHealthy { healthyCount++ } } healthRatio := float64(healthyCount) / float64(len(recent)) var direction TrendDirection var confidence float64 if healthRatio > 0.8 { direction = TrendImproving confidence = healthRatio } else if healthRatio < 0.4 { direction = TrendDegrading confidence = 1.0 - healthRatio } else { direction = TrendStable confidence = 0.5 } return HealthTrend{ Direction: direction, Confidence: confidence, Slope: healthRatio - 0.5, // Simplified slope calculation Prediction: hm.predictHealthStatus(healthRatio), LastUpdated: time.Now(), } } func (hm *HealthMonitorImpl) predictHealthStatus(healthRatio float64) HealthStatus { if healthRatio > 0.7 { return HealthHealthy } else if healthRatio > 0.3 { return HealthDegraded } else { return HealthUnhealthy } } func (hm *HealthMonitorImpl) applyHealthRules(moduleID string, health ModuleHealth) { for _, rule := range hm.rules { if rule.Enabled && rule.Condition(health) { if err := rule.Action(moduleID, health); err != nil { // Log error but continue with other rules } } } } func (hm *HealthMonitorImpl) updateMetrics(monitor *ModuleMonitor, result HealthCheckResult) { hm.metrics.ChecksPerformed++ if result.Status == HealthHealthy { hm.metrics.ChecksSuccessful++ } else { hm.metrics.ChecksFailed++ } // Update average check time if hm.metrics.ChecksPerformed > 0 { totalTime := hm.metrics.AverageCheckTime * time.Duration(hm.metrics.ChecksPerformed-1) hm.metrics.AverageCheckTime = (totalTime + result.ResponseTime) / time.Duration(hm.metrics.ChecksPerformed) } // Update health score score := hm.aggregator.GetHealthScore(monitor.currentHealth) hm.metrics.ModuleHealthScores[monitor.moduleID] = score } func (hm *HealthMonitorImpl) setupDefaultRules() { // Rule: Alert on unhealthy critical modules hm.rules = append(hm.rules, HealthRule{ Name: "critical_module_unhealthy", Description: "Alert when a critical module becomes unhealthy", Condition: func(health ModuleHealth) bool { return health.Status == HealthUnhealthy }, Action: func(moduleID string, health ModuleHealth) error { alert := HealthAlert{ ID: fmt.Sprintf("critical_%s_%d", moduleID, time.Now().Unix()), ModuleID: moduleID, Severity: SeverityCritical, Type: AlertHealthChange, Message: fmt.Sprintf("Critical module %s is unhealthy: %s", moduleID, health.Message), Timestamp: time.Now(), } return hm.notifier.NotifyAlert(alert) }, Severity: SeverityCritical, Enabled: true, }) // Rule: Alert on degraded performance hm.rules = append(hm.rules, HealthRule{ Name: "degraded_performance", Description: "Alert when module performance is degraded", Condition: func(health ModuleHealth) bool { return health.Status == HealthDegraded }, Action: func(moduleID string, health ModuleHealth) error { alert := HealthAlert{ ID: fmt.Sprintf("degraded_%s_%d", moduleID, time.Now().Unix()), ModuleID: moduleID, Severity: SeverityWarning, Type: AlertHealthChange, Message: fmt.Sprintf("Module %s performance is degraded: %s", moduleID, health.Message), Timestamp: time.Now(), } return hm.notifier.NotifyAlert(alert) }, Severity: SeverityWarning, Enabled: true, }) } // DefaultHealthAggregator implements basic health aggregation type DefaultHealthAggregator struct{} func NewDefaultHealthAggregator() *DefaultHealthAggregator { return &DefaultHealthAggregator{} } func (dha *DefaultHealthAggregator) AggregateHealth(modules map[string]ModuleHealth) OverallHealth { overall := OverallHealth{ Modules: modules, LastUpdated: time.Now(), Trends: make(map[string]HealthTrend), } if len(modules) == 0 { overall.Status = HealthUnknown return overall } overall.ModuleCount = len(modules) var totalScore float64 for moduleID, health := range modules { score := dha.GetHealthScore(health) totalScore += score switch health.Status { case HealthHealthy: overall.HealthyCount++ case HealthDegraded: overall.DegradedCount++ case HealthUnhealthy: overall.UnhealthyCount++ overall.CriticalIssues = append(overall.CriticalIssues, fmt.Sprintf("Module %s is unhealthy: %s", moduleID, health.Message)) } } overall.Score = totalScore / float64(len(modules)) overall.Status = dha.CalculateSystemHealth(getHealthValues(modules)) return overall } func (dha *DefaultHealthAggregator) CalculateSystemHealth(individual []ModuleHealth) HealthStatus { if len(individual) == 0 { return HealthUnknown } healthyCount := 0 degradedCount := 0 unhealthyCount := 0 for _, health := range individual { switch health.Status { case HealthHealthy: healthyCount++ case HealthDegraded: degradedCount++ case HealthUnhealthy: unhealthyCount++ } } total := len(individual) healthyRatio := float64(healthyCount) / float64(total) unhealthyRatio := float64(unhealthyCount) / float64(total) if unhealthyRatio > 0.3 { return HealthUnhealthy } else if healthyRatio < 0.7 { return HealthDegraded } else { return HealthHealthy } } func (dha *DefaultHealthAggregator) GetHealthScore(health ModuleHealth) float64 { switch health.Status { case HealthHealthy: return 1.0 case HealthDegraded: return 0.5 case HealthUnhealthy: return 0.0 default: return 0.0 } } func getHealthValues(modules map[string]ModuleHealth) []ModuleHealth { values := make([]ModuleHealth, 0, len(modules)) for _, health := range modules { values = append(values, health) } return values } // DefaultHealthNotifier implements basic health notifications type DefaultHealthNotifier struct{} func NewDefaultHealthNotifier() *DefaultHealthNotifier { return &DefaultHealthNotifier{} } func (dhn *DefaultHealthNotifier) NotifyHealthChange(moduleID string, oldHealth, newHealth ModuleHealth) error { // Basic notification implementation - could be extended to send emails, webhooks, etc. return nil } func (dhn *DefaultHealthNotifier) NotifySystemHealth(health OverallHealth) error { // Basic notification implementation return nil } func (dhn *DefaultHealthNotifier) NotifyAlert(alert HealthAlert) error { // Basic notification implementation return nil }