- Fixed duplicate type declarations in transport package - Removed unused variables in lifecycle and dependency injection - Fixed big.Int arithmetic operations in uniswap contracts - Added missing methods to MetricsCollector (IncrementCounter, RecordLatency, etc.) - Fixed jitter calculation in TCP transport retry logic - Updated ComponentHealth field access to use transport type - Ensured all core packages build successfully All major compilation errors resolved: ✅ Transport package builds clean ✅ Lifecycle package builds clean ✅ Main MEV bot application builds clean ✅ Fixed method signature mismatches ✅ Resolved type conflicts and duplications 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
849 lines
24 KiB
Go
849 lines
24 KiB
Go
package lifecycle
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"sync"
|
|
"time"
|
|
)
|
|
|
|
// HealthMonitorImpl implements comprehensive health monitoring for modules
|
|
type HealthMonitorImpl struct {
|
|
monitors map[string]*ModuleMonitor
|
|
config HealthMonitorConfig
|
|
aggregator HealthAggregator
|
|
notifier HealthNotifier
|
|
metrics HealthMetrics
|
|
rules []HealthRule
|
|
mu sync.RWMutex
|
|
ctx context.Context
|
|
cancel context.CancelFunc
|
|
running bool
|
|
}
|
|
|
|
// ModuleMonitor monitors a specific module's health
|
|
type ModuleMonitor struct {
|
|
moduleID string
|
|
module *RegisteredModule
|
|
config ModuleHealthConfig
|
|
lastCheck time.Time
|
|
checkCount int64
|
|
successCount int64
|
|
failureCount int64
|
|
history []HealthCheckResult
|
|
currentHealth ModuleHealth
|
|
trend HealthTrend
|
|
mu sync.RWMutex
|
|
}
|
|
|
|
// HealthMonitorConfig configures the health monitoring system
|
|
type HealthMonitorConfig struct {
|
|
CheckInterval time.Duration `json:"check_interval"`
|
|
CheckTimeout time.Duration `json:"check_timeout"`
|
|
HistorySize int `json:"history_size"`
|
|
FailureThreshold int `json:"failure_threshold"`
|
|
RecoveryThreshold int `json:"recovery_threshold"`
|
|
EnableNotifications bool `json:"enable_notifications"`
|
|
EnableMetrics bool `json:"enable_metrics"`
|
|
EnableTrends bool `json:"enable_trends"`
|
|
ParallelChecks bool `json:"parallel_checks"`
|
|
MaxConcurrentChecks int `json:"max_concurrent_checks"`
|
|
}
|
|
|
|
// ModuleHealthConfig configures health checking for a specific module
|
|
type ModuleHealthConfig struct {
|
|
CheckInterval time.Duration `json:"check_interval"`
|
|
CheckTimeout time.Duration `json:"check_timeout"`
|
|
Enabled bool `json:"enabled"`
|
|
CriticalModule bool `json:"critical_module"`
|
|
CustomChecks []HealthCheck `json:"custom_checks"`
|
|
FailureThreshold int `json:"failure_threshold"`
|
|
RecoveryThreshold int `json:"recovery_threshold"`
|
|
AutoRestart bool `json:"auto_restart"`
|
|
MaxRestarts int `json:"max_restarts"`
|
|
RestartDelay time.Duration `json:"restart_delay"`
|
|
}
|
|
|
|
// HealthCheck represents a custom health check
|
|
type HealthCheck struct {
|
|
Name string `json:"name"`
|
|
Description string `json:"description"`
|
|
CheckFunc func() error `json:"-"`
|
|
Interval time.Duration `json:"interval"`
|
|
Timeout time.Duration `json:"timeout"`
|
|
Critical bool `json:"critical"`
|
|
Enabled bool `json:"enabled"`
|
|
}
|
|
|
|
// HealthCheckResult represents the result of a health check
|
|
type HealthCheckResult struct {
|
|
Timestamp time.Time `json:"timestamp"`
|
|
Status HealthStatus `json:"status"`
|
|
ResponseTime time.Duration `json:"response_time"`
|
|
Message string `json:"message"`
|
|
Details map[string]interface{} `json:"details"`
|
|
Checks map[string]CheckResult `json:"checks"`
|
|
Error error `json:"error,omitempty"`
|
|
}
|
|
|
|
// CheckResult represents the result of an individual check
|
|
type CheckResult struct {
|
|
Name string `json:"name"`
|
|
Status HealthStatus `json:"status"`
|
|
ResponseTime time.Duration `json:"response_time"`
|
|
Message string `json:"message"`
|
|
Details map[string]interface{} `json:"details"`
|
|
Error error `json:"error,omitempty"`
|
|
}
|
|
|
|
// HealthTrend tracks health trends over time
|
|
type HealthTrend struct {
|
|
Direction TrendDirection `json:"direction"`
|
|
Confidence float64 `json:"confidence"`
|
|
Slope float64 `json:"slope"`
|
|
Prediction HealthStatus `json:"prediction"`
|
|
TimeToAlert time.Duration `json:"time_to_alert"`
|
|
LastUpdated time.Time `json:"last_updated"`
|
|
}
|
|
|
|
// TrendDirection indicates the health trend direction
|
|
type TrendDirection string
|
|
|
|
const (
|
|
TrendImproving TrendDirection = "improving"
|
|
TrendStable TrendDirection = "stable"
|
|
TrendDegrading TrendDirection = "degrading"
|
|
TrendUnknown TrendDirection = "unknown"
|
|
)
|
|
|
|
// HealthAggregator aggregates health status from multiple modules
|
|
type HealthAggregator interface {
|
|
AggregateHealth(modules map[string]ModuleHealth) OverallHealth
|
|
CalculateSystemHealth(individual []ModuleHealth) HealthStatus
|
|
GetHealthScore(health ModuleHealth) float64
|
|
}
|
|
|
|
// HealthNotifier sends health notifications
|
|
type HealthNotifier interface {
|
|
NotifyHealthChange(moduleID string, oldHealth, newHealth ModuleHealth) error
|
|
NotifySystemHealth(health OverallHealth) error
|
|
NotifyAlert(alert HealthAlert) error
|
|
}
|
|
|
|
// OverallHealth represents the overall system health
|
|
type OverallHealth struct {
|
|
Status HealthStatus `json:"status"`
|
|
Score float64 `json:"score"`
|
|
ModuleCount int `json:"module_count"`
|
|
HealthyCount int `json:"healthy_count"`
|
|
DegradedCount int `json:"degraded_count"`
|
|
UnhealthyCount int `json:"unhealthy_count"`
|
|
CriticalIssues []string `json:"critical_issues"`
|
|
Modules map[string]ModuleHealth `json:"modules"`
|
|
LastUpdated time.Time `json:"last_updated"`
|
|
Trends map[string]HealthTrend `json:"trends"`
|
|
Recommendations []HealthRecommendation `json:"recommendations"`
|
|
}
|
|
|
|
// HealthAlert represents a health alert
|
|
type HealthAlert struct {
|
|
ID string `json:"id"`
|
|
ModuleID string `json:"module_id"`
|
|
Severity AlertSeverity `json:"severity"`
|
|
Type AlertType `json:"type"`
|
|
Message string `json:"message"`
|
|
Details map[string]interface{} `json:"details"`
|
|
Timestamp time.Time `json:"timestamp"`
|
|
Resolved bool `json:"resolved"`
|
|
ResolvedAt time.Time `json:"resolved_at,omitempty"`
|
|
}
|
|
|
|
// AlertSeverity defines alert severity levels
|
|
type AlertSeverity string
|
|
|
|
const (
|
|
SeverityInfo AlertSeverity = "info"
|
|
SeverityWarning AlertSeverity = "warning"
|
|
SeverityError AlertSeverity = "error"
|
|
SeverityCritical AlertSeverity = "critical"
|
|
)
|
|
|
|
// AlertType defines types of alerts
|
|
type AlertType string
|
|
|
|
const (
|
|
AlertHealthChange AlertType = "health_change"
|
|
AlertThresholdBreach AlertType = "threshold_breach"
|
|
AlertTrendAlert AlertType = "trend_alert"
|
|
AlertSystemDown AlertType = "system_down"
|
|
AlertRecovery AlertType = "recovery"
|
|
)
|
|
|
|
// HealthRule defines rules for health evaluation
|
|
type HealthRule struct {
|
|
Name string `json:"name"`
|
|
Description string `json:"description"`
|
|
Condition func(ModuleHealth) bool `json:"-"`
|
|
Action func(string, ModuleHealth) error `json:"-"`
|
|
Severity AlertSeverity `json:"severity"`
|
|
Enabled bool `json:"enabled"`
|
|
}
|
|
|
|
// HealthRecommendation provides actionable health recommendations
|
|
type HealthRecommendation struct {
|
|
ModuleID string `json:"module_id"`
|
|
Type string `json:"type"`
|
|
Description string `json:"description"`
|
|
Action string `json:"action"`
|
|
Priority string `json:"priority"`
|
|
Timestamp time.Time `json:"timestamp"`
|
|
}
|
|
|
|
// HealthMetrics tracks health monitoring metrics
|
|
type HealthMetrics struct {
|
|
ChecksPerformed int64 `json:"checks_performed"`
|
|
ChecksSuccessful int64 `json:"checks_successful"`
|
|
ChecksFailed int64 `json:"checks_failed"`
|
|
AverageCheckTime time.Duration `json:"average_check_time"`
|
|
AlertsGenerated int64 `json:"alerts_generated"`
|
|
ModuleRestarts int64 `json:"module_restarts"`
|
|
SystemDowntime time.Duration `json:"system_downtime"`
|
|
ModuleHealthScores map[string]float64 `json:"module_health_scores"`
|
|
TrendAccuracy float64 `json:"trend_accuracy"`
|
|
}
|
|
|
|
// NewHealthMonitor creates a new health monitor
|
|
func NewHealthMonitor(config HealthMonitorConfig) *HealthMonitorImpl {
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
|
|
hm := &HealthMonitorImpl{
|
|
monitors: make(map[string]*ModuleMonitor),
|
|
config: config,
|
|
aggregator: NewDefaultHealthAggregator(),
|
|
notifier: NewDefaultHealthNotifier(),
|
|
rules: make([]HealthRule, 0),
|
|
ctx: ctx,
|
|
cancel: cancel,
|
|
metrics: HealthMetrics{
|
|
ModuleHealthScores: make(map[string]float64),
|
|
},
|
|
}
|
|
|
|
// Set default configuration
|
|
if hm.config.CheckInterval == 0 {
|
|
hm.config.CheckInterval = 30 * time.Second
|
|
}
|
|
if hm.config.CheckTimeout == 0 {
|
|
hm.config.CheckTimeout = 10 * time.Second
|
|
}
|
|
if hm.config.HistorySize == 0 {
|
|
hm.config.HistorySize = 100
|
|
}
|
|
if hm.config.FailureThreshold == 0 {
|
|
hm.config.FailureThreshold = 3
|
|
}
|
|
if hm.config.RecoveryThreshold == 0 {
|
|
hm.config.RecoveryThreshold = 3
|
|
}
|
|
if hm.config.MaxConcurrentChecks == 0 {
|
|
hm.config.MaxConcurrentChecks = 10
|
|
}
|
|
|
|
// Setup default health rules
|
|
hm.setupDefaultRules()
|
|
|
|
return hm
|
|
}
|
|
|
|
// Start starts the health monitoring system
|
|
func (hm *HealthMonitorImpl) Start() error {
|
|
hm.mu.Lock()
|
|
defer hm.mu.Unlock()
|
|
|
|
if hm.running {
|
|
return fmt.Errorf("health monitor already running")
|
|
}
|
|
|
|
hm.running = true
|
|
|
|
// Start monitoring loop
|
|
go hm.monitoringLoop()
|
|
|
|
return nil
|
|
}
|
|
|
|
// Stop stops the health monitoring system
|
|
func (hm *HealthMonitorImpl) Stop() error {
|
|
hm.mu.Lock()
|
|
defer hm.mu.Unlock()
|
|
|
|
if !hm.running {
|
|
return nil
|
|
}
|
|
|
|
hm.cancel()
|
|
hm.running = false
|
|
|
|
return nil
|
|
}
|
|
|
|
// StartMonitoring starts monitoring a specific module
|
|
func (hm *HealthMonitorImpl) StartMonitoring(module *RegisteredModule) error {
|
|
hm.mu.Lock()
|
|
defer hm.mu.Unlock()
|
|
|
|
moduleID := module.ID
|
|
|
|
// Create module monitor
|
|
monitor := &ModuleMonitor{
|
|
moduleID: moduleID,
|
|
module: module,
|
|
config: ModuleHealthConfig{
|
|
CheckInterval: hm.config.CheckInterval,
|
|
CheckTimeout: hm.config.CheckTimeout,
|
|
Enabled: true,
|
|
CriticalModule: module.Config.CriticalModule,
|
|
FailureThreshold: hm.config.FailureThreshold,
|
|
RecoveryThreshold: hm.config.RecoveryThreshold,
|
|
AutoRestart: module.Config.MaxRestarts > 0,
|
|
MaxRestarts: module.Config.MaxRestarts,
|
|
RestartDelay: module.Config.RestartDelay,
|
|
},
|
|
history: make([]HealthCheckResult, 0),
|
|
currentHealth: ModuleHealth{
|
|
Status: HealthUnknown,
|
|
LastCheck: time.Now(),
|
|
},
|
|
}
|
|
|
|
hm.monitors[moduleID] = monitor
|
|
|
|
return nil
|
|
}
|
|
|
|
// StopMonitoring stops monitoring a specific module
|
|
func (hm *HealthMonitorImpl) StopMonitoring(moduleID string) error {
|
|
hm.mu.Lock()
|
|
defer hm.mu.Unlock()
|
|
|
|
delete(hm.monitors, moduleID)
|
|
delete(hm.metrics.ModuleHealthScores, moduleID)
|
|
|
|
return nil
|
|
}
|
|
|
|
// CheckHealth performs a health check on a specific module
|
|
func (hm *HealthMonitorImpl) CheckHealth(module *RegisteredModule) ModuleHealth {
|
|
moduleID := module.ID
|
|
|
|
hm.mu.RLock()
|
|
monitor, exists := hm.monitors[moduleID]
|
|
hm.mu.RUnlock()
|
|
|
|
if !exists {
|
|
return ModuleHealth{
|
|
Status: HealthUnknown,
|
|
Message: "Module not monitored",
|
|
}
|
|
}
|
|
|
|
return hm.performHealthCheck(monitor)
|
|
}
|
|
|
|
// GetHealthStatus returns the health status of all monitored modules
|
|
func (hm *HealthMonitorImpl) GetHealthStatus() map[string]ModuleHealth {
|
|
hm.mu.RLock()
|
|
defer hm.mu.RUnlock()
|
|
|
|
status := make(map[string]ModuleHealth)
|
|
for moduleID, monitor := range hm.monitors {
|
|
monitor.mu.RLock()
|
|
status[moduleID] = monitor.currentHealth
|
|
monitor.mu.RUnlock()
|
|
}
|
|
|
|
return status
|
|
}
|
|
|
|
// GetOverallHealth returns the overall system health
|
|
func (hm *HealthMonitorImpl) GetOverallHealth() OverallHealth {
|
|
hm.mu.RLock()
|
|
defer hm.mu.RUnlock()
|
|
|
|
moduleHealths := make(map[string]ModuleHealth)
|
|
for moduleID, monitor := range hm.monitors {
|
|
monitor.mu.RLock()
|
|
moduleHealths[moduleID] = monitor.currentHealth
|
|
monitor.mu.RUnlock()
|
|
}
|
|
|
|
return hm.aggregator.AggregateHealth(moduleHealths)
|
|
}
|
|
|
|
// AddHealthRule adds a custom health rule
|
|
func (hm *HealthMonitorImpl) AddHealthRule(rule HealthRule) {
|
|
hm.mu.Lock()
|
|
defer hm.mu.Unlock()
|
|
hm.rules = append(hm.rules, rule)
|
|
}
|
|
|
|
// SetHealthAggregator sets a custom health aggregator
|
|
func (hm *HealthMonitorImpl) SetHealthAggregator(aggregator HealthAggregator) {
|
|
hm.mu.Lock()
|
|
defer hm.mu.Unlock()
|
|
hm.aggregator = aggregator
|
|
}
|
|
|
|
// SetHealthNotifier sets a custom health notifier
|
|
func (hm *HealthMonitorImpl) SetHealthNotifier(notifier HealthNotifier) {
|
|
hm.mu.Lock()
|
|
defer hm.mu.Unlock()
|
|
hm.notifier = notifier
|
|
}
|
|
|
|
// GetMetrics returns health monitoring metrics
|
|
func (hm *HealthMonitorImpl) GetMetrics() HealthMetrics {
|
|
hm.mu.RLock()
|
|
defer hm.mu.RUnlock()
|
|
return hm.metrics
|
|
}
|
|
|
|
// Private methods
|
|
|
|
func (hm *HealthMonitorImpl) monitoringLoop() {
|
|
ticker := time.NewTicker(hm.config.CheckInterval)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-hm.ctx.Done():
|
|
return
|
|
case <-ticker.C:
|
|
hm.performAllHealthChecks()
|
|
}
|
|
}
|
|
}
|
|
|
|
func (hm *HealthMonitorImpl) performAllHealthChecks() {
|
|
hm.mu.RLock()
|
|
monitors := make([]*ModuleMonitor, 0, len(hm.monitors))
|
|
for _, monitor := range hm.monitors {
|
|
monitors = append(monitors, monitor)
|
|
}
|
|
hm.mu.RUnlock()
|
|
|
|
if hm.config.ParallelChecks {
|
|
hm.performHealthChecksParallel(monitors)
|
|
} else {
|
|
hm.performHealthChecksSequential(monitors)
|
|
}
|
|
|
|
// Update overall health and send notifications
|
|
overallHealth := hm.GetOverallHealth()
|
|
if hm.config.EnableNotifications {
|
|
hm.notifier.NotifySystemHealth(overallHealth)
|
|
}
|
|
}
|
|
|
|
func (hm *HealthMonitorImpl) performHealthChecksSequential(monitors []*ModuleMonitor) {
|
|
for _, monitor := range monitors {
|
|
if monitor.config.Enabled {
|
|
hm.performHealthCheck(monitor)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (hm *HealthMonitorImpl) performHealthChecksParallel(monitors []*ModuleMonitor) {
|
|
semaphore := make(chan struct{}, hm.config.MaxConcurrentChecks)
|
|
var wg sync.WaitGroup
|
|
|
|
for _, monitor := range monitors {
|
|
if monitor.config.Enabled {
|
|
wg.Add(1)
|
|
go func(m *ModuleMonitor) {
|
|
defer wg.Done()
|
|
semaphore <- struct{}{}
|
|
defer func() { <-semaphore }()
|
|
|
|
hm.performHealthCheck(m)
|
|
}(monitor)
|
|
}
|
|
}
|
|
|
|
wg.Wait()
|
|
}
|
|
|
|
func (hm *HealthMonitorImpl) performHealthCheck(monitor *ModuleMonitor) ModuleHealth {
|
|
start := time.Now()
|
|
|
|
monitor.mu.Lock()
|
|
defer monitor.mu.Unlock()
|
|
|
|
monitor.checkCount++
|
|
monitor.lastCheck = start
|
|
|
|
// Create check context with timeout
|
|
ctx, cancel := context.WithTimeout(hm.ctx, monitor.config.CheckTimeout)
|
|
defer cancel()
|
|
|
|
// Perform basic module health check
|
|
moduleHealth := monitor.module.Instance.GetHealth()
|
|
|
|
// Perform custom health checks
|
|
checkResults := make(map[string]CheckResult)
|
|
for _, check := range monitor.config.CustomChecks {
|
|
if check.Enabled {
|
|
checkResult := hm.performCustomCheck(ctx, check)
|
|
checkResults[check.Name] = checkResult
|
|
|
|
// Update overall status based on check results
|
|
if check.Critical && checkResult.Status != HealthHealthy {
|
|
moduleHealth.Status = HealthUnhealthy
|
|
}
|
|
}
|
|
}
|
|
|
|
// Create health check result
|
|
result := HealthCheckResult{
|
|
Timestamp: start,
|
|
Status: moduleHealth.Status,
|
|
ResponseTime: time.Since(start),
|
|
Message: moduleHealth.Message,
|
|
Details: moduleHealth.Details,
|
|
Checks: checkResults,
|
|
}
|
|
|
|
// Update statistics
|
|
if result.Status == HealthHealthy {
|
|
monitor.successCount++
|
|
} else {
|
|
monitor.failureCount++
|
|
}
|
|
|
|
// Add to history
|
|
monitor.history = append(monitor.history, result)
|
|
if len(monitor.history) > hm.config.HistorySize {
|
|
monitor.history = monitor.history[1:]
|
|
}
|
|
|
|
// Update current health
|
|
oldHealth := monitor.currentHealth
|
|
monitor.currentHealth = moduleHealth
|
|
monitor.currentHealth.LastCheck = start
|
|
monitor.currentHealth.RestartCount = int(monitor.module.HealthStatus.RestartCount)
|
|
|
|
// Calculate uptime
|
|
if !monitor.module.StartTime.IsZero() {
|
|
monitor.currentHealth.Uptime = time.Since(monitor.module.StartTime)
|
|
}
|
|
|
|
// Update trends if enabled
|
|
if hm.config.EnableTrends {
|
|
monitor.trend = hm.calculateHealthTrend(monitor)
|
|
}
|
|
|
|
// Apply health rules
|
|
hm.applyHealthRules(monitor.moduleID, monitor.currentHealth)
|
|
|
|
// Send notifications if health changed
|
|
if hm.config.EnableNotifications && oldHealth.Status != monitor.currentHealth.Status {
|
|
hm.notifier.NotifyHealthChange(monitor.moduleID, oldHealth, monitor.currentHealth)
|
|
}
|
|
|
|
// Update metrics
|
|
if hm.config.EnableMetrics {
|
|
hm.updateMetrics(monitor, result)
|
|
}
|
|
|
|
return monitor.currentHealth
|
|
}
|
|
|
|
func (hm *HealthMonitorImpl) performCustomCheck(ctx context.Context, check HealthCheck) CheckResult {
|
|
start := time.Now()
|
|
|
|
result := CheckResult{
|
|
Name: check.Name,
|
|
Status: HealthHealthy,
|
|
ResponseTime: 0,
|
|
Message: "Check passed",
|
|
Details: make(map[string]interface{}),
|
|
}
|
|
|
|
// Create timeout context for the check
|
|
checkCtx, cancel := context.WithTimeout(ctx, check.Timeout)
|
|
defer cancel()
|
|
|
|
// Run the check
|
|
done := make(chan error, 1)
|
|
go func() {
|
|
done <- check.CheckFunc()
|
|
}()
|
|
|
|
select {
|
|
case err := <-done:
|
|
result.ResponseTime = time.Since(start)
|
|
if err != nil {
|
|
result.Status = HealthUnhealthy
|
|
result.Message = err.Error()
|
|
result.Error = err
|
|
}
|
|
case <-checkCtx.Done():
|
|
result.ResponseTime = time.Since(start)
|
|
result.Status = HealthUnhealthy
|
|
result.Message = "Check timed out"
|
|
result.Error = checkCtx.Err()
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
func (hm *HealthMonitorImpl) calculateHealthTrend(monitor *ModuleMonitor) HealthTrend {
|
|
if len(monitor.history) < 5 {
|
|
return HealthTrend{
|
|
Direction: TrendUnknown,
|
|
Confidence: 0,
|
|
LastUpdated: time.Now(),
|
|
}
|
|
}
|
|
|
|
// Simple trend calculation based on recent health status
|
|
recent := monitor.history[len(monitor.history)-5:]
|
|
healthyCount := 0
|
|
|
|
for _, result := range recent {
|
|
if result.Status == HealthHealthy {
|
|
healthyCount++
|
|
}
|
|
}
|
|
|
|
healthRatio := float64(healthyCount) / float64(len(recent))
|
|
|
|
var direction TrendDirection
|
|
var confidence float64
|
|
|
|
if healthRatio > 0.8 {
|
|
direction = TrendImproving
|
|
confidence = healthRatio
|
|
} else if healthRatio < 0.4 {
|
|
direction = TrendDegrading
|
|
confidence = 1.0 - healthRatio
|
|
} else {
|
|
direction = TrendStable
|
|
confidence = 0.5
|
|
}
|
|
|
|
return HealthTrend{
|
|
Direction: direction,
|
|
Confidence: confidence,
|
|
Slope: healthRatio - 0.5, // Simplified slope calculation
|
|
Prediction: hm.predictHealthStatus(healthRatio),
|
|
LastUpdated: time.Now(),
|
|
}
|
|
}
|
|
|
|
func (hm *HealthMonitorImpl) predictHealthStatus(healthRatio float64) HealthStatus {
|
|
if healthRatio > 0.7 {
|
|
return HealthHealthy
|
|
} else if healthRatio > 0.3 {
|
|
return HealthDegraded
|
|
} else {
|
|
return HealthUnhealthy
|
|
}
|
|
}
|
|
|
|
func (hm *HealthMonitorImpl) applyHealthRules(moduleID string, health ModuleHealth) {
|
|
for _, rule := range hm.rules {
|
|
if rule.Enabled && rule.Condition(health) {
|
|
if err := rule.Action(moduleID, health); err != nil {
|
|
// Log error but continue with other rules
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func (hm *HealthMonitorImpl) updateMetrics(monitor *ModuleMonitor, result HealthCheckResult) {
|
|
hm.metrics.ChecksPerformed++
|
|
|
|
if result.Status == HealthHealthy {
|
|
hm.metrics.ChecksSuccessful++
|
|
} else {
|
|
hm.metrics.ChecksFailed++
|
|
}
|
|
|
|
// Update average check time
|
|
if hm.metrics.ChecksPerformed > 0 {
|
|
totalTime := hm.metrics.AverageCheckTime * time.Duration(hm.metrics.ChecksPerformed-1)
|
|
hm.metrics.AverageCheckTime = (totalTime + result.ResponseTime) / time.Duration(hm.metrics.ChecksPerformed)
|
|
}
|
|
|
|
// Update health score
|
|
score := hm.aggregator.GetHealthScore(monitor.currentHealth)
|
|
hm.metrics.ModuleHealthScores[monitor.moduleID] = score
|
|
}
|
|
|
|
func (hm *HealthMonitorImpl) setupDefaultRules() {
|
|
// Rule: Alert on unhealthy critical modules
|
|
hm.rules = append(hm.rules, HealthRule{
|
|
Name: "critical_module_unhealthy",
|
|
Description: "Alert when a critical module becomes unhealthy",
|
|
Condition: func(health ModuleHealth) bool {
|
|
return health.Status == HealthUnhealthy
|
|
},
|
|
Action: func(moduleID string, health ModuleHealth) error {
|
|
alert := HealthAlert{
|
|
ID: fmt.Sprintf("critical_%s_%d", moduleID, time.Now().Unix()),
|
|
ModuleID: moduleID,
|
|
Severity: SeverityCritical,
|
|
Type: AlertHealthChange,
|
|
Message: fmt.Sprintf("Critical module %s is unhealthy: %s", moduleID, health.Message),
|
|
Timestamp: time.Now(),
|
|
}
|
|
return hm.notifier.NotifyAlert(alert)
|
|
},
|
|
Severity: SeverityCritical,
|
|
Enabled: true,
|
|
})
|
|
|
|
// Rule: Alert on degraded performance
|
|
hm.rules = append(hm.rules, HealthRule{
|
|
Name: "degraded_performance",
|
|
Description: "Alert when module performance is degraded",
|
|
Condition: func(health ModuleHealth) bool {
|
|
return health.Status == HealthDegraded
|
|
},
|
|
Action: func(moduleID string, health ModuleHealth) error {
|
|
alert := HealthAlert{
|
|
ID: fmt.Sprintf("degraded_%s_%d", moduleID, time.Now().Unix()),
|
|
ModuleID: moduleID,
|
|
Severity: SeverityWarning,
|
|
Type: AlertHealthChange,
|
|
Message: fmt.Sprintf("Module %s performance is degraded: %s", moduleID, health.Message),
|
|
Timestamp: time.Now(),
|
|
}
|
|
return hm.notifier.NotifyAlert(alert)
|
|
},
|
|
Severity: SeverityWarning,
|
|
Enabled: true,
|
|
})
|
|
}
|
|
|
|
// DefaultHealthAggregator implements basic health aggregation
|
|
type DefaultHealthAggregator struct{}
|
|
|
|
func NewDefaultHealthAggregator() *DefaultHealthAggregator {
|
|
return &DefaultHealthAggregator{}
|
|
}
|
|
|
|
func (dha *DefaultHealthAggregator) AggregateHealth(modules map[string]ModuleHealth) OverallHealth {
|
|
overall := OverallHealth{
|
|
Modules: modules,
|
|
LastUpdated: time.Now(),
|
|
Trends: make(map[string]HealthTrend),
|
|
}
|
|
|
|
if len(modules) == 0 {
|
|
overall.Status = HealthUnknown
|
|
return overall
|
|
}
|
|
|
|
overall.ModuleCount = len(modules)
|
|
var totalScore float64
|
|
|
|
for moduleID, health := range modules {
|
|
score := dha.GetHealthScore(health)
|
|
totalScore += score
|
|
|
|
switch health.Status {
|
|
case HealthHealthy:
|
|
overall.HealthyCount++
|
|
case HealthDegraded:
|
|
overall.DegradedCount++
|
|
case HealthUnhealthy:
|
|
overall.UnhealthyCount++
|
|
overall.CriticalIssues = append(overall.CriticalIssues,
|
|
fmt.Sprintf("Module %s is unhealthy: %s", moduleID, health.Message))
|
|
}
|
|
}
|
|
|
|
overall.Score = totalScore / float64(len(modules))
|
|
overall.Status = dha.CalculateSystemHealth(getHealthValues(modules))
|
|
|
|
return overall
|
|
}
|
|
|
|
func (dha *DefaultHealthAggregator) CalculateSystemHealth(individual []ModuleHealth) HealthStatus {
|
|
if len(individual) == 0 {
|
|
return HealthUnknown
|
|
}
|
|
|
|
healthyCount := 0
|
|
degradedCount := 0
|
|
unhealthyCount := 0
|
|
|
|
for _, health := range individual {
|
|
switch health.Status {
|
|
case HealthHealthy:
|
|
healthyCount++
|
|
case HealthDegraded:
|
|
degradedCount++
|
|
case HealthUnhealthy:
|
|
unhealthyCount++
|
|
}
|
|
}
|
|
|
|
total := len(individual)
|
|
healthyRatio := float64(healthyCount) / float64(total)
|
|
unhealthyRatio := float64(unhealthyCount) / float64(total)
|
|
|
|
if unhealthyRatio > 0.3 {
|
|
return HealthUnhealthy
|
|
} else if healthyRatio < 0.7 {
|
|
return HealthDegraded
|
|
} else {
|
|
return HealthHealthy
|
|
}
|
|
}
|
|
|
|
func (dha *DefaultHealthAggregator) GetHealthScore(health ModuleHealth) float64 {
|
|
switch health.Status {
|
|
case HealthHealthy:
|
|
return 1.0
|
|
case HealthDegraded:
|
|
return 0.5
|
|
case HealthUnhealthy:
|
|
return 0.0
|
|
default:
|
|
return 0.0
|
|
}
|
|
}
|
|
|
|
func getHealthValues(modules map[string]ModuleHealth) []ModuleHealth {
|
|
values := make([]ModuleHealth, 0, len(modules))
|
|
for _, health := range modules {
|
|
values = append(values, health)
|
|
}
|
|
return values
|
|
}
|
|
|
|
// DefaultHealthNotifier implements basic health notifications
|
|
type DefaultHealthNotifier struct{}
|
|
|
|
func NewDefaultHealthNotifier() *DefaultHealthNotifier {
|
|
return &DefaultHealthNotifier{}
|
|
}
|
|
|
|
func (dhn *DefaultHealthNotifier) NotifyHealthChange(moduleID string, oldHealth, newHealth ModuleHealth) error {
|
|
// Basic notification implementation - could be extended to send emails, webhooks, etc.
|
|
return nil
|
|
}
|
|
|
|
func (dhn *DefaultHealthNotifier) NotifySystemHealth(health OverallHealth) error {
|
|
// Basic notification implementation
|
|
return nil
|
|
}
|
|
|
|
func (dhn *DefaultHealthNotifier) NotifyAlert(alert HealthAlert) error {
|
|
// Basic notification implementation
|
|
return nil
|
|
}
|