fix: resolve all compilation issues across transport and lifecycle packages
- Fixed duplicate type declarations in transport package - Removed unused variables in lifecycle and dependency injection - Fixed big.Int arithmetic operations in uniswap contracts - Added missing methods to MetricsCollector (IncrementCounter, RecordLatency, etc.) - Fixed jitter calculation in TCP transport retry logic - Updated ComponentHealth field access to use transport type - Ensured all core packages build successfully All major compilation errors resolved: ✅ Transport package builds clean ✅ Lifecycle package builds clean ✅ Main MEV bot application builds clean ✅ Fixed method signature mismatches ✅ Resolved type conflicts and duplications 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
848
pkg/lifecycle/health_monitor.go
Normal file
848
pkg/lifecycle/health_monitor.go
Normal file
@@ -0,0 +1,848 @@
|
||||
package lifecycle
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// HealthMonitorImpl implements comprehensive health monitoring for modules
|
||||
type HealthMonitorImpl struct {
|
||||
monitors map[string]*ModuleMonitor
|
||||
config HealthMonitorConfig
|
||||
aggregator HealthAggregator
|
||||
notifier HealthNotifier
|
||||
metrics HealthMetrics
|
||||
rules []HealthRule
|
||||
mu sync.RWMutex
|
||||
ctx context.Context
|
||||
cancel context.CancelFunc
|
||||
running bool
|
||||
}
|
||||
|
||||
// ModuleMonitor monitors a specific module's health
|
||||
type ModuleMonitor struct {
|
||||
moduleID string
|
||||
module *RegisteredModule
|
||||
config ModuleHealthConfig
|
||||
lastCheck time.Time
|
||||
checkCount int64
|
||||
successCount int64
|
||||
failureCount int64
|
||||
history []HealthCheckResult
|
||||
currentHealth ModuleHealth
|
||||
trend HealthTrend
|
||||
mu sync.RWMutex
|
||||
}
|
||||
|
||||
// HealthMonitorConfig configures the health monitoring system
|
||||
type HealthMonitorConfig struct {
|
||||
CheckInterval time.Duration `json:"check_interval"`
|
||||
CheckTimeout time.Duration `json:"check_timeout"`
|
||||
HistorySize int `json:"history_size"`
|
||||
FailureThreshold int `json:"failure_threshold"`
|
||||
RecoveryThreshold int `json:"recovery_threshold"`
|
||||
EnableNotifications bool `json:"enable_notifications"`
|
||||
EnableMetrics bool `json:"enable_metrics"`
|
||||
EnableTrends bool `json:"enable_trends"`
|
||||
ParallelChecks bool `json:"parallel_checks"`
|
||||
MaxConcurrentChecks int `json:"max_concurrent_checks"`
|
||||
}
|
||||
|
||||
// ModuleHealthConfig configures health checking for a specific module
|
||||
type ModuleHealthConfig struct {
|
||||
CheckInterval time.Duration `json:"check_interval"`
|
||||
CheckTimeout time.Duration `json:"check_timeout"`
|
||||
Enabled bool `json:"enabled"`
|
||||
CriticalModule bool `json:"critical_module"`
|
||||
CustomChecks []HealthCheck `json:"custom_checks"`
|
||||
FailureThreshold int `json:"failure_threshold"`
|
||||
RecoveryThreshold int `json:"recovery_threshold"`
|
||||
AutoRestart bool `json:"auto_restart"`
|
||||
MaxRestarts int `json:"max_restarts"`
|
||||
RestartDelay time.Duration `json:"restart_delay"`
|
||||
}
|
||||
|
||||
// HealthCheck represents a custom health check
|
||||
type HealthCheck struct {
|
||||
Name string `json:"name"`
|
||||
Description string `json:"description"`
|
||||
CheckFunc func() error `json:"-"`
|
||||
Interval time.Duration `json:"interval"`
|
||||
Timeout time.Duration `json:"timeout"`
|
||||
Critical bool `json:"critical"`
|
||||
Enabled bool `json:"enabled"`
|
||||
}
|
||||
|
||||
// HealthCheckResult represents the result of a health check
|
||||
type HealthCheckResult struct {
|
||||
Timestamp time.Time `json:"timestamp"`
|
||||
Status HealthStatus `json:"status"`
|
||||
ResponseTime time.Duration `json:"response_time"`
|
||||
Message string `json:"message"`
|
||||
Details map[string]interface{} `json:"details"`
|
||||
Checks map[string]CheckResult `json:"checks"`
|
||||
Error error `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
// CheckResult represents the result of an individual check
|
||||
type CheckResult struct {
|
||||
Name string `json:"name"`
|
||||
Status HealthStatus `json:"status"`
|
||||
ResponseTime time.Duration `json:"response_time"`
|
||||
Message string `json:"message"`
|
||||
Details map[string]interface{} `json:"details"`
|
||||
Error error `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
// HealthTrend tracks health trends over time
|
||||
type HealthTrend struct {
|
||||
Direction TrendDirection `json:"direction"`
|
||||
Confidence float64 `json:"confidence"`
|
||||
Slope float64 `json:"slope"`
|
||||
Prediction HealthStatus `json:"prediction"`
|
||||
TimeToAlert time.Duration `json:"time_to_alert"`
|
||||
LastUpdated time.Time `json:"last_updated"`
|
||||
}
|
||||
|
||||
// TrendDirection indicates the health trend direction
|
||||
type TrendDirection string
|
||||
|
||||
const (
|
||||
TrendImproving TrendDirection = "improving"
|
||||
TrendStable TrendDirection = "stable"
|
||||
TrendDegrading TrendDirection = "degrading"
|
||||
TrendUnknown TrendDirection = "unknown"
|
||||
)
|
||||
|
||||
// HealthAggregator aggregates health status from multiple modules
|
||||
type HealthAggregator interface {
|
||||
AggregateHealth(modules map[string]ModuleHealth) OverallHealth
|
||||
CalculateSystemHealth(individual []ModuleHealth) HealthStatus
|
||||
GetHealthScore(health ModuleHealth) float64
|
||||
}
|
||||
|
||||
// HealthNotifier sends health notifications
|
||||
type HealthNotifier interface {
|
||||
NotifyHealthChange(moduleID string, oldHealth, newHealth ModuleHealth) error
|
||||
NotifySystemHealth(health OverallHealth) error
|
||||
NotifyAlert(alert HealthAlert) error
|
||||
}
|
||||
|
||||
// OverallHealth represents the overall system health
|
||||
type OverallHealth struct {
|
||||
Status HealthStatus `json:"status"`
|
||||
Score float64 `json:"score"`
|
||||
ModuleCount int `json:"module_count"`
|
||||
HealthyCount int `json:"healthy_count"`
|
||||
DegradedCount int `json:"degraded_count"`
|
||||
UnhealthyCount int `json:"unhealthy_count"`
|
||||
CriticalIssues []string `json:"critical_issues"`
|
||||
Modules map[string]ModuleHealth `json:"modules"`
|
||||
LastUpdated time.Time `json:"last_updated"`
|
||||
Trends map[string]HealthTrend `json:"trends"`
|
||||
Recommendations []HealthRecommendation `json:"recommendations"`
|
||||
}
|
||||
|
||||
// HealthAlert represents a health alert
|
||||
type HealthAlert struct {
|
||||
ID string `json:"id"`
|
||||
ModuleID string `json:"module_id"`
|
||||
Severity AlertSeverity `json:"severity"`
|
||||
Type AlertType `json:"type"`
|
||||
Message string `json:"message"`
|
||||
Details map[string]interface{} `json:"details"`
|
||||
Timestamp time.Time `json:"timestamp"`
|
||||
Resolved bool `json:"resolved"`
|
||||
ResolvedAt time.Time `json:"resolved_at,omitempty"`
|
||||
}
|
||||
|
||||
// AlertSeverity defines alert severity levels
|
||||
type AlertSeverity string
|
||||
|
||||
const (
|
||||
SeverityInfo AlertSeverity = "info"
|
||||
SeverityWarning AlertSeverity = "warning"
|
||||
SeverityError AlertSeverity = "error"
|
||||
SeverityCritical AlertSeverity = "critical"
|
||||
)
|
||||
|
||||
// AlertType defines types of alerts
|
||||
type AlertType string
|
||||
|
||||
const (
|
||||
AlertHealthChange AlertType = "health_change"
|
||||
AlertThresholdBreach AlertType = "threshold_breach"
|
||||
AlertTrendAlert AlertType = "trend_alert"
|
||||
AlertSystemDown AlertType = "system_down"
|
||||
AlertRecovery AlertType = "recovery"
|
||||
)
|
||||
|
||||
// HealthRule defines rules for health evaluation
|
||||
type HealthRule struct {
|
||||
Name string `json:"name"`
|
||||
Description string `json:"description"`
|
||||
Condition func(ModuleHealth) bool `json:"-"`
|
||||
Action func(string, ModuleHealth) error `json:"-"`
|
||||
Severity AlertSeverity `json:"severity"`
|
||||
Enabled bool `json:"enabled"`
|
||||
}
|
||||
|
||||
// HealthRecommendation provides actionable health recommendations
|
||||
type HealthRecommendation struct {
|
||||
ModuleID string `json:"module_id"`
|
||||
Type string `json:"type"`
|
||||
Description string `json:"description"`
|
||||
Action string `json:"action"`
|
||||
Priority string `json:"priority"`
|
||||
Timestamp time.Time `json:"timestamp"`
|
||||
}
|
||||
|
||||
// HealthMetrics tracks health monitoring metrics
|
||||
type HealthMetrics struct {
|
||||
ChecksPerformed int64 `json:"checks_performed"`
|
||||
ChecksSuccessful int64 `json:"checks_successful"`
|
||||
ChecksFailed int64 `json:"checks_failed"`
|
||||
AverageCheckTime time.Duration `json:"average_check_time"`
|
||||
AlertsGenerated int64 `json:"alerts_generated"`
|
||||
ModuleRestarts int64 `json:"module_restarts"`
|
||||
SystemDowntime time.Duration `json:"system_downtime"`
|
||||
ModuleHealthScores map[string]float64 `json:"module_health_scores"`
|
||||
TrendAccuracy float64 `json:"trend_accuracy"`
|
||||
}
|
||||
|
||||
// NewHealthMonitor creates a new health monitor
|
||||
func NewHealthMonitor(config HealthMonitorConfig) *HealthMonitorImpl {
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
|
||||
hm := &HealthMonitorImpl{
|
||||
monitors: make(map[string]*ModuleMonitor),
|
||||
config: config,
|
||||
aggregator: NewDefaultHealthAggregator(),
|
||||
notifier: NewDefaultHealthNotifier(),
|
||||
rules: make([]HealthRule, 0),
|
||||
ctx: ctx,
|
||||
cancel: cancel,
|
||||
metrics: HealthMetrics{
|
||||
ModuleHealthScores: make(map[string]float64),
|
||||
},
|
||||
}
|
||||
|
||||
// Set default configuration
|
||||
if hm.config.CheckInterval == 0 {
|
||||
hm.config.CheckInterval = 30 * time.Second
|
||||
}
|
||||
if hm.config.CheckTimeout == 0 {
|
||||
hm.config.CheckTimeout = 10 * time.Second
|
||||
}
|
||||
if hm.config.HistorySize == 0 {
|
||||
hm.config.HistorySize = 100
|
||||
}
|
||||
if hm.config.FailureThreshold == 0 {
|
||||
hm.config.FailureThreshold = 3
|
||||
}
|
||||
if hm.config.RecoveryThreshold == 0 {
|
||||
hm.config.RecoveryThreshold = 3
|
||||
}
|
||||
if hm.config.MaxConcurrentChecks == 0 {
|
||||
hm.config.MaxConcurrentChecks = 10
|
||||
}
|
||||
|
||||
// Setup default health rules
|
||||
hm.setupDefaultRules()
|
||||
|
||||
return hm
|
||||
}
|
||||
|
||||
// Start starts the health monitoring system
|
||||
func (hm *HealthMonitorImpl) Start() error {
|
||||
hm.mu.Lock()
|
||||
defer hm.mu.Unlock()
|
||||
|
||||
if hm.running {
|
||||
return fmt.Errorf("health monitor already running")
|
||||
}
|
||||
|
||||
hm.running = true
|
||||
|
||||
// Start monitoring loop
|
||||
go hm.monitoringLoop()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Stop stops the health monitoring system
|
||||
func (hm *HealthMonitorImpl) Stop() error {
|
||||
hm.mu.Lock()
|
||||
defer hm.mu.Unlock()
|
||||
|
||||
if !hm.running {
|
||||
return nil
|
||||
}
|
||||
|
||||
hm.cancel()
|
||||
hm.running = false
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// StartMonitoring starts monitoring a specific module
|
||||
func (hm *HealthMonitorImpl) StartMonitoring(module *RegisteredModule) error {
|
||||
hm.mu.Lock()
|
||||
defer hm.mu.Unlock()
|
||||
|
||||
moduleID := module.ID
|
||||
|
||||
// Create module monitor
|
||||
monitor := &ModuleMonitor{
|
||||
moduleID: moduleID,
|
||||
module: module,
|
||||
config: ModuleHealthConfig{
|
||||
CheckInterval: hm.config.CheckInterval,
|
||||
CheckTimeout: hm.config.CheckTimeout,
|
||||
Enabled: true,
|
||||
CriticalModule: module.Config.CriticalModule,
|
||||
FailureThreshold: hm.config.FailureThreshold,
|
||||
RecoveryThreshold: hm.config.RecoveryThreshold,
|
||||
AutoRestart: module.Config.MaxRestarts > 0,
|
||||
MaxRestarts: module.Config.MaxRestarts,
|
||||
RestartDelay: module.Config.RestartDelay,
|
||||
},
|
||||
history: make([]HealthCheckResult, 0),
|
||||
currentHealth: ModuleHealth{
|
||||
Status: HealthUnknown,
|
||||
LastCheck: time.Now(),
|
||||
},
|
||||
}
|
||||
|
||||
hm.monitors[moduleID] = monitor
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// StopMonitoring stops monitoring a specific module
|
||||
func (hm *HealthMonitorImpl) StopMonitoring(moduleID string) error {
|
||||
hm.mu.Lock()
|
||||
defer hm.mu.Unlock()
|
||||
|
||||
delete(hm.monitors, moduleID)
|
||||
delete(hm.metrics.ModuleHealthScores, moduleID)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// CheckHealth performs a health check on a specific module
|
||||
func (hm *HealthMonitorImpl) CheckHealth(module *RegisteredModule) ModuleHealth {
|
||||
moduleID := module.ID
|
||||
|
||||
hm.mu.RLock()
|
||||
monitor, exists := hm.monitors[moduleID]
|
||||
hm.mu.RUnlock()
|
||||
|
||||
if !exists {
|
||||
return ModuleHealth{
|
||||
Status: HealthUnknown,
|
||||
Message: "Module not monitored",
|
||||
}
|
||||
}
|
||||
|
||||
return hm.performHealthCheck(monitor)
|
||||
}
|
||||
|
||||
// GetHealthStatus returns the health status of all monitored modules
|
||||
func (hm *HealthMonitorImpl) GetHealthStatus() map[string]ModuleHealth {
|
||||
hm.mu.RLock()
|
||||
defer hm.mu.RUnlock()
|
||||
|
||||
status := make(map[string]ModuleHealth)
|
||||
for moduleID, monitor := range hm.monitors {
|
||||
monitor.mu.RLock()
|
||||
status[moduleID] = monitor.currentHealth
|
||||
monitor.mu.RUnlock()
|
||||
}
|
||||
|
||||
return status
|
||||
}
|
||||
|
||||
// GetOverallHealth returns the overall system health
|
||||
func (hm *HealthMonitorImpl) GetOverallHealth() OverallHealth {
|
||||
hm.mu.RLock()
|
||||
defer hm.mu.RUnlock()
|
||||
|
||||
moduleHealths := make(map[string]ModuleHealth)
|
||||
for moduleID, monitor := range hm.monitors {
|
||||
monitor.mu.RLock()
|
||||
moduleHealths[moduleID] = monitor.currentHealth
|
||||
monitor.mu.RUnlock()
|
||||
}
|
||||
|
||||
return hm.aggregator.AggregateHealth(moduleHealths)
|
||||
}
|
||||
|
||||
// AddHealthRule adds a custom health rule
|
||||
func (hm *HealthMonitorImpl) AddHealthRule(rule HealthRule) {
|
||||
hm.mu.Lock()
|
||||
defer hm.mu.Unlock()
|
||||
hm.rules = append(hm.rules, rule)
|
||||
}
|
||||
|
||||
// SetHealthAggregator sets a custom health aggregator
|
||||
func (hm *HealthMonitorImpl) SetHealthAggregator(aggregator HealthAggregator) {
|
||||
hm.mu.Lock()
|
||||
defer hm.mu.Unlock()
|
||||
hm.aggregator = aggregator
|
||||
}
|
||||
|
||||
// SetHealthNotifier sets a custom health notifier
|
||||
func (hm *HealthMonitorImpl) SetHealthNotifier(notifier HealthNotifier) {
|
||||
hm.mu.Lock()
|
||||
defer hm.mu.Unlock()
|
||||
hm.notifier = notifier
|
||||
}
|
||||
|
||||
// GetMetrics returns health monitoring metrics
|
||||
func (hm *HealthMonitorImpl) GetMetrics() HealthMetrics {
|
||||
hm.mu.RLock()
|
||||
defer hm.mu.RUnlock()
|
||||
return hm.metrics
|
||||
}
|
||||
|
||||
// Private methods
|
||||
|
||||
func (hm *HealthMonitorImpl) monitoringLoop() {
|
||||
ticker := time.NewTicker(hm.config.CheckInterval)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-hm.ctx.Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
hm.performAllHealthChecks()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (hm *HealthMonitorImpl) performAllHealthChecks() {
|
||||
hm.mu.RLock()
|
||||
monitors := make([]*ModuleMonitor, 0, len(hm.monitors))
|
||||
for _, monitor := range hm.monitors {
|
||||
monitors = append(monitors, monitor)
|
||||
}
|
||||
hm.mu.RUnlock()
|
||||
|
||||
if hm.config.ParallelChecks {
|
||||
hm.performHealthChecksParallel(monitors)
|
||||
} else {
|
||||
hm.performHealthChecksSequential(monitors)
|
||||
}
|
||||
|
||||
// Update overall health and send notifications
|
||||
overallHealth := hm.GetOverallHealth()
|
||||
if hm.config.EnableNotifications {
|
||||
hm.notifier.NotifySystemHealth(overallHealth)
|
||||
}
|
||||
}
|
||||
|
||||
func (hm *HealthMonitorImpl) performHealthChecksSequential(monitors []*ModuleMonitor) {
|
||||
for _, monitor := range monitors {
|
||||
if monitor.config.Enabled {
|
||||
hm.performHealthCheck(monitor)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (hm *HealthMonitorImpl) performHealthChecksParallel(monitors []*ModuleMonitor) {
|
||||
semaphore := make(chan struct{}, hm.config.MaxConcurrentChecks)
|
||||
var wg sync.WaitGroup
|
||||
|
||||
for _, monitor := range monitors {
|
||||
if monitor.config.Enabled {
|
||||
wg.Add(1)
|
||||
go func(m *ModuleMonitor) {
|
||||
defer wg.Done()
|
||||
semaphore <- struct{}{}
|
||||
defer func() { <-semaphore }()
|
||||
|
||||
hm.performHealthCheck(m)
|
||||
}(monitor)
|
||||
}
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
}
|
||||
|
||||
func (hm *HealthMonitorImpl) performHealthCheck(monitor *ModuleMonitor) ModuleHealth {
|
||||
start := time.Now()
|
||||
|
||||
monitor.mu.Lock()
|
||||
defer monitor.mu.Unlock()
|
||||
|
||||
monitor.checkCount++
|
||||
monitor.lastCheck = start
|
||||
|
||||
// Create check context with timeout
|
||||
ctx, cancel := context.WithTimeout(hm.ctx, monitor.config.CheckTimeout)
|
||||
defer cancel()
|
||||
|
||||
// Perform basic module health check
|
||||
moduleHealth := monitor.module.Instance.GetHealth()
|
||||
|
||||
// Perform custom health checks
|
||||
checkResults := make(map[string]CheckResult)
|
||||
for _, check := range monitor.config.CustomChecks {
|
||||
if check.Enabled {
|
||||
checkResult := hm.performCustomCheck(ctx, check)
|
||||
checkResults[check.Name] = checkResult
|
||||
|
||||
// Update overall status based on check results
|
||||
if check.Critical && checkResult.Status != HealthHealthy {
|
||||
moduleHealth.Status = HealthUnhealthy
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Create health check result
|
||||
result := HealthCheckResult{
|
||||
Timestamp: start,
|
||||
Status: moduleHealth.Status,
|
||||
ResponseTime: time.Since(start),
|
||||
Message: moduleHealth.Message,
|
||||
Details: moduleHealth.Details,
|
||||
Checks: checkResults,
|
||||
}
|
||||
|
||||
// Update statistics
|
||||
if result.Status == HealthHealthy {
|
||||
monitor.successCount++
|
||||
} else {
|
||||
monitor.failureCount++
|
||||
}
|
||||
|
||||
// Add to history
|
||||
monitor.history = append(monitor.history, result)
|
||||
if len(monitor.history) > hm.config.HistorySize {
|
||||
monitor.history = monitor.history[1:]
|
||||
}
|
||||
|
||||
// Update current health
|
||||
oldHealth := monitor.currentHealth
|
||||
monitor.currentHealth = moduleHealth
|
||||
monitor.currentHealth.LastCheck = start
|
||||
monitor.currentHealth.RestartCount = int(monitor.module.HealthStatus.RestartCount)
|
||||
|
||||
// Calculate uptime
|
||||
if !monitor.module.StartTime.IsZero() {
|
||||
monitor.currentHealth.Uptime = time.Since(monitor.module.StartTime)
|
||||
}
|
||||
|
||||
// Update trends if enabled
|
||||
if hm.config.EnableTrends {
|
||||
monitor.trend = hm.calculateHealthTrend(monitor)
|
||||
}
|
||||
|
||||
// Apply health rules
|
||||
hm.applyHealthRules(monitor.moduleID, monitor.currentHealth)
|
||||
|
||||
// Send notifications if health changed
|
||||
if hm.config.EnableNotifications && oldHealth.Status != monitor.currentHealth.Status {
|
||||
hm.notifier.NotifyHealthChange(monitor.moduleID, oldHealth, monitor.currentHealth)
|
||||
}
|
||||
|
||||
// Update metrics
|
||||
if hm.config.EnableMetrics {
|
||||
hm.updateMetrics(monitor, result)
|
||||
}
|
||||
|
||||
return monitor.currentHealth
|
||||
}
|
||||
|
||||
func (hm *HealthMonitorImpl) performCustomCheck(ctx context.Context, check HealthCheck) CheckResult {
|
||||
start := time.Now()
|
||||
|
||||
result := CheckResult{
|
||||
Name: check.Name,
|
||||
Status: HealthHealthy,
|
||||
ResponseTime: 0,
|
||||
Message: "Check passed",
|
||||
Details: make(map[string]interface{}),
|
||||
}
|
||||
|
||||
// Create timeout context for the check
|
||||
checkCtx, cancel := context.WithTimeout(ctx, check.Timeout)
|
||||
defer cancel()
|
||||
|
||||
// Run the check
|
||||
done := make(chan error, 1)
|
||||
go func() {
|
||||
done <- check.CheckFunc()
|
||||
}()
|
||||
|
||||
select {
|
||||
case err := <-done:
|
||||
result.ResponseTime = time.Since(start)
|
||||
if err != nil {
|
||||
result.Status = HealthUnhealthy
|
||||
result.Message = err.Error()
|
||||
result.Error = err
|
||||
}
|
||||
case <-checkCtx.Done():
|
||||
result.ResponseTime = time.Since(start)
|
||||
result.Status = HealthUnhealthy
|
||||
result.Message = "Check timed out"
|
||||
result.Error = checkCtx.Err()
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
func (hm *HealthMonitorImpl) calculateHealthTrend(monitor *ModuleMonitor) HealthTrend {
|
||||
if len(monitor.history) < 5 {
|
||||
return HealthTrend{
|
||||
Direction: TrendUnknown,
|
||||
Confidence: 0,
|
||||
LastUpdated: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
// Simple trend calculation based on recent health status
|
||||
recent := monitor.history[len(monitor.history)-5:]
|
||||
healthyCount := 0
|
||||
|
||||
for _, result := range recent {
|
||||
if result.Status == HealthHealthy {
|
||||
healthyCount++
|
||||
}
|
||||
}
|
||||
|
||||
healthRatio := float64(healthyCount) / float64(len(recent))
|
||||
|
||||
var direction TrendDirection
|
||||
var confidence float64
|
||||
|
||||
if healthRatio > 0.8 {
|
||||
direction = TrendImproving
|
||||
confidence = healthRatio
|
||||
} else if healthRatio < 0.4 {
|
||||
direction = TrendDegrading
|
||||
confidence = 1.0 - healthRatio
|
||||
} else {
|
||||
direction = TrendStable
|
||||
confidence = 0.5
|
||||
}
|
||||
|
||||
return HealthTrend{
|
||||
Direction: direction,
|
||||
Confidence: confidence,
|
||||
Slope: healthRatio - 0.5, // Simplified slope calculation
|
||||
Prediction: hm.predictHealthStatus(healthRatio),
|
||||
LastUpdated: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (hm *HealthMonitorImpl) predictHealthStatus(healthRatio float64) HealthStatus {
|
||||
if healthRatio > 0.7 {
|
||||
return HealthHealthy
|
||||
} else if healthRatio > 0.3 {
|
||||
return HealthDegraded
|
||||
} else {
|
||||
return HealthUnhealthy
|
||||
}
|
||||
}
|
||||
|
||||
func (hm *HealthMonitorImpl) applyHealthRules(moduleID string, health ModuleHealth) {
|
||||
for _, rule := range hm.rules {
|
||||
if rule.Enabled && rule.Condition(health) {
|
||||
if err := rule.Action(moduleID, health); err != nil {
|
||||
// Log error but continue with other rules
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (hm *HealthMonitorImpl) updateMetrics(monitor *ModuleMonitor, result HealthCheckResult) {
|
||||
hm.metrics.ChecksPerformed++
|
||||
|
||||
if result.Status == HealthHealthy {
|
||||
hm.metrics.ChecksSuccessful++
|
||||
} else {
|
||||
hm.metrics.ChecksFailed++
|
||||
}
|
||||
|
||||
// Update average check time
|
||||
if hm.metrics.ChecksPerformed > 0 {
|
||||
totalTime := hm.metrics.AverageCheckTime * time.Duration(hm.metrics.ChecksPerformed-1)
|
||||
hm.metrics.AverageCheckTime = (totalTime + result.ResponseTime) / time.Duration(hm.metrics.ChecksPerformed)
|
||||
}
|
||||
|
||||
// Update health score
|
||||
score := hm.aggregator.GetHealthScore(monitor.currentHealth)
|
||||
hm.metrics.ModuleHealthScores[monitor.moduleID] = score
|
||||
}
|
||||
|
||||
func (hm *HealthMonitorImpl) setupDefaultRules() {
|
||||
// Rule: Alert on unhealthy critical modules
|
||||
hm.rules = append(hm.rules, HealthRule{
|
||||
Name: "critical_module_unhealthy",
|
||||
Description: "Alert when a critical module becomes unhealthy",
|
||||
Condition: func(health ModuleHealth) bool {
|
||||
return health.Status == HealthUnhealthy
|
||||
},
|
||||
Action: func(moduleID string, health ModuleHealth) error {
|
||||
alert := HealthAlert{
|
||||
ID: fmt.Sprintf("critical_%s_%d", moduleID, time.Now().Unix()),
|
||||
ModuleID: moduleID,
|
||||
Severity: SeverityCritical,
|
||||
Type: AlertHealthChange,
|
||||
Message: fmt.Sprintf("Critical module %s is unhealthy: %s", moduleID, health.Message),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
return hm.notifier.NotifyAlert(alert)
|
||||
},
|
||||
Severity: SeverityCritical,
|
||||
Enabled: true,
|
||||
})
|
||||
|
||||
// Rule: Alert on degraded performance
|
||||
hm.rules = append(hm.rules, HealthRule{
|
||||
Name: "degraded_performance",
|
||||
Description: "Alert when module performance is degraded",
|
||||
Condition: func(health ModuleHealth) bool {
|
||||
return health.Status == HealthDegraded
|
||||
},
|
||||
Action: func(moduleID string, health ModuleHealth) error {
|
||||
alert := HealthAlert{
|
||||
ID: fmt.Sprintf("degraded_%s_%d", moduleID, time.Now().Unix()),
|
||||
ModuleID: moduleID,
|
||||
Severity: SeverityWarning,
|
||||
Type: AlertHealthChange,
|
||||
Message: fmt.Sprintf("Module %s performance is degraded: %s", moduleID, health.Message),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
return hm.notifier.NotifyAlert(alert)
|
||||
},
|
||||
Severity: SeverityWarning,
|
||||
Enabled: true,
|
||||
})
|
||||
}
|
||||
|
||||
// DefaultHealthAggregator implements basic health aggregation
|
||||
type DefaultHealthAggregator struct{}
|
||||
|
||||
func NewDefaultHealthAggregator() *DefaultHealthAggregator {
|
||||
return &DefaultHealthAggregator{}
|
||||
}
|
||||
|
||||
func (dha *DefaultHealthAggregator) AggregateHealth(modules map[string]ModuleHealth) OverallHealth {
|
||||
overall := OverallHealth{
|
||||
Modules: modules,
|
||||
LastUpdated: time.Now(),
|
||||
Trends: make(map[string]HealthTrend),
|
||||
}
|
||||
|
||||
if len(modules) == 0 {
|
||||
overall.Status = HealthUnknown
|
||||
return overall
|
||||
}
|
||||
|
||||
overall.ModuleCount = len(modules)
|
||||
var totalScore float64
|
||||
|
||||
for moduleID, health := range modules {
|
||||
score := dha.GetHealthScore(health)
|
||||
totalScore += score
|
||||
|
||||
switch health.Status {
|
||||
case HealthHealthy:
|
||||
overall.HealthyCount++
|
||||
case HealthDegraded:
|
||||
overall.DegradedCount++
|
||||
case HealthUnhealthy:
|
||||
overall.UnhealthyCount++
|
||||
overall.CriticalIssues = append(overall.CriticalIssues,
|
||||
fmt.Sprintf("Module %s is unhealthy: %s", moduleID, health.Message))
|
||||
}
|
||||
}
|
||||
|
||||
overall.Score = totalScore / float64(len(modules))
|
||||
overall.Status = dha.CalculateSystemHealth(getHealthValues(modules))
|
||||
|
||||
return overall
|
||||
}
|
||||
|
||||
func (dha *DefaultHealthAggregator) CalculateSystemHealth(individual []ModuleHealth) HealthStatus {
|
||||
if len(individual) == 0 {
|
||||
return HealthUnknown
|
||||
}
|
||||
|
||||
healthyCount := 0
|
||||
degradedCount := 0
|
||||
unhealthyCount := 0
|
||||
|
||||
for _, health := range individual {
|
||||
switch health.Status {
|
||||
case HealthHealthy:
|
||||
healthyCount++
|
||||
case HealthDegraded:
|
||||
degradedCount++
|
||||
case HealthUnhealthy:
|
||||
unhealthyCount++
|
||||
}
|
||||
}
|
||||
|
||||
total := len(individual)
|
||||
healthyRatio := float64(healthyCount) / float64(total)
|
||||
unhealthyRatio := float64(unhealthyCount) / float64(total)
|
||||
|
||||
if unhealthyRatio > 0.3 {
|
||||
return HealthUnhealthy
|
||||
} else if healthyRatio < 0.7 {
|
||||
return HealthDegraded
|
||||
} else {
|
||||
return HealthHealthy
|
||||
}
|
||||
}
|
||||
|
||||
func (dha *DefaultHealthAggregator) GetHealthScore(health ModuleHealth) float64 {
|
||||
switch health.Status {
|
||||
case HealthHealthy:
|
||||
return 1.0
|
||||
case HealthDegraded:
|
||||
return 0.5
|
||||
case HealthUnhealthy:
|
||||
return 0.0
|
||||
default:
|
||||
return 0.0
|
||||
}
|
||||
}
|
||||
|
||||
func getHealthValues(modules map[string]ModuleHealth) []ModuleHealth {
|
||||
values := make([]ModuleHealth, 0, len(modules))
|
||||
for _, health := range modules {
|
||||
values = append(values, health)
|
||||
}
|
||||
return values
|
||||
}
|
||||
|
||||
// DefaultHealthNotifier implements basic health notifications
|
||||
type DefaultHealthNotifier struct{}
|
||||
|
||||
func NewDefaultHealthNotifier() *DefaultHealthNotifier {
|
||||
return &DefaultHealthNotifier{}
|
||||
}
|
||||
|
||||
func (dhn *DefaultHealthNotifier) NotifyHealthChange(moduleID string, oldHealth, newHealth ModuleHealth) error {
|
||||
// Basic notification implementation - could be extended to send emails, webhooks, etc.
|
||||
return nil
|
||||
}
|
||||
|
||||
func (dhn *DefaultHealthNotifier) NotifySystemHealth(health OverallHealth) error {
|
||||
// Basic notification implementation
|
||||
return nil
|
||||
}
|
||||
|
||||
func (dhn *DefaultHealthNotifier) NotifyAlert(alert HealthAlert) error {
|
||||
// Basic notification implementation
|
||||
return nil
|
||||
}
|
||||
Reference in New Issue
Block a user