Files
mev-beta/internal/monitoring/health_checker.go
Krypto Kajun 850223a953 fix(multicall): resolve critical multicall parsing corruption issues
- Added comprehensive bounds checking to prevent buffer overruns in multicall parsing
- Implemented graduated validation system (Strict/Moderate/Permissive) to reduce false positives
- Added LRU caching system for address validation with 10-minute TTL
- Enhanced ABI decoder with missing Universal Router and Arbitrum-specific DEX signatures
- Fixed duplicate function declarations and import conflicts across multiple files
- Added error recovery mechanisms with multiple fallback strategies
- Updated tests to handle new validation behavior for suspicious addresses
- Fixed parser test expectations for improved validation system
- Applied gofmt formatting fixes to ensure code style compliance
- Fixed mutex copying issues in monitoring package by introducing MetricsSnapshot
- Resolved critical security vulnerabilities in heuristic address extraction
- Progress: Updated TODO audit from 10% to 35% complete

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-17 00:12:55 -05:00

448 lines
12 KiB
Go

package monitoring
import (
"context"
"fmt"
"sync"
"time"
"github.com/fraktal/mev-beta/internal/logger"
)
// HealthCheckRunner performs periodic health checks and monitoring
type HealthCheckRunner struct {
mu sync.RWMutex
logger *logger.Logger
integrityMonitor *IntegrityMonitor
checkInterval time.Duration
running bool
stopChan chan struct{}
lastHealthCheck time.Time
healthHistory []HealthSnapshot
maxHistorySize int
warmupSamples int
minAddressesForAlerts int64
}
// HealthSnapshot represents a point-in-time health snapshot
type HealthSnapshot struct {
Timestamp time.Time
HealthScore float64
CorruptionRate float64
ValidationSuccess float64
ContractCallSuccess float64
ActiveAlerts int
Trend HealthTrend
}
// HealthTrend indicates the direction of health metrics
type HealthTrend int
const (
HealthTrendUnknown HealthTrend = iota
HealthTrendImproving
HealthTrendStable
HealthTrendDeclining
HealthTrendCritical
)
func (t HealthTrend) String() string {
switch t {
case HealthTrendImproving:
return "IMPROVING"
case HealthTrendStable:
return "STABLE"
case HealthTrendDeclining:
return "DECLINING"
case HealthTrendCritical:
return "CRITICAL"
default:
return "UNKNOWN"
}
}
// NewHealthCheckRunner creates a new health check runner
func NewHealthCheckRunner(logger *logger.Logger, integrityMonitor *IntegrityMonitor) *HealthCheckRunner {
return &HealthCheckRunner{
logger: logger,
integrityMonitor: integrityMonitor,
checkInterval: 30 * time.Second, // Default 30 second intervals
stopChan: make(chan struct{}),
healthHistory: make([]HealthSnapshot, 0),
maxHistorySize: 100, // Keep last 100 snapshots (50 minutes at 30s intervals)
warmupSamples: 3,
minAddressesForAlerts: 25,
}
}
// Start begins the periodic health checking routine
func (hcr *HealthCheckRunner) Start(ctx context.Context) {
hcr.mu.Lock()
if hcr.running {
hcr.mu.Unlock()
return
}
hcr.running = true
hcr.mu.Unlock()
hcr.logger.Info("Starting health check runner",
"interval", hcr.checkInterval)
go hcr.healthCheckLoop(ctx)
}
// Stop stops the health checking routine
func (hcr *HealthCheckRunner) Stop() {
hcr.mu.Lock()
defer hcr.mu.Unlock()
if !hcr.running {
return
}
hcr.running = false
close(hcr.stopChan)
hcr.logger.Info("Health check runner stopped")
}
// healthCheckLoop runs the periodic health checking
func (hcr *HealthCheckRunner) healthCheckLoop(ctx context.Context) {
ticker := time.NewTicker(hcr.checkInterval)
defer ticker.Stop()
// Perform initial health check
hcr.performHealthCheck()
for {
select {
case <-ctx.Done():
hcr.logger.Info("Health check runner stopped due to context cancellation")
return
case <-hcr.stopChan:
hcr.logger.Info("Health check runner stopped")
return
case <-ticker.C:
hcr.performHealthCheck()
}
}
}
// performHealthCheck executes a comprehensive health check
func (hcr *HealthCheckRunner) performHealthCheck() {
start := time.Now()
hcr.lastHealthCheck = start
if !hcr.integrityMonitor.IsEnabled() {
hcr.logger.Debug("Skipping health check - integrity monitor disabled")
return
}
// Get current metrics
metrics := hcr.integrityMonitor.GetMetrics()
healthSummary := hcr.integrityMonitor.GetHealthSummary()
// Calculate rates
corruptionRate := 0.0
if metrics.TotalAddressesProcessed > 0 {
corruptionRate = float64(metrics.CorruptAddressesDetected) / float64(metrics.TotalAddressesProcessed)
}
validationSuccessRate := 0.0
totalValidations := metrics.AddressValidationPassed + metrics.AddressValidationFailed
if totalValidations > 0 {
validationSuccessRate = float64(metrics.AddressValidationPassed) / float64(totalValidations)
}
contractCallSuccessRate := 0.0
totalCalls := metrics.ContractCallsSucceeded + metrics.ContractCallsFailed
if totalCalls > 0 {
contractCallSuccessRate = float64(metrics.ContractCallsSucceeded) / float64(totalCalls)
}
// Create health snapshot
snapshot := HealthSnapshot{
Timestamp: start,
HealthScore: metrics.HealthScore,
CorruptionRate: corruptionRate,
ValidationSuccess: validationSuccessRate,
ContractCallSuccess: contractCallSuccessRate,
ActiveAlerts: 0, // Will be calculated based on current conditions
Trend: hcr.calculateHealthTrend(metrics.HealthScore),
}
// Add to history
hcr.addHealthSnapshot(snapshot)
// Check for threshold violations and generate alerts
hcr.checkThresholds(healthSummary, snapshot)
// Log health status periodically
hcr.logHealthStatus(snapshot, time.Since(start))
}
// addHealthSnapshot adds a snapshot to the health history
func (hcr *HealthCheckRunner) addHealthSnapshot(snapshot HealthSnapshot) {
hcr.mu.Lock()
defer hcr.mu.Unlock()
hcr.healthHistory = append(hcr.healthHistory, snapshot)
// Trim history if it exceeds max size
if len(hcr.healthHistory) > hcr.maxHistorySize {
hcr.healthHistory = hcr.healthHistory[len(hcr.healthHistory)-hcr.maxHistorySize:]
}
}
// calculateHealthTrend analyzes recent health scores to determine trend
func (hcr *HealthCheckRunner) calculateHealthTrend(currentScore float64) HealthTrend {
hcr.mu.RLock()
defer hcr.mu.RUnlock()
if len(hcr.healthHistory) < 3 {
return HealthTrendUnknown
}
// Get last few scores for trend analysis
recentScores := make([]float64, 0, 5)
start := len(hcr.healthHistory) - 5
if start < 0 {
start = 0
}
for i := start; i < len(hcr.healthHistory); i++ {
recentScores = append(recentScores, hcr.healthHistory[i].HealthScore)
}
recentScores = append(recentScores, currentScore)
// Calculate trend
if currentScore < 0.5 {
return HealthTrendCritical
}
// Simple linear trend calculation
if len(recentScores) >= 3 {
first := recentScores[0]
last := recentScores[len(recentScores)-1]
diff := last - first
if diff > 0.05 {
return HealthTrendImproving
} else if diff < -0.05 {
return HealthTrendDeclining
} else {
return HealthTrendStable
}
}
return HealthTrendUnknown
}
// checkThresholds checks for threshold violations and generates alerts
func (hcr *HealthCheckRunner) checkThresholds(healthSummary map[string]interface{}, snapshot HealthSnapshot) {
if !hcr.readyForAlerts(healthSummary, snapshot) {
hcr.logger.Debug("Health alerts suppressed during warm-up",
"health_score", snapshot.HealthScore,
"total_addresses_processed", safeNumericLookup(healthSummary, "total_addresses_processed"),
"history_size", hcr.historySize())
return
}
// Critical health score alert
if snapshot.HealthScore < 0.5 {
alert := CorruptionAlert{
Timestamp: time.Now(),
Severity: AlertSeverityEmergency,
Message: fmt.Sprintf("CRITICAL: System health score is %.2f (below 0.5)", snapshot.HealthScore),
Context: map[string]interface{}{
"health_score": snapshot.HealthScore,
"corruption_rate": snapshot.CorruptionRate,
"validation_success": snapshot.ValidationSuccess,
"contract_call_success": snapshot.ContractCallSuccess,
"trend": snapshot.Trend.String(),
},
}
hcr.integrityMonitor.sendAlert(alert)
}
// High corruption rate alert
if snapshot.CorruptionRate > 0.10 { // 10% corruption rate
alert := CorruptionAlert{
Timestamp: time.Now(),
Severity: AlertSeverityCritical,
Message: fmt.Sprintf("High corruption rate detected: %.2f%%", snapshot.CorruptionRate*100),
Context: map[string]interface{}{
"corruption_rate": snapshot.CorruptionRate,
"threshold": 0.10,
"addresses_affected": snapshot.CorruptionRate,
},
}
hcr.integrityMonitor.sendAlert(alert)
}
// Declining trend alert
if snapshot.Trend == HealthTrendDeclining || snapshot.Trend == HealthTrendCritical {
alert := CorruptionAlert{
Timestamp: time.Now(),
Severity: AlertSeverityWarning,
Message: fmt.Sprintf("System health trend is %s (current score: %.2f)", snapshot.Trend.String(), snapshot.HealthScore),
Context: map[string]interface{}{
"trend": snapshot.Trend.String(),
"health_score": snapshot.HealthScore,
"recent_snapshots": hcr.getRecentSnapshots(5),
},
}
hcr.integrityMonitor.sendAlert(alert)
}
}
func (hcr *HealthCheckRunner) readyForAlerts(healthSummary map[string]interface{}, snapshot HealthSnapshot) bool {
hcr.mu.RLock()
historyLen := len(hcr.healthHistory)
hcr.mu.RUnlock()
if historyLen < hcr.warmupSamples {
return false
}
totalProcessed := safeNumericLookup(healthSummary, "total_addresses_processed")
if totalProcessed >= 0 && totalProcessed < float64(hcr.minAddressesForAlerts) {
return false
}
// Require at least one validation or contract call attempt before alarming.
if snapshot.ValidationSuccess == 0 && snapshot.ContractCallSuccess == 0 && totalProcessed == 0 {
return false
}
return true
}
func safeNumericLookup(summary map[string]interface{}, key string) float64 {
if summary == nil {
return -1
}
value, ok := summary[key]
if !ok {
return -1
}
switch v := value.(type) {
case int:
return float64(v)
case int32:
return float64(v)
case int64:
return float64(v)
case uint:
return float64(v)
case uint32:
return float64(v)
case uint64:
return float64(v)
case float32:
return float64(v)
case float64:
return v
default:
return -1
}
}
func (hcr *HealthCheckRunner) historySize() int {
hcr.mu.RLock()
defer hcr.mu.RUnlock()
return len(hcr.healthHistory)
}
// logHealthStatus logs periodic health status information
func (hcr *HealthCheckRunner) logHealthStatus(snapshot HealthSnapshot, duration time.Duration) {
// Log detailed status every 5 minutes (10 checks at 30s intervals)
if len(hcr.healthHistory)%10 == 0 {
hcr.logger.Info("System health status",
"health_score", snapshot.HealthScore,
"corruption_rate", fmt.Sprintf("%.4f", snapshot.CorruptionRate),
"validation_success", fmt.Sprintf("%.4f", snapshot.ValidationSuccess),
"contract_call_success", fmt.Sprintf("%.4f", snapshot.ContractCallSuccess),
"trend", snapshot.Trend.String(),
"check_duration", duration)
} else {
// Brief status for regular checks
hcr.logger.Debug("Health check completed",
"health_score", snapshot.HealthScore,
"trend", snapshot.Trend.String(),
"duration", duration)
}
}
// GetRecentSnapshots returns the most recent health snapshots
func (hcr *HealthCheckRunner) GetRecentSnapshots(count int) []HealthSnapshot {
return hcr.getRecentSnapshots(count)
}
// getRecentSnapshots internal implementation
func (hcr *HealthCheckRunner) getRecentSnapshots(count int) []HealthSnapshot {
hcr.mu.RLock()
defer hcr.mu.RUnlock()
if len(hcr.healthHistory) == 0 {
return []HealthSnapshot{}
}
start := len(hcr.healthHistory) - count
if start < 0 {
start = 0
}
// Create a copy to avoid external modification
snapshots := make([]HealthSnapshot, len(hcr.healthHistory[start:]))
copy(snapshots, hcr.healthHistory[start:])
return snapshots
}
// GetHealthSummary returns a comprehensive health summary
func (hcr *HealthCheckRunner) GetHealthSummary() map[string]interface{} {
hcr.mu.RLock()
defer hcr.mu.RUnlock()
if len(hcr.healthHistory) == 0 {
return map[string]interface{}{
"running": hcr.running,
"check_interval": hcr.checkInterval.String(),
"history_size": 0,
"last_check": nil,
}
}
lastSnapshot := hcr.healthHistory[len(hcr.healthHistory)-1]
return map[string]interface{}{
"running": hcr.running,
"check_interval": hcr.checkInterval.String(),
"history_size": len(hcr.healthHistory),
"last_check": hcr.lastHealthCheck,
"current_health_score": lastSnapshot.HealthScore,
"current_trend": lastSnapshot.Trend.String(),
"corruption_rate": lastSnapshot.CorruptionRate,
"validation_success": lastSnapshot.ValidationSuccess,
"contract_call_success": lastSnapshot.ContractCallSuccess,
"recent_snapshots": hcr.getRecentSnapshots(10),
}
}
// SetCheckInterval sets the health check interval
func (hcr *HealthCheckRunner) SetCheckInterval(interval time.Duration) {
hcr.mu.Lock()
defer hcr.mu.Unlock()
hcr.checkInterval = interval
hcr.logger.Info("Health check interval updated", "interval", interval)
}
// IsRunning returns whether the health checker is running
func (hcr *HealthCheckRunner) IsRunning() bool {
hcr.mu.RLock()
defer hcr.mu.RUnlock()
return hcr.running
}