- Added comprehensive bounds checking to prevent buffer overruns in multicall parsing - Implemented graduated validation system (Strict/Moderate/Permissive) to reduce false positives - Added LRU caching system for address validation with 10-minute TTL - Enhanced ABI decoder with missing Universal Router and Arbitrum-specific DEX signatures - Fixed duplicate function declarations and import conflicts across multiple files - Added error recovery mechanisms with multiple fallback strategies - Updated tests to handle new validation behavior for suspicious addresses - Fixed parser test expectations for improved validation system - Applied gofmt formatting fixes to ensure code style compliance - Fixed mutex copying issues in monitoring package by introducing MetricsSnapshot - Resolved critical security vulnerabilities in heuristic address extraction - Progress: Updated TODO audit from 10% to 35% complete 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
448 lines
12 KiB
Go
448 lines
12 KiB
Go
package monitoring
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/fraktal/mev-beta/internal/logger"
|
|
)
|
|
|
|
// HealthCheckRunner performs periodic health checks and monitoring
|
|
type HealthCheckRunner struct {
|
|
mu sync.RWMutex
|
|
logger *logger.Logger
|
|
integrityMonitor *IntegrityMonitor
|
|
checkInterval time.Duration
|
|
running bool
|
|
stopChan chan struct{}
|
|
lastHealthCheck time.Time
|
|
healthHistory []HealthSnapshot
|
|
maxHistorySize int
|
|
warmupSamples int
|
|
minAddressesForAlerts int64
|
|
}
|
|
|
|
// HealthSnapshot represents a point-in-time health snapshot
|
|
type HealthSnapshot struct {
|
|
Timestamp time.Time
|
|
HealthScore float64
|
|
CorruptionRate float64
|
|
ValidationSuccess float64
|
|
ContractCallSuccess float64
|
|
ActiveAlerts int
|
|
Trend HealthTrend
|
|
}
|
|
|
|
// HealthTrend indicates the direction of health metrics
|
|
type HealthTrend int
|
|
|
|
const (
|
|
HealthTrendUnknown HealthTrend = iota
|
|
HealthTrendImproving
|
|
HealthTrendStable
|
|
HealthTrendDeclining
|
|
HealthTrendCritical
|
|
)
|
|
|
|
func (t HealthTrend) String() string {
|
|
switch t {
|
|
case HealthTrendImproving:
|
|
return "IMPROVING"
|
|
case HealthTrendStable:
|
|
return "STABLE"
|
|
case HealthTrendDeclining:
|
|
return "DECLINING"
|
|
case HealthTrendCritical:
|
|
return "CRITICAL"
|
|
default:
|
|
return "UNKNOWN"
|
|
}
|
|
}
|
|
|
|
// NewHealthCheckRunner creates a new health check runner
|
|
func NewHealthCheckRunner(logger *logger.Logger, integrityMonitor *IntegrityMonitor) *HealthCheckRunner {
|
|
return &HealthCheckRunner{
|
|
logger: logger,
|
|
integrityMonitor: integrityMonitor,
|
|
checkInterval: 30 * time.Second, // Default 30 second intervals
|
|
stopChan: make(chan struct{}),
|
|
healthHistory: make([]HealthSnapshot, 0),
|
|
maxHistorySize: 100, // Keep last 100 snapshots (50 minutes at 30s intervals)
|
|
warmupSamples: 3,
|
|
minAddressesForAlerts: 25,
|
|
}
|
|
}
|
|
|
|
// Start begins the periodic health checking routine
|
|
func (hcr *HealthCheckRunner) Start(ctx context.Context) {
|
|
hcr.mu.Lock()
|
|
if hcr.running {
|
|
hcr.mu.Unlock()
|
|
return
|
|
}
|
|
hcr.running = true
|
|
hcr.mu.Unlock()
|
|
|
|
hcr.logger.Info("Starting health check runner",
|
|
"interval", hcr.checkInterval)
|
|
|
|
go hcr.healthCheckLoop(ctx)
|
|
}
|
|
|
|
// Stop stops the health checking routine
|
|
func (hcr *HealthCheckRunner) Stop() {
|
|
hcr.mu.Lock()
|
|
defer hcr.mu.Unlock()
|
|
|
|
if !hcr.running {
|
|
return
|
|
}
|
|
|
|
hcr.running = false
|
|
close(hcr.stopChan)
|
|
hcr.logger.Info("Health check runner stopped")
|
|
}
|
|
|
|
// healthCheckLoop runs the periodic health checking
|
|
func (hcr *HealthCheckRunner) healthCheckLoop(ctx context.Context) {
|
|
ticker := time.NewTicker(hcr.checkInterval)
|
|
defer ticker.Stop()
|
|
|
|
// Perform initial health check
|
|
hcr.performHealthCheck()
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
hcr.logger.Info("Health check runner stopped due to context cancellation")
|
|
return
|
|
case <-hcr.stopChan:
|
|
hcr.logger.Info("Health check runner stopped")
|
|
return
|
|
case <-ticker.C:
|
|
hcr.performHealthCheck()
|
|
}
|
|
}
|
|
}
|
|
|
|
// performHealthCheck executes a comprehensive health check
|
|
func (hcr *HealthCheckRunner) performHealthCheck() {
|
|
start := time.Now()
|
|
hcr.lastHealthCheck = start
|
|
|
|
if !hcr.integrityMonitor.IsEnabled() {
|
|
hcr.logger.Debug("Skipping health check - integrity monitor disabled")
|
|
return
|
|
}
|
|
|
|
// Get current metrics
|
|
metrics := hcr.integrityMonitor.GetMetrics()
|
|
healthSummary := hcr.integrityMonitor.GetHealthSummary()
|
|
|
|
// Calculate rates
|
|
corruptionRate := 0.0
|
|
if metrics.TotalAddressesProcessed > 0 {
|
|
corruptionRate = float64(metrics.CorruptAddressesDetected) / float64(metrics.TotalAddressesProcessed)
|
|
}
|
|
|
|
validationSuccessRate := 0.0
|
|
totalValidations := metrics.AddressValidationPassed + metrics.AddressValidationFailed
|
|
if totalValidations > 0 {
|
|
validationSuccessRate = float64(metrics.AddressValidationPassed) / float64(totalValidations)
|
|
}
|
|
|
|
contractCallSuccessRate := 0.0
|
|
totalCalls := metrics.ContractCallsSucceeded + metrics.ContractCallsFailed
|
|
if totalCalls > 0 {
|
|
contractCallSuccessRate = float64(metrics.ContractCallsSucceeded) / float64(totalCalls)
|
|
}
|
|
|
|
// Create health snapshot
|
|
snapshot := HealthSnapshot{
|
|
Timestamp: start,
|
|
HealthScore: metrics.HealthScore,
|
|
CorruptionRate: corruptionRate,
|
|
ValidationSuccess: validationSuccessRate,
|
|
ContractCallSuccess: contractCallSuccessRate,
|
|
ActiveAlerts: 0, // Will be calculated based on current conditions
|
|
Trend: hcr.calculateHealthTrend(metrics.HealthScore),
|
|
}
|
|
|
|
// Add to history
|
|
hcr.addHealthSnapshot(snapshot)
|
|
|
|
// Check for threshold violations and generate alerts
|
|
hcr.checkThresholds(healthSummary, snapshot)
|
|
|
|
// Log health status periodically
|
|
hcr.logHealthStatus(snapshot, time.Since(start))
|
|
}
|
|
|
|
// addHealthSnapshot adds a snapshot to the health history
|
|
func (hcr *HealthCheckRunner) addHealthSnapshot(snapshot HealthSnapshot) {
|
|
hcr.mu.Lock()
|
|
defer hcr.mu.Unlock()
|
|
|
|
hcr.healthHistory = append(hcr.healthHistory, snapshot)
|
|
|
|
// Trim history if it exceeds max size
|
|
if len(hcr.healthHistory) > hcr.maxHistorySize {
|
|
hcr.healthHistory = hcr.healthHistory[len(hcr.healthHistory)-hcr.maxHistorySize:]
|
|
}
|
|
}
|
|
|
|
// calculateHealthTrend analyzes recent health scores to determine trend
|
|
func (hcr *HealthCheckRunner) calculateHealthTrend(currentScore float64) HealthTrend {
|
|
hcr.mu.RLock()
|
|
defer hcr.mu.RUnlock()
|
|
|
|
if len(hcr.healthHistory) < 3 {
|
|
return HealthTrendUnknown
|
|
}
|
|
|
|
// Get last few scores for trend analysis
|
|
recentScores := make([]float64, 0, 5)
|
|
start := len(hcr.healthHistory) - 5
|
|
if start < 0 {
|
|
start = 0
|
|
}
|
|
|
|
for i := start; i < len(hcr.healthHistory); i++ {
|
|
recentScores = append(recentScores, hcr.healthHistory[i].HealthScore)
|
|
}
|
|
recentScores = append(recentScores, currentScore)
|
|
|
|
// Calculate trend
|
|
if currentScore < 0.5 {
|
|
return HealthTrendCritical
|
|
}
|
|
|
|
// Simple linear trend calculation
|
|
if len(recentScores) >= 3 {
|
|
first := recentScores[0]
|
|
last := recentScores[len(recentScores)-1]
|
|
diff := last - first
|
|
|
|
if diff > 0.05 {
|
|
return HealthTrendImproving
|
|
} else if diff < -0.05 {
|
|
return HealthTrendDeclining
|
|
} else {
|
|
return HealthTrendStable
|
|
}
|
|
}
|
|
|
|
return HealthTrendUnknown
|
|
}
|
|
|
|
// checkThresholds checks for threshold violations and generates alerts
|
|
func (hcr *HealthCheckRunner) checkThresholds(healthSummary map[string]interface{}, snapshot HealthSnapshot) {
|
|
if !hcr.readyForAlerts(healthSummary, snapshot) {
|
|
hcr.logger.Debug("Health alerts suppressed during warm-up",
|
|
"health_score", snapshot.HealthScore,
|
|
"total_addresses_processed", safeNumericLookup(healthSummary, "total_addresses_processed"),
|
|
"history_size", hcr.historySize())
|
|
return
|
|
}
|
|
|
|
// Critical health score alert
|
|
if snapshot.HealthScore < 0.5 {
|
|
alert := CorruptionAlert{
|
|
Timestamp: time.Now(),
|
|
Severity: AlertSeverityEmergency,
|
|
Message: fmt.Sprintf("CRITICAL: System health score is %.2f (below 0.5)", snapshot.HealthScore),
|
|
Context: map[string]interface{}{
|
|
"health_score": snapshot.HealthScore,
|
|
"corruption_rate": snapshot.CorruptionRate,
|
|
"validation_success": snapshot.ValidationSuccess,
|
|
"contract_call_success": snapshot.ContractCallSuccess,
|
|
"trend": snapshot.Trend.String(),
|
|
},
|
|
}
|
|
hcr.integrityMonitor.sendAlert(alert)
|
|
}
|
|
|
|
// High corruption rate alert
|
|
if snapshot.CorruptionRate > 0.10 { // 10% corruption rate
|
|
alert := CorruptionAlert{
|
|
Timestamp: time.Now(),
|
|
Severity: AlertSeverityCritical,
|
|
Message: fmt.Sprintf("High corruption rate detected: %.2f%%", snapshot.CorruptionRate*100),
|
|
Context: map[string]interface{}{
|
|
"corruption_rate": snapshot.CorruptionRate,
|
|
"threshold": 0.10,
|
|
"addresses_affected": snapshot.CorruptionRate,
|
|
},
|
|
}
|
|
hcr.integrityMonitor.sendAlert(alert)
|
|
}
|
|
|
|
// Declining trend alert
|
|
if snapshot.Trend == HealthTrendDeclining || snapshot.Trend == HealthTrendCritical {
|
|
alert := CorruptionAlert{
|
|
Timestamp: time.Now(),
|
|
Severity: AlertSeverityWarning,
|
|
Message: fmt.Sprintf("System health trend is %s (current score: %.2f)", snapshot.Trend.String(), snapshot.HealthScore),
|
|
Context: map[string]interface{}{
|
|
"trend": snapshot.Trend.String(),
|
|
"health_score": snapshot.HealthScore,
|
|
"recent_snapshots": hcr.getRecentSnapshots(5),
|
|
},
|
|
}
|
|
hcr.integrityMonitor.sendAlert(alert)
|
|
}
|
|
}
|
|
|
|
func (hcr *HealthCheckRunner) readyForAlerts(healthSummary map[string]interface{}, snapshot HealthSnapshot) bool {
|
|
hcr.mu.RLock()
|
|
historyLen := len(hcr.healthHistory)
|
|
hcr.mu.RUnlock()
|
|
|
|
if historyLen < hcr.warmupSamples {
|
|
return false
|
|
}
|
|
|
|
totalProcessed := safeNumericLookup(healthSummary, "total_addresses_processed")
|
|
if totalProcessed >= 0 && totalProcessed < float64(hcr.minAddressesForAlerts) {
|
|
return false
|
|
}
|
|
|
|
// Require at least one validation or contract call attempt before alarming.
|
|
if snapshot.ValidationSuccess == 0 && snapshot.ContractCallSuccess == 0 && totalProcessed == 0 {
|
|
return false
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
func safeNumericLookup(summary map[string]interface{}, key string) float64 {
|
|
if summary == nil {
|
|
return -1
|
|
}
|
|
|
|
value, ok := summary[key]
|
|
if !ok {
|
|
return -1
|
|
}
|
|
|
|
switch v := value.(type) {
|
|
case int:
|
|
return float64(v)
|
|
case int32:
|
|
return float64(v)
|
|
case int64:
|
|
return float64(v)
|
|
case uint:
|
|
return float64(v)
|
|
case uint32:
|
|
return float64(v)
|
|
case uint64:
|
|
return float64(v)
|
|
case float32:
|
|
return float64(v)
|
|
case float64:
|
|
return v
|
|
default:
|
|
return -1
|
|
}
|
|
}
|
|
|
|
func (hcr *HealthCheckRunner) historySize() int {
|
|
hcr.mu.RLock()
|
|
defer hcr.mu.RUnlock()
|
|
return len(hcr.healthHistory)
|
|
}
|
|
|
|
// logHealthStatus logs periodic health status information
|
|
func (hcr *HealthCheckRunner) logHealthStatus(snapshot HealthSnapshot, duration time.Duration) {
|
|
// Log detailed status every 5 minutes (10 checks at 30s intervals)
|
|
if len(hcr.healthHistory)%10 == 0 {
|
|
hcr.logger.Info("System health status",
|
|
"health_score", snapshot.HealthScore,
|
|
"corruption_rate", fmt.Sprintf("%.4f", snapshot.CorruptionRate),
|
|
"validation_success", fmt.Sprintf("%.4f", snapshot.ValidationSuccess),
|
|
"contract_call_success", fmt.Sprintf("%.4f", snapshot.ContractCallSuccess),
|
|
"trend", snapshot.Trend.String(),
|
|
"check_duration", duration)
|
|
} else {
|
|
// Brief status for regular checks
|
|
hcr.logger.Debug("Health check completed",
|
|
"health_score", snapshot.HealthScore,
|
|
"trend", snapshot.Trend.String(),
|
|
"duration", duration)
|
|
}
|
|
}
|
|
|
|
// GetRecentSnapshots returns the most recent health snapshots
|
|
func (hcr *HealthCheckRunner) GetRecentSnapshots(count int) []HealthSnapshot {
|
|
return hcr.getRecentSnapshots(count)
|
|
}
|
|
|
|
// getRecentSnapshots internal implementation
|
|
func (hcr *HealthCheckRunner) getRecentSnapshots(count int) []HealthSnapshot {
|
|
hcr.mu.RLock()
|
|
defer hcr.mu.RUnlock()
|
|
|
|
if len(hcr.healthHistory) == 0 {
|
|
return []HealthSnapshot{}
|
|
}
|
|
|
|
start := len(hcr.healthHistory) - count
|
|
if start < 0 {
|
|
start = 0
|
|
}
|
|
|
|
// Create a copy to avoid external modification
|
|
snapshots := make([]HealthSnapshot, len(hcr.healthHistory[start:]))
|
|
copy(snapshots, hcr.healthHistory[start:])
|
|
|
|
return snapshots
|
|
}
|
|
|
|
// GetHealthSummary returns a comprehensive health summary
|
|
func (hcr *HealthCheckRunner) GetHealthSummary() map[string]interface{} {
|
|
hcr.mu.RLock()
|
|
defer hcr.mu.RUnlock()
|
|
|
|
if len(hcr.healthHistory) == 0 {
|
|
return map[string]interface{}{
|
|
"running": hcr.running,
|
|
"check_interval": hcr.checkInterval.String(),
|
|
"history_size": 0,
|
|
"last_check": nil,
|
|
}
|
|
}
|
|
|
|
lastSnapshot := hcr.healthHistory[len(hcr.healthHistory)-1]
|
|
|
|
return map[string]interface{}{
|
|
"running": hcr.running,
|
|
"check_interval": hcr.checkInterval.String(),
|
|
"history_size": len(hcr.healthHistory),
|
|
"last_check": hcr.lastHealthCheck,
|
|
"current_health_score": lastSnapshot.HealthScore,
|
|
"current_trend": lastSnapshot.Trend.String(),
|
|
"corruption_rate": lastSnapshot.CorruptionRate,
|
|
"validation_success": lastSnapshot.ValidationSuccess,
|
|
"contract_call_success": lastSnapshot.ContractCallSuccess,
|
|
"recent_snapshots": hcr.getRecentSnapshots(10),
|
|
}
|
|
}
|
|
|
|
// SetCheckInterval sets the health check interval
|
|
func (hcr *HealthCheckRunner) SetCheckInterval(interval time.Duration) {
|
|
hcr.mu.Lock()
|
|
defer hcr.mu.Unlock()
|
|
hcr.checkInterval = interval
|
|
hcr.logger.Info("Health check interval updated", "interval", interval)
|
|
}
|
|
|
|
// IsRunning returns whether the health checker is running
|
|
func (hcr *HealthCheckRunner) IsRunning() bool {
|
|
hcr.mu.RLock()
|
|
defer hcr.mu.RUnlock()
|
|
return hcr.running
|
|
}
|