feat: create v2-prep branch with comprehensive planning
Restructured project for V2 refactor: **Structure Changes:** - Moved all V1 code to orig/ folder (preserved with git mv) - Created docs/planning/ directory - Added orig/README_V1.md explaining V1 preservation **Planning Documents:** - 00_V2_MASTER_PLAN.md: Complete architecture overview - Executive summary of critical V1 issues - High-level component architecture diagrams - 5-phase implementation roadmap - Success metrics and risk mitigation - 07_TASK_BREAKDOWN.md: Atomic task breakdown - 99+ hours of detailed tasks - Every task < 2 hours (atomic) - Clear dependencies and success criteria - Organized by implementation phase **V2 Key Improvements:** - Per-exchange parsers (factory pattern) - Multi-layer strict validation - Multi-index pool cache - Background validation pipeline - Comprehensive observability **Critical Issues Addressed:** - Zero address tokens (strict validation + cache enrichment) - Parsing accuracy (protocol-specific parsers) - No audit trail (background validation channel) - Inefficient lookups (multi-index cache) - Stats disconnection (event-driven metrics) Next Steps: 1. Review planning documents 2. Begin Phase 1: Foundation (P1-001 through P1-010) 3. Implement parsers in Phase 2 4. Build cache system in Phase 3 5. Add validation pipeline in Phase 4 6. Migrate and test in Phase 5 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
447
orig/internal/monitoring/health_checker.go
Normal file
447
orig/internal/monitoring/health_checker.go
Normal file
@@ -0,0 +1,447 @@
|
||||
package monitoring
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/fraktal/mev-beta/internal/logger"
|
||||
)
|
||||
|
||||
// HealthCheckRunner performs periodic health checks and monitoring
|
||||
type HealthCheckRunner struct {
|
||||
mu sync.RWMutex
|
||||
logger *logger.Logger
|
||||
integrityMonitor *IntegrityMonitor
|
||||
checkInterval time.Duration
|
||||
running bool
|
||||
stopChan chan struct{}
|
||||
lastHealthCheck time.Time
|
||||
healthHistory []HealthSnapshot
|
||||
maxHistorySize int
|
||||
warmupSamples int
|
||||
minAddressesForAlerts int64
|
||||
}
|
||||
|
||||
// HealthSnapshot represents a point-in-time health snapshot
|
||||
type HealthSnapshot struct {
|
||||
Timestamp time.Time
|
||||
HealthScore float64
|
||||
CorruptionRate float64
|
||||
ValidationSuccess float64
|
||||
ContractCallSuccess float64
|
||||
ActiveAlerts int
|
||||
Trend HealthTrend
|
||||
}
|
||||
|
||||
// HealthTrend indicates the direction of health metrics
|
||||
type HealthTrend int
|
||||
|
||||
const (
|
||||
HealthTrendUnknown HealthTrend = iota
|
||||
HealthTrendImproving
|
||||
HealthTrendStable
|
||||
HealthTrendDeclining
|
||||
HealthTrendCritical
|
||||
)
|
||||
|
||||
func (t HealthTrend) String() string {
|
||||
switch t {
|
||||
case HealthTrendImproving:
|
||||
return "IMPROVING"
|
||||
case HealthTrendStable:
|
||||
return "STABLE"
|
||||
case HealthTrendDeclining:
|
||||
return "DECLINING"
|
||||
case HealthTrendCritical:
|
||||
return "CRITICAL"
|
||||
default:
|
||||
return "UNKNOWN"
|
||||
}
|
||||
}
|
||||
|
||||
// NewHealthCheckRunner creates a new health check runner
|
||||
func NewHealthCheckRunner(logger *logger.Logger, integrityMonitor *IntegrityMonitor) *HealthCheckRunner {
|
||||
return &HealthCheckRunner{
|
||||
logger: logger,
|
||||
integrityMonitor: integrityMonitor,
|
||||
checkInterval: 30 * time.Second, // Default 30 second intervals
|
||||
stopChan: make(chan struct{}),
|
||||
healthHistory: make([]HealthSnapshot, 0),
|
||||
maxHistorySize: 100, // Keep last 100 snapshots (50 minutes at 30s intervals)
|
||||
warmupSamples: 3,
|
||||
minAddressesForAlerts: 25,
|
||||
}
|
||||
}
|
||||
|
||||
// Start begins the periodic health checking routine
|
||||
func (hcr *HealthCheckRunner) Start(ctx context.Context) {
|
||||
hcr.mu.Lock()
|
||||
if hcr.running {
|
||||
hcr.mu.Unlock()
|
||||
return
|
||||
}
|
||||
hcr.running = true
|
||||
hcr.mu.Unlock()
|
||||
|
||||
hcr.logger.Info("Starting health check runner",
|
||||
"interval", hcr.checkInterval)
|
||||
|
||||
go hcr.healthCheckLoop(ctx)
|
||||
}
|
||||
|
||||
// Stop stops the health checking routine
|
||||
func (hcr *HealthCheckRunner) Stop() {
|
||||
hcr.mu.Lock()
|
||||
defer hcr.mu.Unlock()
|
||||
|
||||
if !hcr.running {
|
||||
return
|
||||
}
|
||||
|
||||
hcr.running = false
|
||||
close(hcr.stopChan)
|
||||
hcr.logger.Info("Health check runner stopped")
|
||||
}
|
||||
|
||||
// healthCheckLoop runs the periodic health checking
|
||||
func (hcr *HealthCheckRunner) healthCheckLoop(ctx context.Context) {
|
||||
ticker := time.NewTicker(hcr.checkInterval)
|
||||
defer ticker.Stop()
|
||||
|
||||
// Perform initial health check
|
||||
hcr.performHealthCheck()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
hcr.logger.Info("Health check runner stopped due to context cancellation")
|
||||
return
|
||||
case <-hcr.stopChan:
|
||||
hcr.logger.Info("Health check runner stopped")
|
||||
return
|
||||
case <-ticker.C:
|
||||
hcr.performHealthCheck()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// performHealthCheck executes a comprehensive health check
|
||||
func (hcr *HealthCheckRunner) performHealthCheck() {
|
||||
start := time.Now()
|
||||
hcr.lastHealthCheck = start
|
||||
|
||||
if !hcr.integrityMonitor.IsEnabled() {
|
||||
hcr.logger.Debug("Skipping health check - integrity monitor disabled")
|
||||
return
|
||||
}
|
||||
|
||||
// Get current metrics
|
||||
metrics := hcr.integrityMonitor.GetMetrics()
|
||||
healthSummary := hcr.integrityMonitor.GetHealthSummary()
|
||||
|
||||
// Calculate rates
|
||||
corruptionRate := 0.0
|
||||
if metrics.TotalAddressesProcessed > 0 {
|
||||
corruptionRate = float64(metrics.CorruptAddressesDetected) / float64(metrics.TotalAddressesProcessed)
|
||||
}
|
||||
|
||||
validationSuccessRate := 0.0
|
||||
totalValidations := metrics.AddressValidationPassed + metrics.AddressValidationFailed
|
||||
if totalValidations > 0 {
|
||||
validationSuccessRate = float64(metrics.AddressValidationPassed) / float64(totalValidations)
|
||||
}
|
||||
|
||||
contractCallSuccessRate := 0.0
|
||||
totalCalls := metrics.ContractCallsSucceeded + metrics.ContractCallsFailed
|
||||
if totalCalls > 0 {
|
||||
contractCallSuccessRate = float64(metrics.ContractCallsSucceeded) / float64(totalCalls)
|
||||
}
|
||||
|
||||
// Create health snapshot
|
||||
snapshot := HealthSnapshot{
|
||||
Timestamp: start,
|
||||
HealthScore: metrics.HealthScore,
|
||||
CorruptionRate: corruptionRate,
|
||||
ValidationSuccess: validationSuccessRate,
|
||||
ContractCallSuccess: contractCallSuccessRate,
|
||||
ActiveAlerts: 0, // Will be calculated based on current conditions
|
||||
Trend: hcr.calculateHealthTrend(metrics.HealthScore),
|
||||
}
|
||||
|
||||
// Add to history
|
||||
hcr.addHealthSnapshot(snapshot)
|
||||
|
||||
// Check for threshold violations and generate alerts
|
||||
hcr.checkThresholds(healthSummary, snapshot)
|
||||
|
||||
// Log health status periodically
|
||||
hcr.logHealthStatus(snapshot, time.Since(start))
|
||||
}
|
||||
|
||||
// addHealthSnapshot adds a snapshot to the health history
|
||||
func (hcr *HealthCheckRunner) addHealthSnapshot(snapshot HealthSnapshot) {
|
||||
hcr.mu.Lock()
|
||||
defer hcr.mu.Unlock()
|
||||
|
||||
hcr.healthHistory = append(hcr.healthHistory, snapshot)
|
||||
|
||||
// Trim history if it exceeds max size
|
||||
if len(hcr.healthHistory) > hcr.maxHistorySize {
|
||||
hcr.healthHistory = hcr.healthHistory[len(hcr.healthHistory)-hcr.maxHistorySize:]
|
||||
}
|
||||
}
|
||||
|
||||
// calculateHealthTrend analyzes recent health scores to determine trend
|
||||
func (hcr *HealthCheckRunner) calculateHealthTrend(currentScore float64) HealthTrend {
|
||||
hcr.mu.RLock()
|
||||
defer hcr.mu.RUnlock()
|
||||
|
||||
if len(hcr.healthHistory) < 3 {
|
||||
return HealthTrendUnknown
|
||||
}
|
||||
|
||||
// Get last few scores for trend analysis
|
||||
recentScores := make([]float64, 0, 5)
|
||||
start := len(hcr.healthHistory) - 5
|
||||
if start < 0 {
|
||||
start = 0
|
||||
}
|
||||
|
||||
for i := start; i < len(hcr.healthHistory); i++ {
|
||||
recentScores = append(recentScores, hcr.healthHistory[i].HealthScore)
|
||||
}
|
||||
recentScores = append(recentScores, currentScore)
|
||||
|
||||
// Calculate trend
|
||||
if currentScore < 0.5 {
|
||||
return HealthTrendCritical
|
||||
}
|
||||
|
||||
// Simple linear trend calculation
|
||||
if len(recentScores) >= 3 {
|
||||
first := recentScores[0]
|
||||
last := recentScores[len(recentScores)-1]
|
||||
diff := last - first
|
||||
|
||||
if diff > 0.05 {
|
||||
return HealthTrendImproving
|
||||
} else if diff < -0.05 {
|
||||
return HealthTrendDeclining
|
||||
} else {
|
||||
return HealthTrendStable
|
||||
}
|
||||
}
|
||||
|
||||
return HealthTrendUnknown
|
||||
}
|
||||
|
||||
// checkThresholds checks for threshold violations and generates alerts
|
||||
func (hcr *HealthCheckRunner) checkThresholds(healthSummary map[string]interface{}, snapshot HealthSnapshot) {
|
||||
if !hcr.readyForAlerts(healthSummary, snapshot) {
|
||||
hcr.logger.Debug("Health alerts suppressed during warm-up",
|
||||
"health_score", snapshot.HealthScore,
|
||||
"total_addresses_processed", safeNumericLookup(healthSummary, "total_addresses_processed"),
|
||||
"history_size", hcr.historySize())
|
||||
return
|
||||
}
|
||||
|
||||
// Critical health score alert
|
||||
if snapshot.HealthScore < 0.5 {
|
||||
alert := CorruptionAlert{
|
||||
Timestamp: time.Now(),
|
||||
Severity: AlertSeverityEmergency,
|
||||
Message: fmt.Sprintf("CRITICAL: System health score is %.2f (below 0.5)", snapshot.HealthScore),
|
||||
Context: map[string]interface{}{
|
||||
"health_score": snapshot.HealthScore,
|
||||
"corruption_rate": snapshot.CorruptionRate,
|
||||
"validation_success": snapshot.ValidationSuccess,
|
||||
"contract_call_success": snapshot.ContractCallSuccess,
|
||||
"trend": snapshot.Trend.String(),
|
||||
},
|
||||
}
|
||||
hcr.integrityMonitor.sendAlert(alert)
|
||||
}
|
||||
|
||||
// High corruption rate alert
|
||||
if snapshot.CorruptionRate > 0.10 { // 10% corruption rate
|
||||
alert := CorruptionAlert{
|
||||
Timestamp: time.Now(),
|
||||
Severity: AlertSeverityCritical,
|
||||
Message: fmt.Sprintf("High corruption rate detected: %.2f%%", snapshot.CorruptionRate*100),
|
||||
Context: map[string]interface{}{
|
||||
"corruption_rate": snapshot.CorruptionRate,
|
||||
"threshold": 0.10,
|
||||
"addresses_affected": snapshot.CorruptionRate,
|
||||
},
|
||||
}
|
||||
hcr.integrityMonitor.sendAlert(alert)
|
||||
}
|
||||
|
||||
// Declining trend alert
|
||||
if snapshot.Trend == HealthTrendDeclining || snapshot.Trend == HealthTrendCritical {
|
||||
alert := CorruptionAlert{
|
||||
Timestamp: time.Now(),
|
||||
Severity: AlertSeverityWarning,
|
||||
Message: fmt.Sprintf("System health trend is %s (current score: %.2f)", snapshot.Trend.String(), snapshot.HealthScore),
|
||||
Context: map[string]interface{}{
|
||||
"trend": snapshot.Trend.String(),
|
||||
"health_score": snapshot.HealthScore,
|
||||
"recent_snapshots": hcr.getRecentSnapshots(5),
|
||||
},
|
||||
}
|
||||
hcr.integrityMonitor.sendAlert(alert)
|
||||
}
|
||||
}
|
||||
|
||||
func (hcr *HealthCheckRunner) readyForAlerts(healthSummary map[string]interface{}, snapshot HealthSnapshot) bool {
|
||||
hcr.mu.RLock()
|
||||
historyLen := len(hcr.healthHistory)
|
||||
hcr.mu.RUnlock()
|
||||
|
||||
if historyLen < hcr.warmupSamples {
|
||||
return false
|
||||
}
|
||||
|
||||
totalProcessed := safeNumericLookup(healthSummary, "total_addresses_processed")
|
||||
if totalProcessed >= 0 && totalProcessed < float64(hcr.minAddressesForAlerts) {
|
||||
return false
|
||||
}
|
||||
|
||||
// Require at least one validation or contract call attempt before alarming.
|
||||
if snapshot.ValidationSuccess == 0 && snapshot.ContractCallSuccess == 0 && totalProcessed == 0 {
|
||||
return false
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
func safeNumericLookup(summary map[string]interface{}, key string) float64 {
|
||||
if summary == nil {
|
||||
return -1
|
||||
}
|
||||
|
||||
value, ok := summary[key]
|
||||
if !ok {
|
||||
return -1
|
||||
}
|
||||
|
||||
switch v := value.(type) {
|
||||
case int:
|
||||
return float64(v)
|
||||
case int32:
|
||||
return float64(v)
|
||||
case int64:
|
||||
return float64(v)
|
||||
case uint:
|
||||
return float64(v)
|
||||
case uint32:
|
||||
return float64(v)
|
||||
case uint64:
|
||||
return float64(v)
|
||||
case float32:
|
||||
return float64(v)
|
||||
case float64:
|
||||
return v
|
||||
default:
|
||||
return -1
|
||||
}
|
||||
}
|
||||
|
||||
func (hcr *HealthCheckRunner) historySize() int {
|
||||
hcr.mu.RLock()
|
||||
defer hcr.mu.RUnlock()
|
||||
return len(hcr.healthHistory)
|
||||
}
|
||||
|
||||
// logHealthStatus logs periodic health status information
|
||||
func (hcr *HealthCheckRunner) logHealthStatus(snapshot HealthSnapshot, duration time.Duration) {
|
||||
// Log detailed status every 5 minutes (10 checks at 30s intervals)
|
||||
if len(hcr.healthHistory)%10 == 0 {
|
||||
hcr.logger.Info("System health status",
|
||||
"health_score", snapshot.HealthScore,
|
||||
"corruption_rate", fmt.Sprintf("%.4f", snapshot.CorruptionRate),
|
||||
"validation_success", fmt.Sprintf("%.4f", snapshot.ValidationSuccess),
|
||||
"contract_call_success", fmt.Sprintf("%.4f", snapshot.ContractCallSuccess),
|
||||
"trend", snapshot.Trend.String(),
|
||||
"check_duration", duration)
|
||||
} else {
|
||||
// Brief status for regular checks
|
||||
hcr.logger.Debug("Health check completed",
|
||||
"health_score", snapshot.HealthScore,
|
||||
"trend", snapshot.Trend.String(),
|
||||
"duration", duration)
|
||||
}
|
||||
}
|
||||
|
||||
// GetRecentSnapshots returns the most recent health snapshots
|
||||
func (hcr *HealthCheckRunner) GetRecentSnapshots(count int) []HealthSnapshot {
|
||||
return hcr.getRecentSnapshots(count)
|
||||
}
|
||||
|
||||
// getRecentSnapshots internal implementation
|
||||
func (hcr *HealthCheckRunner) getRecentSnapshots(count int) []HealthSnapshot {
|
||||
hcr.mu.RLock()
|
||||
defer hcr.mu.RUnlock()
|
||||
|
||||
if len(hcr.healthHistory) == 0 {
|
||||
return []HealthSnapshot{}
|
||||
}
|
||||
|
||||
start := len(hcr.healthHistory) - count
|
||||
if start < 0 {
|
||||
start = 0
|
||||
}
|
||||
|
||||
// Create a copy to avoid external modification
|
||||
snapshots := make([]HealthSnapshot, len(hcr.healthHistory[start:]))
|
||||
copy(snapshots, hcr.healthHistory[start:])
|
||||
|
||||
return snapshots
|
||||
}
|
||||
|
||||
// GetHealthSummary returns a comprehensive health summary
|
||||
func (hcr *HealthCheckRunner) GetHealthSummary() map[string]interface{} {
|
||||
hcr.mu.RLock()
|
||||
defer hcr.mu.RUnlock()
|
||||
|
||||
if len(hcr.healthHistory) == 0 {
|
||||
return map[string]interface{}{
|
||||
"running": hcr.running,
|
||||
"check_interval": hcr.checkInterval.String(),
|
||||
"history_size": 0,
|
||||
"last_check": nil,
|
||||
}
|
||||
}
|
||||
|
||||
lastSnapshot := hcr.healthHistory[len(hcr.healthHistory)-1]
|
||||
|
||||
return map[string]interface{}{
|
||||
"running": hcr.running,
|
||||
"check_interval": hcr.checkInterval.String(),
|
||||
"history_size": len(hcr.healthHistory),
|
||||
"last_check": hcr.lastHealthCheck,
|
||||
"current_health_score": lastSnapshot.HealthScore,
|
||||
"current_trend": lastSnapshot.Trend.String(),
|
||||
"corruption_rate": lastSnapshot.CorruptionRate,
|
||||
"validation_success": lastSnapshot.ValidationSuccess,
|
||||
"contract_call_success": lastSnapshot.ContractCallSuccess,
|
||||
"recent_snapshots": hcr.getRecentSnapshots(10),
|
||||
}
|
||||
}
|
||||
|
||||
// SetCheckInterval sets the health check interval
|
||||
func (hcr *HealthCheckRunner) SetCheckInterval(interval time.Duration) {
|
||||
hcr.mu.Lock()
|
||||
defer hcr.mu.Unlock()
|
||||
hcr.checkInterval = interval
|
||||
hcr.logger.Info("Health check interval updated", "interval", interval)
|
||||
}
|
||||
|
||||
// IsRunning returns whether the health checker is running
|
||||
func (hcr *HealthCheckRunner) IsRunning() bool {
|
||||
hcr.mu.RLock()
|
||||
defer hcr.mu.RUnlock()
|
||||
return hcr.running
|
||||
}
|
||||
Reference in New Issue
Block a user