mev-beta/internal/monitoring/health_checker.go

package monitoring

import (
	"context"
	"fmt"
	"sync"
	"time"

	"github.com/fraktal/mev-beta/internal/logger"
)

// HealthCheckRunner performs periodic health checks and monitoring
type HealthCheckRunner struct {
	mu                    sync.RWMutex
	logger                *logger.Logger
	integrityMonitor      *IntegrityMonitor
	checkInterval         time.Duration
	running               bool
	stopChan              chan struct{}
	lastHealthCheck       time.Time
	healthHistory         []HealthSnapshot
	maxHistorySize        int
	warmupSamples         int
	minAddressesForAlerts int64
}

// HealthSnapshot represents a point-in-time health snapshot
type HealthSnapshot struct {
	Timestamp           time.Time
	HealthScore         float64
	CorruptionRate      float64
	ValidationSuccess   float64
	ContractCallSuccess float64
	ActiveAlerts        int
	Trend               HealthTrend
}

// HealthTrend indicates the direction of health metrics
type HealthTrend int

const (
	HealthTrendUnknown HealthTrend = iota
	HealthTrendImproving
	HealthTrendStable
	HealthTrendDeclining
	HealthTrendCritical
)

func (t HealthTrend) String() string {
	switch t {
	case HealthTrendImproving:
		return "IMPROVING"
	case HealthTrendStable:
		return "STABLE"
	case HealthTrendDeclining:
		return "DECLINING"
	case HealthTrendCritical:
		return "CRITICAL"
	default:
		return "UNKNOWN"
	}
}

// NewHealthCheckRunner creates a new health check runner
func NewHealthCheckRunner(logger *logger.Logger, integrityMonitor *IntegrityMonitor) *HealthCheckRunner {
	return &HealthCheckRunner{
		logger:                logger,
		integrityMonitor:      integrityMonitor,
		checkInterval:         30 * time.Second, // Default 30 second intervals
		stopChan:              make(chan struct{}),
		healthHistory:         make([]HealthSnapshot, 0),
		maxHistorySize:        100, // Keep last 100 snapshots (50 minutes at 30s intervals)
		warmupSamples:         3,
		minAddressesForAlerts: 25,
	}
}

// Start begins the periodic health checking routine
func (hcr *HealthCheckRunner) Start(ctx context.Context) {
	hcr.mu.Lock()
	if hcr.running {
		hcr.mu.Unlock()
		return
	}
	hcr.running = true
	hcr.mu.Unlock()

	hcr.logger.Info("Starting health check runner",
		"interval", hcr.checkInterval)

	go hcr.healthCheckLoop(ctx)
}

// Stop stops the health checking routine
func (hcr *HealthCheckRunner) Stop() {
	hcr.mu.Lock()
	defer hcr.mu.Unlock()

	if !hcr.running {
		return
	}

	hcr.running = false
	close(hcr.stopChan)
	hcr.logger.Info("Health check runner stopped")
}

// healthCheckLoop runs the periodic health checking
func (hcr *HealthCheckRunner) healthCheckLoop(ctx context.Context) {
	ticker := time.NewTicker(hcr.checkInterval)
	defer ticker.Stop()

	// Perform initial health check
	hcr.performHealthCheck()

	for {
		select {
		case <-ctx.Done():
			hcr.logger.Info("Health check runner stopped due to context cancellation")
			return
		case <-hcr.stopChan:
			hcr.logger.Info("Health check runner stopped")
			return
		case <-ticker.C:
			hcr.performHealthCheck()
		}
	}
}

// performHealthCheck executes a comprehensive health check
func (hcr *HealthCheckRunner) performHealthCheck() {
	start := time.Now()
	hcr.lastHealthCheck = start

	if !hcr.integrityMonitor.IsEnabled() {
		hcr.logger.Debug("Skipping health check - integrity monitor disabled")
		return
	}

	// Get current metrics
	metrics := hcr.integrityMonitor.GetMetrics()
	healthSummary := hcr.integrityMonitor.GetHealthSummary()

	// Calculate rates
	corruptionRate := 0.0
	if metrics.TotalAddressesProcessed > 0 {
		corruptionRate = float64(metrics.CorruptAddressesDetected) / float64(metrics.TotalAddressesProcessed)
	}

	validationSuccessRate := 0.0
	totalValidations := metrics.AddressValidationPassed + metrics.AddressValidationFailed
	if totalValidations > 0 {
		validationSuccessRate = float64(metrics.AddressValidationPassed) / float64(totalValidations)
	}

	contractCallSuccessRate := 0.0
	totalCalls := metrics.ContractCallsSucceeded + metrics.ContractCallsFailed
	if totalCalls > 0 {
		contractCallSuccessRate = float64(metrics.ContractCallsSucceeded) / float64(totalCalls)
	}

	// Create health snapshot
	snapshot := HealthSnapshot{
		Timestamp:           start,
		HealthScore:         metrics.HealthScore,
		CorruptionRate:      corruptionRate,
		ValidationSuccess:   validationSuccessRate,
		ContractCallSuccess: contractCallSuccessRate,
		ActiveAlerts:        0, // Will be calculated based on current conditions
		Trend:               hcr.calculateHealthTrend(metrics.HealthScore),
	}

	// Add to history
	hcr.addHealthSnapshot(snapshot)

	// Check for threshold violations and generate alerts
	hcr.checkThresholds(healthSummary, snapshot)

	// Log health status periodically
	hcr.logHealthStatus(snapshot, time.Since(start))
}

// addHealthSnapshot adds a snapshot to the health history
func (hcr *HealthCheckRunner) addHealthSnapshot(snapshot HealthSnapshot) {
	hcr.mu.Lock()
	defer hcr.mu.Unlock()

	hcr.healthHistory = append(hcr.healthHistory, snapshot)

	// Trim history if it exceeds max size
	if len(hcr.healthHistory) > hcr.maxHistorySize {
		hcr.healthHistory = hcr.healthHistory[len(hcr.healthHistory)-hcr.maxHistorySize:]
	}
}

// calculateHealthTrend analyzes recent health scores to determine trend
func (hcr *HealthCheckRunner) calculateHealthTrend(currentScore float64) HealthTrend {
	hcr.mu.RLock()
	defer hcr.mu.RUnlock()

	if len(hcr.healthHistory) < 3 {
		return HealthTrendUnknown
	}

	// Get last few scores for trend analysis
	recentScores := make([]float64, 0, 5)
	start := len(hcr.healthHistory) - 5
	if start < 0 {
		start = 0
	}

	for i := start; i < len(hcr.healthHistory); i++ {
		recentScores = append(recentScores, hcr.healthHistory[i].HealthScore)
	}
	recentScores = append(recentScores, currentScore)

	// Calculate trend
	if currentScore < 0.5 {
		return HealthTrendCritical
	}

	// Simple linear trend calculation
	if len(recentScores) >= 3 {
		first := recentScores[0]
		last := recentScores[len(recentScores)-1]
		diff := last - first

		if diff > 0.05 {
			return HealthTrendImproving
		} else if diff < -0.05 {
			return HealthTrendDeclining
		} else {
			return HealthTrendStable
		}
	}

	return HealthTrendUnknown
}

// checkThresholds checks for threshold violations and generates alerts
func (hcr *HealthCheckRunner) checkThresholds(healthSummary map[string]interface{}, snapshot HealthSnapshot) {
	if !hcr.readyForAlerts(healthSummary, snapshot) {
		hcr.logger.Debug("Health alerts suppressed during warm-up",
			"health_score", snapshot.HealthScore,
			"total_addresses_processed", safeNumericLookup(healthSummary, "total_addresses_processed"),
			"history_size", hcr.historySize())
		return
	}

	// Critical health score alert
	if snapshot.HealthScore < 0.5 {
		alert := CorruptionAlert{
			Timestamp: time.Now(),
			Severity:  AlertSeverityEmergency,
			Message:   fmt.Sprintf("CRITICAL: System health score is %.2f (below 0.5)", snapshot.HealthScore),
			Context: map[string]interface{}{
				"health_score":          snapshot.HealthScore,
				"corruption_rate":       snapshot.CorruptionRate,
				"validation_success":    snapshot.ValidationSuccess,
				"contract_call_success": snapshot.ContractCallSuccess,
				"trend":                 snapshot.Trend.String(),
			},
		}
		hcr.integrityMonitor.sendAlert(alert)
	}

	// High corruption rate alert
	if snapshot.CorruptionRate > 0.10 { // 10% corruption rate
		alert := CorruptionAlert{
			Timestamp: time.Now(),
			Severity:  AlertSeverityCritical,
			Message:   fmt.Sprintf("High corruption rate detected: %.2f%%", snapshot.CorruptionRate*100),
			Context: map[string]interface{}{
				"corruption_rate":    snapshot.CorruptionRate,
				"threshold":          0.10,
				"addresses_affected": snapshot.CorruptionRate,
			},
		}
		hcr.integrityMonitor.sendAlert(alert)
	}

	// Declining trend alert
	if snapshot.Trend == HealthTrendDeclining || snapshot.Trend == HealthTrendCritical {
		alert := CorruptionAlert{
			Timestamp: time.Now(),
			Severity:  AlertSeverityWarning,
			Message:   fmt.Sprintf("System health trend is %s (current score: %.2f)", snapshot.Trend.String(), snapshot.HealthScore),
			Context: map[string]interface{}{
				"trend":            snapshot.Trend.String(),
				"health_score":     snapshot.HealthScore,
				"recent_snapshots": hcr.getRecentSnapshots(5),
			},
		}
		hcr.integrityMonitor.sendAlert(alert)
	}
}

func (hcr *HealthCheckRunner) readyForAlerts(healthSummary map[string]interface{}, snapshot HealthSnapshot) bool {
	hcr.mu.RLock()
	historyLen := len(hcr.healthHistory)
	hcr.mu.RUnlock()

	if historyLen < hcr.warmupSamples {
		return false
	}

	totalProcessed := safeNumericLookup(healthSummary, "total_addresses_processed")
	if totalProcessed >= 0 && totalProcessed < float64(hcr.minAddressesForAlerts) {
		return false
	}

	// Require at least one validation or contract call attempt before alarming.
	if snapshot.ValidationSuccess == 0 && snapshot.ContractCallSuccess == 0 && totalProcessed == 0 {
		return false
	}

	return true
}

func safeNumericLookup(summary map[string]interface{}, key string) float64 {
	if summary == nil {
		return -1
	}

	value, ok := summary[key]
	if !ok {
		return -1
	}

	switch v := value.(type) {
	case int:
		return float64(v)
	case int32:
		return float64(v)
	case int64:
		return float64(v)
	case uint:
		return float64(v)
	case uint32:
		return float64(v)
	case uint64:
		return float64(v)
	case float32:
		return float64(v)
	case float64:
		return v
	default:
		return -1
	}
}

func (hcr *HealthCheckRunner) historySize() int {
	hcr.mu.RLock()
	defer hcr.mu.RUnlock()
	return len(hcr.healthHistory)
}

// logHealthStatus logs periodic health status information
func (hcr *HealthCheckRunner) logHealthStatus(snapshot HealthSnapshot, duration time.Duration) {
	// Log detailed status every 5 minutes (10 checks at 30s intervals)
	if len(hcr.healthHistory)%10 == 0 {
		hcr.logger.Info("System health status",
			"health_score", snapshot.HealthScore,
			"corruption_rate", fmt.Sprintf("%.4f", snapshot.CorruptionRate),
			"validation_success", fmt.Sprintf("%.4f", snapshot.ValidationSuccess),
			"contract_call_success", fmt.Sprintf("%.4f", snapshot.ContractCallSuccess),
			"trend", snapshot.Trend.String(),
			"check_duration", duration)
	} else {
		// Brief status for regular checks
		hcr.logger.Debug("Health check completed",
			"health_score", snapshot.HealthScore,
			"trend", snapshot.Trend.String(),
			"duration", duration)
	}
}

// GetRecentSnapshots returns the most recent health snapshots
func (hcr *HealthCheckRunner) GetRecentSnapshots(count int) []HealthSnapshot {
	return hcr.getRecentSnapshots(count)
}

// getRecentSnapshots internal implementation
func (hcr *HealthCheckRunner) getRecentSnapshots(count int) []HealthSnapshot {
	hcr.mu.RLock()
	defer hcr.mu.RUnlock()

	if len(hcr.healthHistory) == 0 {
		return []HealthSnapshot{}
	}

	start := len(hcr.healthHistory) - count
	if start < 0 {
		start = 0
	}

	// Create a copy to avoid external modification
	snapshots := make([]HealthSnapshot, len(hcr.healthHistory[start:]))
	copy(snapshots, hcr.healthHistory[start:])

	return snapshots
}

// GetHealthSummary returns a comprehensive health summary
func (hcr *HealthCheckRunner) GetHealthSummary() map[string]interface{} {
	hcr.mu.RLock()
	defer hcr.mu.RUnlock()

	if len(hcr.healthHistory) == 0 {
		return map[string]interface{}{
			"running":        hcr.running,
			"check_interval": hcr.checkInterval.String(),
			"history_size":   0,
			"last_check":     nil,
		}
	}

	lastSnapshot := hcr.healthHistory[len(hcr.healthHistory)-1]

	return map[string]interface{}{
		"running":               hcr.running,
		"check_interval":        hcr.checkInterval.String(),
		"history_size":          len(hcr.healthHistory),
		"last_check":            hcr.lastHealthCheck,
		"current_health_score":  lastSnapshot.HealthScore,
		"current_trend":         lastSnapshot.Trend.String(),
		"corruption_rate":       lastSnapshot.CorruptionRate,
		"validation_success":    lastSnapshot.ValidationSuccess,
		"contract_call_success": lastSnapshot.ContractCallSuccess,
		"recent_snapshots":      hcr.getRecentSnapshots(10),
	}
}

// SetCheckInterval sets the health check interval
func (hcr *HealthCheckRunner) SetCheckInterval(interval time.Duration) {
	hcr.mu.Lock()
	defer hcr.mu.Unlock()
	hcr.checkInterval = interval
	hcr.logger.Info("Health check interval updated", "interval", interval)
}

// IsRunning returns whether the health checker is running
func (hcr *HealthCheckRunner) IsRunning() bool {
	hcr.mu.RLock()
	defer hcr.mu.RUnlock()
	return hcr.running
}