mev-beta/internal/monitoring/integrity_monitor.go

package monitoring

import (
	"context"
	"fmt"
	"sync"
	"time"

	"github.com/ethereum/go-ethereum/common"

	"github.com/fraktal/mev-beta/internal/logger"
	"github.com/fraktal/mev-beta/internal/recovery"
)

// IntegrityMetrics tracks data integrity statistics
type IntegrityMetrics struct {
	mu                       sync.RWMutex
	TotalAddressesProcessed  int64
	CorruptAddressesDetected int64
	AddressValidationPassed  int64
	AddressValidationFailed  int64
	ContractCallsSucceeded   int64
	ContractCallsFailed      int64
	RetryOperationsTriggered int64
	FallbackOperationsUsed   int64
	CircuitBreakersTripped   int64
	LastCorruptionDetection  time.Time
	AverageCorruptionScore   float64
	MaxCorruptionScore       int
	HealthScore              float64
	HighScore                float64
	RecoveryActions          map[recovery.RecoveryAction]int64
	ErrorsByType             map[recovery.ErrorType]int64
}

// MetricsSnapshot represents a copy of metrics without mutex for safe external access
type MetricsSnapshot struct {
	TotalAddressesProcessed  int64                             `json:"total_addresses_processed"`
	CorruptAddressesDetected int64                             `json:"corrupt_addresses_detected"`
	AddressValidationPassed  int64                             `json:"address_validation_passed"`
	AddressValidationFailed  int64                             `json:"address_validation_failed"`
	ContractCallsSucceeded   int64                             `json:"contract_calls_succeeded"`
	ContractCallsFailed      int64                             `json:"contract_calls_failed"`
	RetryOperationsTriggered int64                             `json:"retry_operations_triggered"`
	FallbackOperationsUsed   int64                             `json:"fallback_operations_used"`
	CircuitBreakersTripped   int64                             `json:"circuit_breakers_tripped"`
	LastCorruptionDetection  time.Time                         `json:"last_corruption_detection"`
	AverageCorruptionScore   float64                           `json:"average_corruption_score"`
	MaxCorruptionScore       int                               `json:"max_corruption_score"`
	HealthScore              float64                           `json:"health_score"`
	HighScore                float64                           `json:"high_score"`
	RecoveryActions          map[recovery.RecoveryAction]int64 `json:"recovery_actions"`
	ErrorsByType             map[recovery.ErrorType]int64      `json:"errors_by_type"`
}

// CorruptionAlert represents a corruption detection alert
type CorruptionAlert struct {
	Timestamp       time.Time
	Address         common.Address
	CorruptionScore int
	Source          string
	Severity        AlertSeverity
	Message         string
	Context         map[string]interface{}
}

// AlertSeverity defines alert severity levels
type AlertSeverity int

const (
	AlertSeverityInfo AlertSeverity = iota
	AlertSeverityWarning
	AlertSeverityCritical
	AlertSeverityEmergency
)

func (s AlertSeverity) String() string {
	switch s {
	case AlertSeverityInfo:
		return "INFO"
	case AlertSeverityWarning:
		return "WARNING"
	case AlertSeverityCritical:
		return "CRITICAL"
	case AlertSeverityEmergency:
		return "EMERGENCY"
	default:
		return "UNKNOWN"
	}
}

// IntegrityMonitor monitors and tracks data integrity metrics
type IntegrityMonitor struct {
	mu                sync.RWMutex
	logger            *logger.Logger
	metrics           *IntegrityMetrics
	alertThresholds   map[string]float64
	alertSubscribers  []AlertSubscriber
	healthCheckRunner *HealthCheckRunner
	enabled           bool
	alerts            []CorruptionAlert
	alertsMutex       sync.RWMutex
}

// AlertSubscriber defines the interface for alert handlers
type AlertSubscriber interface {
	HandleAlert(alert CorruptionAlert) error
}

// NewIntegrityMonitor creates a new integrity monitoring system
func NewIntegrityMonitor(logger *logger.Logger) *IntegrityMonitor {
	monitor := &IntegrityMonitor{
		logger: logger,
		metrics: &IntegrityMetrics{
			RecoveryActions: make(map[recovery.RecoveryAction]int64),
			ErrorsByType:    make(map[recovery.ErrorType]int64),
			HealthScore:     1.0,
			HighScore:       1.0,
		},
		alertThresholds: make(map[string]float64),
		enabled:         true,
		alerts:          make([]CorruptionAlert, 0, 256),
	}

	// Set default thresholds
	monitor.setDefaultThresholds()

	// Initialize health check runner
	monitor.healthCheckRunner = NewHealthCheckRunner(logger, monitor)

	return monitor
}

// setDefaultThresholds configures default alert thresholds
func (im *IntegrityMonitor) setDefaultThresholds() {
	im.alertThresholds["corruption_rate"] = 0.05      // 5% corruption rate
	im.alertThresholds["failure_rate"] = 0.10         // 10% failure rate
	im.alertThresholds["health_score_min"] = 0.80     // 80% minimum health
	im.alertThresholds["max_corruption_score"] = 70.0 // Maximum individual corruption score
	im.alertThresholds["circuit_breaker_rate"] = 0.02 // 2% circuit breaker rate
}

// RecordAddressProcessed increments the counter for processed addresses
func (im *IntegrityMonitor) RecordAddressProcessed() {
	if !im.enabled {
		return
	}

	im.metrics.mu.Lock()
	im.metrics.TotalAddressesProcessed++
	im.metrics.mu.Unlock()

	im.updateHealthScore()
}

// RecordCorruptionDetected records a corruption detection event
func (im *IntegrityMonitor) RecordCorruptionDetected(address common.Address, corruptionScore int, source string) {
	if !im.enabled {
		return
	}

	im.metrics.mu.Lock()
	im.metrics.CorruptAddressesDetected++
	im.metrics.LastCorruptionDetection = time.Now()

	// Update corruption statistics
	if corruptionScore > im.metrics.MaxCorruptionScore {
		im.metrics.MaxCorruptionScore = corruptionScore
	}

	// Calculate rolling average corruption score
	total := float64(im.metrics.CorruptAddressesDetected)
	im.metrics.AverageCorruptionScore = ((im.metrics.AverageCorruptionScore * (total - 1)) + float64(corruptionScore)) / total
	im.metrics.mu.Unlock()

	// Generate alert based on corruption score
	severity := im.getCorruptionSeverity(corruptionScore)
	alert := CorruptionAlert{
		Timestamp:       time.Now(),
		Address:         address,
		CorruptionScore: corruptionScore,
		Source:          source,
		Severity:        severity,
		Message:         fmt.Sprintf("Corruption detected: address %s, score %d, source %s", address.Hex(), corruptionScore, source),
		Context: map[string]interface{}{
			"address":          address.Hex(),
			"corruption_score": corruptionScore,
			"source":           source,
			"timestamp":        time.Now().Unix(),
		},
	}

	im.sendAlert(alert)
	im.updateHealthScore()

	im.logger.Warn("Corruption detected",
		"address", address.Hex(),
		"corruption_score", corruptionScore,
		"source", source,
		"severity", severity.String())
}

// RecordValidationResult records address validation results
func (im *IntegrityMonitor) RecordValidationResult(passed bool) {
	if !im.enabled {
		return
	}

	im.metrics.mu.Lock()
	if passed {
		im.metrics.AddressValidationPassed++
	} else {
		im.metrics.AddressValidationFailed++
	}
	im.metrics.mu.Unlock()

	im.updateHealthScore()
}

// RecordContractCallResult records contract call success/failure
func (im *IntegrityMonitor) RecordContractCallResult(succeeded bool) {
	if !im.enabled {
		return
	}

	im.metrics.mu.Lock()
	if succeeded {
		im.metrics.ContractCallsSucceeded++
	} else {
		im.metrics.ContractCallsFailed++
	}
	im.metrics.mu.Unlock()

	im.updateHealthScore()
}

// RecordRecoveryAction records recovery action usage
func (im *IntegrityMonitor) RecordRecoveryAction(action recovery.RecoveryAction) {
	if !im.enabled {
		return
	}

	im.metrics.mu.Lock()
	im.metrics.RecoveryActions[action]++

	// Track specific metrics
	switch action {
	case recovery.ActionRetryWithBackoff:
		im.metrics.RetryOperationsTriggered++
	case recovery.ActionUseFallbackData:
		im.metrics.FallbackOperationsUsed++
	case recovery.ActionCircuitBreaker:
		im.metrics.CircuitBreakersTripped++
	}
	im.metrics.mu.Unlock()

	im.updateHealthScore()
}

// RecordErrorType records error by type
func (im *IntegrityMonitor) RecordErrorType(errorType recovery.ErrorType) {
	if !im.enabled {
		return
	}

	im.metrics.mu.Lock()
	im.metrics.ErrorsByType[errorType]++
	im.metrics.mu.Unlock()
}

// getCorruptionSeverity determines alert severity based on corruption score
func (im *IntegrityMonitor) getCorruptionSeverity(corruptionScore int) AlertSeverity {
	if corruptionScore >= 90 {
		return AlertSeverityEmergency
	} else if corruptionScore >= 70 {
		return AlertSeverityCritical
	} else if corruptionScore >= 40 {
		return AlertSeverityWarning
	}
	return AlertSeverityInfo
}

// updateHealthScore calculates overall system health score
func (im *IntegrityMonitor) updateHealthScore() {
	im.metrics.mu.Lock()
	defer im.metrics.mu.Unlock()

	if im.metrics.TotalAddressesProcessed == 0 {
		im.metrics.HealthScore = 1.0
		return
	}

	// Calculate component scores
	corruptionRate := float64(im.metrics.CorruptAddressesDetected) / float64(im.metrics.TotalAddressesProcessed)

	var validationSuccessRate float64 = 1.0
	validationTotal := im.metrics.AddressValidationPassed + im.metrics.AddressValidationFailed
	if validationTotal > 0 {
		validationSuccessRate = float64(im.metrics.AddressValidationPassed) / float64(validationTotal)
	}

	var contractCallSuccessRate float64 = 1.0
	contractTotal := im.metrics.ContractCallsSucceeded + im.metrics.ContractCallsFailed
	if contractTotal > 0 {
		contractCallSuccessRate = float64(im.metrics.ContractCallsSucceeded) / float64(contractTotal)
	}

	// Weighted health score calculation
	healthScore := 0.0
	healthScore += (1.0 - corruptionRate) * 0.4  // 40% weight on corruption prevention
	healthScore += validationSuccessRate * 0.3   // 30% weight on validation success
	healthScore += contractCallSuccessRate * 0.3 // 30% weight on contract call success

	// Cap at 1.0 and handle edge cases
	if healthScore > 1.0 {
		healthScore = 1.0
	} else if healthScore < 0.0 {
		healthScore = 0.0
	}

	im.metrics.HealthScore = healthScore
	if healthScore > im.metrics.HighScore {
		im.metrics.HighScore = healthScore
	}

	// Check for health score threshold alerts
	if healthScore < im.alertThresholds["health_score_min"] {
		alert := CorruptionAlert{
			Timestamp: time.Now(),
			Severity:  AlertSeverityCritical,
			Message:   fmt.Sprintf("System health score dropped to %.2f (threshold: %.2f)", healthScore, im.alertThresholds["health_score_min"]),
			Context: map[string]interface{}{
				"health_score":          healthScore,
				"threshold":             im.alertThresholds["health_score_min"],
				"corruption_rate":       corruptionRate,
				"validation_success":    validationSuccessRate,
				"contract_call_success": contractCallSuccessRate,
			},
		}
		im.sendAlert(alert)
	}
}

// sendAlert sends alerts to all subscribers
func (im *IntegrityMonitor) sendAlert(alert CorruptionAlert) {
	im.alertsMutex.Lock()
	im.alerts = append(im.alerts, alert)
	if len(im.alerts) > 1000 {
		trimmed := make([]CorruptionAlert, 1000)
		copy(trimmed, im.alerts[len(im.alerts)-1000:])
		im.alerts = trimmed
	}
	im.alertsMutex.Unlock()

	for _, subscriber := range im.alertSubscribers {
		if err := subscriber.HandleAlert(alert); err != nil {
			im.logger.Error("Failed to send alert",
				"subscriber", fmt.Sprintf("%T", subscriber),
				"error", err)
		}
	}
}

// AddAlertSubscriber adds an alert subscriber
func (im *IntegrityMonitor) AddAlertSubscriber(subscriber AlertSubscriber) {
	im.mu.Lock()
	defer im.mu.Unlock()
	im.alertSubscribers = append(im.alertSubscribers, subscriber)
}

// GetMetrics returns a copy of current metrics
func (im *IntegrityMonitor) GetMetrics() MetricsSnapshot {
	im.metrics.mu.RLock()
	defer im.metrics.mu.RUnlock()

	// Create a deep copy
	metrics := IntegrityMetrics{
		TotalAddressesProcessed:  im.metrics.TotalAddressesProcessed,
		CorruptAddressesDetected: im.metrics.CorruptAddressesDetected,
		AddressValidationPassed:  im.metrics.AddressValidationPassed,
		AddressValidationFailed:  im.metrics.AddressValidationFailed,
		ContractCallsSucceeded:   im.metrics.ContractCallsSucceeded,
		ContractCallsFailed:      im.metrics.ContractCallsFailed,
		RetryOperationsTriggered: im.metrics.RetryOperationsTriggered,
		FallbackOperationsUsed:   im.metrics.FallbackOperationsUsed,
		CircuitBreakersTripped:   im.metrics.CircuitBreakersTripped,
		LastCorruptionDetection:  im.metrics.LastCorruptionDetection,
		AverageCorruptionScore:   im.metrics.AverageCorruptionScore,
		MaxCorruptionScore:       im.metrics.MaxCorruptionScore,
		HealthScore:              im.metrics.HealthScore,
		HighScore:                im.metrics.HighScore,
		RecoveryActions:          make(map[recovery.RecoveryAction]int64),
		ErrorsByType:             make(map[recovery.ErrorType]int64),
	}

	// Copy maps
	for k, v := range im.metrics.RecoveryActions {
		metrics.RecoveryActions[k] = v
	}
	for k, v := range im.metrics.ErrorsByType {
		metrics.ErrorsByType[k] = v
	}

	// Return a safe copy without mutex
	return MetricsSnapshot{
		TotalAddressesProcessed:  metrics.TotalAddressesProcessed,
		CorruptAddressesDetected: metrics.CorruptAddressesDetected,
		AddressValidationPassed:  metrics.AddressValidationPassed,
		AddressValidationFailed:  metrics.AddressValidationFailed,
		ContractCallsSucceeded:   metrics.ContractCallsSucceeded,
		ContractCallsFailed:      metrics.ContractCallsFailed,
		RetryOperationsTriggered: metrics.RetryOperationsTriggered,
		FallbackOperationsUsed:   metrics.FallbackOperationsUsed,
		CircuitBreakersTripped:   metrics.CircuitBreakersTripped,
		LastCorruptionDetection:  metrics.LastCorruptionDetection,
		AverageCorruptionScore:   metrics.AverageCorruptionScore,
		MaxCorruptionScore:       metrics.MaxCorruptionScore,
		HealthScore:              metrics.HealthScore,
		HighScore:                metrics.HighScore,
		RecoveryActions:          metrics.RecoveryActions,
		ErrorsByType:             metrics.ErrorsByType,
	}
}

// GetHealthSummary returns a comprehensive health summary
func (im *IntegrityMonitor) GetHealthSummary() map[string]interface{} {
	metrics := im.GetMetrics()

	corruptionRate := 0.0
	if metrics.TotalAddressesProcessed > 0 {
		corruptionRate = float64(metrics.CorruptAddressesDetected) / float64(metrics.TotalAddressesProcessed)
	}

	validationSuccessRate := 0.0
	totalValidations := metrics.AddressValidationPassed + metrics.AddressValidationFailed
	if totalValidations > 0 {
		validationSuccessRate = float64(metrics.AddressValidationPassed) / float64(totalValidations)
	}

	contractCallSuccessRate := 0.0
	totalCalls := metrics.ContractCallsSucceeded + metrics.ContractCallsFailed
	if totalCalls > 0 {
		contractCallSuccessRate = float64(metrics.ContractCallsSucceeded) / float64(totalCalls)
	}

	return map[string]interface{}{
		"enabled":                    im.enabled,
		"health_score":               metrics.HealthScore,
		"total_addresses_processed":  metrics.TotalAddressesProcessed,
		"corruption_detections":      metrics.CorruptAddressesDetected,
		"corruption_rate":            corruptionRate,
		"validation_success_rate":    validationSuccessRate,
		"contract_call_success_rate": contractCallSuccessRate,
		"average_corruption_score":   metrics.AverageCorruptionScore,
		"max_corruption_score":       metrics.MaxCorruptionScore,
		"retry_operations":           metrics.RetryOperationsTriggered,
		"fallback_operations":        metrics.FallbackOperationsUsed,
		"circuit_breakers_tripped":   metrics.CircuitBreakersTripped,
		"last_corruption":            metrics.LastCorruptionDetection,
		"recovery_actions":           metrics.RecoveryActions,
		"errors_by_type":             metrics.ErrorsByType,
		"alert_thresholds":           im.alertThresholds,
		"alert_subscribers":          len(im.alertSubscribers),
	}
}

// GetRecentAlerts returns the most recent corruption alerts up to the specified limit.
func (im *IntegrityMonitor) GetRecentAlerts(limit int) []CorruptionAlert {
	im.alertsMutex.RLock()
	defer im.alertsMutex.RUnlock()

	if limit <= 0 || limit > len(im.alerts) {
		limit = len(im.alerts)
	}

	if limit == 0 {
		return []CorruptionAlert{}
	}

	start := len(im.alerts) - limit
	alertsCopy := make([]CorruptionAlert, limit)
	copy(alertsCopy, im.alerts[start:])
	return alertsCopy
}

// SetThreshold sets an alert threshold
func (im *IntegrityMonitor) SetThreshold(name string, value float64) {
	im.mu.Lock()
	defer im.mu.Unlock()
	im.alertThresholds[name] = value
}

// Enable enables the integrity monitor
func (im *IntegrityMonitor) Enable() {
	im.mu.Lock()
	defer im.mu.Unlock()
	im.enabled = true
	im.logger.Info("Integrity monitor enabled")
}

// Disable disables the integrity monitor
func (im *IntegrityMonitor) Disable() {
	im.mu.Lock()
	defer im.mu.Unlock()
	im.enabled = false
	im.logger.Info("Integrity monitor disabled")
}

// IsEnabled returns whether the monitor is enabled
func (im *IntegrityMonitor) IsEnabled() bool {
	im.mu.RLock()
	defer im.mu.RUnlock()
	return im.enabled
}

// StartHealthCheckRunner starts the periodic health check routine
func (im *IntegrityMonitor) StartHealthCheckRunner(ctx context.Context) {
	if im.healthCheckRunner != nil {
		im.healthCheckRunner.Start(ctx)
	}
}

// StopHealthCheckRunner stops the periodic health check routine
func (im *IntegrityMonitor) StopHealthCheckRunner() {
	if im.healthCheckRunner != nil {
		im.healthCheckRunner.Stop()
	}
}

// GetHealthCheckRunner returns the health check runner
func (im *IntegrityMonitor) GetHealthCheckRunner() *HealthCheckRunner {
	return im.healthCheckRunner
}